1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. 29e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 39e201c85SYohann // 49e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause 59e201c85SYohann // 69e201c85SYohann // This file is part of CEED: http://github.com/ceed 79e201c85SYohann 89e201c85SYohann /// @file 99e201c85SYohann /// Internal header for HIP backend macro and type definitions for JiT source 10c0b5abf0SJeremy L Thompson #include <ceed/types.h> 119e201c85SYohann 129e201c85SYohann //------------------------------------------------------------------------------ 139e201c85SYohann // Load matrices for basis actions 149e201c85SYohann //------------------------------------------------------------------------------ 159e201c85SYohann template <int P, int Q> 16f815fac9SJeremy L Thompson inline __device__ void LoadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { 172b730f8bSJeremy L Thompson for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; 189e201c85SYohann } 199e201c85SYohann 209e201c85SYohann //------------------------------------------------------------------------------ 213a2968d6SJeremy L Thompson // AtPoints 223a2968d6SJeremy L Thompson //------------------------------------------------------------------------------ 233a2968d6SJeremy L Thompson 243a2968d6SJeremy L Thompson //------------------------------------------------------------------------------ 253a2968d6SJeremy L Thompson // L-vector -> single point 263a2968d6SJeremy L Thompson //------------------------------------------------------------------------------ 273a2968d6SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS> 283a2968d6SJeremy L Thompson inline __device__ void ReadPoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, 293a2968d6SJeremy L Thompson const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 303a2968d6SJeremy L Thompson const CeedInt ind = indices[p + elem * NUM_PTS]; 313a2968d6SJeremy L Thompson 323a2968d6SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 333a2968d6SJeremy L Thompson r_u[comp] = d_u[ind + comp * COMP_STRIDE]; 343a2968d6SJeremy L Thompson } 353a2968d6SJeremy L Thompson } 363a2968d6SJeremy L Thompson 373a2968d6SJeremy L Thompson //------------------------------------------------------------------------------ 383a2968d6SJeremy L Thompson // Single point -> L-vector 393a2968d6SJeremy L Thompson //------------------------------------------------------------------------------ 403a2968d6SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS> 413a2968d6SJeremy L Thompson inline __device__ void WritePoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, 423a2968d6SJeremy L Thompson const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) { 433a2968d6SJeremy L Thompson if (p < points_in_elem) { 443a2968d6SJeremy L Thompson const CeedInt ind = indices[p + elem * NUM_PTS]; 453a2968d6SJeremy L Thompson 463a2968d6SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 473a2968d6SJeremy L Thompson d_u[ind + comp * COMP_STRIDE] += r_u[comp]; 483a2968d6SJeremy L Thompson } 493a2968d6SJeremy L Thompson } 503a2968d6SJeremy L Thompson } 513a2968d6SJeremy L Thompson 523a2968d6SJeremy L Thompson //------------------------------------------------------------------------------ 539e201c85SYohann // 1D 549e201c85SYohann //------------------------------------------------------------------------------ 559e201c85SYohann 569e201c85SYohann //------------------------------------------------------------------------------ 570183ed61SJeremy L Thompson // Set E-vector value 580183ed61SJeremy L Thompson //------------------------------------------------------------------------------ 590183ed61SJeremy L Thompson template <int NUM_COMP, int P_1D> 600183ed61SJeremy L Thompson inline __device__ void SetEVecStandard1d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 610183ed61SJeremy L Thompson const CeedInt target_comp = n / P_1D; 620183ed61SJeremy L Thompson const CeedInt target_node = n % P_1D; 630183ed61SJeremy L Thompson 640183ed61SJeremy L Thompson if (data.t_id_x == target_node) { 650183ed61SJeremy L Thompson r_v[target_comp] = value; 660183ed61SJeremy L Thompson } 670183ed61SJeremy L Thompson } 680183ed61SJeremy L Thompson 690183ed61SJeremy L Thompson //------------------------------------------------------------------------------ 709e201c85SYohann // L-vector -> E-vector, offsets provided 719e201c85SYohann //------------------------------------------------------------------------------ 726b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D> 73f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 74672b0f2aSSebastian Grimberg const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 756b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D) { 769e201c85SYohann const CeedInt node = data.t_id_x; 776b92dc4bSJeremy L Thompson const CeedInt ind = indices[node + elem * P_1D]; 78672b0f2aSSebastian Grimberg 79672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 809e201c85SYohann } 819e201c85SYohann } 829e201c85SYohann 839e201c85SYohann //------------------------------------------------------------------------------ 849e201c85SYohann // L-vector -> E-vector, strided 859e201c85SYohann //------------------------------------------------------------------------------ 866b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 87f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 886b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D) { 899e201c85SYohann const CeedInt node = data.t_id_x; 909e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 91672b0f2aSSebastian Grimberg 92672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 939e201c85SYohann } 949e201c85SYohann } 959e201c85SYohann 969e201c85SYohann //------------------------------------------------------------------------------ 979e201c85SYohann // E-vector -> L-vector, offsets provided 989e201c85SYohann //------------------------------------------------------------------------------ 996b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D> 100f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 101672b0f2aSSebastian Grimberg const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 1026b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D) { 1039e201c85SYohann const CeedInt node = data.t_id_x; 1046b92dc4bSJeremy L Thompson const CeedInt ind = indices[node + elem * P_1D]; 105672b0f2aSSebastian Grimberg 106672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 1079e201c85SYohann } 1089e201c85SYohann } 1099e201c85SYohann 1100183ed61SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D> 1110183ed61SJeremy L Thompson inline __device__ void WriteLVecStandard1d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 1120183ed61SJeremy L Thompson const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 1130183ed61SJeremy L Thompson CeedScalar *__restrict__ d_v) { 1140183ed61SJeremy L Thompson const CeedInt target_comp = n / P_1D; 1150183ed61SJeremy L Thompson const CeedInt target_node = n % P_1D; 1160183ed61SJeremy L Thompson 1170183ed61SJeremy L Thompson if (data.t_id_x == target_node) { 1180183ed61SJeremy L Thompson const CeedInt ind = indices[target_node + elem * P_1D]; 1190183ed61SJeremy L Thompson 1200183ed61SJeremy L Thompson atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]); 1210183ed61SJeremy L Thompson } 1220183ed61SJeremy L Thompson } 1230183ed61SJeremy L Thompson 1249e201c85SYohann //------------------------------------------------------------------------------ 125692716b7SZach Atkins // E-vector -> L-vector, full assembly 126692716b7SZach Atkins //------------------------------------------------------------------------------ 127692716b7SZach Atkins template <int NUM_COMP, int COMP_STRIDE, int P_1D> 128692716b7SZach Atkins inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, 129692716b7SZach Atkins const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 130692716b7SZach Atkins const CeedInt in_comp = in / P_1D; 131692716b7SZach Atkins const CeedInt in_node = in % P_1D; 132692716b7SZach Atkins const CeedInt e_vec_size = P_1D * NUM_COMP; 133692716b7SZach Atkins 134692716b7SZach Atkins if (data.t_id_x < P_1D) { 135692716b7SZach Atkins const CeedInt out_node = data.t_id_x; 136692716b7SZach Atkins 137692716b7SZach Atkins for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 138692716b7SZach Atkins d_v[elem * e_vec_size * e_vec_size + (in_comp * NUM_COMP + comp) * P_1D * P_1D + out_node * P_1D + in_node] += r_v[comp]; 139692716b7SZach Atkins } 140692716b7SZach Atkins } 141692716b7SZach Atkins } 142692716b7SZach Atkins 143692716b7SZach Atkins //------------------------------------------------------------------------------ 1445daefc96SJeremy L Thompson // E-vector -> L-vector, Qfunction assembly 1455daefc96SJeremy L Thompson //------------------------------------------------------------------------------ 1465daefc96SJeremy L Thompson template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D> 1475daefc96SJeremy L Thompson inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset, 1485daefc96SJeremy L Thompson const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 1495daefc96SJeremy L Thompson if (data.t_id_x < Q_1D) { 1505daefc96SJeremy L Thompson const CeedInt ind = data.t_id_x + elem * Q_1D; 1515daefc96SJeremy L Thompson 1525daefc96SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) { 1535daefc96SJeremy L Thompson d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * num_elem)] = r_v[comp]; 1545daefc96SJeremy L Thompson } 1555daefc96SJeremy L Thompson } 1565daefc96SJeremy L Thompson } 1575daefc96SJeremy L Thompson 1585daefc96SJeremy L Thompson //------------------------------------------------------------------------------ 1599e201c85SYohann // E-vector -> L-vector, strided 1609e201c85SYohann //------------------------------------------------------------------------------ 1616b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 162f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 163672b0f2aSSebastian Grimberg CeedScalar *__restrict__ d_v) { 1646b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D) { 1659e201c85SYohann const CeedInt node = data.t_id_x; 1669e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 167672b0f2aSSebastian Grimberg 168672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 1699e201c85SYohann } 1709e201c85SYohann } 1719e201c85SYohann 1729e201c85SYohann //------------------------------------------------------------------------------ 1739e201c85SYohann // 2D 1749e201c85SYohann //------------------------------------------------------------------------------ 1759e201c85SYohann 1769e201c85SYohann //------------------------------------------------------------------------------ 1770183ed61SJeremy L Thompson // Set E-vector value 1780183ed61SJeremy L Thompson //------------------------------------------------------------------------------ 1790183ed61SJeremy L Thompson template <int NUM_COMP, int P_1D> 1800183ed61SJeremy L Thompson inline __device__ void SetEVecStandard2d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 1810183ed61SJeremy L Thompson const CeedInt target_comp = n / (P_1D * P_1D); 1820183ed61SJeremy L Thompson const CeedInt target_node_x = n % P_1D; 1830183ed61SJeremy L Thompson const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 1840183ed61SJeremy L Thompson 1850183ed61SJeremy L Thompson if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 1860183ed61SJeremy L Thompson r_v[target_comp] = value; 1870183ed61SJeremy L Thompson } 1880183ed61SJeremy L Thompson } 1890183ed61SJeremy L Thompson 1900183ed61SJeremy L Thompson //------------------------------------------------------------------------------ 1919e201c85SYohann // L-vector -> E-vector, offsets provided 1929e201c85SYohann //------------------------------------------------------------------------------ 1936b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D> 194f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 195672b0f2aSSebastian Grimberg const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 1966b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 1976b92dc4bSJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 1986b92dc4bSJeremy L Thompson const CeedInt ind = indices[node + elem * P_1D * P_1D]; 199672b0f2aSSebastian Grimberg 200672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 2019e201c85SYohann } 2029e201c85SYohann } 2039e201c85SYohann 2049e201c85SYohann //------------------------------------------------------------------------------ 2059e201c85SYohann // L-vector -> E-vector, strided 2069e201c85SYohann //------------------------------------------------------------------------------ 2076b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 208f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 2096b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 2106b92dc4bSJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 2119e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 212672b0f2aSSebastian Grimberg 213672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 2149e201c85SYohann } 2159e201c85SYohann } 2169e201c85SYohann 2179e201c85SYohann //------------------------------------------------------------------------------ 2189e201c85SYohann // E-vector -> L-vector, offsets provided 2199e201c85SYohann //------------------------------------------------------------------------------ 2206b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D> 221f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 222672b0f2aSSebastian Grimberg const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 2236b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 2246b92dc4bSJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 2256b92dc4bSJeremy L Thompson const CeedInt ind = indices[node + elem * P_1D * P_1D]; 226672b0f2aSSebastian Grimberg 227672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 2289e201c85SYohann } 2299e201c85SYohann } 2309e201c85SYohann 2310183ed61SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D> 2320183ed61SJeremy L Thompson inline __device__ void WriteLVecStandard2d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 2330183ed61SJeremy L Thompson const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 2340183ed61SJeremy L Thompson CeedScalar *__restrict__ d_v) { 2350183ed61SJeremy L Thompson const CeedInt target_comp = n / (P_1D * P_1D); 2360183ed61SJeremy L Thompson const CeedInt target_node_x = n % P_1D; 2370183ed61SJeremy L Thompson const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 2380183ed61SJeremy L Thompson 2390183ed61SJeremy L Thompson if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 2400183ed61SJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 2410183ed61SJeremy L Thompson const CeedInt ind = indices[node + elem * P_1D * P_1D]; 2420183ed61SJeremy L Thompson 2430183ed61SJeremy L Thompson atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]); 2440183ed61SJeremy L Thompson } 2450183ed61SJeremy L Thompson } 2460183ed61SJeremy L Thompson 2479e201c85SYohann //------------------------------------------------------------------------------ 248692716b7SZach Atkins // E-vector -> L-vector, full assembly 249692716b7SZach Atkins //------------------------------------------------------------------------------ 250692716b7SZach Atkins template <int NUM_COMP, int COMP_STRIDE, int P_1D> 251692716b7SZach Atkins inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, 252692716b7SZach Atkins const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 253692716b7SZach Atkins const CeedInt elem_size = P_1D * P_1D; 254692716b7SZach Atkins const CeedInt in_comp = in / elem_size; 255692716b7SZach Atkins const CeedInt in_node_x = in % P_1D; 256692716b7SZach Atkins const CeedInt in_node_y = (in % elem_size) / P_1D; 257692716b7SZach Atkins const CeedInt e_vec_size = elem_size * NUM_COMP; 258692716b7SZach Atkins 259692716b7SZach Atkins if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 260692716b7SZach Atkins const CeedInt in_node = in_node_x + in_node_y * P_1D; 261692716b7SZach Atkins const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D; 262692716b7SZach Atkins 263692716b7SZach Atkins for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 264692716b7SZach Atkins const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node; 265692716b7SZach Atkins 266692716b7SZach Atkins d_v[elem * e_vec_size * e_vec_size + index] += r_v[comp]; 267692716b7SZach Atkins } 268692716b7SZach Atkins } 269692716b7SZach Atkins } 270692716b7SZach Atkins 271692716b7SZach Atkins //------------------------------------------------------------------------------ 2725daefc96SJeremy L Thompson // E-vector -> L-vector, Qfunction assembly 2735daefc96SJeremy L Thompson //------------------------------------------------------------------------------ 2745daefc96SJeremy L Thompson template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D> 2755daefc96SJeremy L Thompson inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset, 2765daefc96SJeremy L Thompson const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 2775daefc96SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 2785daefc96SJeremy L Thompson const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D) + elem * Q_1D * Q_1D; 2795daefc96SJeremy L Thompson 2805daefc96SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) { 2815daefc96SJeremy L Thompson d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * num_elem)] = r_v[comp]; 2825daefc96SJeremy L Thompson } 2835daefc96SJeremy L Thompson } 2845daefc96SJeremy L Thompson } 2855daefc96SJeremy L Thompson 2865daefc96SJeremy L Thompson //------------------------------------------------------------------------------ 2879e201c85SYohann // E-vector -> L-vector, strided 2889e201c85SYohann //------------------------------------------------------------------------------ 2896b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 290f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 291672b0f2aSSebastian Grimberg CeedScalar *__restrict__ d_v) { 2926b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 2936b92dc4bSJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 2949e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 295672b0f2aSSebastian Grimberg 296672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 2979e201c85SYohann } 2989e201c85SYohann } 2999e201c85SYohann 3009e201c85SYohann //------------------------------------------------------------------------------ 3019e201c85SYohann // 3D 3029e201c85SYohann //------------------------------------------------------------------------------ 3039e201c85SYohann 3049e201c85SYohann //------------------------------------------------------------------------------ 3050183ed61SJeremy L Thompson // Set E-vector value 3060183ed61SJeremy L Thompson //------------------------------------------------------------------------------ 3070183ed61SJeremy L Thompson template <int NUM_COMP, int P_1D> 3080183ed61SJeremy L Thompson inline __device__ void SetEVecStandard3d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 3090183ed61SJeremy L Thompson const CeedInt target_comp = n / (P_1D * P_1D * P_1D); 3100183ed61SJeremy L Thompson const CeedInt target_node_x = n % P_1D; 3110183ed61SJeremy L Thompson const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D; 3120183ed61SJeremy L Thompson const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D); 3130183ed61SJeremy L Thompson 3140183ed61SJeremy L Thompson if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 3150183ed61SJeremy L Thompson r_v[target_node_z + target_comp * P_1D] = value; 3160183ed61SJeremy L Thompson } 3170183ed61SJeremy L Thompson } 3180183ed61SJeremy L Thompson 3190183ed61SJeremy L Thompson //------------------------------------------------------------------------------ 3209e201c85SYohann // L-vector -> E-vector, offsets provided 3219e201c85SYohann //------------------------------------------------------------------------------ 3226b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D> 323f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 324672b0f2aSSebastian Grimberg const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 3256b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 3266b92dc4bSJeremy L Thompson for (CeedInt z = 0; z < P_1D; z++) { 3276b92dc4bSJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 3286b92dc4bSJeremy L Thompson const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 329672b0f2aSSebastian Grimberg 3306b92dc4bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp]; 3319e201c85SYohann } 3329e201c85SYohann } 333688b5473SJeremy L Thompson } 3349e201c85SYohann 3359e201c85SYohann //------------------------------------------------------------------------------ 3369e201c85SYohann // L-vector -> E-vector, strided 3379e201c85SYohann //------------------------------------------------------------------------------ 3386b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 339f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 3406b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 3416b92dc4bSJeremy L Thompson for (CeedInt z = 0; z < P_1D; z++) { 3426b92dc4bSJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 3439e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 344672b0f2aSSebastian Grimberg 3456b92dc4bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP]; 3469e201c85SYohann } 3479e201c85SYohann } 348688b5473SJeremy L Thompson } 3499e201c85SYohann 3509e201c85SYohann //------------------------------------------------------------------------------ 3519e201c85SYohann // E-vector -> Q-vector, offests provided 3529e201c85SYohann //------------------------------------------------------------------------------ 3536b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int Q_1D> 354f815fac9SJeremy L Thompson inline __device__ void ReadEVecSliceStandard3d(SharedData_Hip &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, 355f815fac9SJeremy L Thompson const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, 356f815fac9SJeremy L Thompson CeedScalar *__restrict__ r_u) { 3576b92dc4bSJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 3586b92dc4bSJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; 3596b92dc4bSJeremy L Thompson const CeedInt ind = indices[node + elem * Q_1D * Q_1D * Q_1D]; 360672b0f2aSSebastian Grimberg 361672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 3629e201c85SYohann } 3639e201c85SYohann } 3649e201c85SYohann 3659e201c85SYohann //------------------------------------------------------------------------------ 3669e201c85SYohann // E-vector -> Q-vector, strided 3679e201c85SYohann //------------------------------------------------------------------------------ 3686b92dc4bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 369f815fac9SJeremy L Thompson inline __device__ void ReadEVecSliceStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, 370672b0f2aSSebastian Grimberg CeedScalar *__restrict__ r_u) { 3716b92dc4bSJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 3726b92dc4bSJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; 3739e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 374672b0f2aSSebastian Grimberg 375672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 3769e201c85SYohann } 3779e201c85SYohann } 3789e201c85SYohann 3799e201c85SYohann //------------------------------------------------------------------------------ 3809e201c85SYohann // E-vector -> L-vector, offsets provided 3819e201c85SYohann //------------------------------------------------------------------------------ 3826b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D> 383f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 384672b0f2aSSebastian Grimberg const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 3856b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 3866b92dc4bSJeremy L Thompson for (CeedInt z = 0; z < P_1D; z++) { 3876b92dc4bSJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 3886b92dc4bSJeremy L Thompson const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 389672b0f2aSSebastian Grimberg 3906b92dc4bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]); 3919e201c85SYohann } 3929e201c85SYohann } 393688b5473SJeremy L Thompson } 3949e201c85SYohann 3950183ed61SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D> 3960183ed61SJeremy L Thompson inline __device__ void WriteLVecStandard3d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 3970183ed61SJeremy L Thompson const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 3980183ed61SJeremy L Thompson CeedScalar *__restrict__ d_v) { 3990183ed61SJeremy L Thompson const CeedInt target_comp = n / (P_1D * P_1D * P_1D); 4000183ed61SJeremy L Thompson const CeedInt target_node_x = n % P_1D; 4010183ed61SJeremy L Thompson const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D; 4020183ed61SJeremy L Thompson const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D); 4030183ed61SJeremy L Thompson 4040183ed61SJeremy L Thompson if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 4050183ed61SJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D; 4060183ed61SJeremy L Thompson const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 4070183ed61SJeremy L Thompson 4080183ed61SJeremy L Thompson atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]); 4090183ed61SJeremy L Thompson } 4100183ed61SJeremy L Thompson } 4110183ed61SJeremy L Thompson 4129e201c85SYohann //------------------------------------------------------------------------------ 413692716b7SZach Atkins // E-vector -> L-vector, full assembly 414692716b7SZach Atkins //------------------------------------------------------------------------------ 415692716b7SZach Atkins template <int NUM_COMP, int COMP_STRIDE, int P_1D> 416692716b7SZach Atkins inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, 417692716b7SZach Atkins const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 418692716b7SZach Atkins const CeedInt elem_size = P_1D * P_1D * P_1D; 419692716b7SZach Atkins const CeedInt in_comp = in / elem_size; 420692716b7SZach Atkins const CeedInt in_node_x = in % P_1D; 421692716b7SZach Atkins const CeedInt in_node_y = (in % (P_1D * P_1D)) / P_1D; 422692716b7SZach Atkins const CeedInt in_node_z = (in % elem_size) / (P_1D * P_1D); 423692716b7SZach Atkins const CeedInt e_vec_size = elem_size * NUM_COMP; 424692716b7SZach Atkins 425692716b7SZach Atkins if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 426692716b7SZach Atkins const CeedInt in_node = in_node_x + in_node_y * P_1D + in_node_z * P_1D * P_1D; 427692716b7SZach Atkins for (CeedInt z = 0; z < P_1D; z++) { 428692716b7SZach Atkins const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 429692716b7SZach Atkins 430692716b7SZach Atkins for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 431692716b7SZach Atkins const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node; 432692716b7SZach Atkins 433692716b7SZach Atkins d_v[elem * e_vec_size * e_vec_size + index] += r_v[z + comp * P_1D]; 434692716b7SZach Atkins } 435692716b7SZach Atkins } 436692716b7SZach Atkins } 437692716b7SZach Atkins } 438692716b7SZach Atkins 439692716b7SZach Atkins //------------------------------------------------------------------------------ 4405daefc96SJeremy L Thompson // E-vector -> L-vector, Qfunction assembly 4415daefc96SJeremy L Thompson //------------------------------------------------------------------------------ 4425daefc96SJeremy L Thompson template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D> 4435daefc96SJeremy L Thompson inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset, 4445daefc96SJeremy L Thompson const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 4455daefc96SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 4465daefc96SJeremy L Thompson for (CeedInt z = 0; z < Q_1D; z++) { 4475daefc96SJeremy L Thompson const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D + z * Q_1D * Q_1D) + elem * Q_1D * Q_1D * Q_1D; 4485daefc96SJeremy L Thompson 4495daefc96SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) { 4505daefc96SJeremy L Thompson d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * Q_1D * num_elem)] = r_v[z + comp * Q_1D]; 4515daefc96SJeremy L Thompson } 4525daefc96SJeremy L Thompson } 4535daefc96SJeremy L Thompson } 4545daefc96SJeremy L Thompson } 4555daefc96SJeremy L Thompson 4565daefc96SJeremy L Thompson //------------------------------------------------------------------------------ 4579e201c85SYohann // E-vector -> L-vector, strided 4589e201c85SYohann //------------------------------------------------------------------------------ 4596b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 460f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 461672b0f2aSSebastian Grimberg CeedScalar *__restrict__ d_v) { 4626b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 4636b92dc4bSJeremy L Thompson for (CeedInt z = 0; z < P_1D; z++) { 4646b92dc4bSJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 4659e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 466672b0f2aSSebastian Grimberg 4676b92dc4bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D]; 4689e201c85SYohann } 4699e201c85SYohann } 470688b5473SJeremy L Thompson } 4719e201c85SYohann 4729e201c85SYohann //------------------------------------------------------------------------------ 4739e201c85SYohann // 3D collocated derivatives computation 4749e201c85SYohann //------------------------------------------------------------------------------ 4756b92dc4bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 476f815fac9SJeremy L Thompson inline __device__ void GradColloSlice3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 4772b730f8bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 4786b92dc4bSJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 479672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 480d6c19ee8SJeremy L Thompson __syncthreads(); 4816b92dc4bSJeremy L Thompson data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D]; 4829e201c85SYohann __syncthreads(); 4839e201c85SYohann // X derivative 484672b0f2aSSebastian Grimberg r_V[comp + 0 * NUM_COMP] = 0.0; 4856b92dc4bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 4866b92dc4bSJeremy L Thompson r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D]; 487688b5473SJeremy L Thompson } 4889e201c85SYohann // Y derivative 489672b0f2aSSebastian Grimberg r_V[comp + 1 * NUM_COMP] = 0.0; 4906b92dc4bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 4916b92dc4bSJeremy L Thompson r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D]; 492688b5473SJeremy L Thompson } 4939e201c85SYohann // Z derivative 494672b0f2aSSebastian Grimberg r_V[comp + 2 * NUM_COMP] = 0.0; 4956b92dc4bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 4966b92dc4bSJeremy L Thompson r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D]; 497688b5473SJeremy L Thompson } 4989e201c85SYohann } 4999e201c85SYohann } 5009e201c85SYohann } 5019e201c85SYohann 5029e201c85SYohann //------------------------------------------------------------------------------ 5039e201c85SYohann // 3D collocated derivatives transpose 5049e201c85SYohann //------------------------------------------------------------------------------ 5056b92dc4bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 506f815fac9SJeremy L Thompson inline __device__ void GradColloSliceTranspose3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 5072b730f8bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 5086b92dc4bSJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 509672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 5109e201c85SYohann // X derivative 511d6c19ee8SJeremy L Thompson __syncthreads(); 512672b0f2aSSebastian Grimberg data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP]; 5139e201c85SYohann __syncthreads(); 5146b92dc4bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 5156b92dc4bSJeremy L Thompson r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D]; 516688b5473SJeremy L Thompson } 5179e201c85SYohann // Y derivative 518d6c19ee8SJeremy L Thompson __syncthreads(); 519672b0f2aSSebastian Grimberg data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP]; 5209e201c85SYohann __syncthreads(); 5216b92dc4bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 5226b92dc4bSJeremy L Thompson r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D]; 523688b5473SJeremy L Thompson } 5249e201c85SYohann // Z derivative 5256b92dc4bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 5266b92dc4bSJeremy L Thompson r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP]; 527688b5473SJeremy L Thompson } 5289e201c85SYohann } 5299e201c85SYohann } 5309e201c85SYohann } 531