1d275d636SJeremy L Thompson // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 29e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 39e201c85SYohann // 49e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause 59e201c85SYohann // 69e201c85SYohann // This file is part of CEED: http://github.com/ceed 79e201c85SYohann 89e201c85SYohann /// @file 99e201c85SYohann /// Internal header for HIP backend macro and type definitions for JiT source 10c0b5abf0SJeremy L Thompson #include <ceed/types.h> 119e201c85SYohann 129e201c85SYohann //------------------------------------------------------------------------------ 139e201c85SYohann // Load matrices for basis actions 149e201c85SYohann //------------------------------------------------------------------------------ 159e201c85SYohann template <int P, int Q> 16f815fac9SJeremy L Thompson inline __device__ void LoadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { 172b730f8bSJeremy L Thompson for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; 189e201c85SYohann } 199e201c85SYohann 209e201c85SYohann //------------------------------------------------------------------------------ 213a2968d6SJeremy L Thompson // AtPoints 223a2968d6SJeremy L Thompson //------------------------------------------------------------------------------ 233a2968d6SJeremy L Thompson 243a2968d6SJeremy L Thompson //------------------------------------------------------------------------------ 253a2968d6SJeremy L Thompson // L-vector -> single point 263a2968d6SJeremy L Thompson //------------------------------------------------------------------------------ 273a2968d6SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS> 283a2968d6SJeremy L Thompson inline __device__ void ReadPoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, 293a2968d6SJeremy L Thompson const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 303a2968d6SJeremy L Thompson const CeedInt ind = indices[p + elem * NUM_PTS]; 313a2968d6SJeremy L Thompson 323a2968d6SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 333a2968d6SJeremy L Thompson r_u[comp] = d_u[ind + comp * COMP_STRIDE]; 343a2968d6SJeremy L Thompson } 353a2968d6SJeremy L Thompson } 363a2968d6SJeremy L Thompson 373a2968d6SJeremy L Thompson //------------------------------------------------------------------------------ 383a2968d6SJeremy L Thompson // Single point -> L-vector 393a2968d6SJeremy L Thompson //------------------------------------------------------------------------------ 403a2968d6SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS> 413a2968d6SJeremy L Thompson inline __device__ void WritePoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, 423a2968d6SJeremy L Thompson const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) { 433a2968d6SJeremy L Thompson if (p < points_in_elem) { 443a2968d6SJeremy L Thompson const CeedInt ind = indices[p + elem * NUM_PTS]; 453a2968d6SJeremy L Thompson 463a2968d6SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 473a2968d6SJeremy L Thompson d_u[ind + comp * COMP_STRIDE] += r_u[comp]; 483a2968d6SJeremy L Thompson } 493a2968d6SJeremy L Thompson } 503a2968d6SJeremy L Thompson } 513a2968d6SJeremy L Thompson 523a2968d6SJeremy L Thompson //------------------------------------------------------------------------------ 539e201c85SYohann // 1D 549e201c85SYohann //------------------------------------------------------------------------------ 559e201c85SYohann 569e201c85SYohann //------------------------------------------------------------------------------ 57*0183ed61SJeremy L Thompson // Set E-vector value 58*0183ed61SJeremy L Thompson //------------------------------------------------------------------------------ 59*0183ed61SJeremy L Thompson template <int NUM_COMP, int P_1D> 60*0183ed61SJeremy L Thompson inline __device__ void SetEVecStandard1d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 61*0183ed61SJeremy L Thompson const CeedInt target_comp = n / P_1D; 62*0183ed61SJeremy L Thompson const CeedInt target_node = n % P_1D; 63*0183ed61SJeremy L Thompson 64*0183ed61SJeremy L Thompson if (data.t_id_x == target_node) { 65*0183ed61SJeremy L Thompson r_v[target_comp] = value; 66*0183ed61SJeremy L Thompson } 67*0183ed61SJeremy L Thompson } 68*0183ed61SJeremy L Thompson 69*0183ed61SJeremy L Thompson //------------------------------------------------------------------------------ 709e201c85SYohann // L-vector -> E-vector, offsets provided 719e201c85SYohann //------------------------------------------------------------------------------ 726b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D> 73f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 74672b0f2aSSebastian Grimberg const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 756b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D) { 769e201c85SYohann const CeedInt node = data.t_id_x; 776b92dc4bSJeremy L Thompson const CeedInt ind = indices[node + elem * P_1D]; 78672b0f2aSSebastian Grimberg 79672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 809e201c85SYohann } 819e201c85SYohann } 829e201c85SYohann 839e201c85SYohann //------------------------------------------------------------------------------ 849e201c85SYohann // L-vector -> E-vector, strided 859e201c85SYohann //------------------------------------------------------------------------------ 866b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 87f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 886b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D) { 899e201c85SYohann const CeedInt node = data.t_id_x; 909e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 91672b0f2aSSebastian Grimberg 92672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 939e201c85SYohann } 949e201c85SYohann } 959e201c85SYohann 969e201c85SYohann //------------------------------------------------------------------------------ 979e201c85SYohann // E-vector -> L-vector, offsets provided 989e201c85SYohann //------------------------------------------------------------------------------ 996b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D> 100f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 101672b0f2aSSebastian Grimberg const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 1026b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D) { 1039e201c85SYohann const CeedInt node = data.t_id_x; 1046b92dc4bSJeremy L Thompson const CeedInt ind = indices[node + elem * P_1D]; 105672b0f2aSSebastian Grimberg 106672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 1079e201c85SYohann } 1089e201c85SYohann } 1099e201c85SYohann 110*0183ed61SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D> 111*0183ed61SJeremy L Thompson inline __device__ void WriteLVecStandard1d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 112*0183ed61SJeremy L Thompson const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 113*0183ed61SJeremy L Thompson CeedScalar *__restrict__ d_v) { 114*0183ed61SJeremy L Thompson const CeedInt target_comp = n / P_1D; 115*0183ed61SJeremy L Thompson const CeedInt target_node = n % P_1D; 116*0183ed61SJeremy L Thompson 117*0183ed61SJeremy L Thompson if (data.t_id_x == target_node) { 118*0183ed61SJeremy L Thompson const CeedInt ind = indices[target_node + elem * P_1D]; 119*0183ed61SJeremy L Thompson 120*0183ed61SJeremy L Thompson atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]); 121*0183ed61SJeremy L Thompson } 122*0183ed61SJeremy L Thompson } 123*0183ed61SJeremy L Thompson 1249e201c85SYohann //------------------------------------------------------------------------------ 1259e201c85SYohann // E-vector -> L-vector, strided 1269e201c85SYohann //------------------------------------------------------------------------------ 1276b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 128f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 129672b0f2aSSebastian Grimberg CeedScalar *__restrict__ d_v) { 1306b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D) { 1319e201c85SYohann const CeedInt node = data.t_id_x; 1329e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 133672b0f2aSSebastian Grimberg 134672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 1359e201c85SYohann } 1369e201c85SYohann } 1379e201c85SYohann 1389e201c85SYohann //------------------------------------------------------------------------------ 1399e201c85SYohann // 2D 1409e201c85SYohann //------------------------------------------------------------------------------ 1419e201c85SYohann 1429e201c85SYohann //------------------------------------------------------------------------------ 143*0183ed61SJeremy L Thompson // Set E-vector value 144*0183ed61SJeremy L Thompson //------------------------------------------------------------------------------ 145*0183ed61SJeremy L Thompson template <int NUM_COMP, int P_1D> 146*0183ed61SJeremy L Thompson inline __device__ void SetEVecStandard2d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 147*0183ed61SJeremy L Thompson const CeedInt target_comp = n / (P_1D * P_1D); 148*0183ed61SJeremy L Thompson const CeedInt target_node_x = n % P_1D; 149*0183ed61SJeremy L Thompson const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 150*0183ed61SJeremy L Thompson 151*0183ed61SJeremy L Thompson if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 152*0183ed61SJeremy L Thompson r_v[target_comp] = value; 153*0183ed61SJeremy L Thompson } 154*0183ed61SJeremy L Thompson } 155*0183ed61SJeremy L Thompson 156*0183ed61SJeremy L Thompson //------------------------------------------------------------------------------ 1579e201c85SYohann // L-vector -> E-vector, offsets provided 1589e201c85SYohann //------------------------------------------------------------------------------ 1596b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D> 160f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 161672b0f2aSSebastian Grimberg const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 1626b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 1636b92dc4bSJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 1646b92dc4bSJeremy L Thompson const CeedInt ind = indices[node + elem * P_1D * P_1D]; 165672b0f2aSSebastian Grimberg 166672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 1679e201c85SYohann } 1689e201c85SYohann } 1699e201c85SYohann 1709e201c85SYohann //------------------------------------------------------------------------------ 1719e201c85SYohann // L-vector -> E-vector, strided 1729e201c85SYohann //------------------------------------------------------------------------------ 1736b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 174f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 1756b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 1766b92dc4bSJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 1779e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 178672b0f2aSSebastian Grimberg 179672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 1809e201c85SYohann } 1819e201c85SYohann } 1829e201c85SYohann 1839e201c85SYohann //------------------------------------------------------------------------------ 1849e201c85SYohann // E-vector -> L-vector, offsets provided 1859e201c85SYohann //------------------------------------------------------------------------------ 1866b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D> 187f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 188672b0f2aSSebastian Grimberg const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 1896b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 1906b92dc4bSJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 1916b92dc4bSJeremy L Thompson const CeedInt ind = indices[node + elem * P_1D * P_1D]; 192672b0f2aSSebastian Grimberg 193672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 1949e201c85SYohann } 1959e201c85SYohann } 1969e201c85SYohann 197*0183ed61SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D> 198*0183ed61SJeremy L Thompson inline __device__ void WriteLVecStandard2d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 199*0183ed61SJeremy L Thompson const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 200*0183ed61SJeremy L Thompson CeedScalar *__restrict__ d_v) { 201*0183ed61SJeremy L Thompson const CeedInt target_comp = n / (P_1D * P_1D); 202*0183ed61SJeremy L Thompson const CeedInt target_node_x = n % P_1D; 203*0183ed61SJeremy L Thompson const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 204*0183ed61SJeremy L Thompson 205*0183ed61SJeremy L Thompson if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 206*0183ed61SJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 207*0183ed61SJeremy L Thompson const CeedInt ind = indices[node + elem * P_1D * P_1D]; 208*0183ed61SJeremy L Thompson 209*0183ed61SJeremy L Thompson atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]); 210*0183ed61SJeremy L Thompson } 211*0183ed61SJeremy L Thompson } 212*0183ed61SJeremy L Thompson 2139e201c85SYohann //------------------------------------------------------------------------------ 2149e201c85SYohann // E-vector -> L-vector, strided 2159e201c85SYohann //------------------------------------------------------------------------------ 2166b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 217f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 218672b0f2aSSebastian Grimberg CeedScalar *__restrict__ d_v) { 2196b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 2206b92dc4bSJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 2219e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 222672b0f2aSSebastian Grimberg 223672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 2249e201c85SYohann } 2259e201c85SYohann } 2269e201c85SYohann 2279e201c85SYohann //------------------------------------------------------------------------------ 2289e201c85SYohann // 3D 2299e201c85SYohann //------------------------------------------------------------------------------ 2309e201c85SYohann 2319e201c85SYohann //------------------------------------------------------------------------------ 232*0183ed61SJeremy L Thompson // Set E-vector value 233*0183ed61SJeremy L Thompson //------------------------------------------------------------------------------ 234*0183ed61SJeremy L Thompson template <int NUM_COMP, int P_1D> 235*0183ed61SJeremy L Thompson inline __device__ void SetEVecStandard3d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 236*0183ed61SJeremy L Thompson const CeedInt target_comp = n / (P_1D * P_1D * P_1D); 237*0183ed61SJeremy L Thompson const CeedInt target_node_x = n % P_1D; 238*0183ed61SJeremy L Thompson const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D; 239*0183ed61SJeremy L Thompson const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D); 240*0183ed61SJeremy L Thompson 241*0183ed61SJeremy L Thompson if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 242*0183ed61SJeremy L Thompson r_v[target_node_z + target_comp * P_1D] = value; 243*0183ed61SJeremy L Thompson } 244*0183ed61SJeremy L Thompson } 245*0183ed61SJeremy L Thompson 246*0183ed61SJeremy L Thompson //------------------------------------------------------------------------------ 2479e201c85SYohann // L-vector -> E-vector, offsets provided 2489e201c85SYohann //------------------------------------------------------------------------------ 2496b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D> 250f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 251672b0f2aSSebastian Grimberg const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 2526b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 2536b92dc4bSJeremy L Thompson for (CeedInt z = 0; z < P_1D; z++) { 2546b92dc4bSJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 2556b92dc4bSJeremy L Thompson const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 256672b0f2aSSebastian Grimberg 2576b92dc4bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp]; 2589e201c85SYohann } 2599e201c85SYohann } 260688b5473SJeremy L Thompson } 2619e201c85SYohann 2629e201c85SYohann //------------------------------------------------------------------------------ 2639e201c85SYohann // L-vector -> E-vector, strided 2649e201c85SYohann //------------------------------------------------------------------------------ 2656b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 266f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 2676b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 2686b92dc4bSJeremy L Thompson for (CeedInt z = 0; z < P_1D; z++) { 2696b92dc4bSJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 2709e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 271672b0f2aSSebastian Grimberg 2726b92dc4bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP]; 2739e201c85SYohann } 2749e201c85SYohann } 275688b5473SJeremy L Thompson } 2769e201c85SYohann 2779e201c85SYohann //------------------------------------------------------------------------------ 2789e201c85SYohann // E-vector -> Q-vector, offests provided 2799e201c85SYohann //------------------------------------------------------------------------------ 2806b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int Q_1D> 281f815fac9SJeremy L Thompson inline __device__ void ReadEVecSliceStandard3d(SharedData_Hip &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, 282f815fac9SJeremy L Thompson const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, 283f815fac9SJeremy L Thompson CeedScalar *__restrict__ r_u) { 2846b92dc4bSJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 2856b92dc4bSJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; 2866b92dc4bSJeremy L Thompson const CeedInt ind = indices[node + elem * Q_1D * Q_1D * Q_1D]; 287672b0f2aSSebastian Grimberg 288672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 2899e201c85SYohann } 2909e201c85SYohann } 2919e201c85SYohann 2929e201c85SYohann //------------------------------------------------------------------------------ 2939e201c85SYohann // E-vector -> Q-vector, strided 2949e201c85SYohann //------------------------------------------------------------------------------ 2956b92dc4bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 296f815fac9SJeremy L Thompson inline __device__ void ReadEVecSliceStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, 297672b0f2aSSebastian Grimberg CeedScalar *__restrict__ r_u) { 2986b92dc4bSJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 2996b92dc4bSJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; 3009e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 301672b0f2aSSebastian Grimberg 302672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 3039e201c85SYohann } 3049e201c85SYohann } 3059e201c85SYohann 3069e201c85SYohann //------------------------------------------------------------------------------ 3079e201c85SYohann // E-vector -> L-vector, offsets provided 3089e201c85SYohann //------------------------------------------------------------------------------ 3096b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D> 310f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 311672b0f2aSSebastian Grimberg const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 3126b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 3136b92dc4bSJeremy L Thompson for (CeedInt z = 0; z < P_1D; z++) { 3146b92dc4bSJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 3156b92dc4bSJeremy L Thompson const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 316672b0f2aSSebastian Grimberg 3176b92dc4bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]); 3189e201c85SYohann } 3199e201c85SYohann } 320688b5473SJeremy L Thompson } 3219e201c85SYohann 322*0183ed61SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D> 323*0183ed61SJeremy L Thompson inline __device__ void WriteLVecStandard3d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 324*0183ed61SJeremy L Thompson const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 325*0183ed61SJeremy L Thompson CeedScalar *__restrict__ d_v) { 326*0183ed61SJeremy L Thompson const CeedInt target_comp = n / (P_1D * P_1D * P_1D); 327*0183ed61SJeremy L Thompson const CeedInt target_node_x = n % P_1D; 328*0183ed61SJeremy L Thompson const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D; 329*0183ed61SJeremy L Thompson const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D); 330*0183ed61SJeremy L Thompson 331*0183ed61SJeremy L Thompson if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 332*0183ed61SJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D; 333*0183ed61SJeremy L Thompson const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 334*0183ed61SJeremy L Thompson 335*0183ed61SJeremy L Thompson atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]); 336*0183ed61SJeremy L Thompson } 337*0183ed61SJeremy L Thompson } 338*0183ed61SJeremy L Thompson 3399e201c85SYohann //------------------------------------------------------------------------------ 3409e201c85SYohann // E-vector -> L-vector, strided 3419e201c85SYohann //------------------------------------------------------------------------------ 3426b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 343f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 344672b0f2aSSebastian Grimberg CeedScalar *__restrict__ d_v) { 3456b92dc4bSJeremy L Thompson if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 3466b92dc4bSJeremy L Thompson for (CeedInt z = 0; z < P_1D; z++) { 3476b92dc4bSJeremy L Thompson const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 3489e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 349672b0f2aSSebastian Grimberg 3506b92dc4bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D]; 3519e201c85SYohann } 3529e201c85SYohann } 353688b5473SJeremy L Thompson } 3549e201c85SYohann 3559e201c85SYohann //------------------------------------------------------------------------------ 3569e201c85SYohann // 3D collocated derivatives computation 3579e201c85SYohann //------------------------------------------------------------------------------ 3586b92dc4bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 359f815fac9SJeremy L Thompson inline __device__ void GradColloSlice3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 3602b730f8bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 3616b92dc4bSJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 362672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 363d6c19ee8SJeremy L Thompson __syncthreads(); 3646b92dc4bSJeremy L Thompson data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D]; 3659e201c85SYohann __syncthreads(); 3669e201c85SYohann // X derivative 367672b0f2aSSebastian Grimberg r_V[comp + 0 * NUM_COMP] = 0.0; 3686b92dc4bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3696b92dc4bSJeremy L Thompson r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D]; 370688b5473SJeremy L Thompson } 3719e201c85SYohann // Y derivative 372672b0f2aSSebastian Grimberg r_V[comp + 1 * NUM_COMP] = 0.0; 3736b92dc4bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3746b92dc4bSJeremy L Thompson r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D]; 375688b5473SJeremy L Thompson } 3769e201c85SYohann // Z derivative 377672b0f2aSSebastian Grimberg r_V[comp + 2 * NUM_COMP] = 0.0; 3786b92dc4bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3796b92dc4bSJeremy L Thompson r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D]; 380688b5473SJeremy L Thompson } 3819e201c85SYohann } 3829e201c85SYohann } 3839e201c85SYohann } 3849e201c85SYohann 3859e201c85SYohann //------------------------------------------------------------------------------ 3869e201c85SYohann // 3D collocated derivatives transpose 3879e201c85SYohann //------------------------------------------------------------------------------ 3886b92dc4bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 389f815fac9SJeremy L Thompson inline __device__ void GradColloSliceTranspose3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 3902b730f8bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 3916b92dc4bSJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 392672b0f2aSSebastian Grimberg for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 3939e201c85SYohann // X derivative 394d6c19ee8SJeremy L Thompson __syncthreads(); 395672b0f2aSSebastian Grimberg data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP]; 3969e201c85SYohann __syncthreads(); 3976b92dc4bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3986b92dc4bSJeremy L Thompson r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D]; 399688b5473SJeremy L Thompson } 4009e201c85SYohann // Y derivative 401d6c19ee8SJeremy L Thompson __syncthreads(); 402672b0f2aSSebastian Grimberg data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP]; 4039e201c85SYohann __syncthreads(); 4046b92dc4bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 4056b92dc4bSJeremy L Thompson r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D]; 406688b5473SJeremy L Thompson } 4079e201c85SYohann // Z derivative 4086b92dc4bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 4096b92dc4bSJeremy L Thompson r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP]; 410688b5473SJeremy L Thompson } 4119e201c85SYohann } 4129e201c85SYohann } 4139e201c85SYohann } 414