1*9e201c85SYohann // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2*9e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3*9e201c85SYohann // 4*9e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause 5*9e201c85SYohann // 6*9e201c85SYohann // This file is part of CEED: http://github.com/ceed 7*9e201c85SYohann 8*9e201c85SYohann /// @file 9*9e201c85SYohann /// Internal header for HIP backend macro and type definitions for JiT source 10*9e201c85SYohann #ifndef _ceed_hip_gen_templates_h 11*9e201c85SYohann #define _ceed_hip_gen_templates_h 12*9e201c85SYohann 13*9e201c85SYohann #include <ceed/types.h> 14*9e201c85SYohann 15*9e201c85SYohann 16*9e201c85SYohann //------------------------------------------------------------------------------ 17*9e201c85SYohann // Load matrices for basis actions 18*9e201c85SYohann //------------------------------------------------------------------------------ 19*9e201c85SYohann template <int P, int Q> 20*9e201c85SYohann inline __device__ void loadMatrix(SharedData_Hip& data, const CeedScalar* d_B, CeedScalar* B) { 21*9e201c85SYohann for (CeedInt i = data.t_id; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 22*9e201c85SYohann B[i] = d_B[i]; 23*9e201c85SYohann } 24*9e201c85SYohann 25*9e201c85SYohann //------------------------------------------------------------------------------ 26*9e201c85SYohann // 1D 27*9e201c85SYohann //------------------------------------------------------------------------------ 28*9e201c85SYohann 29*9e201c85SYohann //------------------------------------------------------------------------------ 30*9e201c85SYohann // L-vector -> E-vector, offsets provided 31*9e201c85SYohann //------------------------------------------------------------------------------ 32*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d> 33*9e201c85SYohann inline __device__ void readDofsOffset1d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 34*9e201c85SYohann if (data.t_id_x < P1d) { 35*9e201c85SYohann const CeedInt node = data.t_id_x; 36*9e201c85SYohann const CeedInt ind = indices[node + elem * P1d]; 37*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 38*9e201c85SYohann r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 39*9e201c85SYohann } 40*9e201c85SYohann } 41*9e201c85SYohann 42*9e201c85SYohann //------------------------------------------------------------------------------ 43*9e201c85SYohann // L-vector -> E-vector, strided 44*9e201c85SYohann //------------------------------------------------------------------------------ 45*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 46*9e201c85SYohann inline __device__ void readDofsStrided1d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 47*9e201c85SYohann if (data.t_id_x < P1d) { 48*9e201c85SYohann const CeedInt node = data.t_id_x; 49*9e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 50*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 51*9e201c85SYohann r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 52*9e201c85SYohann } 53*9e201c85SYohann } 54*9e201c85SYohann 55*9e201c85SYohann //------------------------------------------------------------------------------ 56*9e201c85SYohann // E-vector -> L-vector, offsets provided 57*9e201c85SYohann //------------------------------------------------------------------------------ 58*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d> 59*9e201c85SYohann inline __device__ void writeDofsOffset1d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 60*9e201c85SYohann if (data.t_id_x < P1d) { 61*9e201c85SYohann const CeedInt node = data.t_id_x; 62*9e201c85SYohann const CeedInt ind = indices[node + elem * P1d]; 63*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 64*9e201c85SYohann atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 65*9e201c85SYohann } 66*9e201c85SYohann } 67*9e201c85SYohann 68*9e201c85SYohann //------------------------------------------------------------------------------ 69*9e201c85SYohann // E-vector -> L-vector, strided 70*9e201c85SYohann //------------------------------------------------------------------------------ 71*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 72*9e201c85SYohann inline __device__ void writeDofsStrided1d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 73*9e201c85SYohann if (data.t_id_x < P1d) { 74*9e201c85SYohann const CeedInt node = data.t_id_x; 75*9e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 76*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 77*9e201c85SYohann d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 78*9e201c85SYohann } 79*9e201c85SYohann } 80*9e201c85SYohann 81*9e201c85SYohann //------------------------------------------------------------------------------ 82*9e201c85SYohann // 2D 83*9e201c85SYohann //------------------------------------------------------------------------------ 84*9e201c85SYohann 85*9e201c85SYohann //------------------------------------------------------------------------------ 86*9e201c85SYohann // L-vector -> E-vector, offsets provided 87*9e201c85SYohann //------------------------------------------------------------------------------ 88*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d> 89*9e201c85SYohann inline __device__ void readDofsOffset2d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 90*9e201c85SYohann if (data.t_id_x < P1d && data.t_id_y < P1d) { 91*9e201c85SYohann const CeedInt node = data.t_id_x + data.t_id_y*P1d; 92*9e201c85SYohann const CeedInt ind = indices[node + elem * P1d*P1d]; 93*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 94*9e201c85SYohann r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 95*9e201c85SYohann } 96*9e201c85SYohann } 97*9e201c85SYohann 98*9e201c85SYohann //------------------------------------------------------------------------------ 99*9e201c85SYohann // L-vector -> E-vector, strided 100*9e201c85SYohann //------------------------------------------------------------------------------ 101*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 102*9e201c85SYohann inline __device__ void readDofsStrided2d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 103*9e201c85SYohann if (data.t_id_x < P1d && data.t_id_y < P1d) { 104*9e201c85SYohann const CeedInt node = data.t_id_x + data.t_id_y*P1d; 105*9e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 106*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 107*9e201c85SYohann r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 108*9e201c85SYohann } 109*9e201c85SYohann } 110*9e201c85SYohann 111*9e201c85SYohann //------------------------------------------------------------------------------ 112*9e201c85SYohann // E-vector -> L-vector, offsets provided 113*9e201c85SYohann //------------------------------------------------------------------------------ 114*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d> 115*9e201c85SYohann inline __device__ void writeDofsOffset2d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 116*9e201c85SYohann if (data.t_id_x < P1d && data.t_id_y < P1d) { 117*9e201c85SYohann const CeedInt node = data.t_id_x + data.t_id_y*P1d; 118*9e201c85SYohann const CeedInt ind = indices[node + elem * P1d*P1d]; 119*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 120*9e201c85SYohann atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 121*9e201c85SYohann } 122*9e201c85SYohann } 123*9e201c85SYohann 124*9e201c85SYohann //------------------------------------------------------------------------------ 125*9e201c85SYohann // E-vector -> L-vector, strided 126*9e201c85SYohann //------------------------------------------------------------------------------ 127*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 128*9e201c85SYohann inline __device__ void writeDofsStrided2d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 129*9e201c85SYohann if (data.t_id_x < P1d && data.t_id_y < P1d) { 130*9e201c85SYohann const CeedInt node = data.t_id_x + data.t_id_y*P1d; 131*9e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 132*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 133*9e201c85SYohann d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 134*9e201c85SYohann } 135*9e201c85SYohann } 136*9e201c85SYohann 137*9e201c85SYohann //------------------------------------------------------------------------------ 138*9e201c85SYohann // 3D 139*9e201c85SYohann //------------------------------------------------------------------------------ 140*9e201c85SYohann 141*9e201c85SYohann //------------------------------------------------------------------------------ 142*9e201c85SYohann // L-vector -> E-vector, offsets provided 143*9e201c85SYohann //------------------------------------------------------------------------------ 144*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d> 145*9e201c85SYohann inline __device__ void readDofsOffset3d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 146*9e201c85SYohann if (data.t_id_x < P1d && data.t_id_y < P1d) 147*9e201c85SYohann for (CeedInt z = 0; z < P1d; ++z) { 148*9e201c85SYohann const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d; 149*9e201c85SYohann const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 150*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 151*9e201c85SYohann r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 152*9e201c85SYohann } 153*9e201c85SYohann } 154*9e201c85SYohann 155*9e201c85SYohann //------------------------------------------------------------------------------ 156*9e201c85SYohann // L-vector -> E-vector, strided 157*9e201c85SYohann //------------------------------------------------------------------------------ 158*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 159*9e201c85SYohann inline __device__ void readDofsStrided3d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 160*9e201c85SYohann if (data.t_id_x < P1d && data.t_id_y < P1d) 161*9e201c85SYohann for (CeedInt z = 0; z < P1d; ++z) { 162*9e201c85SYohann const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d; 163*9e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 164*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 165*9e201c85SYohann r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 166*9e201c85SYohann } 167*9e201c85SYohann } 168*9e201c85SYohann 169*9e201c85SYohann //------------------------------------------------------------------------------ 170*9e201c85SYohann // E-vector -> Q-vector, offests provided 171*9e201c85SYohann //------------------------------------------------------------------------------ 172*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int Q1d> 173*9e201c85SYohann inline __device__ void readSliceQuadsOffset3d(SharedData_Hip& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 174*9e201c85SYohann if (data.t_id_x < Q1d && data.t_id_y < Q1d) { 175*9e201c85SYohann const CeedInt node = data.t_id_x + data.t_id_y*Q1d + q*Q1d*Q1d; 176*9e201c85SYohann const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 177*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 178*9e201c85SYohann r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 179*9e201c85SYohann } 180*9e201c85SYohann } 181*9e201c85SYohann 182*9e201c85SYohann //------------------------------------------------------------------------------ 183*9e201c85SYohann // E-vector -> Q-vector, strided 184*9e201c85SYohann //------------------------------------------------------------------------------ 185*9e201c85SYohann template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 186*9e201c85SYohann inline __device__ void readSliceQuadsStrided3d(SharedData_Hip& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 187*9e201c85SYohann if (data.t_id_x < Q1d && data.t_id_y < Q1d) { 188*9e201c85SYohann const CeedInt node = data.t_id_x + data.t_id_y*Q1d + q*Q1d*Q1d; 189*9e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 190*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 191*9e201c85SYohann r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 192*9e201c85SYohann } 193*9e201c85SYohann } 194*9e201c85SYohann 195*9e201c85SYohann //------------------------------------------------------------------------------ 196*9e201c85SYohann // E-vector -> L-vector, offsets provided 197*9e201c85SYohann //------------------------------------------------------------------------------ 198*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d> 199*9e201c85SYohann inline __device__ void writeDofsOffset3d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 200*9e201c85SYohann if (data.t_id_x < P1d && data.t_id_y < P1d) 201*9e201c85SYohann for (CeedInt z = 0; z < P1d; ++z) { 202*9e201c85SYohann const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d; 203*9e201c85SYohann const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 204*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 205*9e201c85SYohann atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 206*9e201c85SYohann } 207*9e201c85SYohann } 208*9e201c85SYohann 209*9e201c85SYohann //------------------------------------------------------------------------------ 210*9e201c85SYohann // E-vector -> L-vector, strided 211*9e201c85SYohann //------------------------------------------------------------------------------ 212*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 213*9e201c85SYohann inline __device__ void writeDofsStrided3d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 214*9e201c85SYohann if (data.t_id_x < P1d && data.t_id_y < P1d) 215*9e201c85SYohann for (CeedInt z = 0; z < P1d; ++z) { 216*9e201c85SYohann const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d; 217*9e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 218*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 219*9e201c85SYohann d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 220*9e201c85SYohann } 221*9e201c85SYohann } 222*9e201c85SYohann 223*9e201c85SYohann //------------------------------------------------------------------------------ 224*9e201c85SYohann // 3D collocated derivatives computation 225*9e201c85SYohann //------------------------------------------------------------------------------ 226*9e201c85SYohann template <int NCOMP, int Q1d> 227*9e201c85SYohann inline __device__ void gradCollo3d(SharedData_Hip& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 228*9e201c85SYohann if (data.t_id_x < Q1d && data.t_id_y < Q1d) { 229*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 230*9e201c85SYohann data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[q + comp*Q1d]; 231*9e201c85SYohann __syncthreads(); 232*9e201c85SYohann // X derivative 233*9e201c85SYohann r_V[comp+0*NCOMP] = 0.0; 234*9e201c85SYohann for (CeedInt i = 0; i < Q1d; ++i) 235*9e201c85SYohann r_V[comp+0*NCOMP] += c_G[i + data.t_id_x*Q1d] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction (X derivative) 236*9e201c85SYohann // Y derivative 237*9e201c85SYohann r_V[comp+1*NCOMP] = 0.0; 238*9e201c85SYohann for (CeedInt i = 0; i < Q1d; ++i) 239*9e201c85SYohann r_V[comp+1*NCOMP] += c_G[i + data.t_id_y*Q1d] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction (Y derivative) 240*9e201c85SYohann // Z derivative 241*9e201c85SYohann r_V[comp+2*NCOMP] = 0.0; 242*9e201c85SYohann for (CeedInt i = 0; i < Q1d; ++i) 243*9e201c85SYohann r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 244*9e201c85SYohann __syncthreads(); 245*9e201c85SYohann } 246*9e201c85SYohann } 247*9e201c85SYohann } 248*9e201c85SYohann 249*9e201c85SYohann //------------------------------------------------------------------------------ 250*9e201c85SYohann // 3D collocated derivatives transpose 251*9e201c85SYohann //------------------------------------------------------------------------------ 252*9e201c85SYohann template <int NCOMP, int Q1d> 253*9e201c85SYohann inline __device__ void gradColloTranspose3d(SharedData_Hip& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 254*9e201c85SYohann if (data.t_id_x < Q1d && data.t_id_y < Q1d) { 255*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 256*9e201c85SYohann // X derivative 257*9e201c85SYohann data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[comp + 0*NCOMP]; 258*9e201c85SYohann __syncthreads(); 259*9e201c85SYohann for (CeedInt i = 0; i < Q1d; ++i) 260*9e201c85SYohann r_V[q+comp*Q1d] += c_G[data.t_id_x + i*Q1d] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction (X derivative) 261*9e201c85SYohann __syncthreads(); 262*9e201c85SYohann // Y derivative 263*9e201c85SYohann data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[comp + 1*NCOMP]; 264*9e201c85SYohann __syncthreads(); 265*9e201c85SYohann for (CeedInt i = 0; i < Q1d; ++i) 266*9e201c85SYohann r_V[q+comp*Q1d] += c_G[data.t_id_y + i*Q1d] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction (Y derivative) 267*9e201c85SYohann __syncthreads(); 268*9e201c85SYohann // Z derivative 269*9e201c85SYohann for (CeedInt i = 0; i < Q1d; ++i) 270*9e201c85SYohann r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 271*9e201c85SYohann } 272*9e201c85SYohann } 273*9e201c85SYohann } 274*9e201c85SYohann 275*9e201c85SYohann #endif 276