1*9e201c85SYohann // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2*9e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3*9e201c85SYohann // 4*9e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause 5*9e201c85SYohann // 6*9e201c85SYohann // This file is part of CEED: http://github.com/ceed 7*9e201c85SYohann 8*9e201c85SYohann /// @file 9*9e201c85SYohann /// Internal header for CUDA backend macro and type definitions for JiT source 10*9e201c85SYohann #ifndef _ceed_cuda_gen_templates_h 11*9e201c85SYohann #define _ceed_cuda_gen_templates_h 12*9e201c85SYohann 13*9e201c85SYohann #include <ceed/types.h> 14*9e201c85SYohann 15*9e201c85SYohann //------------------------------------------------------------------------------ 16*9e201c85SYohann // Load matrices for basis actions 17*9e201c85SYohann //------------------------------------------------------------------------------ 18*9e201c85SYohann template <int P, int Q> 19*9e201c85SYohann inline __device__ void loadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { 20*9e201c85SYohann for (CeedInt i = data.t_id; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 21*9e201c85SYohann B[i] = d_B[i]; 22*9e201c85SYohann } 23*9e201c85SYohann 24*9e201c85SYohann //------------------------------------------------------------------------------ 25*9e201c85SYohann // 1D 26*9e201c85SYohann //------------------------------------------------------------------------------ 27*9e201c85SYohann 28*9e201c85SYohann //------------------------------------------------------------------------------ 29*9e201c85SYohann // L-vector -> E-vector, offsets provided 30*9e201c85SYohann //------------------------------------------------------------------------------ 31*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d> 32*9e201c85SYohann inline __device__ void readDofsOffset1d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 33*9e201c85SYohann if (data.t_id_x < P1d) { 34*9e201c85SYohann const CeedInt node = data.t_id_x; 35*9e201c85SYohann const CeedInt ind = indices[node + elem * P1d]; 36*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 37*9e201c85SYohann r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 38*9e201c85SYohann } 39*9e201c85SYohann } 40*9e201c85SYohann 41*9e201c85SYohann //------------------------------------------------------------------------------ 42*9e201c85SYohann // L-vector -> E-vector, strided 43*9e201c85SYohann //------------------------------------------------------------------------------ 44*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 45*9e201c85SYohann inline __device__ void readDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 46*9e201c85SYohann if (data.t_id_x < P1d) { 47*9e201c85SYohann const CeedInt node = data.t_id_x; 48*9e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 49*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 50*9e201c85SYohann r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 51*9e201c85SYohann } 52*9e201c85SYohann } 53*9e201c85SYohann 54*9e201c85SYohann //------------------------------------------------------------------------------ 55*9e201c85SYohann // E-vector -> L-vector, offsets provided 56*9e201c85SYohann //------------------------------------------------------------------------------ 57*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d> 58*9e201c85SYohann inline __device__ void writeDofsOffset1d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { 59*9e201c85SYohann if (data.t_id_x < P1d) { 60*9e201c85SYohann const CeedInt node = data.t_id_x; 61*9e201c85SYohann const CeedInt ind = indices[node + elem * P1d]; 62*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 63*9e201c85SYohann atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 64*9e201c85SYohann } 65*9e201c85SYohann } 66*9e201c85SYohann 67*9e201c85SYohann //------------------------------------------------------------------------------ 68*9e201c85SYohann // E-vector -> L-vector, strided 69*9e201c85SYohann //------------------------------------------------------------------------------ 70*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 71*9e201c85SYohann inline __device__ void writeDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 72*9e201c85SYohann if (data.t_id_x < P1d) { 73*9e201c85SYohann const CeedInt node = data.t_id_x; 74*9e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 75*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 76*9e201c85SYohann d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 77*9e201c85SYohann } 78*9e201c85SYohann } 79*9e201c85SYohann 80*9e201c85SYohann //------------------------------------------------------------------------------ 81*9e201c85SYohann // 2D 82*9e201c85SYohann //------------------------------------------------------------------------------ 83*9e201c85SYohann 84*9e201c85SYohann //------------------------------------------------------------------------------ 85*9e201c85SYohann // L-vector -> E-vector, offsets provided 86*9e201c85SYohann //------------------------------------------------------------------------------ 87*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d> 88*9e201c85SYohann inline __device__ void readDofsOffset2d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 89*9e201c85SYohann if (data.t_id_x < P1d && data.t_id_y < P1d) { 90*9e201c85SYohann const CeedInt node = data.t_id_x + data.t_id_y*P1d; 91*9e201c85SYohann const CeedInt ind = indices[node + elem * P1d*P1d]; 92*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 93*9e201c85SYohann r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 94*9e201c85SYohann } 95*9e201c85SYohann } 96*9e201c85SYohann 97*9e201c85SYohann //------------------------------------------------------------------------------ 98*9e201c85SYohann // L-vector -> E-vector, strided 99*9e201c85SYohann //------------------------------------------------------------------------------ 100*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 101*9e201c85SYohann inline __device__ void readDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 102*9e201c85SYohann if (data.t_id_x < P1d && data.t_id_y < P1d) { 103*9e201c85SYohann const CeedInt node = data.t_id_x + data.t_id_y*P1d; 104*9e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 105*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 106*9e201c85SYohann r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 107*9e201c85SYohann } 108*9e201c85SYohann } 109*9e201c85SYohann 110*9e201c85SYohann //------------------------------------------------------------------------------ 111*9e201c85SYohann // E-vector -> L-vector, offsets provided 112*9e201c85SYohann //------------------------------------------------------------------------------ 113*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d> 114*9e201c85SYohann inline __device__ void writeDofsOffset2d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { 115*9e201c85SYohann if (data.t_id_x < P1d && data.t_id_y < P1d) { 116*9e201c85SYohann const CeedInt node = data.t_id_x + data.t_id_y*P1d; 117*9e201c85SYohann const CeedInt ind = indices[node + elem * P1d*P1d]; 118*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 119*9e201c85SYohann atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 120*9e201c85SYohann } 121*9e201c85SYohann } 122*9e201c85SYohann 123*9e201c85SYohann //------------------------------------------------------------------------------ 124*9e201c85SYohann // E-vector -> L-vector, strided 125*9e201c85SYohann //------------------------------------------------------------------------------ 126*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 127*9e201c85SYohann inline __device__ void writeDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 128*9e201c85SYohann if (data.t_id_x < P1d && data.t_id_y < P1d) { 129*9e201c85SYohann const CeedInt node = data.t_id_x + data.t_id_y*P1d; 130*9e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 131*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 132*9e201c85SYohann d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 133*9e201c85SYohann } 134*9e201c85SYohann } 135*9e201c85SYohann 136*9e201c85SYohann //------------------------------------------------------------------------------ 137*9e201c85SYohann // 3D 138*9e201c85SYohann //------------------------------------------------------------------------------ 139*9e201c85SYohann 140*9e201c85SYohann //------------------------------------------------------------------------------ 141*9e201c85SYohann // L-vector -> E-vector, offsets provided 142*9e201c85SYohann //------------------------------------------------------------------------------ 143*9e201c85SYohann // TODO: remove "Dofs" and "Quads" in the following function names? 144*9e201c85SYohann // - readDofsOffset3d -> readOffset3d ? 145*9e201c85SYohann // - readDofsStrided3d -> readStrided3d ? 146*9e201c85SYohann // - readSliceQuadsOffset3d -> readSliceOffset3d ? 147*9e201c85SYohann // - readSliceQuadsStrided3d -> readSliceStrided3d ? 148*9e201c85SYohann // - writeDofsOffset3d -> writeOffset3d ? 149*9e201c85SYohann // - writeDofsStrided3d -> writeStrided3d ? 150*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d> 151*9e201c85SYohann inline __device__ void readDofsOffset3d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 152*9e201c85SYohann if (data.t_id_x < P1d && data.t_id_y < P1d) 153*9e201c85SYohann for (CeedInt z = 0; z < P1d; ++z) { 154*9e201c85SYohann const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d; 155*9e201c85SYohann const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 156*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 157*9e201c85SYohann r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 158*9e201c85SYohann } 159*9e201c85SYohann } 160*9e201c85SYohann 161*9e201c85SYohann //------------------------------------------------------------------------------ 162*9e201c85SYohann // L-vector -> E-vector, strided 163*9e201c85SYohann //------------------------------------------------------------------------------ 164*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 165*9e201c85SYohann inline __device__ void readDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 166*9e201c85SYohann if (data.t_id_x < P1d && data.t_id_y < P1d) 167*9e201c85SYohann for (CeedInt z = 0; z < P1d; ++z) { 168*9e201c85SYohann const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d; 169*9e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 170*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 171*9e201c85SYohann r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 172*9e201c85SYohann } 173*9e201c85SYohann } 174*9e201c85SYohann 175*9e201c85SYohann //------------------------------------------------------------------------------ 176*9e201c85SYohann // E-vector -> Q-vector, offests provided 177*9e201c85SYohann //------------------------------------------------------------------------------ 178*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int Q1d> 179*9e201c85SYohann inline __device__ void readSliceQuadsOffset3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 180*9e201c85SYohann if (data.t_id_x < Q1d && data.t_id_y < Q1d) { 181*9e201c85SYohann const CeedInt node = data.t_id_x + data.t_id_y*Q1d + q*Q1d*Q1d; 182*9e201c85SYohann const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 183*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 184*9e201c85SYohann r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 185*9e201c85SYohann } 186*9e201c85SYohann } 187*9e201c85SYohann 188*9e201c85SYohann //------------------------------------------------------------------------------ 189*9e201c85SYohann // E-vector -> Q-vector, strided 190*9e201c85SYohann //------------------------------------------------------------------------------ 191*9e201c85SYohann template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 192*9e201c85SYohann inline __device__ void readSliceQuadsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 193*9e201c85SYohann if (data.t_id_x < Q1d && data.t_id_y < Q1d) { 194*9e201c85SYohann const CeedInt node = data.t_id_x + data.t_id_y*Q1d + q*Q1d*Q1d; 195*9e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 196*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 197*9e201c85SYohann r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 198*9e201c85SYohann } 199*9e201c85SYohann } 200*9e201c85SYohann 201*9e201c85SYohann //------------------------------------------------------------------------------ 202*9e201c85SYohann // E-vector -> L-vector, offsets provided 203*9e201c85SYohann //------------------------------------------------------------------------------ 204*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d> 205*9e201c85SYohann inline __device__ void writeDofsOffset3d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { 206*9e201c85SYohann if (data.t_id_x < P1d && data.t_id_y < P1d) 207*9e201c85SYohann for (CeedInt z = 0; z < P1d; ++z) { 208*9e201c85SYohann const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d; 209*9e201c85SYohann const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 210*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 211*9e201c85SYohann atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 212*9e201c85SYohann } 213*9e201c85SYohann } 214*9e201c85SYohann 215*9e201c85SYohann //------------------------------------------------------------------------------ 216*9e201c85SYohann // E-vector -> L-vector, strided 217*9e201c85SYohann //------------------------------------------------------------------------------ 218*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 219*9e201c85SYohann inline __device__ void writeDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 220*9e201c85SYohann if (data.t_id_x < P1d && data.t_id_y < P1d) 221*9e201c85SYohann for (CeedInt z = 0; z < P1d; ++z) { 222*9e201c85SYohann const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d; 223*9e201c85SYohann const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 224*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) 225*9e201c85SYohann d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 226*9e201c85SYohann } 227*9e201c85SYohann } 228*9e201c85SYohann 229*9e201c85SYohann //------------------------------------------------------------------------------ 230*9e201c85SYohann // 3D collocated derivatives computation 231*9e201c85SYohann //------------------------------------------------------------------------------ 232*9e201c85SYohann template <int NCOMP, int Q1d> 233*9e201c85SYohann inline __device__ void gradCollo3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 234*9e201c85SYohann if (data.t_id_x < Q1d && data.t_id_y < Q1d) { 235*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 236*9e201c85SYohann data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[q + comp*Q1d]; 237*9e201c85SYohann __syncthreads(); 238*9e201c85SYohann // X derivative 239*9e201c85SYohann r_V[comp+0*NCOMP] = 0.0; 240*9e201c85SYohann for (CeedInt i = 0; i < Q1d; ++i) 241*9e201c85SYohann r_V[comp+0*NCOMP] += c_G[i + data.t_id_x*Q1d] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction (X derivative) 242*9e201c85SYohann // Y derivative 243*9e201c85SYohann r_V[comp+1*NCOMP] = 0.0; 244*9e201c85SYohann for (CeedInt i = 0; i < Q1d; ++i) 245*9e201c85SYohann r_V[comp+1*NCOMP] += c_G[i + data.t_id_y*Q1d] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction (Y derivative) 246*9e201c85SYohann // Z derivative 247*9e201c85SYohann r_V[comp+2*NCOMP] = 0.0; 248*9e201c85SYohann for (CeedInt i = 0; i < Q1d; ++i) 249*9e201c85SYohann r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 250*9e201c85SYohann __syncthreads(); 251*9e201c85SYohann } 252*9e201c85SYohann } 253*9e201c85SYohann } 254*9e201c85SYohann 255*9e201c85SYohann //------------------------------------------------------------------------------ 256*9e201c85SYohann // 3D collocated derivatives transpose 257*9e201c85SYohann //------------------------------------------------------------------------------ 258*9e201c85SYohann template <int NCOMP, int Q1d> 259*9e201c85SYohann inline __device__ void gradColloTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 260*9e201c85SYohann if (data.t_id_x < Q1d && data.t_id_y < Q1d) { 261*9e201c85SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 262*9e201c85SYohann // X derivative 263*9e201c85SYohann data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[comp + 0*NCOMP]; 264*9e201c85SYohann __syncthreads(); 265*9e201c85SYohann for (CeedInt i = 0; i < Q1d; ++i) 266*9e201c85SYohann r_V[q+comp*Q1d] += c_G[data.t_id_x + i*Q1d] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction (X derivative) 267*9e201c85SYohann __syncthreads(); 268*9e201c85SYohann // Y derivative 269*9e201c85SYohann data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[comp + 1*NCOMP]; 270*9e201c85SYohann __syncthreads(); 271*9e201c85SYohann for (CeedInt i = 0; i < Q1d; ++i) 272*9e201c85SYohann r_V[q+comp*Q1d] += c_G[data.t_id_y + i*Q1d] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction (Y derivative) 273*9e201c85SYohann __syncthreads(); 274*9e201c85SYohann // Z derivative 275*9e201c85SYohann for (CeedInt i = 0; i < Q1d; ++i) 276*9e201c85SYohann r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 277*9e201c85SYohann } 278*9e201c85SYohann } 279*9e201c85SYohann } 280*9e201c85SYohann 281*9e201c85SYohann #endif 282