1*9e201c85SYohann // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2*9e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3*9e201c85SYohann // 4*9e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause 5*9e201c85SYohann // 6*9e201c85SYohann // This file is part of CEED: http://github.com/ceed 7*9e201c85SYohann 8*9e201c85SYohann /// @file 9*9e201c85SYohann /// Internal header for CUDA shared memory tensor product basis templates 10*9e201c85SYohann #ifndef _ceed_cuda_shared_basis_tensor_templates_h 11*9e201c85SYohann #define _ceed_cuda_shared_basis_tensor_templates_h 12*9e201c85SYohann 13*9e201c85SYohann #include <ceed.h> 14*9e201c85SYohann 15*9e201c85SYohann //------------------------------------------------------------------------------ 16*9e201c85SYohann // 1D 17*9e201c85SYohann //------------------------------------------------------------------------------ 18*9e201c85SYohann 19*9e201c85SYohann //------------------------------------------------------------------------------ 20*9e201c85SYohann // 1D tensor contraction x 21*9e201c85SYohann //------------------------------------------------------------------------------ 22*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 23*9e201c85SYohann inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 24*9e201c85SYohann data.slice[data.t_id_x] = *U; 25*9e201c85SYohann __syncthreads(); 26*9e201c85SYohann *V = 0.0; 27*9e201c85SYohann if (data.t_id_x < Q_1D) { 28*9e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) { 29*9e201c85SYohann *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction 30*9e201c85SYohann } 31*9e201c85SYohann } 32*9e201c85SYohann __syncthreads(); 33*9e201c85SYohann } 34*9e201c85SYohann 35*9e201c85SYohann //------------------------------------------------------------------------------ 36*9e201c85SYohann // 1D transpose tensor contraction x 37*9e201c85SYohann //------------------------------------------------------------------------------ 38*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 39*9e201c85SYohann inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 40*9e201c85SYohann data.slice[data.t_id_x] = *U; 41*9e201c85SYohann __syncthreads(); 42*9e201c85SYohann *V = 0.0; 43*9e201c85SYohann if (data.t_id_x < P_1D) { 44*9e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 45*9e201c85SYohann *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction 46*9e201c85SYohann } 47*9e201c85SYohann } 48*9e201c85SYohann __syncthreads(); 49*9e201c85SYohann } 50*9e201c85SYohann 51*9e201c85SYohann //------------------------------------------------------------------------------ 52*9e201c85SYohann // 1D interpolate to quadrature points 53*9e201c85SYohann //------------------------------------------------------------------------------ 54*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 55*9e201c85SYohann inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 56*9e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 57*9e201c85SYohann ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp); 58*9e201c85SYohann } 59*9e201c85SYohann } 60*9e201c85SYohann 61*9e201c85SYohann //------------------------------------------------------------------------------ 62*9e201c85SYohann // 1D interpolate transpose 63*9e201c85SYohann //------------------------------------------------------------------------------ 64*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 65*9e201c85SYohann inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 66*9e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 67*9e201c85SYohann ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp); 68*9e201c85SYohann } 69*9e201c85SYohann } 70*9e201c85SYohann 71*9e201c85SYohann //------------------------------------------------------------------------------ 72*9e201c85SYohann // 1D derivatives at quadrature points 73*9e201c85SYohann //------------------------------------------------------------------------------ 74*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 75*9e201c85SYohann inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 76*9e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 77*9e201c85SYohann ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp); 78*9e201c85SYohann } 79*9e201c85SYohann } 80*9e201c85SYohann 81*9e201c85SYohann //------------------------------------------------------------------------------ 82*9e201c85SYohann // 1D derivatives transpose 83*9e201c85SYohann //------------------------------------------------------------------------------ 84*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 85*9e201c85SYohann inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 86*9e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 87*9e201c85SYohann ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp); 88*9e201c85SYohann } 89*9e201c85SYohann } 90*9e201c85SYohann 91*9e201c85SYohann //------------------------------------------------------------------------------ 92*9e201c85SYohann // 1D quadrature weights 93*9e201c85SYohann //------------------------------------------------------------------------------ 94*9e201c85SYohann template <int Q_1D> 95*9e201c85SYohann inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 96*9e201c85SYohann *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0; 97*9e201c85SYohann } 98*9e201c85SYohann 99*9e201c85SYohann //------------------------------------------------------------------------------ 100*9e201c85SYohann // 2D 101*9e201c85SYohann //------------------------------------------------------------------------------ 102*9e201c85SYohann 103*9e201c85SYohann //------------------------------------------------------------------------------ 104*9e201c85SYohann // 2D tensor contraction x 105*9e201c85SYohann //------------------------------------------------------------------------------ 106*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 107*9e201c85SYohann inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 108*9e201c85SYohann data.slice[data.t_id_x+data.t_id_y*T_1D] = *U; 109*9e201c85SYohann __syncthreads(); 110*9e201c85SYohann *V = 0.0; 111*9e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 112*9e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) { 113*9e201c85SYohann *V += B[i + data.t_id_x*P_1D] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction 114*9e201c85SYohann } 115*9e201c85SYohann } 116*9e201c85SYohann __syncthreads(); 117*9e201c85SYohann } 118*9e201c85SYohann 119*9e201c85SYohann //------------------------------------------------------------------------------ 120*9e201c85SYohann // 2D tensor contract y 121*9e201c85SYohann //------------------------------------------------------------------------------ 122*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 123*9e201c85SYohann inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 124*9e201c85SYohann data.slice[data.t_id_x+data.t_id_y*T_1D] = *U; 125*9e201c85SYohann __syncthreads(); 126*9e201c85SYohann *V = 0.0; 127*9e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 128*9e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) { 129*9e201c85SYohann *V += B[i + data.t_id_y*P_1D] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction 130*9e201c85SYohann } 131*9e201c85SYohann } 132*9e201c85SYohann __syncthreads(); 133*9e201c85SYohann } 134*9e201c85SYohann 135*9e201c85SYohann //------------------------------------------------------------------------------ 136*9e201c85SYohann // 2D transpose tensor contract y 137*9e201c85SYohann //------------------------------------------------------------------------------ 138*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 139*9e201c85SYohann inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 140*9e201c85SYohann data.slice[data.t_id_x+data.t_id_y*T_1D] = *U; 141*9e201c85SYohann __syncthreads(); 142*9e201c85SYohann *V = 0.0; 143*9e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 144*9e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 145*9e201c85SYohann *V += B[data.t_id_y + i*P_1D] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction 146*9e201c85SYohann } 147*9e201c85SYohann } 148*9e201c85SYohann __syncthreads(); 149*9e201c85SYohann } 150*9e201c85SYohann 151*9e201c85SYohann //------------------------------------------------------------------------------ 152*9e201c85SYohann // 2D transpose tensor contract x 153*9e201c85SYohann //------------------------------------------------------------------------------ 154*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 155*9e201c85SYohann inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 156*9e201c85SYohann data.slice[data.t_id_x+data.t_id_y*T_1D] = *U; 157*9e201c85SYohann __syncthreads(); 158*9e201c85SYohann *V = 0.0; 159*9e201c85SYohann if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 160*9e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 161*9e201c85SYohann *V += B[data.t_id_x + i*P_1D] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction 162*9e201c85SYohann } 163*9e201c85SYohann } 164*9e201c85SYohann __syncthreads(); 165*9e201c85SYohann } 166*9e201c85SYohann 167*9e201c85SYohann //------------------------------------------------------------------------------ 168*9e201c85SYohann // 2D transpose tensor contract and add x 169*9e201c85SYohann //------------------------------------------------------------------------------ 170*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 171*9e201c85SYohann inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 172*9e201c85SYohann data.slice[data.t_id_x+data.t_id_y*T_1D] = *U; 173*9e201c85SYohann __syncthreads(); 174*9e201c85SYohann if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 175*9e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 176*9e201c85SYohann *V += B[data.t_id_x + i*P_1D] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction 177*9e201c85SYohann } 178*9e201c85SYohann } 179*9e201c85SYohann __syncthreads(); 180*9e201c85SYohann } 181*9e201c85SYohann 182*9e201c85SYohann //------------------------------------------------------------------------------ 183*9e201c85SYohann // 2D interpolate to quadrature points 184*9e201c85SYohann //------------------------------------------------------------------------------ 185*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 186*9e201c85SYohann inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 187*9e201c85SYohann CeedScalar r_t[1]; 188*9e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 189*9e201c85SYohann ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t); 190*9e201c85SYohann ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp); 191*9e201c85SYohann } 192*9e201c85SYohann } 193*9e201c85SYohann 194*9e201c85SYohann //------------------------------------------------------------------------------ 195*9e201c85SYohann // 2D interpolate transpose 196*9e201c85SYohann //------------------------------------------------------------------------------ 197*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 198*9e201c85SYohann inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 199*9e201c85SYohann CeedScalar r_t[1]; 200*9e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 201*9e201c85SYohann ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t); 202*9e201c85SYohann ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp); 203*9e201c85SYohann } 204*9e201c85SYohann } 205*9e201c85SYohann 206*9e201c85SYohann //------------------------------------------------------------------------------ 207*9e201c85SYohann // 2D derivatives at quadrature points 208*9e201c85SYohann //------------------------------------------------------------------------------ 209*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 210*9e201c85SYohann inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 211*9e201c85SYohann CeedScalar r_t[1]; 212*9e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 213*9e201c85SYohann ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_t); 214*9e201c85SYohann ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp + 0*NUM_COMP); 215*9e201c85SYohann ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t); 216*9e201c85SYohann ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp + 1*NUM_COMP); 217*9e201c85SYohann } 218*9e201c85SYohann } 219*9e201c85SYohann 220*9e201c85SYohann //------------------------------------------------------------------------------ 221*9e201c85SYohann // 2D derivatives transpose 222*9e201c85SYohann //------------------------------------------------------------------------------ 223*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 224*9e201c85SYohann inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 225*9e201c85SYohann CeedScalar r_t[1]; 226*9e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 227*9e201c85SYohann ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 0*NUM_COMP, c_B, r_t); 228*9e201c85SYohann ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp); 229*9e201c85SYohann ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 1*NUM_COMP, c_G, r_t); 230*9e201c85SYohann ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp); 231*9e201c85SYohann } 232*9e201c85SYohann } 233*9e201c85SYohann 234*9e201c85SYohann //------------------------------------------------------------------------------ 235*9e201c85SYohann // 2D quadrature weights 236*9e201c85SYohann //------------------------------------------------------------------------------ 237*9e201c85SYohann template <int Q_1D> 238*9e201c85SYohann inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 239*9e201c85SYohann *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? 240*9e201c85SYohann q_weight_1d[data.t_id_x]*q_weight_1d[data.t_id_y] : 0.0; 241*9e201c85SYohann } 242*9e201c85SYohann 243*9e201c85SYohann //------------------------------------------------------------------------------ 244*9e201c85SYohann // 3D 245*9e201c85SYohann //------------------------------------------------------------------------------ 246*9e201c85SYohann 247*9e201c85SYohann //------------------------------------------------------------------------------ 248*9e201c85SYohann // 3D tensor contract x 249*9e201c85SYohann //------------------------------------------------------------------------------ 250*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 251*9e201c85SYohann inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 252*9e201c85SYohann CeedScalar r_B[P_1D]; 253*9e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) { 254*9e201c85SYohann r_B[i] = B[i + data.t_id_x*P_1D]; 255*9e201c85SYohann } 256*9e201c85SYohann 257*9e201c85SYohann for (CeedInt k = 0; k < P_1D; k++) { 258*9e201c85SYohann data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; 259*9e201c85SYohann __syncthreads(); 260*9e201c85SYohann V[k] = 0.0; 261*9e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 262*9e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) { 263*9e201c85SYohann V[k] += r_B[i] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction 264*9e201c85SYohann } 265*9e201c85SYohann } 266*9e201c85SYohann __syncthreads(); 267*9e201c85SYohann } 268*9e201c85SYohann } 269*9e201c85SYohann 270*9e201c85SYohann //------------------------------------------------------------------------------ 271*9e201c85SYohann // 3D tensor contract y 272*9e201c85SYohann //------------------------------------------------------------------------------ 273*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 274*9e201c85SYohann inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 275*9e201c85SYohann CeedScalar r_B[P_1D]; 276*9e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) { 277*9e201c85SYohann r_B[i] = B[i + data.t_id_y*P_1D]; 278*9e201c85SYohann } 279*9e201c85SYohann 280*9e201c85SYohann for (CeedInt k = 0; k < P_1D; k++) { 281*9e201c85SYohann data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; 282*9e201c85SYohann __syncthreads(); 283*9e201c85SYohann V[k] = 0.0; 284*9e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 285*9e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) { 286*9e201c85SYohann V[k] += r_B[i] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction 287*9e201c85SYohann } 288*9e201c85SYohann } 289*9e201c85SYohann __syncthreads(); 290*9e201c85SYohann } 291*9e201c85SYohann } 292*9e201c85SYohann 293*9e201c85SYohann //------------------------------------------------------------------------------ 294*9e201c85SYohann // 3D tensor contract z 295*9e201c85SYohann //------------------------------------------------------------------------------ 296*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 297*9e201c85SYohann inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 298*9e201c85SYohann for (CeedInt k = 0; k < Q_1D; k++) { 299*9e201c85SYohann V[k] = 0.0; 300*9e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 301*9e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) { 302*9e201c85SYohann V[k] += B[i + k*P_1D] * U[i]; // Contract z direction 303*9e201c85SYohann } 304*9e201c85SYohann } 305*9e201c85SYohann } 306*9e201c85SYohann } 307*9e201c85SYohann 308*9e201c85SYohann //------------------------------------------------------------------------------ 309*9e201c85SYohann // 3D transpose tensor contract z 310*9e201c85SYohann //------------------------------------------------------------------------------ 311*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 312*9e201c85SYohann inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 313*9e201c85SYohann for (CeedInt k = 0; k < P_1D; k++) { 314*9e201c85SYohann V[k] = 0.0; 315*9e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 316*9e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 317*9e201c85SYohann V[k] += B[k + i*P_1D] * U[i]; // Contract z direction 318*9e201c85SYohann } 319*9e201c85SYohann } 320*9e201c85SYohann } 321*9e201c85SYohann } 322*9e201c85SYohann 323*9e201c85SYohann //------------------------------------------------------------------------------ 324*9e201c85SYohann // 3D transpose tensor contract y 325*9e201c85SYohann //------------------------------------------------------------------------------ 326*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 327*9e201c85SYohann inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 328*9e201c85SYohann CeedScalar r_B[Q_1D]; 329*9e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 330*9e201c85SYohann r_B[i] = B[data.t_id_y + i*P_1D]; 331*9e201c85SYohann } 332*9e201c85SYohann 333*9e201c85SYohann for (CeedInt k = 0; k < P_1D; k++) { 334*9e201c85SYohann data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; 335*9e201c85SYohann __syncthreads(); 336*9e201c85SYohann V[k] = 0.0; 337*9e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 338*9e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 339*9e201c85SYohann V[k] += r_B[i] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction 340*9e201c85SYohann } 341*9e201c85SYohann } 342*9e201c85SYohann __syncthreads(); 343*9e201c85SYohann } 344*9e201c85SYohann } 345*9e201c85SYohann 346*9e201c85SYohann //------------------------------------------------------------------------------ 347*9e201c85SYohann // 3D transpose tensor contract y 348*9e201c85SYohann //------------------------------------------------------------------------------ 349*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 350*9e201c85SYohann inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 351*9e201c85SYohann CeedScalar r_B[Q_1D]; 352*9e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 353*9e201c85SYohann r_B[i] = B[data.t_id_y + i*P_1D]; 354*9e201c85SYohann } 355*9e201c85SYohann 356*9e201c85SYohann for (CeedInt k = 0; k < P_1D; k++) { 357*9e201c85SYohann data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; 358*9e201c85SYohann __syncthreads(); 359*9e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 360*9e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 361*9e201c85SYohann V[k] += r_B[i] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction 362*9e201c85SYohann } 363*9e201c85SYohann } 364*9e201c85SYohann __syncthreads(); 365*9e201c85SYohann } 366*9e201c85SYohann } 367*9e201c85SYohann 368*9e201c85SYohann //------------------------------------------------------------------------------ 369*9e201c85SYohann // 3D transpose tensor contract x 370*9e201c85SYohann //------------------------------------------------------------------------------ 371*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 372*9e201c85SYohann inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 373*9e201c85SYohann CeedScalar r_B[Q_1D]; 374*9e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 375*9e201c85SYohann r_B[i] = B[data.t_id_x + i*P_1D]; 376*9e201c85SYohann } 377*9e201c85SYohann 378*9e201c85SYohann for (CeedInt k = 0; k < P_1D; k++) { 379*9e201c85SYohann data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; 380*9e201c85SYohann __syncthreads(); 381*9e201c85SYohann V[k] = 0.0; 382*9e201c85SYohann if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 383*9e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 384*9e201c85SYohann V[k] += r_B[i] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction 385*9e201c85SYohann } 386*9e201c85SYohann } 387*9e201c85SYohann __syncthreads(); 388*9e201c85SYohann } 389*9e201c85SYohann } 390*9e201c85SYohann 391*9e201c85SYohann //------------------------------------------------------------------------------ 392*9e201c85SYohann // 3D transpose tensor contract add x 393*9e201c85SYohann //------------------------------------------------------------------------------ 394*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 395*9e201c85SYohann inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 396*9e201c85SYohann CeedScalar r_B[Q_1D]; 397*9e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 398*9e201c85SYohann r_B[i] = B[data.t_id_x + i*P_1D]; 399*9e201c85SYohann } 400*9e201c85SYohann 401*9e201c85SYohann for (CeedInt k = 0; k < P_1D; k++) { 402*9e201c85SYohann data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; 403*9e201c85SYohann __syncthreads(); 404*9e201c85SYohann if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 405*9e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 406*9e201c85SYohann V[k] += r_B[i] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction 407*9e201c85SYohann } 408*9e201c85SYohann } 409*9e201c85SYohann __syncthreads(); 410*9e201c85SYohann } 411*9e201c85SYohann } 412*9e201c85SYohann 413*9e201c85SYohann //------------------------------------------------------------------------------ 414*9e201c85SYohann // 3D interpolate to quadrature points 415*9e201c85SYohann //------------------------------------------------------------------------------ 416*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 417*9e201c85SYohann inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 418*9e201c85SYohann CeedScalar r_t1[T_1D]; 419*9e201c85SYohann CeedScalar r_t2[T_1D]; 420*9e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 421*9e201c85SYohann ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*P_1D, c_B, r_t1); 422*9e201c85SYohann ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 423*9e201c85SYohann ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*Q_1D); 424*9e201c85SYohann } 425*9e201c85SYohann } 426*9e201c85SYohann 427*9e201c85SYohann //------------------------------------------------------------------------------ 428*9e201c85SYohann // 3D interpolate transpose 429*9e201c85SYohann //------------------------------------------------------------------------------ 430*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 431*9e201c85SYohann inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 432*9e201c85SYohann CeedScalar r_t1[T_1D]; 433*9e201c85SYohann CeedScalar r_t2[T_1D]; 434*9e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 435*9e201c85SYohann ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*Q_1D, c_B, r_t1); 436*9e201c85SYohann ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 437*9e201c85SYohann ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*P_1D); 438*9e201c85SYohann } 439*9e201c85SYohann } 440*9e201c85SYohann 441*9e201c85SYohann //------------------------------------------------------------------------------ 442*9e201c85SYohann // 3D derivatives at quadrature points 443*9e201c85SYohann //------------------------------------------------------------------------------ 444*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 445*9e201c85SYohann inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 446*9e201c85SYohann CeedScalar r_t1[T_1D]; 447*9e201c85SYohann CeedScalar r_t2[T_1D]; 448*9e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 449*9e201c85SYohann ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*P_1D, c_G, r_t1); 450*9e201c85SYohann ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 451*9e201c85SYohann ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*Q_1D + 0*NUM_COMP*Q_1D); 452*9e201c85SYohann ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*P_1D, c_B, r_t1); 453*9e201c85SYohann ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2); 454*9e201c85SYohann ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*Q_1D + 1*NUM_COMP*Q_1D); 455*9e201c85SYohann ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*P_1D, c_B, r_t1); 456*9e201c85SYohann ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 457*9e201c85SYohann ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp*Q_1D + 2*NUM_COMP*Q_1D); 458*9e201c85SYohann } 459*9e201c85SYohann } 460*9e201c85SYohann 461*9e201c85SYohann //------------------------------------------------------------------------------ 462*9e201c85SYohann // 3D derivatives transpose 463*9e201c85SYohann //------------------------------------------------------------------------------ 464*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 465*9e201c85SYohann inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 466*9e201c85SYohann CeedScalar r_t1[T_1D]; 467*9e201c85SYohann CeedScalar r_t2[T_1D]; 468*9e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 469*9e201c85SYohann ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*Q_1D + 0*NUM_COMP*Q_1D, c_B, r_t1); 470*9e201c85SYohann ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 471*9e201c85SYohann ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp*P_1D); 472*9e201c85SYohann ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*Q_1D + 1*NUM_COMP*Q_1D, c_B, r_t1); 473*9e201c85SYohann ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2); 474*9e201c85SYohann ContractTransposeAddX3d<NUM_COMP,P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*P_1D); 475*9e201c85SYohann ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*Q_1D + 2*NUM_COMP*Q_1D, c_G, r_t1); 476*9e201c85SYohann ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 477*9e201c85SYohann ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*P_1D); 478*9e201c85SYohann } 479*9e201c85SYohann } 480*9e201c85SYohann 481*9e201c85SYohann //------------------------------------------------------------------------------ 482*9e201c85SYohann // 3D derivatives at quadrature points 483*9e201c85SYohann //------------------------------------------------------------------------------ 484*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 485*9e201c85SYohann inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 486*9e201c85SYohann CeedScalar r_t1[T_1D]; 487*9e201c85SYohann CeedScalar r_t2[T_1D]; 488*9e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 489*9e201c85SYohann ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*P_1D, c_B, r_t1); 490*9e201c85SYohann ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 491*9e201c85SYohann ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1); 492*9e201c85SYohann ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp*Q_1D + 0*NUM_COMP*Q_1D); 493*9e201c85SYohann ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp*Q_1D + 1*NUM_COMP*Q_1D); 494*9e201c85SYohann ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp*Q_1D + 2*NUM_COMP*Q_1D); 495*9e201c85SYohann } 496*9e201c85SYohann } 497*9e201c85SYohann 498*9e201c85SYohann //------------------------------------------------------------------------------ 499*9e201c85SYohann // 3D derivatives transpose 500*9e201c85SYohann //------------------------------------------------------------------------------ 501*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 502*9e201c85SYohann inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 503*9e201c85SYohann CeedScalar r_t1[T_1D]; 504*9e201c85SYohann CeedScalar r_t2[T_1D]; 505*9e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 506*9e201c85SYohann ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp*Q_1D + 2*NUM_COMP*Q_1D, c_G, r_t2); 507*9e201c85SYohann ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp*Q_1D + 1*NUM_COMP*Q_1D, c_G, r_t2); 508*9e201c85SYohann ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp*Q_1D + 0*NUM_COMP*Q_1D, c_G, r_t2); 509*9e201c85SYohann ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1); 510*9e201c85SYohann ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 511*9e201c85SYohann ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*P_1D); 512*9e201c85SYohann } 513*9e201c85SYohann } 514*9e201c85SYohann 515*9e201c85SYohann //------------------------------------------------------------------------------ 516*9e201c85SYohann // 3D quadrature weights 517*9e201c85SYohann //------------------------------------------------------------------------------ 518*9e201c85SYohann template <int Q_1D> 519*9e201c85SYohann inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 520*9e201c85SYohann const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); 521*9e201c85SYohann const CeedScalar pw = quad ? q_weight_1d[data.t_id_x]*q_weight_1d[data.t_id_y] : 0.0; 522*9e201c85SYohann for (CeedInt q = 0; q < Q_1D; q++) { 523*9e201c85SYohann w[q] = quad ? pw*q_weight_1d[q] : 0.0; 524*9e201c85SYohann } 525*9e201c85SYohann } 526*9e201c85SYohann 527*9e201c85SYohann //------------------------------------------------------------------------------ 528*9e201c85SYohann 529*9e201c85SYohann #endif 530