1*9b91271bSJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2*9b91271bSJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3*9b91271bSJeremy L Thompson // 4*9b91271bSJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 5*9b91271bSJeremy L Thompson // 6*9b91271bSJeremy L Thompson // This file is part of CEED: http://github.com/ceed 7*9b91271bSJeremy L Thompson 8*9b91271bSJeremy L Thompson /// @file 9*9b91271bSJeremy L Thompson /// Internal header for HIP shared memory tensor product basis templates 10*9b91271bSJeremy L Thompson #include <ceed/types.h> 11*9b91271bSJeremy L Thompson 12*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 13*9b91271bSJeremy L Thompson // 2D 14*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 15*9b91271bSJeremy L Thompson 16*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 17*9b91271bSJeremy L Thompson // 2D tensor contraction x 18*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 19*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 20*9b91271bSJeremy L Thompson inline __device__ void ContractX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 21*9b91271bSJeremy L Thompson CeedScalar *V) { 22*9b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 23*9b91271bSJeremy L Thompson __syncthreads(); 24*9b91271bSJeremy L Thompson *V = 0.0; 25*9b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D) { 26*9b91271bSJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 27*9b91271bSJeremy L Thompson *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 28*9b91271bSJeremy L Thompson } 29*9b91271bSJeremy L Thompson } 30*9b91271bSJeremy L Thompson __syncthreads(); 31*9b91271bSJeremy L Thompson } 32*9b91271bSJeremy L Thompson 33*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 34*9b91271bSJeremy L Thompson // 2D tensor contract y 35*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 36*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 37*9b91271bSJeremy L Thompson inline __device__ void ContractY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 38*9b91271bSJeremy L Thompson CeedScalar *V) { 39*9b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 40*9b91271bSJeremy L Thompson __syncthreads(); 41*9b91271bSJeremy L Thompson *V = 0.0; 42*9b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) { 43*9b91271bSJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 44*9b91271bSJeremy L Thompson *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 45*9b91271bSJeremy L Thompson } 46*9b91271bSJeremy L Thompson } 47*9b91271bSJeremy L Thompson __syncthreads(); 48*9b91271bSJeremy L Thompson } 49*9b91271bSJeremy L Thompson 50*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 51*9b91271bSJeremy L Thompson // 2D transpose tensor contract y 52*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 53*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 54*9b91271bSJeremy L Thompson inline __device__ void ContractTransposeY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 55*9b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 56*9b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 57*9b91271bSJeremy L Thompson __syncthreads(); 58*9b91271bSJeremy L Thompson *V = 0.0; 59*9b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D) { 60*9b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 61*9b91271bSJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 62*9b91271bSJeremy L Thompson } 63*9b91271bSJeremy L Thompson } 64*9b91271bSJeremy L Thompson __syncthreads(); 65*9b91271bSJeremy L Thompson } 66*9b91271bSJeremy L Thompson 67*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 68*9b91271bSJeremy L Thompson // 2D transpose tensor contract x 69*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 70*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 71*9b91271bSJeremy L Thompson inline __device__ void ContractTransposeX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 72*9b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 73*9b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 74*9b91271bSJeremy L Thompson __syncthreads(); 75*9b91271bSJeremy L Thompson *V = 0.0; 76*9b91271bSJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D) { 77*9b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 78*9b91271bSJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 79*9b91271bSJeremy L Thompson } 80*9b91271bSJeremy L Thompson } 81*9b91271bSJeremy L Thompson __syncthreads(); 82*9b91271bSJeremy L Thompson } 83*9b91271bSJeremy L Thompson 84*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 85*9b91271bSJeremy L Thompson // 2D transpose tensor contract and add x 86*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 87*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 88*9b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 89*9b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 90*9b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 91*9b91271bSJeremy L Thompson __syncthreads(); 92*9b91271bSJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D) { 93*9b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 94*9b91271bSJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 95*9b91271bSJeremy L Thompson } 96*9b91271bSJeremy L Thompson } 97*9b91271bSJeremy L Thompson __syncthreads(); 98*9b91271bSJeremy L Thompson } 99*9b91271bSJeremy L Thompson 100*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 101*9b91271bSJeremy L Thompson // 2D pack/unpack quadrature values 102*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 103*9b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 104*9b91271bSJeremy L Thompson inline __device__ void QPack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 105*9b91271bSJeremy L Thompson const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D; 106*9b91271bSJeremy L Thompson 107*9b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 108*9b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp]; 109*9b91271bSJeremy L Thompson __syncthreads(); 110*9b91271bSJeremy L Thompson U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0; 111*9b91271bSJeremy L Thompson __syncthreads(); 112*9b91271bSJeremy L Thompson } 113*9b91271bSJeremy L Thompson } 114*9b91271bSJeremy L Thompson 115*9b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 116*9b91271bSJeremy L Thompson inline __device__ void QUnpack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 117*9b91271bSJeremy L Thompson const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D; 118*9b91271bSJeremy L Thompson 119*9b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 120*9b91271bSJeremy L Thompson if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp]; 121*9b91271bSJeremy L Thompson __syncthreads(); 122*9b91271bSJeremy L Thompson U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0; 123*9b91271bSJeremy L Thompson __syncthreads(); 124*9b91271bSJeremy L Thompson } 125*9b91271bSJeremy L Thompson } 126*9b91271bSJeremy L Thompson 127*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 128*9b91271bSJeremy L Thompson // 2D interpolate to quadrature points 129*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 130*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 131*9b91271bSJeremy L Thompson inline __device__ void InterpTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 132*9b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 133*9b91271bSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 134*9b91271bSJeremy L Thompson CeedScalar r_t[1]; 135*9b91271bSJeremy L Thompson 136*9b91271bSJeremy L Thompson QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 137*9b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 138*9b91271bSJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 139*9b91271bSJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 140*9b91271bSJeremy L Thompson } 141*9b91271bSJeremy L Thompson QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 142*9b91271bSJeremy L Thompson QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 143*9b91271bSJeremy L Thompson } 144*9b91271bSJeremy L Thompson 145*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 146*9b91271bSJeremy L Thompson // 2D interpolate transpose 147*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 148*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 149*9b91271bSJeremy L Thompson inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 150*9b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 151*9b91271bSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 152*9b91271bSJeremy L Thompson CeedScalar r_t[1]; 153*9b91271bSJeremy L Thompson 154*9b91271bSJeremy L Thompson QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 155*9b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 156*9b91271bSJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 157*9b91271bSJeremy L Thompson ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 158*9b91271bSJeremy L Thompson } 159*9b91271bSJeremy L Thompson QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 160*9b91271bSJeremy L Thompson } 161*9b91271bSJeremy L Thompson 162*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 163*9b91271bSJeremy L Thompson // 2D derivatives at quadrature points 164*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 165*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 166*9b91271bSJeremy L Thompson inline __device__ void GradTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 167*9b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 168*9b91271bSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 169*9b91271bSJeremy L Thompson CeedScalar r_t[1]; 170*9b91271bSJeremy L Thompson 171*9b91271bSJeremy L Thompson QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 172*9b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 173*9b91271bSJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t); 174*9b91271bSJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 175*9b91271bSJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 176*9b91271bSJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 177*9b91271bSJeremy L Thompson } 178*9b91271bSJeremy L Thompson QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 179*9b91271bSJeremy L Thompson QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 180*9b91271bSJeremy L Thompson } 181*9b91271bSJeremy L Thompson 182*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 183*9b91271bSJeremy L Thompson // 2D derivatives transpose 184*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 185*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 186*9b91271bSJeremy L Thompson inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 187*9b91271bSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 188*9b91271bSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 189*9b91271bSJeremy L Thompson CeedScalar r_t[1]; 190*9b91271bSJeremy L Thompson 191*9b91271bSJeremy L Thompson QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 192*9b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 193*9b91271bSJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 194*9b91271bSJeremy L Thompson ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]); 195*9b91271bSJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 196*9b91271bSJeremy L Thompson ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 197*9b91271bSJeremy L Thompson } 198*9b91271bSJeremy L Thompson QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 199*9b91271bSJeremy L Thompson } 200*9b91271bSJeremy L Thompson 201*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 202*9b91271bSJeremy L Thompson // 2D quadrature weights 203*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 204*9b91271bSJeremy L Thompson template <int P_1D, int Q_1D> 205*9b91271bSJeremy L Thompson inline __device__ void WeightTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 206*9b91271bSJeremy L Thompson const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D; 207*9b91271bSJeremy L Thompson 208*9b91271bSJeremy L Thompson *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0; 209*9b91271bSJeremy L Thompson } 210*9b91271bSJeremy L Thompson 211*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 212*9b91271bSJeremy L Thompson // 3D 213*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 214*9b91271bSJeremy L Thompson 215*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 216*9b91271bSJeremy L Thompson // 3D tensor contract x 217*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 218*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 219*9b91271bSJeremy L Thompson inline __device__ void ContractX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 220*9b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 221*9b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 222*9b91271bSJeremy L Thompson __syncthreads(); 223*9b91271bSJeremy L Thompson *V = 0.0; 224*9b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 225*9b91271bSJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 226*9b91271bSJeremy L Thompson *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 227*9b91271bSJeremy L Thompson } 228*9b91271bSJeremy L Thompson } 229*9b91271bSJeremy L Thompson __syncthreads(); 230*9b91271bSJeremy L Thompson } 231*9b91271bSJeremy L Thompson 232*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 233*9b91271bSJeremy L Thompson // 3D tensor contract y 234*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 235*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 236*9b91271bSJeremy L Thompson inline __device__ void ContractY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 237*9b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 238*9b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 239*9b91271bSJeremy L Thompson __syncthreads(); 240*9b91271bSJeremy L Thompson *V = 0.0; 241*9b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 242*9b91271bSJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 243*9b91271bSJeremy L Thompson *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 244*9b91271bSJeremy L Thompson } 245*9b91271bSJeremy L Thompson } 246*9b91271bSJeremy L Thompson __syncthreads(); 247*9b91271bSJeremy L Thompson } 248*9b91271bSJeremy L Thompson 249*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 250*9b91271bSJeremy L Thompson // 3D tensor contract z 251*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 252*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 253*9b91271bSJeremy L Thompson inline __device__ void ContractZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 254*9b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 255*9b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 256*9b91271bSJeremy L Thompson __syncthreads(); 257*9b91271bSJeremy L Thompson *V = 0.0; 258*9b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) { 259*9b91271bSJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 260*9b91271bSJeremy L Thompson *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 261*9b91271bSJeremy L Thompson } 262*9b91271bSJeremy L Thompson } 263*9b91271bSJeremy L Thompson __syncthreads(); 264*9b91271bSJeremy L Thompson } 265*9b91271bSJeremy L Thompson 266*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 267*9b91271bSJeremy L Thompson // 3D tensor contract z 268*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 269*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 270*9b91271bSJeremy L Thompson inline __device__ void ContractTransposeZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 271*9b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 272*9b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 273*9b91271bSJeremy L Thompson __syncthreads(); 274*9b91271bSJeremy L Thompson *V = 0.0; 275*9b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 276*9b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 277*9b91271bSJeremy L Thompson *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 278*9b91271bSJeremy L Thompson } 279*9b91271bSJeremy L Thompson } 280*9b91271bSJeremy L Thompson __syncthreads(); 281*9b91271bSJeremy L Thompson } 282*9b91271bSJeremy L Thompson 283*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 284*9b91271bSJeremy L Thompson // 3D transpose tensor contract z 285*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 286*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 287*9b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, 288*9b91271bSJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 289*9b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 290*9b91271bSJeremy L Thompson __syncthreads(); 291*9b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 292*9b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 293*9b91271bSJeremy L Thompson *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 294*9b91271bSJeremy L Thompson } 295*9b91271bSJeremy L Thompson } 296*9b91271bSJeremy L Thompson __syncthreads(); 297*9b91271bSJeremy L Thompson } 298*9b91271bSJeremy L Thompson 299*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 300*9b91271bSJeremy L Thompson // 3D transpose tensor contract y 301*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 302*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 303*9b91271bSJeremy L Thompson inline __device__ void ContractTransposeY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 304*9b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 305*9b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 306*9b91271bSJeremy L Thompson __syncthreads(); 307*9b91271bSJeremy L Thompson *V = 0.0; 308*9b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 309*9b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 310*9b91271bSJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 311*9b91271bSJeremy L Thompson } 312*9b91271bSJeremy L Thompson } 313*9b91271bSJeremy L Thompson __syncthreads(); 314*9b91271bSJeremy L Thompson } 315*9b91271bSJeremy L Thompson 316*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 317*9b91271bSJeremy L Thompson // 3D transpose tensor contract y 318*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 319*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 320*9b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, 321*9b91271bSJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 322*9b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 323*9b91271bSJeremy L Thompson __syncthreads(); 324*9b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 325*9b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 326*9b91271bSJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 327*9b91271bSJeremy L Thompson } 328*9b91271bSJeremy L Thompson } 329*9b91271bSJeremy L Thompson __syncthreads(); 330*9b91271bSJeremy L Thompson } 331*9b91271bSJeremy L Thompson 332*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 333*9b91271bSJeremy L Thompson // 3D transpose tensor contract x 334*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 335*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 336*9b91271bSJeremy L Thompson inline __device__ void ContractTransposeX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 337*9b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 338*9b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 339*9b91271bSJeremy L Thompson __syncthreads(); 340*9b91271bSJeremy L Thompson *V = 0.0; 341*9b91271bSJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 342*9b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 343*9b91271bSJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 344*9b91271bSJeremy L Thompson } 345*9b91271bSJeremy L Thompson } 346*9b91271bSJeremy L Thompson __syncthreads(); 347*9b91271bSJeremy L Thompson } 348*9b91271bSJeremy L Thompson 349*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 350*9b91271bSJeremy L Thompson // 3D transpose tensor contract add x 351*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 352*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 353*9b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, 354*9b91271bSJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 355*9b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 356*9b91271bSJeremy L Thompson __syncthreads(); 357*9b91271bSJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 358*9b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 359*9b91271bSJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 360*9b91271bSJeremy L Thompson } 361*9b91271bSJeremy L Thompson } 362*9b91271bSJeremy L Thompson __syncthreads(); 363*9b91271bSJeremy L Thompson } 364*9b91271bSJeremy L Thompson 365*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 366*9b91271bSJeremy L Thompson // 3D pack/unpack quadrature values 367*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 368*9b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 369*9b91271bSJeremy L Thompson inline __device__ void QPack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 370*9b91271bSJeremy L Thompson const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D); 371*9b91271bSJeremy L Thompson 372*9b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 373*9b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp]; 374*9b91271bSJeremy L Thompson __syncthreads(); 375*9b91271bSJeremy L Thompson U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0; 376*9b91271bSJeremy L Thompson __syncthreads(); 377*9b91271bSJeremy L Thompson } 378*9b91271bSJeremy L Thompson } 379*9b91271bSJeremy L Thompson 380*9b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 381*9b91271bSJeremy L Thompson inline __device__ void QUnpack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 382*9b91271bSJeremy L Thompson const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D); 383*9b91271bSJeremy L Thompson 384*9b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 385*9b91271bSJeremy L Thompson if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp]; 386*9b91271bSJeremy L Thompson __syncthreads(); 387*9b91271bSJeremy L Thompson U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0; 388*9b91271bSJeremy L Thompson __syncthreads(); 389*9b91271bSJeremy L Thompson } 390*9b91271bSJeremy L Thompson } 391*9b91271bSJeremy L Thompson 392*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 393*9b91271bSJeremy L Thompson // 3D interpolate to quadrature points 394*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 395*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 396*9b91271bSJeremy L Thompson inline __device__ void InterpTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 397*9b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 398*9b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 399*9b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 400*9b91271bSJeremy L Thompson 401*9b91271bSJeremy L Thompson QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 402*9b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 403*9b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 404*9b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 405*9b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 406*9b91271bSJeremy L Thompson } 407*9b91271bSJeremy L Thompson QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 408*9b91271bSJeremy L Thompson QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 409*9b91271bSJeremy L Thompson } 410*9b91271bSJeremy L Thompson 411*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 412*9b91271bSJeremy L Thompson // 3D interpolate transpose 413*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 414*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 415*9b91271bSJeremy L Thompson inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 416*9b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 417*9b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 418*9b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 419*9b91271bSJeremy L Thompson 420*9b91271bSJeremy L Thompson QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 421*9b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 422*9b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 423*9b91271bSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 424*9b91271bSJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 425*9b91271bSJeremy L Thompson } 426*9b91271bSJeremy L Thompson QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 427*9b91271bSJeremy L Thompson } 428*9b91271bSJeremy L Thompson 429*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 430*9b91271bSJeremy L Thompson // 3D derivatives at quadrature points 431*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 432*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 433*9b91271bSJeremy L Thompson inline __device__ void GradTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 434*9b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 435*9b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 436*9b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 437*9b91271bSJeremy L Thompson 438*9b91271bSJeremy L Thompson QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 439*9b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 440*9b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1); 441*9b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 442*9b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]); 443*9b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 444*9b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2); 445*9b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]); 446*9b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 447*9b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 448*9b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]); 449*9b91271bSJeremy L Thompson } 450*9b91271bSJeremy L Thompson QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 451*9b91271bSJeremy L Thompson QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 452*9b91271bSJeremy L Thompson } 453*9b91271bSJeremy L Thompson 454*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 455*9b91271bSJeremy L Thompson // 3D derivatives transpose 456*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 457*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 458*9b91271bSJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 459*9b91271bSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 460*9b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 461*9b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 462*9b91271bSJeremy L Thompson 463*9b91271bSJeremy L Thompson QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 464*9b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 465*9b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1); 466*9b91271bSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 467*9b91271bSJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]); 468*9b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1); 469*9b91271bSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 470*9b91271bSJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 471*9b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1); 472*9b91271bSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 473*9b91271bSJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 474*9b91271bSJeremy L Thompson } 475*9b91271bSJeremy L Thompson QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 476*9b91271bSJeremy L Thompson } 477*9b91271bSJeremy L Thompson 478*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 479*9b91271bSJeremy L Thompson // 3D derivatives at quadrature points 480*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 481*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 482*9b91271bSJeremy L Thompson inline __device__ void GradTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 483*9b91271bSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 484*9b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 485*9b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 486*9b91271bSJeremy L Thompson 487*9b91271bSJeremy L Thompson QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 488*9b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 489*9b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 490*9b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 491*9b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 492*9b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]); 493*9b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]); 494*9b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]); 495*9b91271bSJeremy L Thompson } 496*9b91271bSJeremy L Thompson QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 497*9b91271bSJeremy L Thompson QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 498*9b91271bSJeremy L Thompson } 499*9b91271bSJeremy L Thompson 500*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 501*9b91271bSJeremy L Thompson // 3D derivatives transpose 502*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 503*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 504*9b91271bSJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 505*9b91271bSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 506*9b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 507*9b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 508*9b91271bSJeremy L Thompson 509*9b91271bSJeremy L Thompson QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 510*9b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 511*9b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2); 512*9b91271bSJeremy L Thompson ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2); 513*9b91271bSJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2); 514*9b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 515*9b91271bSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 516*9b91271bSJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 517*9b91271bSJeremy L Thompson } 518*9b91271bSJeremy L Thompson QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 519*9b91271bSJeremy L Thompson } 520*9b91271bSJeremy L Thompson 521*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 522*9b91271bSJeremy L Thompson // 3D quadrature weights 523*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 524*9b91271bSJeremy L Thompson template <int P_1D, int Q_1D> 525*9b91271bSJeremy L Thompson inline __device__ void WeightTensor3dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 526*9b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D); 527*9b91271bSJeremy L Thompson 528*9b91271bSJeremy L Thompson *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0; 529*9b91271bSJeremy L Thompson } 530