1*bd882c8aSJames Wright // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2*bd882c8aSJames Wright // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3*bd882c8aSJames Wright // 4*bd882c8aSJames Wright // SPDX-License-Identifier: BSD-2-Clause 5*bd882c8aSJames Wright // 6*bd882c8aSJames Wright // This file is part of CEED: http://github.com/ceed 7*bd882c8aSJames Wright 8*bd882c8aSJames Wright /// @file 9*bd882c8aSJames Wright /// Internal header for SYCL shared memory tensor product basis templates 10*bd882c8aSJames Wright #ifndef _ceed_sycl_shared_basis_tensor_templates_h 11*bd882c8aSJames Wright #define _ceed_sycl_shared_basis_tensor_templates_h 12*bd882c8aSJames Wright 13*bd882c8aSJames Wright #include <ceed.h> 14*bd882c8aSJames Wright 15*bd882c8aSJames Wright //------------------------------------------------------------------------------ 16*bd882c8aSJames Wright // 1D 17*bd882c8aSJames Wright //------------------------------------------------------------------------------ 18*bd882c8aSJames Wright 19*bd882c8aSJames Wright //------------------------------------------------------------------------------ 20*bd882c8aSJames Wright // 1D tensor contraction x 21*bd882c8aSJames Wright //------------------------------------------------------------------------------ 22*bd882c8aSJames Wright inline void ContractX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 23*bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 24*bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 25*bd882c8aSJames Wright 26*bd882c8aSJames Wright scratch[item_id_x] = *U; 27*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 28*bd882c8aSJames Wright 29*bd882c8aSJames Wright *V = 0.0; 30*bd882c8aSJames Wright if (item_id_x < Q_1D) { 31*bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 32*bd882c8aSJames Wright *V += B[i + item_id_x * P_1D] * scratch[i]; // Contract x direction 33*bd882c8aSJames Wright } 34*bd882c8aSJames Wright } 35*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 36*bd882c8aSJames Wright } 37*bd882c8aSJames Wright 38*bd882c8aSJames Wright //------------------------------------------------------------------------------ 39*bd882c8aSJames Wright // 1D transpose tensor contraction x 40*bd882c8aSJames Wright //------------------------------------------------------------------------------ 41*bd882c8aSJames Wright inline void ContractTransposeX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 42*bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 43*bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 44*bd882c8aSJames Wright 45*bd882c8aSJames Wright scratch[item_id_x] = *U; 46*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 47*bd882c8aSJames Wright 48*bd882c8aSJames Wright *V = 0.0; 49*bd882c8aSJames Wright if (item_id_x < P_1D) { 50*bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 51*bd882c8aSJames Wright *V += B[item_id_x + i * P_1D] * scratch[i]; // Contract x direction 52*bd882c8aSJames Wright } 53*bd882c8aSJames Wright } 54*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 55*bd882c8aSJames Wright } 56*bd882c8aSJames Wright 57*bd882c8aSJames Wright //------------------------------------------------------------------------------ 58*bd882c8aSJames Wright // 1D interpolate to quadrature points 59*bd882c8aSJames Wright //------------------------------------------------------------------------------ 60*bd882c8aSJames Wright inline void Interp1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 61*bd882c8aSJames Wright local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 62*bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 63*bd882c8aSJames Wright ContractX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch); 64*bd882c8aSJames Wright } 65*bd882c8aSJames Wright } 66*bd882c8aSJames Wright 67*bd882c8aSJames Wright //------------------------------------------------------------------------------ 68*bd882c8aSJames Wright // 1D interpolate transpose 69*bd882c8aSJames Wright //------------------------------------------------------------------------------ 70*bd882c8aSJames Wright inline void InterpTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 71*bd882c8aSJames Wright local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 72*bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 73*bd882c8aSJames Wright ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch); 74*bd882c8aSJames Wright } 75*bd882c8aSJames Wright } 76*bd882c8aSJames Wright 77*bd882c8aSJames Wright //------------------------------------------------------------------------------ 78*bd882c8aSJames Wright // 1D derivatives at quadrature points 79*bd882c8aSJames Wright //------------------------------------------------------------------------------ 80*bd882c8aSJames Wright inline void Grad1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 81*bd882c8aSJames Wright local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 82*bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 83*bd882c8aSJames Wright ContractX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch); 84*bd882c8aSJames Wright } 85*bd882c8aSJames Wright } 86*bd882c8aSJames Wright 87*bd882c8aSJames Wright //------------------------------------------------------------------------------ 88*bd882c8aSJames Wright // 1D derivatives transpose 89*bd882c8aSJames Wright //------------------------------------------------------------------------------ 90*bd882c8aSJames Wright inline void GradTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 91*bd882c8aSJames Wright local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 92*bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 93*bd882c8aSJames Wright ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch); 94*bd882c8aSJames Wright } 95*bd882c8aSJames Wright } 96*bd882c8aSJames Wright 97*bd882c8aSJames Wright //------------------------------------------------------------------------------ 98*bd882c8aSJames Wright // 1D quadrature weights 99*bd882c8aSJames Wright //------------------------------------------------------------------------------ 100*bd882c8aSJames Wright inline void Weight1d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) { 101*bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 102*bd882c8aSJames Wright *w = (item_id_x < Q_1D) ? q_weight_1d[item_id_x] : 0.0; 103*bd882c8aSJames Wright } 104*bd882c8aSJames Wright 105*bd882c8aSJames Wright //------------------------------------------------------------------------------ 106*bd882c8aSJames Wright // 2D 107*bd882c8aSJames Wright //------------------------------------------------------------------------------ 108*bd882c8aSJames Wright 109*bd882c8aSJames Wright //------------------------------------------------------------------------------ 110*bd882c8aSJames Wright // 2D tensor contraction x 111*bd882c8aSJames Wright //------------------------------------------------------------------------------ 112*bd882c8aSJames Wright inline void ContractX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 113*bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 114*bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 115*bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 116*bd882c8aSJames Wright 117*bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = *U; 118*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 119*bd882c8aSJames Wright 120*bd882c8aSJames Wright *V = 0.0; 121*bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < P_1D) { 122*bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 123*bd882c8aSJames Wright *V += B[i + item_id_x * P_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction 124*bd882c8aSJames Wright } 125*bd882c8aSJames Wright } 126*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 127*bd882c8aSJames Wright } 128*bd882c8aSJames Wright 129*bd882c8aSJames Wright //------------------------------------------------------------------------------ 130*bd882c8aSJames Wright // 2D tensor contract y 131*bd882c8aSJames Wright //------------------------------------------------------------------------------ 132*bd882c8aSJames Wright inline void ContractY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 133*bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 134*bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 135*bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 136*bd882c8aSJames Wright 137*bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = *U; 138*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 139*bd882c8aSJames Wright 140*bd882c8aSJames Wright *V = 0.0; 141*bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < Q_1D) { 142*bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 143*bd882c8aSJames Wright *V += B[i + item_id_y * P_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction 144*bd882c8aSJames Wright } 145*bd882c8aSJames Wright } 146*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 147*bd882c8aSJames Wright } 148*bd882c8aSJames Wright 149*bd882c8aSJames Wright //------------------------------------------------------------------------------ 150*bd882c8aSJames Wright // 2D transpose tensor contract y 151*bd882c8aSJames Wright //------------------------------------------------------------------------------ 152*bd882c8aSJames Wright inline void ContractTransposeY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 153*bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 154*bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 155*bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 156*bd882c8aSJames Wright 157*bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = *U; 158*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 159*bd882c8aSJames Wright 160*bd882c8aSJames Wright *V = 0.0; 161*bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < P_1D) { 162*bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 163*bd882c8aSJames Wright *V += B[item_id_y + i * P_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction 164*bd882c8aSJames Wright } 165*bd882c8aSJames Wright } 166*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 167*bd882c8aSJames Wright } 168*bd882c8aSJames Wright 169*bd882c8aSJames Wright //------------------------------------------------------------------------------ 170*bd882c8aSJames Wright // 2D transpose tensor contract x 171*bd882c8aSJames Wright //------------------------------------------------------------------------------ 172*bd882c8aSJames Wright inline void ContractTransposeX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 173*bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 174*bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 175*bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 176*bd882c8aSJames Wright 177*bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = *U; 178*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 179*bd882c8aSJames Wright 180*bd882c8aSJames Wright *V = 0.0; 181*bd882c8aSJames Wright if (item_id_x < P_1D && item_id_y < P_1D) { 182*bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 183*bd882c8aSJames Wright *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction 184*bd882c8aSJames Wright } 185*bd882c8aSJames Wright } 186*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 187*bd882c8aSJames Wright } 188*bd882c8aSJames Wright 189*bd882c8aSJames Wright //------------------------------------------------------------------------------ 190*bd882c8aSJames Wright // 2D transpose tensor contract and add x 191*bd882c8aSJames Wright //------------------------------------------------------------------------------ 192*bd882c8aSJames Wright inline void ContractTransposeAddX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 193*bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 194*bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 195*bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 196*bd882c8aSJames Wright 197*bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = *U; 198*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 199*bd882c8aSJames Wright 200*bd882c8aSJames Wright if (item_id_x < P_1D && item_id_y < P_1D) { 201*bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 202*bd882c8aSJames Wright *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction 203*bd882c8aSJames Wright } 204*bd882c8aSJames Wright } 205*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 206*bd882c8aSJames Wright } 207*bd882c8aSJames Wright 208*bd882c8aSJames Wright //------------------------------------------------------------------------------ 209*bd882c8aSJames Wright // 2D interpolate to quadrature points 210*bd882c8aSJames Wright //------------------------------------------------------------------------------ 211*bd882c8aSJames Wright inline void InterpTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 212*bd882c8aSJames Wright local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 213*bd882c8aSJames Wright CeedScalar r_t[1]; 214*bd882c8aSJames Wright 215*bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 216*bd882c8aSJames Wright ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch); 217*bd882c8aSJames Wright ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch); 218*bd882c8aSJames Wright } 219*bd882c8aSJames Wright } 220*bd882c8aSJames Wright 221*bd882c8aSJames Wright //------------------------------------------------------------------------------ 222*bd882c8aSJames Wright // 2D interpolate transpose 223*bd882c8aSJames Wright //------------------------------------------------------------------------------ 224*bd882c8aSJames Wright inline void InterpTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 225*bd882c8aSJames Wright local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 226*bd882c8aSJames Wright CeedScalar r_t[1]; 227*bd882c8aSJames Wright 228*bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 229*bd882c8aSJames Wright ContractTransposeY2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch); 230*bd882c8aSJames Wright ContractTransposeX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch); 231*bd882c8aSJames Wright } 232*bd882c8aSJames Wright } 233*bd882c8aSJames Wright 234*bd882c8aSJames Wright //------------------------------------------------------------------------------ 235*bd882c8aSJames Wright // 2D derivatives at quadrature points 236*bd882c8aSJames Wright //------------------------------------------------------------------------------ 237*bd882c8aSJames Wright inline void GradTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 238*bd882c8aSJames Wright local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 239*bd882c8aSJames Wright local CeedScalar *restrict scratch) { 240*bd882c8aSJames Wright CeedScalar r_t[1]; 241*bd882c8aSJames Wright 242*bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 243*bd882c8aSJames Wright ContractX2d(P_1D, Q_1D, r_U + comp, s_G, r_t, scratch); 244*bd882c8aSJames Wright ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp + 0 * NUM_COMP, scratch); 245*bd882c8aSJames Wright ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch); 246*bd882c8aSJames Wright ContractY2d(P_1D, Q_1D, r_t, s_G, r_V + comp + 1 * NUM_COMP, scratch); 247*bd882c8aSJames Wright } 248*bd882c8aSJames Wright } 249*bd882c8aSJames Wright 250*bd882c8aSJames Wright //------------------------------------------------------------------------------ 251*bd882c8aSJames Wright // 2D derivatives transpose 252*bd882c8aSJames Wright //------------------------------------------------------------------------------ 253*bd882c8aSJames Wright inline void GradTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 254*bd882c8aSJames Wright local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 255*bd882c8aSJames Wright local CeedScalar *restrict scratch) { 256*bd882c8aSJames Wright CeedScalar r_t[1]; 257*bd882c8aSJames Wright 258*bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 259*bd882c8aSJames Wright ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 0 * NUM_COMP, s_B, r_t, scratch); 260*bd882c8aSJames Wright ContractTransposeX2d(P_1D, Q_1D, r_t, s_G, r_V + comp, scratch); 261*bd882c8aSJames Wright ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 1 * NUM_COMP, s_G, r_t, scratch); 262*bd882c8aSJames Wright ContractTransposeAddX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch); 263*bd882c8aSJames Wright } 264*bd882c8aSJames Wright } 265*bd882c8aSJames Wright 266*bd882c8aSJames Wright //------------------------------------------------------------------------------ 267*bd882c8aSJames Wright // 2D quadrature weights 268*bd882c8aSJames Wright //------------------------------------------------------------------------------ 269*bd882c8aSJames Wright inline void WeightTensor2d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) { 270*bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 271*bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 272*bd882c8aSJames Wright 273*bd882c8aSJames Wright *w = (item_id_x < Q_1D && item_id_y < Q_1D) ? q_weight_1d[item_id_x] * q_weight_1d[item_id_y] : 0.0; 274*bd882c8aSJames Wright } 275*bd882c8aSJames Wright 276*bd882c8aSJames Wright //------------------------------------------------------------------------------ 277*bd882c8aSJames Wright // 3D 278*bd882c8aSJames Wright //------------------------------------------------------------------------------ 279*bd882c8aSJames Wright 280*bd882c8aSJames Wright //------------------------------------------------------------------------------ 281*bd882c8aSJames Wright // 3D tensor contract x 282*bd882c8aSJames Wright //------------------------------------------------------------------------------ 283*bd882c8aSJames Wright inline void ContractX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 284*bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 285*bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 286*bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 287*bd882c8aSJames Wright 288*bd882c8aSJames Wright CeedScalar r_B[T_1D]; 289*bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 290*bd882c8aSJames Wright r_B[i] = B[i + item_id_x * P_1D]; 291*bd882c8aSJames Wright } 292*bd882c8aSJames Wright 293*bd882c8aSJames Wright for (CeedInt k = 0; k < P_1D; k++) { 294*bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = U[k]; 295*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 296*bd882c8aSJames Wright 297*bd882c8aSJames Wright V[k] = 0.0; 298*bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < P_1D) { 299*bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 300*bd882c8aSJames Wright V[k] += r_B[i] * scratch[i + item_id_y * T_1D]; // Contract x direction 301*bd882c8aSJames Wright } 302*bd882c8aSJames Wright } 303*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 304*bd882c8aSJames Wright } 305*bd882c8aSJames Wright } 306*bd882c8aSJames Wright 307*bd882c8aSJames Wright //------------------------------------------------------------------------------ 308*bd882c8aSJames Wright // 3D tensor contract y 309*bd882c8aSJames Wright //------------------------------------------------------------------------------ 310*bd882c8aSJames Wright inline void ContractY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 311*bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 312*bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 313*bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 314*bd882c8aSJames Wright 315*bd882c8aSJames Wright CeedScalar r_B[T_1D]; 316*bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 317*bd882c8aSJames Wright r_B[i] = B[i + item_id_y * P_1D]; 318*bd882c8aSJames Wright } 319*bd882c8aSJames Wright 320*bd882c8aSJames Wright for (CeedInt k = 0; k < P_1D; k++) { 321*bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = U[k]; 322*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 323*bd882c8aSJames Wright 324*bd882c8aSJames Wright V[k] = 0.0; 325*bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < Q_1D) { 326*bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 327*bd882c8aSJames Wright V[k] += r_B[i] * scratch[item_id_x + i * T_1D]; // Contract y direction 328*bd882c8aSJames Wright } 329*bd882c8aSJames Wright } 330*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 331*bd882c8aSJames Wright } 332*bd882c8aSJames Wright } 333*bd882c8aSJames Wright 334*bd882c8aSJames Wright //------------------------------------------------------------------------------ 335*bd882c8aSJames Wright // 3D tensor contract z 336*bd882c8aSJames Wright //------------------------------------------------------------------------------ 337*bd882c8aSJames Wright inline void ContractZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 338*bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 339*bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 340*bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 341*bd882c8aSJames Wright 342*bd882c8aSJames Wright for (CeedInt k = 0; k < Q_1D; k++) { 343*bd882c8aSJames Wright V[k] = 0.0; 344*bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < Q_1D) { 345*bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 346*bd882c8aSJames Wright V[k] += B[i + k * P_1D] * U[i]; // Contract z direction 347*bd882c8aSJames Wright } 348*bd882c8aSJames Wright } 349*bd882c8aSJames Wright } 350*bd882c8aSJames Wright } 351*bd882c8aSJames Wright 352*bd882c8aSJames Wright //------------------------------------------------------------------------------ 353*bd882c8aSJames Wright // 3D transpose tensor contract z 354*bd882c8aSJames Wright //------------------------------------------------------------------------------ 355*bd882c8aSJames Wright inline void ContractTransposeZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 356*bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 357*bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 358*bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 359*bd882c8aSJames Wright 360*bd882c8aSJames Wright for (CeedInt k = 0; k < P_1D; k++) { 361*bd882c8aSJames Wright V[k] = 0.0; 362*bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < Q_1D) { 363*bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 364*bd882c8aSJames Wright V[k] += B[k + i * P_1D] * U[i]; // Contract z direction 365*bd882c8aSJames Wright } 366*bd882c8aSJames Wright } 367*bd882c8aSJames Wright } 368*bd882c8aSJames Wright } 369*bd882c8aSJames Wright 370*bd882c8aSJames Wright //------------------------------------------------------------------------------ 371*bd882c8aSJames Wright // 3D transpose tensor contract y 372*bd882c8aSJames Wright //------------------------------------------------------------------------------ 373*bd882c8aSJames Wright inline void ContractTransposeY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 374*bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 375*bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 376*bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 377*bd882c8aSJames Wright 378*bd882c8aSJames Wright CeedScalar r_B[T_1D]; 379*bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 380*bd882c8aSJames Wright r_B[i] = B[item_id_y + i * P_1D]; 381*bd882c8aSJames Wright } 382*bd882c8aSJames Wright 383*bd882c8aSJames Wright for (CeedInt k = 0; k < P_1D; k++) { 384*bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = U[k]; 385*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 386*bd882c8aSJames Wright 387*bd882c8aSJames Wright V[k] = 0.0; 388*bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < P_1D) { 389*bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 390*bd882c8aSJames Wright V[k] += r_B[i] * scratch[item_id_x + i * T_1D]; // Contract y direction 391*bd882c8aSJames Wright } 392*bd882c8aSJames Wright } 393*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 394*bd882c8aSJames Wright } 395*bd882c8aSJames Wright } 396*bd882c8aSJames Wright 397*bd882c8aSJames Wright //------------------------------------------------------------------------------ 398*bd882c8aSJames Wright // 3D transpose tensor contract y 399*bd882c8aSJames Wright //------------------------------------------------------------------------------ 400*bd882c8aSJames Wright inline void ContractTransposeAddY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 401*bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 402*bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 403*bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 404*bd882c8aSJames Wright 405*bd882c8aSJames Wright CeedScalar r_B[T_1D]; 406*bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 407*bd882c8aSJames Wright r_B[i] = B[item_id_y + i * P_1D]; 408*bd882c8aSJames Wright } 409*bd882c8aSJames Wright 410*bd882c8aSJames Wright for (CeedInt k = 0; k < P_1D; k++) { 411*bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = U[k]; 412*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 413*bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < P_1D) { 414*bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 415*bd882c8aSJames Wright V[k] += r_B[i] * scratch[item_id_x + i * T_1D]; // Contract y direction 416*bd882c8aSJames Wright } 417*bd882c8aSJames Wright } 418*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 419*bd882c8aSJames Wright } 420*bd882c8aSJames Wright } 421*bd882c8aSJames Wright 422*bd882c8aSJames Wright //------------------------------------------------------------------------------ 423*bd882c8aSJames Wright // 3D transpose tensor contract x 424*bd882c8aSJames Wright //------------------------------------------------------------------------------ 425*bd882c8aSJames Wright inline void ContractTransposeX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 426*bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 427*bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 428*bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 429*bd882c8aSJames Wright 430*bd882c8aSJames Wright CeedScalar r_B[T_1D]; 431*bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 432*bd882c8aSJames Wright r_B[i] = B[item_id_x + i * P_1D]; 433*bd882c8aSJames Wright } 434*bd882c8aSJames Wright 435*bd882c8aSJames Wright for (CeedInt k = 0; k < P_1D; k++) { 436*bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = U[k]; 437*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 438*bd882c8aSJames Wright V[k] = 0.0; 439*bd882c8aSJames Wright if (item_id_x < P_1D && item_id_y < P_1D) { 440*bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 441*bd882c8aSJames Wright V[k] += r_B[i] * scratch[i + item_id_y * T_1D]; // Contract x direction 442*bd882c8aSJames Wright } 443*bd882c8aSJames Wright } 444*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 445*bd882c8aSJames Wright } 446*bd882c8aSJames Wright } 447*bd882c8aSJames Wright 448*bd882c8aSJames Wright //------------------------------------------------------------------------------ 449*bd882c8aSJames Wright // 3D transpose tensor contract add x 450*bd882c8aSJames Wright //------------------------------------------------------------------------------ 451*bd882c8aSJames Wright inline void ContractTransposeAddX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 452*bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 453*bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 454*bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 455*bd882c8aSJames Wright 456*bd882c8aSJames Wright CeedScalar r_B[T_1D]; 457*bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 458*bd882c8aSJames Wright r_B[i] = B[item_id_x + i * P_1D]; 459*bd882c8aSJames Wright } 460*bd882c8aSJames Wright 461*bd882c8aSJames Wright for (CeedInt k = 0; k < P_1D; k++) { 462*bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = U[k]; 463*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 464*bd882c8aSJames Wright 465*bd882c8aSJames Wright if (item_id_x < P_1D && item_id_y < P_1D) { 466*bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 467*bd882c8aSJames Wright V[k] += r_B[i] * scratch[i + item_id_y * T_1D]; // Contract x direction 468*bd882c8aSJames Wright } 469*bd882c8aSJames Wright } 470*bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 471*bd882c8aSJames Wright } 472*bd882c8aSJames Wright } 473*bd882c8aSJames Wright 474*bd882c8aSJames Wright //------------------------------------------------------------------------------ 475*bd882c8aSJames Wright // 3D interpolate to quadrature points 476*bd882c8aSJames Wright //------------------------------------------------------------------------------ 477*bd882c8aSJames Wright inline void InterpTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 478*bd882c8aSJames Wright local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 479*bd882c8aSJames Wright CeedScalar r_t1[T_1D]; 480*bd882c8aSJames Wright CeedScalar r_t2[T_1D]; 481*bd882c8aSJames Wright 482*bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 483*bd882c8aSJames Wright ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 484*bd882c8aSJames Wright ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 485*bd882c8aSJames Wright ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D, scratch); 486*bd882c8aSJames Wright } 487*bd882c8aSJames Wright } 488*bd882c8aSJames Wright 489*bd882c8aSJames Wright //------------------------------------------------------------------------------ 490*bd882c8aSJames Wright // 3D interpolate transpose 491*bd882c8aSJames Wright //------------------------------------------------------------------------------ 492*bd882c8aSJames Wright inline void InterpTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 493*bd882c8aSJames Wright local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 494*bd882c8aSJames Wright CeedScalar r_t1[T_1D]; 495*bd882c8aSJames Wright CeedScalar r_t2[T_1D]; 496*bd882c8aSJames Wright 497*bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 498*bd882c8aSJames Wright ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D, s_B, r_t1, scratch); 499*bd882c8aSJames Wright ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 500*bd882c8aSJames Wright ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 501*bd882c8aSJames Wright } 502*bd882c8aSJames Wright } 503*bd882c8aSJames Wright 504*bd882c8aSJames Wright //------------------------------------------------------------------------------ 505*bd882c8aSJames Wright // 3D derivatives at quadrature points 506*bd882c8aSJames Wright //------------------------------------------------------------------------------ 507*bd882c8aSJames Wright inline void GradTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 508*bd882c8aSJames Wright local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 509*bd882c8aSJames Wright local CeedScalar *restrict scratch) { 510*bd882c8aSJames Wright CeedScalar r_t1[T_1D]; 511*bd882c8aSJames Wright CeedScalar r_t2[T_1D]; 512*bd882c8aSJames Wright 513*bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 514*bd882c8aSJames Wright ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_G, r_t1, scratch); 515*bd882c8aSJames Wright ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 516*bd882c8aSJames Wright ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch); 517*bd882c8aSJames Wright ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 518*bd882c8aSJames Wright ContractY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch); 519*bd882c8aSJames Wright ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch); 520*bd882c8aSJames Wright ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 521*bd882c8aSJames Wright ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 522*bd882c8aSJames Wright ContractZ3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch); 523*bd882c8aSJames Wright } 524*bd882c8aSJames Wright } 525*bd882c8aSJames Wright 526*bd882c8aSJames Wright //------------------------------------------------------------------------------ 527*bd882c8aSJames Wright // 3D derivatives transpose 528*bd882c8aSJames Wright //------------------------------------------------------------------------------ 529*bd882c8aSJames Wright inline void GradTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 530*bd882c8aSJames Wright local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 531*bd882c8aSJames Wright local CeedScalar *restrict scratch) { 532*bd882c8aSJames Wright CeedScalar r_t1[T_1D]; 533*bd882c8aSJames Wright CeedScalar r_t2[T_1D]; 534*bd882c8aSJames Wright 535*bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 536*bd882c8aSJames Wright ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_B, r_t1, scratch); 537*bd882c8aSJames Wright ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 538*bd882c8aSJames Wright ContractTransposeX3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * P_1D, scratch); 539*bd882c8aSJames Wright ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_B, r_t1, scratch); 540*bd882c8aSJames Wright ContractTransposeY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch); 541*bd882c8aSJames Wright ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 542*bd882c8aSJames Wright ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t1, scratch); 543*bd882c8aSJames Wright ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 544*bd882c8aSJames Wright ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 545*bd882c8aSJames Wright } 546*bd882c8aSJames Wright } 547*bd882c8aSJames Wright 548*bd882c8aSJames Wright //------------------------------------------------------------------------------ 549*bd882c8aSJames Wright // 3D derivatives at quadrature points 550*bd882c8aSJames Wright //------------------------------------------------------------------------------ 551*bd882c8aSJames Wright inline void GradTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 552*bd882c8aSJames Wright local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 553*bd882c8aSJames Wright local CeedScalar *restrict scratch) { 554*bd882c8aSJames Wright CeedScalar r_t1[T_1D]; 555*bd882c8aSJames Wright CeedScalar r_t2[T_1D]; 556*bd882c8aSJames Wright 557*bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 558*bd882c8aSJames Wright ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 559*bd882c8aSJames Wright ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 560*bd882c8aSJames Wright ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch); 561*bd882c8aSJames Wright ContractX3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch); 562*bd882c8aSJames Wright ContractY3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch); 563*bd882c8aSJames Wright ContractZ3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch); 564*bd882c8aSJames Wright } 565*bd882c8aSJames Wright } 566*bd882c8aSJames Wright 567*bd882c8aSJames Wright //------------------------------------------------------------------------------ 568*bd882c8aSJames Wright // 3D derivatives transpose 569*bd882c8aSJames Wright //------------------------------------------------------------------------------ 570*bd882c8aSJames Wright inline void GradTransposeTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 571*bd882c8aSJames Wright local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, 572*bd882c8aSJames Wright private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 573*bd882c8aSJames Wright CeedScalar r_t1[T_1D]; 574*bd882c8aSJames Wright CeedScalar r_t2[T_1D]; 575*bd882c8aSJames Wright 576*bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 577*bd882c8aSJames Wright ContractTransposeZ3d(Q_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t2, scratch); 578*bd882c8aSJames Wright ContractTransposeAddY3d(Q_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_G, r_t2, scratch); 579*bd882c8aSJames Wright ContractTransposeAddX3d(Q_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_G, r_t2, scratch); 580*bd882c8aSJames Wright ContractTransposeZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch); 581*bd882c8aSJames Wright ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 582*bd882c8aSJames Wright ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 583*bd882c8aSJames Wright } 584*bd882c8aSJames Wright } 585*bd882c8aSJames Wright 586*bd882c8aSJames Wright //------------------------------------------------------------------------------ 587*bd882c8aSJames Wright // 3D quadrature weights 588*bd882c8aSJames Wright //------------------------------------------------------------------------------ 589*bd882c8aSJames Wright // template <int Q_1D> 590*bd882c8aSJames Wright inline void WeightTensor3d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) { 591*bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 592*bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 593*bd882c8aSJames Wright 594*bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < Q_1D) { 595*bd882c8aSJames Wright const CeedScalar w_xy = q_weight_1d[item_id_x] * q_weight_1d[item_id_y]; 596*bd882c8aSJames Wright for (CeedInt q = 0; q < Q_1D; ++q) w[q] = w_xy * q_weight_1d[q]; 597*bd882c8aSJames Wright } else { 598*bd882c8aSJames Wright for (CeedInt q = 0; q < Q_1D; q++) w[q] = 0.0; 599*bd882c8aSJames Wright } 600*bd882c8aSJames Wright } 601*bd882c8aSJames Wright 602*bd882c8aSJames Wright //------------------------------------------------------------------------------ 603*bd882c8aSJames Wright 604*bd882c8aSJames Wright #endif 605