xref: /libCEED/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h (revision bd882c8a454763a096666645dc9a6229d5263694)
1*bd882c8aSJames Wright // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2*bd882c8aSJames Wright // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3*bd882c8aSJames Wright //
4*bd882c8aSJames Wright // SPDX-License-Identifier: BSD-2-Clause
5*bd882c8aSJames Wright //
6*bd882c8aSJames Wright // This file is part of CEED:  http://github.com/ceed
7*bd882c8aSJames Wright 
8*bd882c8aSJames Wright /// @file
9*bd882c8aSJames Wright /// Internal header for SYCL shared memory tensor product basis templates
10*bd882c8aSJames Wright #ifndef _ceed_sycl_shared_basis_tensor_templates_h
11*bd882c8aSJames Wright #define _ceed_sycl_shared_basis_tensor_templates_h
12*bd882c8aSJames Wright 
13*bd882c8aSJames Wright #include <ceed.h>
14*bd882c8aSJames Wright 
15*bd882c8aSJames Wright //------------------------------------------------------------------------------
16*bd882c8aSJames Wright // 1D
17*bd882c8aSJames Wright //------------------------------------------------------------------------------
18*bd882c8aSJames Wright 
19*bd882c8aSJames Wright //------------------------------------------------------------------------------
20*bd882c8aSJames Wright // 1D tensor contraction x
21*bd882c8aSJames Wright //------------------------------------------------------------------------------
22*bd882c8aSJames Wright inline void ContractX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
23*bd882c8aSJames Wright                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
24*bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
25*bd882c8aSJames Wright 
26*bd882c8aSJames Wright   scratch[item_id_x] = *U;
27*bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
28*bd882c8aSJames Wright 
29*bd882c8aSJames Wright   *V = 0.0;
30*bd882c8aSJames Wright   if (item_id_x < Q_1D) {
31*bd882c8aSJames Wright     for (CeedInt i = 0; i < P_1D; i++) {
32*bd882c8aSJames Wright       *V += B[i + item_id_x * P_1D] * scratch[i];  // Contract x direction
33*bd882c8aSJames Wright     }
34*bd882c8aSJames Wright   }
35*bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
36*bd882c8aSJames Wright }
37*bd882c8aSJames Wright 
38*bd882c8aSJames Wright //------------------------------------------------------------------------------
39*bd882c8aSJames Wright // 1D transpose tensor contraction x
40*bd882c8aSJames Wright //------------------------------------------------------------------------------
41*bd882c8aSJames Wright inline void ContractTransposeX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
42*bd882c8aSJames Wright                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
43*bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
44*bd882c8aSJames Wright 
45*bd882c8aSJames Wright   scratch[item_id_x] = *U;
46*bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
47*bd882c8aSJames Wright 
48*bd882c8aSJames Wright   *V = 0.0;
49*bd882c8aSJames Wright   if (item_id_x < P_1D) {
50*bd882c8aSJames Wright     for (CeedInt i = 0; i < Q_1D; i++) {
51*bd882c8aSJames Wright       *V += B[item_id_x + i * P_1D] * scratch[i];  // Contract x direction
52*bd882c8aSJames Wright     }
53*bd882c8aSJames Wright   }
54*bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
55*bd882c8aSJames Wright }
56*bd882c8aSJames Wright 
57*bd882c8aSJames Wright //------------------------------------------------------------------------------
58*bd882c8aSJames Wright // 1D interpolate to quadrature points
59*bd882c8aSJames Wright //------------------------------------------------------------------------------
60*bd882c8aSJames Wright inline void Interp1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
61*bd882c8aSJames Wright                      local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
62*bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
63*bd882c8aSJames Wright     ContractX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch);
64*bd882c8aSJames Wright   }
65*bd882c8aSJames Wright }
66*bd882c8aSJames Wright 
67*bd882c8aSJames Wright //------------------------------------------------------------------------------
68*bd882c8aSJames Wright // 1D interpolate transpose
69*bd882c8aSJames Wright //------------------------------------------------------------------------------
70*bd882c8aSJames Wright inline void InterpTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
71*bd882c8aSJames Wright                               local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
72*bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
73*bd882c8aSJames Wright     ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch);
74*bd882c8aSJames Wright   }
75*bd882c8aSJames Wright }
76*bd882c8aSJames Wright 
77*bd882c8aSJames Wright //------------------------------------------------------------------------------
78*bd882c8aSJames Wright // 1D derivatives at quadrature points
79*bd882c8aSJames Wright //------------------------------------------------------------------------------
80*bd882c8aSJames Wright inline void Grad1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
81*bd882c8aSJames Wright                    local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
82*bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
83*bd882c8aSJames Wright     ContractX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch);
84*bd882c8aSJames Wright   }
85*bd882c8aSJames Wright }
86*bd882c8aSJames Wright 
87*bd882c8aSJames Wright //------------------------------------------------------------------------------
88*bd882c8aSJames Wright // 1D derivatives transpose
89*bd882c8aSJames Wright //------------------------------------------------------------------------------
90*bd882c8aSJames Wright inline void GradTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
91*bd882c8aSJames Wright                             local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
92*bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
93*bd882c8aSJames Wright     ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch);
94*bd882c8aSJames Wright   }
95*bd882c8aSJames Wright }
96*bd882c8aSJames Wright 
97*bd882c8aSJames Wright //------------------------------------------------------------------------------
98*bd882c8aSJames Wright // 1D quadrature weights
99*bd882c8aSJames Wright //------------------------------------------------------------------------------
100*bd882c8aSJames Wright inline void Weight1d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) {
101*bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
102*bd882c8aSJames Wright   *w                      = (item_id_x < Q_1D) ? q_weight_1d[item_id_x] : 0.0;
103*bd882c8aSJames Wright }
104*bd882c8aSJames Wright 
105*bd882c8aSJames Wright //------------------------------------------------------------------------------
106*bd882c8aSJames Wright // 2D
107*bd882c8aSJames Wright //------------------------------------------------------------------------------
108*bd882c8aSJames Wright 
109*bd882c8aSJames Wright //------------------------------------------------------------------------------
110*bd882c8aSJames Wright // 2D tensor contraction x
111*bd882c8aSJames Wright //------------------------------------------------------------------------------
112*bd882c8aSJames Wright inline void ContractX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
113*bd882c8aSJames Wright                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
114*bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
115*bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
116*bd882c8aSJames Wright 
117*bd882c8aSJames Wright   scratch[item_id_x + item_id_y * T_1D] = *U;
118*bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
119*bd882c8aSJames Wright 
120*bd882c8aSJames Wright   *V = 0.0;
121*bd882c8aSJames Wright   if (item_id_x < Q_1D && item_id_y < P_1D) {
122*bd882c8aSJames Wright     for (CeedInt i = 0; i < P_1D; i++) {
123*bd882c8aSJames Wright       *V += B[i + item_id_x * P_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction
124*bd882c8aSJames Wright     }
125*bd882c8aSJames Wright   }
126*bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
127*bd882c8aSJames Wright }
128*bd882c8aSJames Wright 
129*bd882c8aSJames Wright //------------------------------------------------------------------------------
130*bd882c8aSJames Wright // 2D tensor contract y
131*bd882c8aSJames Wright //------------------------------------------------------------------------------
132*bd882c8aSJames Wright inline void ContractY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
133*bd882c8aSJames Wright                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
134*bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
135*bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
136*bd882c8aSJames Wright 
137*bd882c8aSJames Wright   scratch[item_id_x + item_id_y * T_1D] = *U;
138*bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
139*bd882c8aSJames Wright 
140*bd882c8aSJames Wright   *V = 0.0;
141*bd882c8aSJames Wright   if (item_id_x < Q_1D && item_id_y < Q_1D) {
142*bd882c8aSJames Wright     for (CeedInt i = 0; i < P_1D; i++) {
143*bd882c8aSJames Wright       *V += B[i + item_id_y * P_1D] * scratch[item_id_x + i * T_1D];  // Contract y direction
144*bd882c8aSJames Wright     }
145*bd882c8aSJames Wright   }
146*bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
147*bd882c8aSJames Wright }
148*bd882c8aSJames Wright 
149*bd882c8aSJames Wright //------------------------------------------------------------------------------
150*bd882c8aSJames Wright // 2D transpose tensor contract y
151*bd882c8aSJames Wright //------------------------------------------------------------------------------
152*bd882c8aSJames Wright inline void ContractTransposeY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
153*bd882c8aSJames Wright                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
154*bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
155*bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
156*bd882c8aSJames Wright 
157*bd882c8aSJames Wright   scratch[item_id_x + item_id_y * T_1D] = *U;
158*bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
159*bd882c8aSJames Wright 
160*bd882c8aSJames Wright   *V = 0.0;
161*bd882c8aSJames Wright   if (item_id_x < Q_1D && item_id_y < P_1D) {
162*bd882c8aSJames Wright     for (CeedInt i = 0; i < Q_1D; i++) {
163*bd882c8aSJames Wright       *V += B[item_id_y + i * P_1D] * scratch[item_id_x + i * T_1D];  // Contract y direction
164*bd882c8aSJames Wright     }
165*bd882c8aSJames Wright   }
166*bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
167*bd882c8aSJames Wright }
168*bd882c8aSJames Wright 
169*bd882c8aSJames Wright //------------------------------------------------------------------------------
170*bd882c8aSJames Wright // 2D transpose tensor contract x
171*bd882c8aSJames Wright //------------------------------------------------------------------------------
172*bd882c8aSJames Wright inline void ContractTransposeX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
173*bd882c8aSJames Wright                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
174*bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
175*bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
176*bd882c8aSJames Wright 
177*bd882c8aSJames Wright   scratch[item_id_x + item_id_y * T_1D] = *U;
178*bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
179*bd882c8aSJames Wright 
180*bd882c8aSJames Wright   *V = 0.0;
181*bd882c8aSJames Wright   if (item_id_x < P_1D && item_id_y < P_1D) {
182*bd882c8aSJames Wright     for (CeedInt i = 0; i < Q_1D; i++) {
183*bd882c8aSJames Wright       *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction
184*bd882c8aSJames Wright     }
185*bd882c8aSJames Wright   }
186*bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
187*bd882c8aSJames Wright }
188*bd882c8aSJames Wright 
189*bd882c8aSJames Wright //------------------------------------------------------------------------------
190*bd882c8aSJames Wright // 2D transpose tensor contract and add x
191*bd882c8aSJames Wright //------------------------------------------------------------------------------
192*bd882c8aSJames Wright inline void ContractTransposeAddX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
193*bd882c8aSJames Wright                                     private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
194*bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
195*bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
196*bd882c8aSJames Wright 
197*bd882c8aSJames Wright   scratch[item_id_x + item_id_y * T_1D] = *U;
198*bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
199*bd882c8aSJames Wright 
200*bd882c8aSJames Wright   if (item_id_x < P_1D && item_id_y < P_1D) {
201*bd882c8aSJames Wright     for (CeedInt i = 0; i < Q_1D; i++) {
202*bd882c8aSJames Wright       *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction
203*bd882c8aSJames Wright     }
204*bd882c8aSJames Wright   }
205*bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
206*bd882c8aSJames Wright }
207*bd882c8aSJames Wright 
208*bd882c8aSJames Wright //------------------------------------------------------------------------------
209*bd882c8aSJames Wright // 2D interpolate to quadrature points
210*bd882c8aSJames Wright //------------------------------------------------------------------------------
211*bd882c8aSJames Wright inline void InterpTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
212*bd882c8aSJames Wright                            local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
213*bd882c8aSJames Wright   CeedScalar r_t[1];
214*bd882c8aSJames Wright 
215*bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
216*bd882c8aSJames Wright     ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch);
217*bd882c8aSJames Wright     ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch);
218*bd882c8aSJames Wright   }
219*bd882c8aSJames Wright }
220*bd882c8aSJames Wright 
221*bd882c8aSJames Wright //------------------------------------------------------------------------------
222*bd882c8aSJames Wright // 2D interpolate transpose
223*bd882c8aSJames Wright //------------------------------------------------------------------------------
224*bd882c8aSJames Wright inline void InterpTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
225*bd882c8aSJames Wright                                     local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
226*bd882c8aSJames Wright   CeedScalar r_t[1];
227*bd882c8aSJames Wright 
228*bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
229*bd882c8aSJames Wright     ContractTransposeY2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch);
230*bd882c8aSJames Wright     ContractTransposeX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch);
231*bd882c8aSJames Wright   }
232*bd882c8aSJames Wright }
233*bd882c8aSJames Wright 
234*bd882c8aSJames Wright //------------------------------------------------------------------------------
235*bd882c8aSJames Wright // 2D derivatives at quadrature points
236*bd882c8aSJames Wright //------------------------------------------------------------------------------
237*bd882c8aSJames Wright inline void GradTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
238*bd882c8aSJames Wright                          local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
239*bd882c8aSJames Wright                          local CeedScalar *restrict scratch) {
240*bd882c8aSJames Wright   CeedScalar r_t[1];
241*bd882c8aSJames Wright 
242*bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
243*bd882c8aSJames Wright     ContractX2d(P_1D, Q_1D, r_U + comp, s_G, r_t, scratch);
244*bd882c8aSJames Wright     ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp + 0 * NUM_COMP, scratch);
245*bd882c8aSJames Wright     ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch);
246*bd882c8aSJames Wright     ContractY2d(P_1D, Q_1D, r_t, s_G, r_V + comp + 1 * NUM_COMP, scratch);
247*bd882c8aSJames Wright   }
248*bd882c8aSJames Wright }
249*bd882c8aSJames Wright 
250*bd882c8aSJames Wright //------------------------------------------------------------------------------
251*bd882c8aSJames Wright // 2D derivatives transpose
252*bd882c8aSJames Wright //------------------------------------------------------------------------------
253*bd882c8aSJames Wright inline void GradTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
254*bd882c8aSJames Wright                                   local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
255*bd882c8aSJames Wright                                   local CeedScalar *restrict scratch) {
256*bd882c8aSJames Wright   CeedScalar r_t[1];
257*bd882c8aSJames Wright 
258*bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
259*bd882c8aSJames Wright     ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 0 * NUM_COMP, s_B, r_t, scratch);
260*bd882c8aSJames Wright     ContractTransposeX2d(P_1D, Q_1D, r_t, s_G, r_V + comp, scratch);
261*bd882c8aSJames Wright     ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 1 * NUM_COMP, s_G, r_t, scratch);
262*bd882c8aSJames Wright     ContractTransposeAddX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch);
263*bd882c8aSJames Wright   }
264*bd882c8aSJames Wright }
265*bd882c8aSJames Wright 
266*bd882c8aSJames Wright //------------------------------------------------------------------------------
267*bd882c8aSJames Wright // 2D quadrature weights
268*bd882c8aSJames Wright //------------------------------------------------------------------------------
269*bd882c8aSJames Wright inline void WeightTensor2d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) {
270*bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
271*bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
272*bd882c8aSJames Wright 
273*bd882c8aSJames Wright   *w = (item_id_x < Q_1D && item_id_y < Q_1D) ? q_weight_1d[item_id_x] * q_weight_1d[item_id_y] : 0.0;
274*bd882c8aSJames Wright }
275*bd882c8aSJames Wright 
276*bd882c8aSJames Wright //------------------------------------------------------------------------------
277*bd882c8aSJames Wright // 3D
278*bd882c8aSJames Wright //------------------------------------------------------------------------------
279*bd882c8aSJames Wright 
280*bd882c8aSJames Wright //------------------------------------------------------------------------------
281*bd882c8aSJames Wright // 3D tensor contract x
282*bd882c8aSJames Wright //------------------------------------------------------------------------------
283*bd882c8aSJames Wright inline void ContractX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
284*bd882c8aSJames Wright                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
285*bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
286*bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
287*bd882c8aSJames Wright 
288*bd882c8aSJames Wright   CeedScalar r_B[T_1D];
289*bd882c8aSJames Wright   for (CeedInt i = 0; i < P_1D; i++) {
290*bd882c8aSJames Wright     r_B[i] = B[i + item_id_x * P_1D];
291*bd882c8aSJames Wright   }
292*bd882c8aSJames Wright 
293*bd882c8aSJames Wright   for (CeedInt k = 0; k < P_1D; k++) {
294*bd882c8aSJames Wright     scratch[item_id_x + item_id_y * T_1D] = U[k];
295*bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
296*bd882c8aSJames Wright 
297*bd882c8aSJames Wright     V[k] = 0.0;
298*bd882c8aSJames Wright     if (item_id_x < Q_1D && item_id_y < P_1D) {
299*bd882c8aSJames Wright       for (CeedInt i = 0; i < P_1D; i++) {
300*bd882c8aSJames Wright         V[k] += r_B[i] * scratch[i + item_id_y * T_1D];  // Contract x direction
301*bd882c8aSJames Wright       }
302*bd882c8aSJames Wright     }
303*bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
304*bd882c8aSJames Wright   }
305*bd882c8aSJames Wright }
306*bd882c8aSJames Wright 
307*bd882c8aSJames Wright //------------------------------------------------------------------------------
308*bd882c8aSJames Wright // 3D tensor contract y
309*bd882c8aSJames Wright //------------------------------------------------------------------------------
310*bd882c8aSJames Wright inline void ContractY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
311*bd882c8aSJames Wright                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
312*bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
313*bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
314*bd882c8aSJames Wright 
315*bd882c8aSJames Wright   CeedScalar r_B[T_1D];
316*bd882c8aSJames Wright   for (CeedInt i = 0; i < P_1D; i++) {
317*bd882c8aSJames Wright     r_B[i] = B[i + item_id_y * P_1D];
318*bd882c8aSJames Wright   }
319*bd882c8aSJames Wright 
320*bd882c8aSJames Wright   for (CeedInt k = 0; k < P_1D; k++) {
321*bd882c8aSJames Wright     scratch[item_id_x + item_id_y * T_1D] = U[k];
322*bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
323*bd882c8aSJames Wright 
324*bd882c8aSJames Wright     V[k] = 0.0;
325*bd882c8aSJames Wright     if (item_id_x < Q_1D && item_id_y < Q_1D) {
326*bd882c8aSJames Wright       for (CeedInt i = 0; i < P_1D; i++) {
327*bd882c8aSJames Wright         V[k] += r_B[i] * scratch[item_id_x + i * T_1D];  // Contract y direction
328*bd882c8aSJames Wright       }
329*bd882c8aSJames Wright     }
330*bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
331*bd882c8aSJames Wright   }
332*bd882c8aSJames Wright }
333*bd882c8aSJames Wright 
334*bd882c8aSJames Wright //------------------------------------------------------------------------------
335*bd882c8aSJames Wright // 3D tensor contract z
336*bd882c8aSJames Wright //------------------------------------------------------------------------------
337*bd882c8aSJames Wright inline void ContractZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
338*bd882c8aSJames Wright                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
339*bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
340*bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
341*bd882c8aSJames Wright 
342*bd882c8aSJames Wright   for (CeedInt k = 0; k < Q_1D; k++) {
343*bd882c8aSJames Wright     V[k] = 0.0;
344*bd882c8aSJames Wright     if (item_id_x < Q_1D && item_id_y < Q_1D) {
345*bd882c8aSJames Wright       for (CeedInt i = 0; i < P_1D; i++) {
346*bd882c8aSJames Wright         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
347*bd882c8aSJames Wright       }
348*bd882c8aSJames Wright     }
349*bd882c8aSJames Wright   }
350*bd882c8aSJames Wright }
351*bd882c8aSJames Wright 
352*bd882c8aSJames Wright //------------------------------------------------------------------------------
353*bd882c8aSJames Wright // 3D transpose tensor contract z
354*bd882c8aSJames Wright //------------------------------------------------------------------------------
355*bd882c8aSJames Wright inline void ContractTransposeZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
356*bd882c8aSJames Wright                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
357*bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
358*bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
359*bd882c8aSJames Wright 
360*bd882c8aSJames Wright   for (CeedInt k = 0; k < P_1D; k++) {
361*bd882c8aSJames Wright     V[k] = 0.0;
362*bd882c8aSJames Wright     if (item_id_x < Q_1D && item_id_y < Q_1D) {
363*bd882c8aSJames Wright       for (CeedInt i = 0; i < Q_1D; i++) {
364*bd882c8aSJames Wright         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
365*bd882c8aSJames Wright       }
366*bd882c8aSJames Wright     }
367*bd882c8aSJames Wright   }
368*bd882c8aSJames Wright }
369*bd882c8aSJames Wright 
370*bd882c8aSJames Wright //------------------------------------------------------------------------------
371*bd882c8aSJames Wright // 3D transpose tensor contract y
372*bd882c8aSJames Wright //------------------------------------------------------------------------------
373*bd882c8aSJames Wright inline void ContractTransposeY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
374*bd882c8aSJames Wright                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
375*bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
376*bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
377*bd882c8aSJames Wright 
378*bd882c8aSJames Wright   CeedScalar r_B[T_1D];
379*bd882c8aSJames Wright   for (CeedInt i = 0; i < Q_1D; i++) {
380*bd882c8aSJames Wright     r_B[i] = B[item_id_y + i * P_1D];
381*bd882c8aSJames Wright   }
382*bd882c8aSJames Wright 
383*bd882c8aSJames Wright   for (CeedInt k = 0; k < P_1D; k++) {
384*bd882c8aSJames Wright     scratch[item_id_x + item_id_y * T_1D] = U[k];
385*bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
386*bd882c8aSJames Wright 
387*bd882c8aSJames Wright     V[k] = 0.0;
388*bd882c8aSJames Wright     if (item_id_x < Q_1D && item_id_y < P_1D) {
389*bd882c8aSJames Wright       for (CeedInt i = 0; i < Q_1D; i++) {
390*bd882c8aSJames Wright         V[k] += r_B[i] * scratch[item_id_x + i * T_1D];  // Contract y direction
391*bd882c8aSJames Wright       }
392*bd882c8aSJames Wright     }
393*bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
394*bd882c8aSJames Wright   }
395*bd882c8aSJames Wright }
396*bd882c8aSJames Wright 
397*bd882c8aSJames Wright //------------------------------------------------------------------------------
398*bd882c8aSJames Wright // 3D transpose tensor contract y
399*bd882c8aSJames Wright //------------------------------------------------------------------------------
400*bd882c8aSJames Wright inline void ContractTransposeAddY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
401*bd882c8aSJames Wright                                     private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
402*bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
403*bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
404*bd882c8aSJames Wright 
405*bd882c8aSJames Wright   CeedScalar r_B[T_1D];
406*bd882c8aSJames Wright   for (CeedInt i = 0; i < Q_1D; i++) {
407*bd882c8aSJames Wright     r_B[i] = B[item_id_y + i * P_1D];
408*bd882c8aSJames Wright   }
409*bd882c8aSJames Wright 
410*bd882c8aSJames Wright   for (CeedInt k = 0; k < P_1D; k++) {
411*bd882c8aSJames Wright     scratch[item_id_x + item_id_y * T_1D] = U[k];
412*bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
413*bd882c8aSJames Wright     if (item_id_x < Q_1D && item_id_y < P_1D) {
414*bd882c8aSJames Wright       for (CeedInt i = 0; i < Q_1D; i++) {
415*bd882c8aSJames Wright         V[k] += r_B[i] * scratch[item_id_x + i * T_1D];  // Contract y direction
416*bd882c8aSJames Wright       }
417*bd882c8aSJames Wright     }
418*bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
419*bd882c8aSJames Wright   }
420*bd882c8aSJames Wright }
421*bd882c8aSJames Wright 
422*bd882c8aSJames Wright //------------------------------------------------------------------------------
423*bd882c8aSJames Wright // 3D transpose tensor contract x
424*bd882c8aSJames Wright //------------------------------------------------------------------------------
425*bd882c8aSJames Wright inline void ContractTransposeX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
426*bd882c8aSJames Wright                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
427*bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
428*bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
429*bd882c8aSJames Wright 
430*bd882c8aSJames Wright   CeedScalar r_B[T_1D];
431*bd882c8aSJames Wright   for (CeedInt i = 0; i < Q_1D; i++) {
432*bd882c8aSJames Wright     r_B[i] = B[item_id_x + i * P_1D];
433*bd882c8aSJames Wright   }
434*bd882c8aSJames Wright 
435*bd882c8aSJames Wright   for (CeedInt k = 0; k < P_1D; k++) {
436*bd882c8aSJames Wright     scratch[item_id_x + item_id_y * T_1D] = U[k];
437*bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
438*bd882c8aSJames Wright     V[k] = 0.0;
439*bd882c8aSJames Wright     if (item_id_x < P_1D && item_id_y < P_1D) {
440*bd882c8aSJames Wright       for (CeedInt i = 0; i < Q_1D; i++) {
441*bd882c8aSJames Wright         V[k] += r_B[i] * scratch[i + item_id_y * T_1D];  // Contract x direction
442*bd882c8aSJames Wright       }
443*bd882c8aSJames Wright     }
444*bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
445*bd882c8aSJames Wright   }
446*bd882c8aSJames Wright }
447*bd882c8aSJames Wright 
448*bd882c8aSJames Wright //------------------------------------------------------------------------------
449*bd882c8aSJames Wright // 3D transpose tensor contract add x
450*bd882c8aSJames Wright //------------------------------------------------------------------------------
451*bd882c8aSJames Wright inline void ContractTransposeAddX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
452*bd882c8aSJames Wright                                     private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
453*bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
454*bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
455*bd882c8aSJames Wright 
456*bd882c8aSJames Wright   CeedScalar r_B[T_1D];
457*bd882c8aSJames Wright   for (CeedInt i = 0; i < Q_1D; i++) {
458*bd882c8aSJames Wright     r_B[i] = B[item_id_x + i * P_1D];
459*bd882c8aSJames Wright   }
460*bd882c8aSJames Wright 
461*bd882c8aSJames Wright   for (CeedInt k = 0; k < P_1D; k++) {
462*bd882c8aSJames Wright     scratch[item_id_x + item_id_y * T_1D] = U[k];
463*bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
464*bd882c8aSJames Wright 
465*bd882c8aSJames Wright     if (item_id_x < P_1D && item_id_y < P_1D) {
466*bd882c8aSJames Wright       for (CeedInt i = 0; i < Q_1D; i++) {
467*bd882c8aSJames Wright         V[k] += r_B[i] * scratch[i + item_id_y * T_1D];  // Contract x direction
468*bd882c8aSJames Wright       }
469*bd882c8aSJames Wright     }
470*bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
471*bd882c8aSJames Wright   }
472*bd882c8aSJames Wright }
473*bd882c8aSJames Wright 
474*bd882c8aSJames Wright //------------------------------------------------------------------------------
475*bd882c8aSJames Wright // 3D interpolate to quadrature points
476*bd882c8aSJames Wright //------------------------------------------------------------------------------
477*bd882c8aSJames Wright inline void InterpTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
478*bd882c8aSJames Wright                            local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
479*bd882c8aSJames Wright   CeedScalar r_t1[T_1D];
480*bd882c8aSJames Wright   CeedScalar r_t2[T_1D];
481*bd882c8aSJames Wright 
482*bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
483*bd882c8aSJames Wright     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
484*bd882c8aSJames Wright     ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
485*bd882c8aSJames Wright     ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D, scratch);
486*bd882c8aSJames Wright   }
487*bd882c8aSJames Wright }
488*bd882c8aSJames Wright 
489*bd882c8aSJames Wright //------------------------------------------------------------------------------
490*bd882c8aSJames Wright // 3D interpolate transpose
491*bd882c8aSJames Wright //------------------------------------------------------------------------------
492*bd882c8aSJames Wright inline void InterpTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
493*bd882c8aSJames Wright                                     local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
494*bd882c8aSJames Wright   CeedScalar r_t1[T_1D];
495*bd882c8aSJames Wright   CeedScalar r_t2[T_1D];
496*bd882c8aSJames Wright 
497*bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
498*bd882c8aSJames Wright     ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D, s_B, r_t1, scratch);
499*bd882c8aSJames Wright     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
500*bd882c8aSJames Wright     ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
501*bd882c8aSJames Wright   }
502*bd882c8aSJames Wright }
503*bd882c8aSJames Wright 
504*bd882c8aSJames Wright //------------------------------------------------------------------------------
505*bd882c8aSJames Wright // 3D derivatives at quadrature points
506*bd882c8aSJames Wright //------------------------------------------------------------------------------
507*bd882c8aSJames Wright inline void GradTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
508*bd882c8aSJames Wright                          local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
509*bd882c8aSJames Wright                          local CeedScalar *restrict scratch) {
510*bd882c8aSJames Wright   CeedScalar r_t1[T_1D];
511*bd882c8aSJames Wright   CeedScalar r_t2[T_1D];
512*bd882c8aSJames Wright 
513*bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
514*bd882c8aSJames Wright     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_G, r_t1, scratch);
515*bd882c8aSJames Wright     ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
516*bd882c8aSJames Wright     ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch);
517*bd882c8aSJames Wright     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
518*bd882c8aSJames Wright     ContractY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch);
519*bd882c8aSJames Wright     ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch);
520*bd882c8aSJames Wright     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
521*bd882c8aSJames Wright     ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
522*bd882c8aSJames Wright     ContractZ3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch);
523*bd882c8aSJames Wright   }
524*bd882c8aSJames Wright }
525*bd882c8aSJames Wright 
526*bd882c8aSJames Wright //------------------------------------------------------------------------------
527*bd882c8aSJames Wright // 3D derivatives transpose
528*bd882c8aSJames Wright //------------------------------------------------------------------------------
529*bd882c8aSJames Wright inline void GradTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
530*bd882c8aSJames Wright                                   local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
531*bd882c8aSJames Wright                                   local CeedScalar *restrict scratch) {
532*bd882c8aSJames Wright   CeedScalar r_t1[T_1D];
533*bd882c8aSJames Wright   CeedScalar r_t2[T_1D];
534*bd882c8aSJames Wright 
535*bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
536*bd882c8aSJames Wright     ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_B, r_t1, scratch);
537*bd882c8aSJames Wright     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
538*bd882c8aSJames Wright     ContractTransposeX3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * P_1D, scratch);
539*bd882c8aSJames Wright     ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_B, r_t1, scratch);
540*bd882c8aSJames Wright     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch);
541*bd882c8aSJames Wright     ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
542*bd882c8aSJames Wright     ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t1, scratch);
543*bd882c8aSJames Wright     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
544*bd882c8aSJames Wright     ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
545*bd882c8aSJames Wright   }
546*bd882c8aSJames Wright }
547*bd882c8aSJames Wright 
548*bd882c8aSJames Wright //------------------------------------------------------------------------------
549*bd882c8aSJames Wright // 3D derivatives at quadrature points
550*bd882c8aSJames Wright //------------------------------------------------------------------------------
551*bd882c8aSJames Wright inline void GradTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
552*bd882c8aSJames Wright                                    local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
553*bd882c8aSJames Wright                                    local CeedScalar *restrict scratch) {
554*bd882c8aSJames Wright   CeedScalar r_t1[T_1D];
555*bd882c8aSJames Wright   CeedScalar r_t2[T_1D];
556*bd882c8aSJames Wright 
557*bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
558*bd882c8aSJames Wright     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
559*bd882c8aSJames Wright     ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
560*bd882c8aSJames Wright     ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch);
561*bd882c8aSJames Wright     ContractX3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch);
562*bd882c8aSJames Wright     ContractY3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch);
563*bd882c8aSJames Wright     ContractZ3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch);
564*bd882c8aSJames Wright   }
565*bd882c8aSJames Wright }
566*bd882c8aSJames Wright 
567*bd882c8aSJames Wright //------------------------------------------------------------------------------
568*bd882c8aSJames Wright // 3D derivatives transpose
569*bd882c8aSJames Wright //------------------------------------------------------------------------------
570*bd882c8aSJames Wright inline void GradTransposeTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
571*bd882c8aSJames Wright                                             local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G,
572*bd882c8aSJames Wright                                             private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
573*bd882c8aSJames Wright   CeedScalar r_t1[T_1D];
574*bd882c8aSJames Wright   CeedScalar r_t2[T_1D];
575*bd882c8aSJames Wright 
576*bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
577*bd882c8aSJames Wright     ContractTransposeZ3d(Q_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t2, scratch);
578*bd882c8aSJames Wright     ContractTransposeAddY3d(Q_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_G, r_t2, scratch);
579*bd882c8aSJames Wright     ContractTransposeAddX3d(Q_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_G, r_t2, scratch);
580*bd882c8aSJames Wright     ContractTransposeZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch);
581*bd882c8aSJames Wright     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
582*bd882c8aSJames Wright     ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
583*bd882c8aSJames Wright   }
584*bd882c8aSJames Wright }
585*bd882c8aSJames Wright 
586*bd882c8aSJames Wright //------------------------------------------------------------------------------
587*bd882c8aSJames Wright // 3D quadrature weights
588*bd882c8aSJames Wright //------------------------------------------------------------------------------
589*bd882c8aSJames Wright // template <int Q_1D>
590*bd882c8aSJames Wright inline void WeightTensor3d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) {
591*bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
592*bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
593*bd882c8aSJames Wright 
594*bd882c8aSJames Wright   if (item_id_x < Q_1D && item_id_y < Q_1D) {
595*bd882c8aSJames Wright     const CeedScalar w_xy = q_weight_1d[item_id_x] * q_weight_1d[item_id_y];
596*bd882c8aSJames Wright     for (CeedInt q = 0; q < Q_1D; ++q) w[q] = w_xy * q_weight_1d[q];
597*bd882c8aSJames Wright   } else {
598*bd882c8aSJames Wright     for (CeedInt q = 0; q < Q_1D; q++) w[q] = 0.0;
599*bd882c8aSJames Wright   }
600*bd882c8aSJames Wright }
601*bd882c8aSJames Wright 
602*bd882c8aSJames Wright //------------------------------------------------------------------------------
603*bd882c8aSJames Wright 
604*bd882c8aSJames Wright #endif
605