xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h (revision 9e201c85545dd39529c090846df629a32c15659b)
1*9e201c85SYohann // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2*9e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3*9e201c85SYohann //
4*9e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause
5*9e201c85SYohann //
6*9e201c85SYohann // This file is part of CEED:  http://github.com/ceed
7*9e201c85SYohann 
8*9e201c85SYohann /// @file
9*9e201c85SYohann /// Internal header for CUDA shared memory tensor product basis templates
10*9e201c85SYohann #ifndef _ceed_cuda_shared_basis_tensor_templates_h
11*9e201c85SYohann #define _ceed_cuda_shared_basis_tensor_templates_h
12*9e201c85SYohann 
13*9e201c85SYohann #include <ceed.h>
14*9e201c85SYohann 
15*9e201c85SYohann //------------------------------------------------------------------------------
16*9e201c85SYohann // 1D
17*9e201c85SYohann //------------------------------------------------------------------------------
18*9e201c85SYohann 
19*9e201c85SYohann //------------------------------------------------------------------------------
20*9e201c85SYohann // 1D tensor contraction x
21*9e201c85SYohann //------------------------------------------------------------------------------
22*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
23*9e201c85SYohann inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
24*9e201c85SYohann   data.slice[data.t_id_x] = *U;
25*9e201c85SYohann   __syncthreads();
26*9e201c85SYohann   *V = 0.0;
27*9e201c85SYohann   if (data.t_id_x < Q_1D) {
28*9e201c85SYohann     for (CeedInt i = 0; i < P_1D; i++) {
29*9e201c85SYohann       *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction
30*9e201c85SYohann     }
31*9e201c85SYohann   }
32*9e201c85SYohann   __syncthreads();
33*9e201c85SYohann }
34*9e201c85SYohann 
35*9e201c85SYohann //------------------------------------------------------------------------------
36*9e201c85SYohann // 1D transpose tensor contraction x
37*9e201c85SYohann //------------------------------------------------------------------------------
38*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
39*9e201c85SYohann inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
40*9e201c85SYohann   data.slice[data.t_id_x] = *U;
41*9e201c85SYohann   __syncthreads();
42*9e201c85SYohann   *V = 0.0;
43*9e201c85SYohann   if (data.t_id_x < P_1D) {
44*9e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
45*9e201c85SYohann       *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction
46*9e201c85SYohann     }
47*9e201c85SYohann   }
48*9e201c85SYohann   __syncthreads();
49*9e201c85SYohann }
50*9e201c85SYohann 
51*9e201c85SYohann //------------------------------------------------------------------------------
52*9e201c85SYohann // 1D interpolate to quadrature points
53*9e201c85SYohann //------------------------------------------------------------------------------
54*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
55*9e201c85SYohann inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
56*9e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
57*9e201c85SYohann     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
58*9e201c85SYohann   }
59*9e201c85SYohann }
60*9e201c85SYohann 
61*9e201c85SYohann //------------------------------------------------------------------------------
62*9e201c85SYohann // 1D interpolate transpose
63*9e201c85SYohann //------------------------------------------------------------------------------
64*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
65*9e201c85SYohann inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
66*9e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
67*9e201c85SYohann     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
68*9e201c85SYohann   }
69*9e201c85SYohann }
70*9e201c85SYohann 
71*9e201c85SYohann //------------------------------------------------------------------------------
72*9e201c85SYohann // 1D derivatives at quadrature points
73*9e201c85SYohann //------------------------------------------------------------------------------
74*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
75*9e201c85SYohann inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
76*9e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
77*9e201c85SYohann     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
78*9e201c85SYohann   }
79*9e201c85SYohann }
80*9e201c85SYohann 
81*9e201c85SYohann //------------------------------------------------------------------------------
82*9e201c85SYohann // 1D derivatives transpose
83*9e201c85SYohann //------------------------------------------------------------------------------
84*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
85*9e201c85SYohann inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
86*9e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
87*9e201c85SYohann     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
88*9e201c85SYohann   }
89*9e201c85SYohann }
90*9e201c85SYohann 
91*9e201c85SYohann //------------------------------------------------------------------------------
92*9e201c85SYohann // 1D quadrature weights
93*9e201c85SYohann //------------------------------------------------------------------------------
94*9e201c85SYohann template <int Q_1D>
95*9e201c85SYohann inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
96*9e201c85SYohann   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
97*9e201c85SYohann }
98*9e201c85SYohann 
99*9e201c85SYohann //------------------------------------------------------------------------------
100*9e201c85SYohann // 2D
101*9e201c85SYohann //------------------------------------------------------------------------------
102*9e201c85SYohann 
103*9e201c85SYohann //------------------------------------------------------------------------------
104*9e201c85SYohann // 2D tensor contraction x
105*9e201c85SYohann //------------------------------------------------------------------------------
106*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
107*9e201c85SYohann inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
108*9e201c85SYohann   data.slice[data.t_id_x+data.t_id_y*T_1D] = *U;
109*9e201c85SYohann   __syncthreads();
110*9e201c85SYohann   *V = 0.0;
111*9e201c85SYohann   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
112*9e201c85SYohann     for (CeedInt i = 0; i < P_1D; i++) {
113*9e201c85SYohann       *V += B[i + data.t_id_x*P_1D] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction
114*9e201c85SYohann     }
115*9e201c85SYohann   }
116*9e201c85SYohann   __syncthreads();
117*9e201c85SYohann }
118*9e201c85SYohann 
119*9e201c85SYohann //------------------------------------------------------------------------------
120*9e201c85SYohann // 2D tensor contract y
121*9e201c85SYohann //------------------------------------------------------------------------------
122*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
123*9e201c85SYohann inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
124*9e201c85SYohann   data.slice[data.t_id_x+data.t_id_y*T_1D] = *U;
125*9e201c85SYohann   __syncthreads();
126*9e201c85SYohann   *V = 0.0;
127*9e201c85SYohann   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
128*9e201c85SYohann     for (CeedInt i = 0; i < P_1D; i++) {
129*9e201c85SYohann       *V += B[i + data.t_id_y*P_1D] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction
130*9e201c85SYohann     }
131*9e201c85SYohann   }
132*9e201c85SYohann   __syncthreads();
133*9e201c85SYohann }
134*9e201c85SYohann 
135*9e201c85SYohann //------------------------------------------------------------------------------
136*9e201c85SYohann // 2D transpose tensor contract y
137*9e201c85SYohann //------------------------------------------------------------------------------
138*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
139*9e201c85SYohann inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
140*9e201c85SYohann   data.slice[data.t_id_x+data.t_id_y*T_1D] = *U;
141*9e201c85SYohann   __syncthreads();
142*9e201c85SYohann   *V = 0.0;
143*9e201c85SYohann   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
144*9e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
145*9e201c85SYohann       *V += B[data.t_id_y + i*P_1D] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction
146*9e201c85SYohann     }
147*9e201c85SYohann   }
148*9e201c85SYohann   __syncthreads();
149*9e201c85SYohann }
150*9e201c85SYohann 
151*9e201c85SYohann //------------------------------------------------------------------------------
152*9e201c85SYohann // 2D transpose tensor contract x
153*9e201c85SYohann //------------------------------------------------------------------------------
154*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
155*9e201c85SYohann inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
156*9e201c85SYohann   data.slice[data.t_id_x+data.t_id_y*T_1D] = *U;
157*9e201c85SYohann   __syncthreads();
158*9e201c85SYohann   *V = 0.0;
159*9e201c85SYohann   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
160*9e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
161*9e201c85SYohann       *V += B[data.t_id_x + i*P_1D] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction
162*9e201c85SYohann     }
163*9e201c85SYohann   }
164*9e201c85SYohann   __syncthreads();
165*9e201c85SYohann }
166*9e201c85SYohann 
167*9e201c85SYohann //------------------------------------------------------------------------------
168*9e201c85SYohann // 2D transpose tensor contract and add x
169*9e201c85SYohann //------------------------------------------------------------------------------
170*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
171*9e201c85SYohann inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
172*9e201c85SYohann   data.slice[data.t_id_x+data.t_id_y*T_1D] = *U;
173*9e201c85SYohann   __syncthreads();
174*9e201c85SYohann   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
175*9e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
176*9e201c85SYohann       *V += B[data.t_id_x + i*P_1D] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction
177*9e201c85SYohann     }
178*9e201c85SYohann   }
179*9e201c85SYohann   __syncthreads();
180*9e201c85SYohann }
181*9e201c85SYohann 
182*9e201c85SYohann //------------------------------------------------------------------------------
183*9e201c85SYohann // 2D interpolate to quadrature points
184*9e201c85SYohann //------------------------------------------------------------------------------
185*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
186*9e201c85SYohann inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
187*9e201c85SYohann   CeedScalar r_t[1];
188*9e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
189*9e201c85SYohann     ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
190*9e201c85SYohann     ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
191*9e201c85SYohann   }
192*9e201c85SYohann }
193*9e201c85SYohann 
194*9e201c85SYohann //------------------------------------------------------------------------------
195*9e201c85SYohann // 2D interpolate transpose
196*9e201c85SYohann //------------------------------------------------------------------------------
197*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
198*9e201c85SYohann inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
199*9e201c85SYohann   CeedScalar r_t[1];
200*9e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
201*9e201c85SYohann     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
202*9e201c85SYohann     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
203*9e201c85SYohann   }
204*9e201c85SYohann }
205*9e201c85SYohann 
206*9e201c85SYohann //------------------------------------------------------------------------------
207*9e201c85SYohann // 2D derivatives at quadrature points
208*9e201c85SYohann //------------------------------------------------------------------------------
209*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
210*9e201c85SYohann inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
211*9e201c85SYohann   CeedScalar r_t[1];
212*9e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
213*9e201c85SYohann     ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_t);
214*9e201c85SYohann     ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp + 0*NUM_COMP);
215*9e201c85SYohann     ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
216*9e201c85SYohann     ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp + 1*NUM_COMP);
217*9e201c85SYohann   }
218*9e201c85SYohann }
219*9e201c85SYohann 
220*9e201c85SYohann //------------------------------------------------------------------------------
221*9e201c85SYohann // 2D derivatives transpose
222*9e201c85SYohann //------------------------------------------------------------------------------
223*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
224*9e201c85SYohann inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
225*9e201c85SYohann   CeedScalar r_t[1];
226*9e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
227*9e201c85SYohann     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 0*NUM_COMP, c_B, r_t);
228*9e201c85SYohann     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp);
229*9e201c85SYohann     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 1*NUM_COMP, c_G, r_t);
230*9e201c85SYohann     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
231*9e201c85SYohann   }
232*9e201c85SYohann }
233*9e201c85SYohann 
234*9e201c85SYohann //------------------------------------------------------------------------------
235*9e201c85SYohann // 2D quadrature weights
236*9e201c85SYohann //------------------------------------------------------------------------------
237*9e201c85SYohann template <int Q_1D>
238*9e201c85SYohann inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
239*9e201c85SYohann   *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ?
240*9e201c85SYohann         q_weight_1d[data.t_id_x]*q_weight_1d[data.t_id_y] : 0.0;
241*9e201c85SYohann }
242*9e201c85SYohann 
243*9e201c85SYohann //------------------------------------------------------------------------------
244*9e201c85SYohann // 3D
245*9e201c85SYohann //------------------------------------------------------------------------------
246*9e201c85SYohann 
247*9e201c85SYohann //------------------------------------------------------------------------------
248*9e201c85SYohann // 3D tensor contract x
249*9e201c85SYohann //------------------------------------------------------------------------------
250*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
251*9e201c85SYohann inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
252*9e201c85SYohann   CeedScalar r_B[P_1D];
253*9e201c85SYohann   for (CeedInt i = 0; i < P_1D; i++) {
254*9e201c85SYohann     r_B[i] = B[i + data.t_id_x*P_1D];
255*9e201c85SYohann   }
256*9e201c85SYohann 
257*9e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
258*9e201c85SYohann     data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k];
259*9e201c85SYohann     __syncthreads();
260*9e201c85SYohann     V[k] = 0.0;
261*9e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
262*9e201c85SYohann       for (CeedInt i = 0; i < P_1D; i++) {
263*9e201c85SYohann         V[k] += r_B[i] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction
264*9e201c85SYohann       }
265*9e201c85SYohann     }
266*9e201c85SYohann     __syncthreads();
267*9e201c85SYohann   }
268*9e201c85SYohann }
269*9e201c85SYohann 
270*9e201c85SYohann //------------------------------------------------------------------------------
271*9e201c85SYohann // 3D tensor contract y
272*9e201c85SYohann //------------------------------------------------------------------------------
273*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
274*9e201c85SYohann inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
275*9e201c85SYohann   CeedScalar r_B[P_1D];
276*9e201c85SYohann   for (CeedInt i = 0; i < P_1D; i++) {
277*9e201c85SYohann     r_B[i] = B[i + data.t_id_y*P_1D];
278*9e201c85SYohann   }
279*9e201c85SYohann 
280*9e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
281*9e201c85SYohann     data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k];
282*9e201c85SYohann     __syncthreads();
283*9e201c85SYohann     V[k] = 0.0;
284*9e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
285*9e201c85SYohann       for (CeedInt i = 0; i < P_1D; i++) {
286*9e201c85SYohann         V[k] += r_B[i] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction
287*9e201c85SYohann       }
288*9e201c85SYohann     }
289*9e201c85SYohann     __syncthreads();
290*9e201c85SYohann   }
291*9e201c85SYohann }
292*9e201c85SYohann 
293*9e201c85SYohann //------------------------------------------------------------------------------
294*9e201c85SYohann // 3D tensor contract z
295*9e201c85SYohann //------------------------------------------------------------------------------
296*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
297*9e201c85SYohann inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
298*9e201c85SYohann   for (CeedInt k = 0; k < Q_1D; k++) {
299*9e201c85SYohann     V[k] = 0.0;
300*9e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
301*9e201c85SYohann       for (CeedInt i = 0; i < P_1D; i++) {
302*9e201c85SYohann         V[k] += B[i + k*P_1D] * U[i]; // Contract z direction
303*9e201c85SYohann       }
304*9e201c85SYohann     }
305*9e201c85SYohann   }
306*9e201c85SYohann }
307*9e201c85SYohann 
308*9e201c85SYohann //------------------------------------------------------------------------------
309*9e201c85SYohann // 3D transpose tensor contract z
310*9e201c85SYohann //------------------------------------------------------------------------------
311*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
312*9e201c85SYohann inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
313*9e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
314*9e201c85SYohann     V[k] = 0.0;
315*9e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
316*9e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
317*9e201c85SYohann         V[k] += B[k + i*P_1D] * U[i]; // Contract z direction
318*9e201c85SYohann       }
319*9e201c85SYohann     }
320*9e201c85SYohann   }
321*9e201c85SYohann }
322*9e201c85SYohann 
323*9e201c85SYohann //------------------------------------------------------------------------------
324*9e201c85SYohann // 3D transpose tensor contract y
325*9e201c85SYohann //------------------------------------------------------------------------------
326*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
327*9e201c85SYohann inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
328*9e201c85SYohann   CeedScalar r_B[Q_1D];
329*9e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
330*9e201c85SYohann     r_B[i] = B[data.t_id_y + i*P_1D];
331*9e201c85SYohann   }
332*9e201c85SYohann 
333*9e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
334*9e201c85SYohann     data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k];
335*9e201c85SYohann     __syncthreads();
336*9e201c85SYohann     V[k] = 0.0;
337*9e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
338*9e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
339*9e201c85SYohann         V[k] += r_B[i] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction
340*9e201c85SYohann       }
341*9e201c85SYohann     }
342*9e201c85SYohann     __syncthreads();
343*9e201c85SYohann   }
344*9e201c85SYohann }
345*9e201c85SYohann 
346*9e201c85SYohann //------------------------------------------------------------------------------
347*9e201c85SYohann // 3D transpose tensor contract y
348*9e201c85SYohann //------------------------------------------------------------------------------
349*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
350*9e201c85SYohann inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
351*9e201c85SYohann   CeedScalar r_B[Q_1D];
352*9e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
353*9e201c85SYohann     r_B[i] = B[data.t_id_y + i*P_1D];
354*9e201c85SYohann   }
355*9e201c85SYohann 
356*9e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
357*9e201c85SYohann     data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k];
358*9e201c85SYohann     __syncthreads();
359*9e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
360*9e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
361*9e201c85SYohann         V[k] += r_B[i] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction
362*9e201c85SYohann       }
363*9e201c85SYohann     }
364*9e201c85SYohann     __syncthreads();
365*9e201c85SYohann   }
366*9e201c85SYohann }
367*9e201c85SYohann 
368*9e201c85SYohann //------------------------------------------------------------------------------
369*9e201c85SYohann // 3D transpose tensor contract x
370*9e201c85SYohann //------------------------------------------------------------------------------
371*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
372*9e201c85SYohann inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
373*9e201c85SYohann   CeedScalar r_B[Q_1D];
374*9e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
375*9e201c85SYohann     r_B[i] = B[data.t_id_x + i*P_1D];
376*9e201c85SYohann   }
377*9e201c85SYohann 
378*9e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
379*9e201c85SYohann     data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k];
380*9e201c85SYohann     __syncthreads();
381*9e201c85SYohann     V[k] = 0.0;
382*9e201c85SYohann     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
383*9e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
384*9e201c85SYohann         V[k] += r_B[i] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction
385*9e201c85SYohann       }
386*9e201c85SYohann     }
387*9e201c85SYohann     __syncthreads();
388*9e201c85SYohann   }
389*9e201c85SYohann }
390*9e201c85SYohann 
391*9e201c85SYohann //------------------------------------------------------------------------------
392*9e201c85SYohann // 3D transpose tensor contract add x
393*9e201c85SYohann //------------------------------------------------------------------------------
394*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
395*9e201c85SYohann inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
396*9e201c85SYohann   CeedScalar r_B[Q_1D];
397*9e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
398*9e201c85SYohann     r_B[i] = B[data.t_id_x + i*P_1D];
399*9e201c85SYohann   }
400*9e201c85SYohann 
401*9e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
402*9e201c85SYohann     data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k];
403*9e201c85SYohann     __syncthreads();
404*9e201c85SYohann     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
405*9e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
406*9e201c85SYohann         V[k] += r_B[i] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction
407*9e201c85SYohann       }
408*9e201c85SYohann     }
409*9e201c85SYohann     __syncthreads();
410*9e201c85SYohann   }
411*9e201c85SYohann }
412*9e201c85SYohann 
413*9e201c85SYohann //------------------------------------------------------------------------------
414*9e201c85SYohann // 3D interpolate to quadrature points
415*9e201c85SYohann //------------------------------------------------------------------------------
416*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
417*9e201c85SYohann inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
418*9e201c85SYohann   CeedScalar r_t1[T_1D];
419*9e201c85SYohann   CeedScalar r_t2[T_1D];
420*9e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
421*9e201c85SYohann     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*P_1D, c_B, r_t1);
422*9e201c85SYohann     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
423*9e201c85SYohann     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*Q_1D);
424*9e201c85SYohann   }
425*9e201c85SYohann }
426*9e201c85SYohann 
427*9e201c85SYohann //------------------------------------------------------------------------------
428*9e201c85SYohann // 3D interpolate transpose
429*9e201c85SYohann //------------------------------------------------------------------------------
430*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
431*9e201c85SYohann inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
432*9e201c85SYohann   CeedScalar r_t1[T_1D];
433*9e201c85SYohann   CeedScalar r_t2[T_1D];
434*9e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
435*9e201c85SYohann     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*Q_1D, c_B, r_t1);
436*9e201c85SYohann     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
437*9e201c85SYohann     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*P_1D);
438*9e201c85SYohann   }
439*9e201c85SYohann }
440*9e201c85SYohann 
441*9e201c85SYohann //------------------------------------------------------------------------------
442*9e201c85SYohann // 3D derivatives at quadrature points
443*9e201c85SYohann //------------------------------------------------------------------------------
444*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
445*9e201c85SYohann inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
446*9e201c85SYohann   CeedScalar r_t1[T_1D];
447*9e201c85SYohann   CeedScalar r_t2[T_1D];
448*9e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
449*9e201c85SYohann     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*P_1D, c_G, r_t1);
450*9e201c85SYohann     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
451*9e201c85SYohann     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*Q_1D + 0*NUM_COMP*Q_1D);
452*9e201c85SYohann     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*P_1D, c_B, r_t1);
453*9e201c85SYohann     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
454*9e201c85SYohann     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*Q_1D + 1*NUM_COMP*Q_1D);
455*9e201c85SYohann     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*P_1D, c_B, r_t1);
456*9e201c85SYohann     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
457*9e201c85SYohann     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp*Q_1D + 2*NUM_COMP*Q_1D);
458*9e201c85SYohann   }
459*9e201c85SYohann }
460*9e201c85SYohann 
461*9e201c85SYohann //------------------------------------------------------------------------------
462*9e201c85SYohann // 3D derivatives transpose
463*9e201c85SYohann //------------------------------------------------------------------------------
464*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
465*9e201c85SYohann inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
466*9e201c85SYohann   CeedScalar r_t1[T_1D];
467*9e201c85SYohann   CeedScalar r_t2[T_1D];
468*9e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
469*9e201c85SYohann     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*Q_1D + 0*NUM_COMP*Q_1D, c_B, r_t1);
470*9e201c85SYohann     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
471*9e201c85SYohann     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp*P_1D);
472*9e201c85SYohann     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*Q_1D + 1*NUM_COMP*Q_1D, c_B, r_t1);
473*9e201c85SYohann     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
474*9e201c85SYohann     ContractTransposeAddX3d<NUM_COMP,P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*P_1D);
475*9e201c85SYohann     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*Q_1D + 2*NUM_COMP*Q_1D, c_G, r_t1);
476*9e201c85SYohann     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
477*9e201c85SYohann     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*P_1D);
478*9e201c85SYohann   }
479*9e201c85SYohann }
480*9e201c85SYohann 
481*9e201c85SYohann //------------------------------------------------------------------------------
482*9e201c85SYohann // 3D derivatives at quadrature points
483*9e201c85SYohann //------------------------------------------------------------------------------
484*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
485*9e201c85SYohann inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
486*9e201c85SYohann   CeedScalar r_t1[T_1D];
487*9e201c85SYohann   CeedScalar r_t2[T_1D];
488*9e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
489*9e201c85SYohann     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*P_1D, c_B, r_t1);
490*9e201c85SYohann     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
491*9e201c85SYohann     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
492*9e201c85SYohann     ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp*Q_1D + 0*NUM_COMP*Q_1D);
493*9e201c85SYohann     ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp*Q_1D + 1*NUM_COMP*Q_1D);
494*9e201c85SYohann     ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp*Q_1D + 2*NUM_COMP*Q_1D);
495*9e201c85SYohann   }
496*9e201c85SYohann }
497*9e201c85SYohann 
498*9e201c85SYohann //------------------------------------------------------------------------------
499*9e201c85SYohann // 3D derivatives transpose
500*9e201c85SYohann //------------------------------------------------------------------------------
501*9e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
502*9e201c85SYohann inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
503*9e201c85SYohann   CeedScalar r_t1[T_1D];
504*9e201c85SYohann   CeedScalar r_t2[T_1D];
505*9e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
506*9e201c85SYohann     ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp*Q_1D + 2*NUM_COMP*Q_1D, c_G, r_t2);
507*9e201c85SYohann     ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp*Q_1D + 1*NUM_COMP*Q_1D, c_G, r_t2);
508*9e201c85SYohann     ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp*Q_1D + 0*NUM_COMP*Q_1D, c_G, r_t2);
509*9e201c85SYohann     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
510*9e201c85SYohann     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
511*9e201c85SYohann     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*P_1D);
512*9e201c85SYohann   }
513*9e201c85SYohann }
514*9e201c85SYohann 
515*9e201c85SYohann //------------------------------------------------------------------------------
516*9e201c85SYohann // 3D quadrature weights
517*9e201c85SYohann //------------------------------------------------------------------------------
518*9e201c85SYohann template <int Q_1D>
519*9e201c85SYohann inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
520*9e201c85SYohann   const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
521*9e201c85SYohann   const CeedScalar pw = quad ? q_weight_1d[data.t_id_x]*q_weight_1d[data.t_id_y] : 0.0;
522*9e201c85SYohann   for (CeedInt q = 0; q < Q_1D; q++) {
523*9e201c85SYohann     w[q] = quad ? pw*q_weight_1d[q] : 0.0;
524*9e201c85SYohann   }
525*9e201c85SYohann }
526*9e201c85SYohann 
527*9e201c85SYohann //------------------------------------------------------------------------------
528*9e201c85SYohann 
529*9e201c85SYohann #endif
530