xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h (revision 9b91271bd9570896f9ae5225f4862ed0c149b456)
1*9b91271bSJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2*9b91271bSJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3*9b91271bSJeremy L Thompson //
4*9b91271bSJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
5*9b91271bSJeremy L Thompson //
6*9b91271bSJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
7*9b91271bSJeremy L Thompson 
8*9b91271bSJeremy L Thompson /// @file
9*9b91271bSJeremy L Thompson /// Internal header for HIP shared memory tensor product basis templates
10*9b91271bSJeremy L Thompson #include <ceed/types.h>
11*9b91271bSJeremy L Thompson 
12*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
13*9b91271bSJeremy L Thompson // 2D
14*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
15*9b91271bSJeremy L Thompson 
16*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
17*9b91271bSJeremy L Thompson // 2D tensor contraction x
18*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
19*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
20*9b91271bSJeremy L Thompson inline __device__ void ContractX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
21*9b91271bSJeremy L Thompson                                             CeedScalar *V) {
22*9b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
23*9b91271bSJeremy L Thompson   __syncthreads();
24*9b91271bSJeremy L Thompson   *V = 0.0;
25*9b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
26*9b91271bSJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
27*9b91271bSJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
28*9b91271bSJeremy L Thompson     }
29*9b91271bSJeremy L Thompson   }
30*9b91271bSJeremy L Thompson   __syncthreads();
31*9b91271bSJeremy L Thompson }
32*9b91271bSJeremy L Thompson 
33*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
34*9b91271bSJeremy L Thompson // 2D tensor contract y
35*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
36*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
37*9b91271bSJeremy L Thompson inline __device__ void ContractY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
38*9b91271bSJeremy L Thompson                                             CeedScalar *V) {
39*9b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
40*9b91271bSJeremy L Thompson   __syncthreads();
41*9b91271bSJeremy L Thompson   *V = 0.0;
42*9b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D) {
43*9b91271bSJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
44*9b91271bSJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
45*9b91271bSJeremy L Thompson     }
46*9b91271bSJeremy L Thompson   }
47*9b91271bSJeremy L Thompson   __syncthreads();
48*9b91271bSJeremy L Thompson }
49*9b91271bSJeremy L Thompson 
50*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
51*9b91271bSJeremy L Thompson // 2D transpose tensor contract y
52*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
53*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
54*9b91271bSJeremy L Thompson inline __device__ void ContractTransposeY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
55*9b91271bSJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
56*9b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
57*9b91271bSJeremy L Thompson   __syncthreads();
58*9b91271bSJeremy L Thompson   *V = 0.0;
59*9b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
60*9b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
61*9b91271bSJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
62*9b91271bSJeremy L Thompson     }
63*9b91271bSJeremy L Thompson   }
64*9b91271bSJeremy L Thompson   __syncthreads();
65*9b91271bSJeremy L Thompson }
66*9b91271bSJeremy L Thompson 
67*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
68*9b91271bSJeremy L Thompson // 2D transpose tensor contract x
69*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
70*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
71*9b91271bSJeremy L Thompson inline __device__ void ContractTransposeX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
72*9b91271bSJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
73*9b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
74*9b91271bSJeremy L Thompson   __syncthreads();
75*9b91271bSJeremy L Thompson   *V = 0.0;
76*9b91271bSJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
77*9b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
78*9b91271bSJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
79*9b91271bSJeremy L Thompson     }
80*9b91271bSJeremy L Thompson   }
81*9b91271bSJeremy L Thompson   __syncthreads();
82*9b91271bSJeremy L Thompson }
83*9b91271bSJeremy L Thompson 
84*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
85*9b91271bSJeremy L Thompson // 2D transpose tensor contract and add x
86*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
87*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
88*9b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
89*9b91271bSJeremy L Thompson                                                         const CeedScalar *B, CeedScalar *V) {
90*9b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
91*9b91271bSJeremy L Thompson   __syncthreads();
92*9b91271bSJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
93*9b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
94*9b91271bSJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
95*9b91271bSJeremy L Thompson     }
96*9b91271bSJeremy L Thompson   }
97*9b91271bSJeremy L Thompson   __syncthreads();
98*9b91271bSJeremy L Thompson }
99*9b91271bSJeremy L Thompson 
100*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
101*9b91271bSJeremy L Thompson // 2D pack/unpack quadrature values
102*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
103*9b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
104*9b91271bSJeremy L Thompson inline __device__ void QPack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
105*9b91271bSJeremy L Thompson   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
106*9b91271bSJeremy L Thompson 
107*9b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
108*9b91271bSJeremy L Thompson     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
109*9b91271bSJeremy L Thompson     __syncthreads();
110*9b91271bSJeremy L Thompson     U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
111*9b91271bSJeremy L Thompson     __syncthreads();
112*9b91271bSJeremy L Thompson   }
113*9b91271bSJeremy L Thompson }
114*9b91271bSJeremy L Thompson 
115*9b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
116*9b91271bSJeremy L Thompson inline __device__ void QUnpack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
117*9b91271bSJeremy L Thompson   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
118*9b91271bSJeremy L Thompson 
119*9b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
120*9b91271bSJeremy L Thompson     if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
121*9b91271bSJeremy L Thompson     __syncthreads();
122*9b91271bSJeremy L Thompson     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
123*9b91271bSJeremy L Thompson     __syncthreads();
124*9b91271bSJeremy L Thompson   }
125*9b91271bSJeremy L Thompson }
126*9b91271bSJeremy L Thompson 
127*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
128*9b91271bSJeremy L Thompson // 2D interpolate to quadrature points
129*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
130*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
131*9b91271bSJeremy L Thompson inline __device__ void InterpTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
132*9b91271bSJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
133*9b91271bSJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
134*9b91271bSJeremy L Thompson   CeedScalar r_t[1];
135*9b91271bSJeremy L Thompson 
136*9b91271bSJeremy L Thompson   QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
137*9b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
138*9b91271bSJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
139*9b91271bSJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
140*9b91271bSJeremy L Thompson   }
141*9b91271bSJeremy L Thompson   QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
142*9b91271bSJeremy L Thompson   QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
143*9b91271bSJeremy L Thompson }
144*9b91271bSJeremy L Thompson 
145*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
146*9b91271bSJeremy L Thompson // 2D interpolate transpose
147*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
148*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
149*9b91271bSJeremy L Thompson inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
150*9b91271bSJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
151*9b91271bSJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
152*9b91271bSJeremy L Thompson   CeedScalar r_t[1];
153*9b91271bSJeremy L Thompson 
154*9b91271bSJeremy L Thompson   QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
155*9b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
156*9b91271bSJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
157*9b91271bSJeremy L Thompson     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
158*9b91271bSJeremy L Thompson   }
159*9b91271bSJeremy L Thompson   QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
160*9b91271bSJeremy L Thompson }
161*9b91271bSJeremy L Thompson 
162*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
163*9b91271bSJeremy L Thompson // 2D derivatives at quadrature points
164*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
165*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
166*9b91271bSJeremy L Thompson inline __device__ void GradTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
167*9b91271bSJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
168*9b91271bSJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
169*9b91271bSJeremy L Thompson   CeedScalar r_t[1];
170*9b91271bSJeremy L Thompson 
171*9b91271bSJeremy L Thompson   QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
172*9b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
173*9b91271bSJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
174*9b91271bSJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
175*9b91271bSJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
176*9b91271bSJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
177*9b91271bSJeremy L Thompson   }
178*9b91271bSJeremy L Thompson   QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
179*9b91271bSJeremy L Thompson   QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
180*9b91271bSJeremy L Thompson }
181*9b91271bSJeremy L Thompson 
182*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
183*9b91271bSJeremy L Thompson // 2D derivatives transpose
184*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
185*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
186*9b91271bSJeremy L Thompson inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
187*9b91271bSJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
188*9b91271bSJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
189*9b91271bSJeremy L Thompson   CeedScalar r_t[1];
190*9b91271bSJeremy L Thompson 
191*9b91271bSJeremy L Thompson   QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
192*9b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
193*9b91271bSJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
194*9b91271bSJeremy L Thompson     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
195*9b91271bSJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
196*9b91271bSJeremy L Thompson     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
197*9b91271bSJeremy L Thompson   }
198*9b91271bSJeremy L Thompson   QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
199*9b91271bSJeremy L Thompson }
200*9b91271bSJeremy L Thompson 
201*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
202*9b91271bSJeremy L Thompson // 2D quadrature weights
203*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
204*9b91271bSJeremy L Thompson template <int P_1D, int Q_1D>
205*9b91271bSJeremy L Thompson inline __device__ void WeightTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
206*9b91271bSJeremy L Thompson   const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
207*9b91271bSJeremy L Thompson 
208*9b91271bSJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
209*9b91271bSJeremy L Thompson }
210*9b91271bSJeremy L Thompson 
211*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
212*9b91271bSJeremy L Thompson // 3D
213*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
214*9b91271bSJeremy L Thompson 
215*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
216*9b91271bSJeremy L Thompson // 3D tensor contract x
217*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
218*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
219*9b91271bSJeremy L Thompson inline __device__ void ContractX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
220*9b91271bSJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
221*9b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
222*9b91271bSJeremy L Thompson   __syncthreads();
223*9b91271bSJeremy L Thompson   *V = 0.0;
224*9b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
225*9b91271bSJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
226*9b91271bSJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
227*9b91271bSJeremy L Thompson     }
228*9b91271bSJeremy L Thompson   }
229*9b91271bSJeremy L Thompson   __syncthreads();
230*9b91271bSJeremy L Thompson }
231*9b91271bSJeremy L Thompson 
232*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
233*9b91271bSJeremy L Thompson // 3D tensor contract y
234*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
235*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
236*9b91271bSJeremy L Thompson inline __device__ void ContractY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
237*9b91271bSJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
238*9b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
239*9b91271bSJeremy L Thompson   __syncthreads();
240*9b91271bSJeremy L Thompson   *V = 0.0;
241*9b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
242*9b91271bSJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
243*9b91271bSJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
244*9b91271bSJeremy L Thompson     }
245*9b91271bSJeremy L Thompson   }
246*9b91271bSJeremy L Thompson   __syncthreads();
247*9b91271bSJeremy L Thompson }
248*9b91271bSJeremy L Thompson 
249*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
250*9b91271bSJeremy L Thompson // 3D tensor contract z
251*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
252*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
253*9b91271bSJeremy L Thompson inline __device__ void ContractZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
254*9b91271bSJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
255*9b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
256*9b91271bSJeremy L Thompson   __syncthreads();
257*9b91271bSJeremy L Thompson   *V = 0.0;
258*9b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
259*9b91271bSJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
260*9b91271bSJeremy L Thompson       *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
261*9b91271bSJeremy L Thompson     }
262*9b91271bSJeremy L Thompson   }
263*9b91271bSJeremy L Thompson   __syncthreads();
264*9b91271bSJeremy L Thompson }
265*9b91271bSJeremy L Thompson 
266*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
267*9b91271bSJeremy L Thompson // 3D tensor contract z
268*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
269*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
270*9b91271bSJeremy L Thompson inline __device__ void ContractTransposeZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
271*9b91271bSJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
272*9b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
273*9b91271bSJeremy L Thompson   __syncthreads();
274*9b91271bSJeremy L Thompson   *V = 0.0;
275*9b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
276*9b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
277*9b91271bSJeremy L Thompson       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
278*9b91271bSJeremy L Thompson     }
279*9b91271bSJeremy L Thompson   }
280*9b91271bSJeremy L Thompson   __syncthreads();
281*9b91271bSJeremy L Thompson }
282*9b91271bSJeremy L Thompson 
283*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
284*9b91271bSJeremy L Thompson // 3D transpose tensor contract z
285*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
286*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
287*9b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
288*9b91271bSJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
289*9b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
290*9b91271bSJeremy L Thompson   __syncthreads();
291*9b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
292*9b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
293*9b91271bSJeremy L Thompson       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
294*9b91271bSJeremy L Thompson     }
295*9b91271bSJeremy L Thompson   }
296*9b91271bSJeremy L Thompson   __syncthreads();
297*9b91271bSJeremy L Thompson }
298*9b91271bSJeremy L Thompson 
299*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
300*9b91271bSJeremy L Thompson // 3D transpose tensor contract y
301*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
302*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
303*9b91271bSJeremy L Thompson inline __device__ void ContractTransposeY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
304*9b91271bSJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
305*9b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
306*9b91271bSJeremy L Thompson   __syncthreads();
307*9b91271bSJeremy L Thompson   *V = 0.0;
308*9b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
309*9b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
310*9b91271bSJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
311*9b91271bSJeremy L Thompson     }
312*9b91271bSJeremy L Thompson   }
313*9b91271bSJeremy L Thompson   __syncthreads();
314*9b91271bSJeremy L Thompson }
315*9b91271bSJeremy L Thompson 
316*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
317*9b91271bSJeremy L Thompson // 3D transpose tensor contract y
318*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
319*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
320*9b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
321*9b91271bSJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
322*9b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
323*9b91271bSJeremy L Thompson   __syncthreads();
324*9b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
325*9b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
326*9b91271bSJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
327*9b91271bSJeremy L Thompson     }
328*9b91271bSJeremy L Thompson   }
329*9b91271bSJeremy L Thompson   __syncthreads();
330*9b91271bSJeremy L Thompson }
331*9b91271bSJeremy L Thompson 
332*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
333*9b91271bSJeremy L Thompson // 3D transpose tensor contract x
334*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
335*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
336*9b91271bSJeremy L Thompson inline __device__ void ContractTransposeX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
337*9b91271bSJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
338*9b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
339*9b91271bSJeremy L Thompson   __syncthreads();
340*9b91271bSJeremy L Thompson   *V = 0.0;
341*9b91271bSJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
342*9b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
343*9b91271bSJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
344*9b91271bSJeremy L Thompson     }
345*9b91271bSJeremy L Thompson   }
346*9b91271bSJeremy L Thompson   __syncthreads();
347*9b91271bSJeremy L Thompson }
348*9b91271bSJeremy L Thompson 
349*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
350*9b91271bSJeremy L Thompson // 3D transpose tensor contract add x
351*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
352*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
353*9b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
354*9b91271bSJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
355*9b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
356*9b91271bSJeremy L Thompson   __syncthreads();
357*9b91271bSJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
358*9b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
359*9b91271bSJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
360*9b91271bSJeremy L Thompson     }
361*9b91271bSJeremy L Thompson   }
362*9b91271bSJeremy L Thompson   __syncthreads();
363*9b91271bSJeremy L Thompson }
364*9b91271bSJeremy L Thompson 
365*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
366*9b91271bSJeremy L Thompson // 3D pack/unpack quadrature values
367*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
368*9b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
369*9b91271bSJeremy L Thompson inline __device__ void QPack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
370*9b91271bSJeremy L Thompson   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
371*9b91271bSJeremy L Thompson 
372*9b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
373*9b91271bSJeremy L Thompson     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
374*9b91271bSJeremy L Thompson     __syncthreads();
375*9b91271bSJeremy L Thompson     U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
376*9b91271bSJeremy L Thompson     __syncthreads();
377*9b91271bSJeremy L Thompson   }
378*9b91271bSJeremy L Thompson }
379*9b91271bSJeremy L Thompson 
380*9b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
381*9b91271bSJeremy L Thompson inline __device__ void QUnpack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
382*9b91271bSJeremy L Thompson   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
383*9b91271bSJeremy L Thompson 
384*9b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
385*9b91271bSJeremy L Thompson     if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
386*9b91271bSJeremy L Thompson     __syncthreads();
387*9b91271bSJeremy L Thompson     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
388*9b91271bSJeremy L Thompson     __syncthreads();
389*9b91271bSJeremy L Thompson   }
390*9b91271bSJeremy L Thompson }
391*9b91271bSJeremy L Thompson 
392*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
393*9b91271bSJeremy L Thompson // 3D interpolate to quadrature points
394*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
395*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
396*9b91271bSJeremy L Thompson inline __device__ void InterpTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
397*9b91271bSJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
398*9b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
399*9b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
400*9b91271bSJeremy L Thompson 
401*9b91271bSJeremy L Thompson   QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
402*9b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
403*9b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
404*9b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
405*9b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
406*9b91271bSJeremy L Thompson   }
407*9b91271bSJeremy L Thompson   QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
408*9b91271bSJeremy L Thompson   QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
409*9b91271bSJeremy L Thompson }
410*9b91271bSJeremy L Thompson 
411*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
412*9b91271bSJeremy L Thompson // 3D interpolate transpose
413*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
414*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
415*9b91271bSJeremy L Thompson inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
416*9b91271bSJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
417*9b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
418*9b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
419*9b91271bSJeremy L Thompson 
420*9b91271bSJeremy L Thompson   QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
421*9b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
422*9b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
423*9b91271bSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
424*9b91271bSJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
425*9b91271bSJeremy L Thompson   }
426*9b91271bSJeremy L Thompson   QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
427*9b91271bSJeremy L Thompson }
428*9b91271bSJeremy L Thompson 
429*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
430*9b91271bSJeremy L Thompson // 3D derivatives at quadrature points
431*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
432*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
433*9b91271bSJeremy L Thompson inline __device__ void GradTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
434*9b91271bSJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
435*9b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
436*9b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
437*9b91271bSJeremy L Thompson 
438*9b91271bSJeremy L Thompson   QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
439*9b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
440*9b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
441*9b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
442*9b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
443*9b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
444*9b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
445*9b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
446*9b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
447*9b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
448*9b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
449*9b91271bSJeremy L Thompson   }
450*9b91271bSJeremy L Thompson   QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
451*9b91271bSJeremy L Thompson   QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
452*9b91271bSJeremy L Thompson }
453*9b91271bSJeremy L Thompson 
454*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
455*9b91271bSJeremy L Thompson // 3D derivatives transpose
456*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
457*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
458*9b91271bSJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
459*9b91271bSJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
460*9b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
461*9b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
462*9b91271bSJeremy L Thompson 
463*9b91271bSJeremy L Thompson   QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
464*9b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
465*9b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
466*9b91271bSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
467*9b91271bSJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
468*9b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
469*9b91271bSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
470*9b91271bSJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
471*9b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
472*9b91271bSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
473*9b91271bSJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
474*9b91271bSJeremy L Thompson   }
475*9b91271bSJeremy L Thompson   QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
476*9b91271bSJeremy L Thompson }
477*9b91271bSJeremy L Thompson 
478*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
479*9b91271bSJeremy L Thompson // 3D derivatives at quadrature points
480*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
481*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
482*9b91271bSJeremy L Thompson inline __device__ void GradTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
483*9b91271bSJeremy L Thompson                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
484*9b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
485*9b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
486*9b91271bSJeremy L Thompson 
487*9b91271bSJeremy L Thompson   QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
488*9b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
489*9b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
490*9b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
491*9b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
492*9b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
493*9b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
494*9b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
495*9b91271bSJeremy L Thompson   }
496*9b91271bSJeremy L Thompson   QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
497*9b91271bSJeremy L Thompson   QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
498*9b91271bSJeremy L Thompson }
499*9b91271bSJeremy L Thompson 
500*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
501*9b91271bSJeremy L Thompson // 3D derivatives transpose
502*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
503*9b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
504*9b91271bSJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
505*9b91271bSJeremy L Thompson                                                                 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
506*9b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
507*9b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
508*9b91271bSJeremy L Thompson 
509*9b91271bSJeremy L Thompson   QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
510*9b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
511*9b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2);
512*9b91271bSJeremy L Thompson     ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2);
513*9b91271bSJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2);
514*9b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
515*9b91271bSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
516*9b91271bSJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
517*9b91271bSJeremy L Thompson   }
518*9b91271bSJeremy L Thompson   QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
519*9b91271bSJeremy L Thompson }
520*9b91271bSJeremy L Thompson 
521*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
522*9b91271bSJeremy L Thompson // 3D quadrature weights
523*9b91271bSJeremy L Thompson //------------------------------------------------------------------------------
524*9b91271bSJeremy L Thompson template <int P_1D, int Q_1D>
525*9b91271bSJeremy L Thompson inline __device__ void WeightTensor3dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
526*9b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
527*9b91271bSJeremy L Thompson 
528*9b91271bSJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
529*9b91271bSJeremy L Thompson }
530