xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h (revision d6c19ee8504c74d8f30ec67127f069a58291b3ac)
1c8e372f0SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2c8e372f0SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3c8e372f0SJeremy L Thompson //
4c8e372f0SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
5c8e372f0SJeremy L Thompson //
6c8e372f0SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
7c8e372f0SJeremy L Thompson 
8c8e372f0SJeremy L Thompson /// @file
9c8e372f0SJeremy L Thompson /// Internal header for CUDA shared memory tensor product basis templates
10c8e372f0SJeremy L Thompson #include <ceed/types.h>
11c8e372f0SJeremy L Thompson 
12c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
13c8e372f0SJeremy L Thompson // 2D
14c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
15c8e372f0SJeremy L Thompson 
16c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
17c8e372f0SJeremy L Thompson // 2D tensor contraction x
18c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
19c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
20c8e372f0SJeremy L Thompson inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
21c8e372f0SJeremy L Thompson                                             CeedScalar *V) {
22*d6c19ee8SJeremy L Thompson   __syncthreads();
23c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
24c8e372f0SJeremy L Thompson   __syncthreads();
25c8e372f0SJeremy L Thompson   *V = 0.0;
26c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
27c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
28c8e372f0SJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
29c8e372f0SJeremy L Thompson     }
30c8e372f0SJeremy L Thompson   }
31c8e372f0SJeremy L Thompson }
32c8e372f0SJeremy L Thompson 
33c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
34c8e372f0SJeremy L Thompson // 2D tensor contract y
35c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
36c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
37c8e372f0SJeremy L Thompson inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
38c8e372f0SJeremy L Thompson                                             CeedScalar *V) {
39*d6c19ee8SJeremy L Thompson   __syncthreads();
40c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
41c8e372f0SJeremy L Thompson   __syncthreads();
42c8e372f0SJeremy L Thompson   *V = 0.0;
43c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D) {
44c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
45c8e372f0SJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
46c8e372f0SJeremy L Thompson     }
47c8e372f0SJeremy L Thompson   }
48c8e372f0SJeremy L Thompson }
49c8e372f0SJeremy L Thompson 
50c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
51c8e372f0SJeremy L Thompson // 2D transpose tensor contract y
52c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
53c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
54c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
55c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
56*d6c19ee8SJeremy L Thompson   __syncthreads();
57c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
58c8e372f0SJeremy L Thompson   __syncthreads();
59c8e372f0SJeremy L Thompson   *V = 0.0;
60c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
61c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
62c8e372f0SJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
63c8e372f0SJeremy L Thompson     }
64c8e372f0SJeremy L Thompson   }
65c8e372f0SJeremy L Thompson }
66c8e372f0SJeremy L Thompson 
67c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
68c8e372f0SJeremy L Thompson // 2D transpose tensor contract x
69c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
70c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
71c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
72c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
73*d6c19ee8SJeremy L Thompson   __syncthreads();
74c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
75c8e372f0SJeremy L Thompson   __syncthreads();
76c8e372f0SJeremy L Thompson   *V = 0.0;
77c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
78c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
79c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
80c8e372f0SJeremy L Thompson     }
81c8e372f0SJeremy L Thompson   }
82c8e372f0SJeremy L Thompson }
83c8e372f0SJeremy L Thompson 
84c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
85c8e372f0SJeremy L Thompson // 2D transpose tensor contract and add x
86c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
87c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
88c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
89c8e372f0SJeremy L Thompson                                                         const CeedScalar *B, CeedScalar *V) {
90*d6c19ee8SJeremy L Thompson   __syncthreads();
91c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
92c8e372f0SJeremy L Thompson   __syncthreads();
93c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
94c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
95c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
96c8e372f0SJeremy L Thompson     }
97c8e372f0SJeremy L Thompson   }
98c8e372f0SJeremy L Thompson }
99c8e372f0SJeremy L Thompson 
100c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
101c8e372f0SJeremy L Thompson // 2D pack/unpack quadrature values
102c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
103c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
104c8e372f0SJeremy L Thompson inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
105c8e372f0SJeremy L Thompson   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
106c8e372f0SJeremy L Thompson 
107c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
108*d6c19ee8SJeremy L Thompson     __syncthreads();
109c8e372f0SJeremy L Thompson     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
110c8e372f0SJeremy L Thompson     __syncthreads();
111c8e372f0SJeremy L Thompson     U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
112c8e372f0SJeremy L Thompson   }
113c8e372f0SJeremy L Thompson }
114c8e372f0SJeremy L Thompson 
115c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
116c8e372f0SJeremy L Thompson inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
117c8e372f0SJeremy L Thompson   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
118c8e372f0SJeremy L Thompson 
119c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
120*d6c19ee8SJeremy L Thompson     __syncthreads();
121c8e372f0SJeremy L Thompson     if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
122c8e372f0SJeremy L Thompson     __syncthreads();
123c8e372f0SJeremy L Thompson     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
124c8e372f0SJeremy L Thompson   }
125c8e372f0SJeremy L Thompson }
126c8e372f0SJeremy L Thompson 
127c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
128c8e372f0SJeremy L Thompson // 2D interpolate to quadrature points
129c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
130c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
131c8e372f0SJeremy L Thompson inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
132c8e372f0SJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
133c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
134c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
135c8e372f0SJeremy L Thompson 
136ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
137c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
138c8e372f0SJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
139c8e372f0SJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
140c8e372f0SJeremy L Thompson   }
141ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
142ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
143c8e372f0SJeremy L Thompson }
144c8e372f0SJeremy L Thompson 
145c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
146c8e372f0SJeremy L Thompson // 2D interpolate transpose
147c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
148c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
149c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
150c8e372f0SJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
151c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
152c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
153c8e372f0SJeremy L Thompson 
154ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
155c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
156c8e372f0SJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
157c8e372f0SJeremy L Thompson     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
158c8e372f0SJeremy L Thompson   }
159ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
160c8e372f0SJeremy L Thompson }
161c8e372f0SJeremy L Thompson 
162c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
163c8e372f0SJeremy L Thompson // 2D derivatives at quadrature points
164c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
165c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
166c8e372f0SJeremy L Thompson inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
167c8e372f0SJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
168c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
169c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
170c8e372f0SJeremy L Thompson 
171ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
172c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
173c8e372f0SJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
174c8e372f0SJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
175c8e372f0SJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
176c8e372f0SJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
177c8e372f0SJeremy L Thompson   }
178ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
179ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
180c8e372f0SJeremy L Thompson }
181c8e372f0SJeremy L Thompson 
182c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
183c8e372f0SJeremy L Thompson // 2D derivatives transpose
184c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
185c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
186c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
187c8e372f0SJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
188c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
189c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
190c8e372f0SJeremy L Thompson 
191ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
192c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
193c8e372f0SJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
194c8e372f0SJeremy L Thompson     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
195c8e372f0SJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
196c8e372f0SJeremy L Thompson     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
197c8e372f0SJeremy L Thompson   }
198ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
199c8e372f0SJeremy L Thompson }
200c8e372f0SJeremy L Thompson 
201c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
202c8e372f0SJeremy L Thompson // 2D quadrature weights
203c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
204c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D>
205c8e372f0SJeremy L Thompson inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
206c8e372f0SJeremy L Thompson   const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
207c8e372f0SJeremy L Thompson 
208c8e372f0SJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
209c8e372f0SJeremy L Thompson }
210c8e372f0SJeremy L Thompson 
211c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
212c8e372f0SJeremy L Thompson // 3D
213c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
214c8e372f0SJeremy L Thompson 
215c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
216c8e372f0SJeremy L Thompson // 3D tensor contract x
217c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
218c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
219c8e372f0SJeremy L Thompson inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
220c8e372f0SJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
221*d6c19ee8SJeremy L Thompson   __syncthreads();
222c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
223c8e372f0SJeremy L Thompson   __syncthreads();
224c8e372f0SJeremy L Thompson   *V = 0.0;
225c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
226c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
227c8e372f0SJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
228c8e372f0SJeremy L Thompson     }
229c8e372f0SJeremy L Thompson   }
230c8e372f0SJeremy L Thompson }
231c8e372f0SJeremy L Thompson 
232c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
233c8e372f0SJeremy L Thompson // 3D tensor contract y
234c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
235c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
236c8e372f0SJeremy L Thompson inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
237c8e372f0SJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
238*d6c19ee8SJeremy L Thompson   __syncthreads();
239c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
240c8e372f0SJeremy L Thompson   __syncthreads();
241c8e372f0SJeremy L Thompson   *V = 0.0;
242c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
243c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
244c8e372f0SJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
245c8e372f0SJeremy L Thompson     }
246c8e372f0SJeremy L Thompson   }
247c8e372f0SJeremy L Thompson }
248c8e372f0SJeremy L Thompson 
249c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
250c8e372f0SJeremy L Thompson // 3D tensor contract z
251c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
252c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
253c8e372f0SJeremy L Thompson inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
254c8e372f0SJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
255*d6c19ee8SJeremy L Thompson   __syncthreads();
256c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
257c8e372f0SJeremy L Thompson   __syncthreads();
258c8e372f0SJeremy L Thompson   *V = 0.0;
259c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
260c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
261c8e372f0SJeremy L Thompson       *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
262c8e372f0SJeremy L Thompson     }
263c8e372f0SJeremy L Thompson   }
264c8e372f0SJeremy L Thompson }
265c8e372f0SJeremy L Thompson 
266c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
267c8e372f0SJeremy L Thompson // 3D tensor contract z
268c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
269c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
270c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
271c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
272*d6c19ee8SJeremy L Thompson   __syncthreads();
273c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
274c8e372f0SJeremy L Thompson   __syncthreads();
275c8e372f0SJeremy L Thompson   *V = 0.0;
276c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
277c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
278c8e372f0SJeremy L Thompson       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
279c8e372f0SJeremy L Thompson     }
280c8e372f0SJeremy L Thompson   }
281c8e372f0SJeremy L Thompson }
282c8e372f0SJeremy L Thompson 
283c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
284c8e372f0SJeremy L Thompson // 3D transpose tensor contract z
285c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
286c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
287c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
288c8e372f0SJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
289*d6c19ee8SJeremy L Thompson   __syncthreads();
290c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
291c8e372f0SJeremy L Thompson   __syncthreads();
292c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
293c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
294c8e372f0SJeremy L Thompson       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
295c8e372f0SJeremy L Thompson     }
296c8e372f0SJeremy L Thompson   }
297c8e372f0SJeremy L Thompson }
298c8e372f0SJeremy L Thompson 
299c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
300c8e372f0SJeremy L Thompson // 3D transpose tensor contract y
301c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
302c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
303c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
304c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
305*d6c19ee8SJeremy L Thompson   __syncthreads();
306c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
307c8e372f0SJeremy L Thompson   __syncthreads();
308c8e372f0SJeremy L Thompson   *V = 0.0;
309c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
310c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
311c8e372f0SJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
312c8e372f0SJeremy L Thompson     }
313c8e372f0SJeremy L Thompson   }
314c8e372f0SJeremy L Thompson }
315c8e372f0SJeremy L Thompson 
316c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
317c8e372f0SJeremy L Thompson // 3D transpose tensor contract y
318c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
319c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
320c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
321c8e372f0SJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
322*d6c19ee8SJeremy L Thompson   __syncthreads();
323c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
324c8e372f0SJeremy L Thompson   __syncthreads();
325c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
326c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
327c8e372f0SJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
328c8e372f0SJeremy L Thompson     }
329c8e372f0SJeremy L Thompson   }
330c8e372f0SJeremy L Thompson }
331c8e372f0SJeremy L Thompson 
332c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
333c8e372f0SJeremy L Thompson // 3D transpose tensor contract x
334c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
335c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
336c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
337c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
338*d6c19ee8SJeremy L Thompson   __syncthreads();
339c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
340c8e372f0SJeremy L Thompson   __syncthreads();
341c8e372f0SJeremy L Thompson   *V = 0.0;
342c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
343c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
344c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
345c8e372f0SJeremy L Thompson     }
346c8e372f0SJeremy L Thompson   }
347c8e372f0SJeremy L Thompson }
348c8e372f0SJeremy L Thompson 
349c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
350c8e372f0SJeremy L Thompson // 3D transpose tensor contract add x
351c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
352c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
353c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
354c8e372f0SJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
355*d6c19ee8SJeremy L Thompson   __syncthreads();
356c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
357c8e372f0SJeremy L Thompson   __syncthreads();
358c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
359c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
360c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
361c8e372f0SJeremy L Thompson     }
362c8e372f0SJeremy L Thompson   }
363c8e372f0SJeremy L Thompson }
364c8e372f0SJeremy L Thompson 
365c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
366c8e372f0SJeremy L Thompson // 3D pack/unpack quadrature values
367c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
368c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
369c8e372f0SJeremy L Thompson inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
370259057edSJeremy L Thompson   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
371c8e372f0SJeremy L Thompson 
372c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
373*d6c19ee8SJeremy L Thompson     __syncthreads();
374c8e372f0SJeremy L Thompson     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
375c8e372f0SJeremy L Thompson     __syncthreads();
376259057edSJeremy L Thompson     U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
377c8e372f0SJeremy L Thompson   }
378c8e372f0SJeremy L Thompson }
379c8e372f0SJeremy L Thompson 
380c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
381c8e372f0SJeremy L Thompson inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
382259057edSJeremy L Thompson   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
383c8e372f0SJeremy L Thompson 
384c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
385*d6c19ee8SJeremy L Thompson     __syncthreads();
386259057edSJeremy L Thompson     if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
387c8e372f0SJeremy L Thompson     __syncthreads();
388c8e372f0SJeremy L Thompson     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
389c8e372f0SJeremy L Thompson   }
390c8e372f0SJeremy L Thompson }
391c8e372f0SJeremy L Thompson 
392c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
393c8e372f0SJeremy L Thompson // 3D interpolate to quadrature points
394c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
395c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
396c8e372f0SJeremy L Thompson inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
397c8e372f0SJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
398259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
399c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
400c8e372f0SJeremy L Thompson 
401ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
402c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
403c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
404c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
405c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
406c8e372f0SJeremy L Thompson   }
407ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
408ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
409c8e372f0SJeremy L Thompson }
410c8e372f0SJeremy L Thompson 
411c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
412c8e372f0SJeremy L Thompson // 3D interpolate transpose
413c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
414c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
415c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
416c8e372f0SJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
417259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
418c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
419c8e372f0SJeremy L Thompson 
420ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
421c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
422c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
423c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
424c8e372f0SJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
425c8e372f0SJeremy L Thompson   }
426ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
427c8e372f0SJeremy L Thompson }
428c8e372f0SJeremy L Thompson 
429c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
430c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points
431c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
432c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
433c8e372f0SJeremy L Thompson inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
434c8e372f0SJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
435259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
436c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
437c8e372f0SJeremy L Thompson 
438ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
439c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
440c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
441c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
442c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
443c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
444c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
445c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
446c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
447c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
448c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
449c8e372f0SJeremy L Thompson   }
450ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
451ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
452c8e372f0SJeremy L Thompson }
453c8e372f0SJeremy L Thompson 
454c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
455c8e372f0SJeremy L Thompson // 3D derivatives transpose
456c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
457c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
458c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
459c8e372f0SJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
460259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
461c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
462c8e372f0SJeremy L Thompson 
463ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
464c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
465c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
466c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
467c8e372f0SJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
468c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
469c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
470c8e372f0SJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
471c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
472c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
473c8e372f0SJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
474c8e372f0SJeremy L Thompson   }
475ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
476c8e372f0SJeremy L Thompson }
477c8e372f0SJeremy L Thompson 
478c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
479c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points
480c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
481c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
482c8e372f0SJeremy L Thompson inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
483c8e372f0SJeremy L Thompson                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
484259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
485c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
486c8e372f0SJeremy L Thompson 
487ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
488c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
489c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
490c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
491c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
492c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
493c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
494c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
495c8e372f0SJeremy L Thompson   }
496ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
497ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
498c8e372f0SJeremy L Thompson }
499c8e372f0SJeremy L Thompson 
500c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
501c8e372f0SJeremy L Thompson // 3D derivatives transpose
502c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
503c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
504c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
505c8e372f0SJeremy L Thompson                                                                 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
506259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
507c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
508c8e372f0SJeremy L Thompson 
509ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
510c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
511c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2);
512c8e372f0SJeremy L Thompson     ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2);
513c8e372f0SJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2);
514c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
515c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
516c8e372f0SJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
517c8e372f0SJeremy L Thompson   }
518ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
519c8e372f0SJeremy L Thompson }
520c8e372f0SJeremy L Thompson 
521c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
522c8e372f0SJeremy L Thompson // 3D quadrature weights
523c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
524c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D>
525c8e372f0SJeremy L Thompson inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
526259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
527c8e372f0SJeremy L Thompson 
528c8e372f0SJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
529c8e372f0SJeremy L Thompson }
530