xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h (revision 259057ed0da46db025ab7b301a4a4b3f356037e0)
1c8e372f0SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2c8e372f0SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3c8e372f0SJeremy L Thompson //
4c8e372f0SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
5c8e372f0SJeremy L Thompson //
6c8e372f0SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
7c8e372f0SJeremy L Thompson 
8c8e372f0SJeremy L Thompson /// @file
9c8e372f0SJeremy L Thompson /// Internal header for CUDA shared memory tensor product basis templates
10c8e372f0SJeremy L Thompson #include <ceed/types.h>
11c8e372f0SJeremy L Thompson 
12c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
13c8e372f0SJeremy L Thompson // 2D
14c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
15c8e372f0SJeremy L Thompson 
16c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
17c8e372f0SJeremy L Thompson // 2D tensor contraction x
18c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
19c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
20c8e372f0SJeremy L Thompson inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
21c8e372f0SJeremy L Thompson                                             CeedScalar *V) {
22c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
23c8e372f0SJeremy L Thompson   __syncthreads();
24c8e372f0SJeremy L Thompson   *V = 0.0;
25c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
26c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
27c8e372f0SJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
28c8e372f0SJeremy L Thompson     }
29c8e372f0SJeremy L Thompson   }
30c8e372f0SJeremy L Thompson   __syncthreads();
31c8e372f0SJeremy L Thompson }
32c8e372f0SJeremy L Thompson 
33c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
34c8e372f0SJeremy L Thompson // 2D tensor contract y
35c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
36c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
37c8e372f0SJeremy L Thompson inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
38c8e372f0SJeremy L Thompson                                             CeedScalar *V) {
39c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
40c8e372f0SJeremy L Thompson   __syncthreads();
41c8e372f0SJeremy L Thompson   *V = 0.0;
42c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D) {
43c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
44c8e372f0SJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
45c8e372f0SJeremy L Thompson     }
46c8e372f0SJeremy L Thompson   }
47c8e372f0SJeremy L Thompson   __syncthreads();
48c8e372f0SJeremy L Thompson }
49c8e372f0SJeremy L Thompson 
50c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
51c8e372f0SJeremy L Thompson // 2D transpose tensor contract y
52c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
53c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
54c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
55c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
56c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
57c8e372f0SJeremy L Thompson   __syncthreads();
58c8e372f0SJeremy L Thompson   *V = 0.0;
59c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
60c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
61c8e372f0SJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
62c8e372f0SJeremy L Thompson     }
63c8e372f0SJeremy L Thompson   }
64c8e372f0SJeremy L Thompson   __syncthreads();
65c8e372f0SJeremy L Thompson }
66c8e372f0SJeremy L Thompson 
67c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
68c8e372f0SJeremy L Thompson // 2D transpose tensor contract x
69c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
70c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
71c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
72c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
73c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
74c8e372f0SJeremy L Thompson   __syncthreads();
75c8e372f0SJeremy L Thompson   *V = 0.0;
76c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
77c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
78c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
79c8e372f0SJeremy L Thompson     }
80c8e372f0SJeremy L Thompson   }
81c8e372f0SJeremy L Thompson   __syncthreads();
82c8e372f0SJeremy L Thompson }
83c8e372f0SJeremy L Thompson 
84c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
85c8e372f0SJeremy L Thompson // 2D transpose tensor contract and add x
86c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
87c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
88c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
89c8e372f0SJeremy L Thompson                                                         const CeedScalar *B, CeedScalar *V) {
90c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
91c8e372f0SJeremy L Thompson   __syncthreads();
92c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
93c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
94c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
95c8e372f0SJeremy L Thompson     }
96c8e372f0SJeremy L Thompson   }
97c8e372f0SJeremy L Thompson   __syncthreads();
98c8e372f0SJeremy L Thompson }
99c8e372f0SJeremy L Thompson 
100c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
101c8e372f0SJeremy L Thompson // 2D pack/unpack quadrature values
102c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
103c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
104c8e372f0SJeremy L Thompson inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
105c8e372f0SJeremy L Thompson   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
106c8e372f0SJeremy L Thompson 
107c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
108c8e372f0SJeremy L Thompson     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
109c8e372f0SJeremy L Thompson     __syncthreads();
110c8e372f0SJeremy L Thompson     U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
111c8e372f0SJeremy L Thompson     __syncthreads();
112c8e372f0SJeremy L Thompson   }
113c8e372f0SJeremy L Thompson }
114c8e372f0SJeremy L Thompson 
115c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
116c8e372f0SJeremy L Thompson inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
117c8e372f0SJeremy L Thompson   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
118c8e372f0SJeremy L Thompson 
119c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
120c8e372f0SJeremy L Thompson     if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
121c8e372f0SJeremy L Thompson     __syncthreads();
122c8e372f0SJeremy L Thompson     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
123c8e372f0SJeremy L Thompson     __syncthreads();
124c8e372f0SJeremy L Thompson   }
125c8e372f0SJeremy L Thompson }
126c8e372f0SJeremy L Thompson 
127c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
128c8e372f0SJeremy L Thompson // 2D interpolate to quadrature points
129c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
130c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
131c8e372f0SJeremy L Thompson inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
132c8e372f0SJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
133c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
134c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
135c8e372f0SJeremy L Thompson 
136c8e372f0SJeremy L Thompson   QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
137c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
138c8e372f0SJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
139c8e372f0SJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
140c8e372f0SJeremy L Thompson   }
141*259057edSJeremy L Thompson   QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
142c8e372f0SJeremy L Thompson   QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
143c8e372f0SJeremy L Thompson }
144c8e372f0SJeremy L Thompson 
145c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
146c8e372f0SJeremy L Thompson // 2D interpolate transpose
147c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
148c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
149c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
150c8e372f0SJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
151c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
152c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
153c8e372f0SJeremy L Thompson 
154c8e372f0SJeremy L Thompson   QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
155c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
156c8e372f0SJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
157c8e372f0SJeremy L Thompson     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
158c8e372f0SJeremy L Thompson   }
159c8e372f0SJeremy L Thompson   QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
160c8e372f0SJeremy L Thompson }
161c8e372f0SJeremy L Thompson 
162c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
163c8e372f0SJeremy L Thompson // 2D derivatives at quadrature points
164c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
165c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
166c8e372f0SJeremy L Thompson inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
167c8e372f0SJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
168c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
169c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
170c8e372f0SJeremy L Thompson 
171c8e372f0SJeremy L Thompson   QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
172c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
173c8e372f0SJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
174c8e372f0SJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
175c8e372f0SJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
176c8e372f0SJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
177c8e372f0SJeremy L Thompson   }
178*259057edSJeremy L Thompson   QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
179c8e372f0SJeremy L Thompson   QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
180c8e372f0SJeremy L Thompson }
181c8e372f0SJeremy L Thompson 
182c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
183c8e372f0SJeremy L Thompson // 2D derivatives transpose
184c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
185c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
186c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
187c8e372f0SJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
188c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
189c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
190c8e372f0SJeremy L Thompson 
191c8e372f0SJeremy L Thompson   QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
192c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
193c8e372f0SJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
194c8e372f0SJeremy L Thompson     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
195c8e372f0SJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
196c8e372f0SJeremy L Thompson     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
197c8e372f0SJeremy L Thompson   }
198c8e372f0SJeremy L Thompson   QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
199c8e372f0SJeremy L Thompson }
200c8e372f0SJeremy L Thompson 
201c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
202c8e372f0SJeremy L Thompson // 2D quadrature weights
203c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
204c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D>
205c8e372f0SJeremy L Thompson inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
206c8e372f0SJeremy L Thompson   const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
207c8e372f0SJeremy L Thompson 
208c8e372f0SJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
209c8e372f0SJeremy L Thompson }
210c8e372f0SJeremy L Thompson 
211c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
212c8e372f0SJeremy L Thompson // 3D
213c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
214c8e372f0SJeremy L Thompson 
215c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
216c8e372f0SJeremy L Thompson // 3D tensor contract x
217c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
218c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
219c8e372f0SJeremy L Thompson inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
220c8e372f0SJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
221c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
222c8e372f0SJeremy L Thompson   __syncthreads();
223c8e372f0SJeremy L Thompson   *V = 0.0;
224c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
225c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
226c8e372f0SJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
227c8e372f0SJeremy L Thompson     }
228c8e372f0SJeremy L Thompson   }
229c8e372f0SJeremy L Thompson   __syncthreads();
230c8e372f0SJeremy L Thompson }
231c8e372f0SJeremy L Thompson 
232c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
233c8e372f0SJeremy L Thompson // 3D tensor contract y
234c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
235c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
236c8e372f0SJeremy L Thompson inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
237c8e372f0SJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
238c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
239c8e372f0SJeremy L Thompson   __syncthreads();
240c8e372f0SJeremy L Thompson   *V = 0.0;
241c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
242c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
243c8e372f0SJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
244c8e372f0SJeremy L Thompson     }
245c8e372f0SJeremy L Thompson   }
246c8e372f0SJeremy L Thompson   __syncthreads();
247c8e372f0SJeremy L Thompson }
248c8e372f0SJeremy L Thompson 
249c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
250c8e372f0SJeremy L Thompson // 3D tensor contract z
251c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
252c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
253c8e372f0SJeremy L Thompson inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
254c8e372f0SJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
255c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
256c8e372f0SJeremy L Thompson   __syncthreads();
257c8e372f0SJeremy L Thompson   *V = 0.0;
258c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
259c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
260c8e372f0SJeremy L Thompson       *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
261c8e372f0SJeremy L Thompson     }
262c8e372f0SJeremy L Thompson   }
263c8e372f0SJeremy L Thompson   __syncthreads();
264c8e372f0SJeremy L Thompson }
265c8e372f0SJeremy L Thompson 
266c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
267c8e372f0SJeremy L Thompson // 3D tensor contract z
268c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
269c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
270c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
271c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
272c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
273c8e372f0SJeremy L Thompson   __syncthreads();
274c8e372f0SJeremy L Thompson   *V = 0.0;
275c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
276c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
277c8e372f0SJeremy L Thompson       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
278c8e372f0SJeremy L Thompson     }
279c8e372f0SJeremy L Thompson   }
280c8e372f0SJeremy L Thompson   __syncthreads();
281c8e372f0SJeremy L Thompson }
282c8e372f0SJeremy L Thompson 
283c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
284c8e372f0SJeremy L Thompson // 3D transpose tensor contract z
285c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
286c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
287c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
288c8e372f0SJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
289c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
290c8e372f0SJeremy L Thompson   __syncthreads();
291c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
292c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
293c8e372f0SJeremy L Thompson       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
294c8e372f0SJeremy L Thompson     }
295c8e372f0SJeremy L Thompson   }
296c8e372f0SJeremy L Thompson   __syncthreads();
297c8e372f0SJeremy L Thompson }
298c8e372f0SJeremy L Thompson 
299c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
300c8e372f0SJeremy L Thompson // 3D transpose tensor contract y
301c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
302c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
303c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
304c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
305c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
306c8e372f0SJeremy L Thompson   __syncthreads();
307c8e372f0SJeremy L Thompson   *V = 0.0;
308c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
309c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
310c8e372f0SJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
311c8e372f0SJeremy L Thompson     }
312c8e372f0SJeremy L Thompson   }
313c8e372f0SJeremy L Thompson   __syncthreads();
314c8e372f0SJeremy L Thompson }
315c8e372f0SJeremy L Thompson 
316c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
317c8e372f0SJeremy L Thompson // 3D transpose tensor contract y
318c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
319c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
320c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
321c8e372f0SJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
322c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
323c8e372f0SJeremy L Thompson   __syncthreads();
324c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
325c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
326c8e372f0SJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
327c8e372f0SJeremy L Thompson     }
328c8e372f0SJeremy L Thompson   }
329c8e372f0SJeremy L Thompson   __syncthreads();
330c8e372f0SJeremy L Thompson }
331c8e372f0SJeremy L Thompson 
332c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
333c8e372f0SJeremy L Thompson // 3D transpose tensor contract x
334c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
335c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
336c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
337c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
338c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
339c8e372f0SJeremy L Thompson   __syncthreads();
340c8e372f0SJeremy L Thompson   *V = 0.0;
341c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
342c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
343c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
344c8e372f0SJeremy L Thompson     }
345c8e372f0SJeremy L Thompson   }
346c8e372f0SJeremy L Thompson   __syncthreads();
347c8e372f0SJeremy L Thompson }
348c8e372f0SJeremy L Thompson 
349c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
350c8e372f0SJeremy L Thompson // 3D transpose tensor contract add x
351c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
352c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
353c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
354c8e372f0SJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
355c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
356c8e372f0SJeremy L Thompson   __syncthreads();
357c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
358c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
359c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
360c8e372f0SJeremy L Thompson     }
361c8e372f0SJeremy L Thompson   }
362c8e372f0SJeremy L Thompson   __syncthreads();
363c8e372f0SJeremy L Thompson }
364c8e372f0SJeremy L Thompson 
365c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
366c8e372f0SJeremy L Thompson // 3D pack/unpack quadrature values
367c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
368c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
369c8e372f0SJeremy L Thompson inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
370*259057edSJeremy L Thompson   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
371c8e372f0SJeremy L Thompson 
372c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
373c8e372f0SJeremy L Thompson     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
374c8e372f0SJeremy L Thompson     __syncthreads();
375*259057edSJeremy L Thompson     U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
376c8e372f0SJeremy L Thompson     __syncthreads();
377c8e372f0SJeremy L Thompson   }
378c8e372f0SJeremy L Thompson }
379c8e372f0SJeremy L Thompson 
380c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
381c8e372f0SJeremy L Thompson inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
382*259057edSJeremy L Thompson   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
383c8e372f0SJeremy L Thompson 
384c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
385*259057edSJeremy L Thompson     if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
386c8e372f0SJeremy L Thompson     __syncthreads();
387c8e372f0SJeremy L Thompson     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
388c8e372f0SJeremy L Thompson     __syncthreads();
389c8e372f0SJeremy L Thompson   }
390c8e372f0SJeremy L Thompson }
391c8e372f0SJeremy L Thompson 
392c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
393c8e372f0SJeremy L Thompson // 3D interpolate to quadrature points
394c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
395c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
396c8e372f0SJeremy L Thompson inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
397c8e372f0SJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
398*259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
399c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
400c8e372f0SJeremy L Thompson 
401c8e372f0SJeremy L Thompson   QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
402c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
403c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
404c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
405c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
406c8e372f0SJeremy L Thompson   }
407*259057edSJeremy L Thompson   QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
408c8e372f0SJeremy L Thompson   QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
409c8e372f0SJeremy L Thompson }
410c8e372f0SJeremy L Thompson 
411c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
412c8e372f0SJeremy L Thompson // 3D interpolate transpose
413c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
414c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
415c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
416c8e372f0SJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
417*259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
418c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
419c8e372f0SJeremy L Thompson 
420c8e372f0SJeremy L Thompson   QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
421c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
422c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
423c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
424c8e372f0SJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
425c8e372f0SJeremy L Thompson   }
426c8e372f0SJeremy L Thompson   QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
427c8e372f0SJeremy L Thompson }
428c8e372f0SJeremy L Thompson 
429c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
430c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points
431c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
432c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
433c8e372f0SJeremy L Thompson inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
434c8e372f0SJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
435*259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
436c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
437c8e372f0SJeremy L Thompson 
438c8e372f0SJeremy L Thompson   QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
439c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
440c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
441c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
442c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
443c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
444c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
445c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
446c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
447c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
448c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
449c8e372f0SJeremy L Thompson   }
450*259057edSJeremy L Thompson   QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
451c8e372f0SJeremy L Thompson   QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
452c8e372f0SJeremy L Thompson }
453c8e372f0SJeremy L Thompson 
454c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
455c8e372f0SJeremy L Thompson // 3D derivatives transpose
456c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
457c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
458c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
459c8e372f0SJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
460*259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
461c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
462c8e372f0SJeremy L Thompson 
463c8e372f0SJeremy L Thompson   QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
464c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
465c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
466c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
467c8e372f0SJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
468c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
469c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
470c8e372f0SJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
471c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
472c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
473c8e372f0SJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
474c8e372f0SJeremy L Thompson   }
475c8e372f0SJeremy L Thompson   QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
476c8e372f0SJeremy L Thompson }
477c8e372f0SJeremy L Thompson 
478c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
479c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points
480c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
481c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
482c8e372f0SJeremy L Thompson inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
483c8e372f0SJeremy L Thompson                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
484*259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
485c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
486c8e372f0SJeremy L Thompson 
487c8e372f0SJeremy L Thompson   QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
488c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
489c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
490c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
491c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
492c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
493c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
494c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
495c8e372f0SJeremy L Thompson   }
496*259057edSJeremy L Thompson   QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
497c8e372f0SJeremy L Thompson   QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
498c8e372f0SJeremy L Thompson }
499c8e372f0SJeremy L Thompson 
500c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
501c8e372f0SJeremy L Thompson // 3D derivatives transpose
502c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
503c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
504c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
505c8e372f0SJeremy L Thompson                                                                 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
506*259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
507c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
508c8e372f0SJeremy L Thompson 
509c8e372f0SJeremy L Thompson   QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
510c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
511c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2);
512c8e372f0SJeremy L Thompson     ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2);
513c8e372f0SJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2);
514c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
515c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
516c8e372f0SJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
517c8e372f0SJeremy L Thompson   }
518c8e372f0SJeremy L Thompson   QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
519c8e372f0SJeremy L Thompson }
520c8e372f0SJeremy L Thompson 
521c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
522c8e372f0SJeremy L Thompson // 3D quadrature weights
523c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
524c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D>
525c8e372f0SJeremy L Thompson inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
526*259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
527c8e372f0SJeremy L Thompson 
528c8e372f0SJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
529c8e372f0SJeremy L Thompson }
530