xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h (revision d6c19ee8504c74d8f30ec67127f069a58291b3ac)
19b91271bSJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
29b91271bSJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39b91271bSJeremy L Thompson //
49b91271bSJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
59b91271bSJeremy L Thompson //
69b91271bSJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
79b91271bSJeremy L Thompson 
89b91271bSJeremy L Thompson /// @file
99b91271bSJeremy L Thompson /// Internal header for HIP shared memory tensor product basis templates
109b91271bSJeremy L Thompson #include <ceed/types.h>
119b91271bSJeremy L Thompson 
129b91271bSJeremy L Thompson //------------------------------------------------------------------------------
139b91271bSJeremy L Thompson // 2D
149b91271bSJeremy L Thompson //------------------------------------------------------------------------------
159b91271bSJeremy L Thompson 
169b91271bSJeremy L Thompson //------------------------------------------------------------------------------
179b91271bSJeremy L Thompson // 2D tensor contraction x
189b91271bSJeremy L Thompson //------------------------------------------------------------------------------
199b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
209b91271bSJeremy L Thompson inline __device__ void ContractX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
219b91271bSJeremy L Thompson                                             CeedScalar *V) {
22*d6c19ee8SJeremy L Thompson   __syncthreads();
239b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
249b91271bSJeremy L Thompson   __syncthreads();
259b91271bSJeremy L Thompson   *V = 0.0;
269b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
279b91271bSJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
289b91271bSJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
299b91271bSJeremy L Thompson     }
309b91271bSJeremy L Thompson   }
319b91271bSJeremy L Thompson }
329b91271bSJeremy L Thompson 
339b91271bSJeremy L Thompson //------------------------------------------------------------------------------
349b91271bSJeremy L Thompson // 2D tensor contract y
359b91271bSJeremy L Thompson //------------------------------------------------------------------------------
369b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
379b91271bSJeremy L Thompson inline __device__ void ContractY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
389b91271bSJeremy L Thompson                                             CeedScalar *V) {
39*d6c19ee8SJeremy L Thompson   __syncthreads();
409b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
419b91271bSJeremy L Thompson   __syncthreads();
429b91271bSJeremy L Thompson   *V = 0.0;
439b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D) {
449b91271bSJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
459b91271bSJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
469b91271bSJeremy L Thompson     }
479b91271bSJeremy L Thompson   }
489b91271bSJeremy L Thompson }
499b91271bSJeremy L Thompson 
509b91271bSJeremy L Thompson //------------------------------------------------------------------------------
519b91271bSJeremy L Thompson // 2D transpose tensor contract y
529b91271bSJeremy L Thompson //------------------------------------------------------------------------------
539b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
549b91271bSJeremy L Thompson inline __device__ void ContractTransposeY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
559b91271bSJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
56*d6c19ee8SJeremy L Thompson   __syncthreads();
579b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
589b91271bSJeremy L Thompson   __syncthreads();
599b91271bSJeremy L Thompson   *V = 0.0;
609b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
619b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
629b91271bSJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
639b91271bSJeremy L Thompson     }
649b91271bSJeremy L Thompson   }
659b91271bSJeremy L Thompson }
669b91271bSJeremy L Thompson 
679b91271bSJeremy L Thompson //------------------------------------------------------------------------------
689b91271bSJeremy L Thompson // 2D transpose tensor contract x
699b91271bSJeremy L Thompson //------------------------------------------------------------------------------
709b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
719b91271bSJeremy L Thompson inline __device__ void ContractTransposeX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
729b91271bSJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
73*d6c19ee8SJeremy L Thompson   __syncthreads();
749b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
759b91271bSJeremy L Thompson   __syncthreads();
769b91271bSJeremy L Thompson   *V = 0.0;
779b91271bSJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
789b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
799b91271bSJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
809b91271bSJeremy L Thompson     }
819b91271bSJeremy L Thompson   }
829b91271bSJeremy L Thompson }
839b91271bSJeremy L Thompson 
849b91271bSJeremy L Thompson //------------------------------------------------------------------------------
859b91271bSJeremy L Thompson // 2D transpose tensor contract and add x
869b91271bSJeremy L Thompson //------------------------------------------------------------------------------
879b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
889b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
899b91271bSJeremy L Thompson                                                         const CeedScalar *B, CeedScalar *V) {
90*d6c19ee8SJeremy L Thompson   __syncthreads();
919b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
929b91271bSJeremy L Thompson   __syncthreads();
939b91271bSJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
949b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
959b91271bSJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
969b91271bSJeremy L Thompson     }
979b91271bSJeremy L Thompson   }
989b91271bSJeremy L Thompson }
999b91271bSJeremy L Thompson 
1009b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1019b91271bSJeremy L Thompson // 2D pack/unpack quadrature values
1029b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1039b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
1049b91271bSJeremy L Thompson inline __device__ void QPack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
1059b91271bSJeremy L Thompson   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
1069b91271bSJeremy L Thompson 
1079b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
108*d6c19ee8SJeremy L Thompson     __syncthreads();
1099b91271bSJeremy L Thompson     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
1109b91271bSJeremy L Thompson     __syncthreads();
1119b91271bSJeremy L Thompson     U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
1129b91271bSJeremy L Thompson   }
1139b91271bSJeremy L Thompson }
1149b91271bSJeremy L Thompson 
1159b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
1169b91271bSJeremy L Thompson inline __device__ void QUnpack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
1179b91271bSJeremy L Thompson   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
1189b91271bSJeremy L Thompson 
1199b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
120*d6c19ee8SJeremy L Thompson     __syncthreads();
1219b91271bSJeremy L Thompson     if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
1229b91271bSJeremy L Thompson     __syncthreads();
1239b91271bSJeremy L Thompson     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
1249b91271bSJeremy L Thompson   }
1259b91271bSJeremy L Thompson }
1269b91271bSJeremy L Thompson 
1279b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1289b91271bSJeremy L Thompson // 2D interpolate to quadrature points
1299b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1309b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1319b91271bSJeremy L Thompson inline __device__ void InterpTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
1329b91271bSJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
1339b91271bSJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
1349b91271bSJeremy L Thompson   CeedScalar r_t[1];
1359b91271bSJeremy L Thompson 
136ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
1379b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1389b91271bSJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
1399b91271bSJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
1409b91271bSJeremy L Thompson   }
141ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
142ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
1439b91271bSJeremy L Thompson }
1449b91271bSJeremy L Thompson 
1459b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1469b91271bSJeremy L Thompson // 2D interpolate transpose
1479b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1489b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1499b91271bSJeremy L Thompson inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
1509b91271bSJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
1519b91271bSJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
1529b91271bSJeremy L Thompson   CeedScalar r_t[1];
1539b91271bSJeremy L Thompson 
154ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
1559b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1569b91271bSJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
1579b91271bSJeremy L Thompson     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
1589b91271bSJeremy L Thompson   }
159ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
1609b91271bSJeremy L Thompson }
1619b91271bSJeremy L Thompson 
1629b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1639b91271bSJeremy L Thompson // 2D derivatives at quadrature points
1649b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1659b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1669b91271bSJeremy L Thompson inline __device__ void GradTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
1679b91271bSJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
1689b91271bSJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
1699b91271bSJeremy L Thompson   CeedScalar r_t[1];
1709b91271bSJeremy L Thompson 
171ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
1729b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1739b91271bSJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
1749b91271bSJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
1759b91271bSJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
1769b91271bSJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
1779b91271bSJeremy L Thompson   }
178ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
179ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
1809b91271bSJeremy L Thompson }
1819b91271bSJeremy L Thompson 
1829b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1839b91271bSJeremy L Thompson // 2D derivatives transpose
1849b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1859b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1869b91271bSJeremy L Thompson inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
1879b91271bSJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
1889b91271bSJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
1899b91271bSJeremy L Thompson   CeedScalar r_t[1];
1909b91271bSJeremy L Thompson 
191ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
1929b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1939b91271bSJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
1949b91271bSJeremy L Thompson     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
1959b91271bSJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
1969b91271bSJeremy L Thompson     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
1979b91271bSJeremy L Thompson   }
198ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
1999b91271bSJeremy L Thompson }
2009b91271bSJeremy L Thompson 
2019b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2029b91271bSJeremy L Thompson // 2D quadrature weights
2039b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2049b91271bSJeremy L Thompson template <int P_1D, int Q_1D>
2059b91271bSJeremy L Thompson inline __device__ void WeightTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
2069b91271bSJeremy L Thompson   const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
2079b91271bSJeremy L Thompson 
2089b91271bSJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
2099b91271bSJeremy L Thompson }
2109b91271bSJeremy L Thompson 
2119b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2129b91271bSJeremy L Thompson // 3D
2139b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2149b91271bSJeremy L Thompson 
2159b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2169b91271bSJeremy L Thompson // 3D tensor contract x
2179b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2189b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2199b91271bSJeremy L Thompson inline __device__ void ContractX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
2209b91271bSJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
221*d6c19ee8SJeremy L Thompson   __syncthreads();
2229b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
2239b91271bSJeremy L Thompson   __syncthreads();
2249b91271bSJeremy L Thompson   *V = 0.0;
2259b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
2269b91271bSJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
2279b91271bSJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
2289b91271bSJeremy L Thompson     }
2299b91271bSJeremy L Thompson   }
2309b91271bSJeremy L Thompson }
2319b91271bSJeremy L Thompson 
2329b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2339b91271bSJeremy L Thompson // 3D tensor contract y
2349b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2359b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2369b91271bSJeremy L Thompson inline __device__ void ContractY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
2379b91271bSJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
238*d6c19ee8SJeremy L Thompson   __syncthreads();
2399b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
2409b91271bSJeremy L Thompson   __syncthreads();
2419b91271bSJeremy L Thompson   *V = 0.0;
2429b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
2439b91271bSJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
2449b91271bSJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
2459b91271bSJeremy L Thompson     }
2469b91271bSJeremy L Thompson   }
2479b91271bSJeremy L Thompson }
2489b91271bSJeremy L Thompson 
2499b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2509b91271bSJeremy L Thompson // 3D tensor contract z
2519b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2529b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2539b91271bSJeremy L Thompson inline __device__ void ContractZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
2549b91271bSJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
255*d6c19ee8SJeremy L Thompson   __syncthreads();
2569b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
2579b91271bSJeremy L Thompson   __syncthreads();
2589b91271bSJeremy L Thompson   *V = 0.0;
2599b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
2609b91271bSJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
2619b91271bSJeremy L Thompson       *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
2629b91271bSJeremy L Thompson     }
2639b91271bSJeremy L Thompson   }
2649b91271bSJeremy L Thompson }
2659b91271bSJeremy L Thompson 
2669b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2679b91271bSJeremy L Thompson // 3D tensor contract z
2689b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2699b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2709b91271bSJeremy L Thompson inline __device__ void ContractTransposeZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
2719b91271bSJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
272*d6c19ee8SJeremy L Thompson   __syncthreads();
2739b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
2749b91271bSJeremy L Thompson   __syncthreads();
2759b91271bSJeremy L Thompson   *V = 0.0;
2769b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
2779b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
2789b91271bSJeremy L Thompson       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
2799b91271bSJeremy L Thompson     }
2809b91271bSJeremy L Thompson   }
2819b91271bSJeremy L Thompson }
2829b91271bSJeremy L Thompson 
2839b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2849b91271bSJeremy L Thompson // 3D transpose tensor contract z
2859b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2869b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2879b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
2889b91271bSJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
289*d6c19ee8SJeremy L Thompson   __syncthreads();
2909b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
2919b91271bSJeremy L Thompson   __syncthreads();
2929b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
2939b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
2949b91271bSJeremy L Thompson       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
2959b91271bSJeremy L Thompson     }
2969b91271bSJeremy L Thompson   }
2979b91271bSJeremy L Thompson }
2989b91271bSJeremy L Thompson 
2999b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3009b91271bSJeremy L Thompson // 3D transpose tensor contract y
3019b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3029b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3039b91271bSJeremy L Thompson inline __device__ void ContractTransposeY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
3049b91271bSJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
305*d6c19ee8SJeremy L Thompson   __syncthreads();
3069b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
3079b91271bSJeremy L Thompson   __syncthreads();
3089b91271bSJeremy L Thompson   *V = 0.0;
3099b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
3109b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
3119b91271bSJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
3129b91271bSJeremy L Thompson     }
3139b91271bSJeremy L Thompson   }
3149b91271bSJeremy L Thompson }
3159b91271bSJeremy L Thompson 
3169b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3179b91271bSJeremy L Thompson // 3D transpose tensor contract y
3189b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3199b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3209b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
3219b91271bSJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
322*d6c19ee8SJeremy L Thompson   __syncthreads();
3239b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
3249b91271bSJeremy L Thompson   __syncthreads();
3259b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
3269b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
3279b91271bSJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
3289b91271bSJeremy L Thompson     }
3299b91271bSJeremy L Thompson   }
3309b91271bSJeremy L Thompson }
3319b91271bSJeremy L Thompson 
3329b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3339b91271bSJeremy L Thompson // 3D transpose tensor contract x
3349b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3359b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3369b91271bSJeremy L Thompson inline __device__ void ContractTransposeX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
3379b91271bSJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
338*d6c19ee8SJeremy L Thompson   __syncthreads();
3399b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
3409b91271bSJeremy L Thompson   __syncthreads();
3419b91271bSJeremy L Thompson   *V = 0.0;
3429b91271bSJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
3439b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
3449b91271bSJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
3459b91271bSJeremy L Thompson     }
3469b91271bSJeremy L Thompson   }
3479b91271bSJeremy L Thompson }
3489b91271bSJeremy L Thompson 
3499b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3509b91271bSJeremy L Thompson // 3D transpose tensor contract add x
3519b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3529b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3539b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
3549b91271bSJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
355*d6c19ee8SJeremy L Thompson   __syncthreads();
3569b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
3579b91271bSJeremy L Thompson   __syncthreads();
3589b91271bSJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
3599b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
3609b91271bSJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
3619b91271bSJeremy L Thompson     }
3629b91271bSJeremy L Thompson   }
3639b91271bSJeremy L Thompson }
3649b91271bSJeremy L Thompson 
3659b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3669b91271bSJeremy L Thompson // 3D pack/unpack quadrature values
3679b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3689b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
3699b91271bSJeremy L Thompson inline __device__ void QPack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
3709b91271bSJeremy L Thompson   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
3719b91271bSJeremy L Thompson 
3729b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
373*d6c19ee8SJeremy L Thompson     __syncthreads();
3749b91271bSJeremy L Thompson     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
3759b91271bSJeremy L Thompson     __syncthreads();
3769b91271bSJeremy L Thompson     U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
3779b91271bSJeremy L Thompson   }
3789b91271bSJeremy L Thompson }
3799b91271bSJeremy L Thompson 
3809b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
3819b91271bSJeremy L Thompson inline __device__ void QUnpack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
3829b91271bSJeremy L Thompson   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
3839b91271bSJeremy L Thompson 
3849b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
385*d6c19ee8SJeremy L Thompson     __syncthreads();
3869b91271bSJeremy L Thompson     if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
3879b91271bSJeremy L Thompson     __syncthreads();
3889b91271bSJeremy L Thompson     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
3899b91271bSJeremy L Thompson   }
3909b91271bSJeremy L Thompson }
3919b91271bSJeremy L Thompson 
3929b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3939b91271bSJeremy L Thompson // 3D interpolate to quadrature points
3949b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3959b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3969b91271bSJeremy L Thompson inline __device__ void InterpTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
3979b91271bSJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
3989b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
3999b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
4009b91271bSJeremy L Thompson 
401ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
4029b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4039b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
4049b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
4059b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
4069b91271bSJeremy L Thompson   }
407ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
408ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
4099b91271bSJeremy L Thompson }
4109b91271bSJeremy L Thompson 
4119b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4129b91271bSJeremy L Thompson // 3D interpolate transpose
4139b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4149b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4159b91271bSJeremy L Thompson inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
4169b91271bSJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
4179b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
4189b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
4199b91271bSJeremy L Thompson 
420ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
4219b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4229b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
4239b91271bSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
4249b91271bSJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
4259b91271bSJeremy L Thompson   }
426ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
4279b91271bSJeremy L Thompson }
4289b91271bSJeremy L Thompson 
4299b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4309b91271bSJeremy L Thompson // 3D derivatives at quadrature points
4319b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4329b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4339b91271bSJeremy L Thompson inline __device__ void GradTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
4349b91271bSJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
4359b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
4369b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
4379b91271bSJeremy L Thompson 
438ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
4399b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4409b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
4419b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
4429b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
4439b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
4449b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
4459b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
4469b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
4479b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
4489b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
4499b91271bSJeremy L Thompson   }
450ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
451ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
4529b91271bSJeremy L Thompson }
4539b91271bSJeremy L Thompson 
4549b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4559b91271bSJeremy L Thompson // 3D derivatives transpose
4569b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4579b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4589b91271bSJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
4599b91271bSJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
4609b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
4619b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
4629b91271bSJeremy L Thompson 
463ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
4649b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4659b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
4669b91271bSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
4679b91271bSJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
4689b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
4699b91271bSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
4709b91271bSJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
4719b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
4729b91271bSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
4739b91271bSJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
4749b91271bSJeremy L Thompson   }
475ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
4769b91271bSJeremy L Thompson }
4779b91271bSJeremy L Thompson 
4789b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4799b91271bSJeremy L Thompson // 3D derivatives at quadrature points
4809b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4819b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4829b91271bSJeremy L Thompson inline __device__ void GradTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
4839b91271bSJeremy L Thompson                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
4849b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
4859b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
4869b91271bSJeremy L Thompson 
487ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
4889b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4899b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
4909b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
4919b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
4929b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
4939b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
4949b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
4959b91271bSJeremy L Thompson   }
496ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
497ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
4989b91271bSJeremy L Thompson }
4999b91271bSJeremy L Thompson 
5009b91271bSJeremy L Thompson //------------------------------------------------------------------------------
5019b91271bSJeremy L Thompson // 3D derivatives transpose
5029b91271bSJeremy L Thompson //------------------------------------------------------------------------------
5039b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5049b91271bSJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
5059b91271bSJeremy L Thompson                                                                 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
5069b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
5079b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
5089b91271bSJeremy L Thompson 
509ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
5109b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5119b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2);
5129b91271bSJeremy L Thompson     ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2);
5139b91271bSJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2);
5149b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
5159b91271bSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
5169b91271bSJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
5179b91271bSJeremy L Thompson   }
518ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
5199b91271bSJeremy L Thompson }
5209b91271bSJeremy L Thompson 
5219b91271bSJeremy L Thompson //------------------------------------------------------------------------------
5229b91271bSJeremy L Thompson // 3D quadrature weights
5239b91271bSJeremy L Thompson //------------------------------------------------------------------------------
5249b91271bSJeremy L Thompson template <int P_1D, int Q_1D>
5259b91271bSJeremy L Thompson inline __device__ void WeightTensor3dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
5269b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
5279b91271bSJeremy L Thompson 
5289b91271bSJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
5299b91271bSJeremy L Thompson }
530