1c8e372f0SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2c8e372f0SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3c8e372f0SJeremy L Thompson // 4c8e372f0SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 5c8e372f0SJeremy L Thompson // 6c8e372f0SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 7c8e372f0SJeremy L Thompson 8c8e372f0SJeremy L Thompson /// @file 9c8e372f0SJeremy L Thompson /// Internal header for CUDA shared memory tensor product basis templates 10c8e372f0SJeremy L Thompson #include <ceed/types.h> 11c8e372f0SJeremy L Thompson 12c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 13c8e372f0SJeremy L Thompson // 2D 14c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 15c8e372f0SJeremy L Thompson 16c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 17c8e372f0SJeremy L Thompson // 2D tensor contraction x 18c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 19c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 20c8e372f0SJeremy L Thompson inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 21c8e372f0SJeremy L Thompson CeedScalar *V) { 22*d6c19ee8SJeremy L Thompson __syncthreads(); 23c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 24c8e372f0SJeremy L Thompson __syncthreads(); 25c8e372f0SJeremy L Thompson *V = 0.0; 26c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D) { 27c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 28c8e372f0SJeremy L Thompson *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 29c8e372f0SJeremy L Thompson } 30c8e372f0SJeremy L Thompson } 31c8e372f0SJeremy L Thompson } 32c8e372f0SJeremy L Thompson 33c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 34c8e372f0SJeremy L Thompson // 2D tensor contract y 35c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 36c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 37c8e372f0SJeremy L Thompson inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 38c8e372f0SJeremy L Thompson CeedScalar *V) { 39*d6c19ee8SJeremy L Thompson __syncthreads(); 40c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 41c8e372f0SJeremy L Thompson __syncthreads(); 42c8e372f0SJeremy L Thompson *V = 0.0; 43c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) { 44c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 45c8e372f0SJeremy L Thompson *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 46c8e372f0SJeremy L Thompson } 47c8e372f0SJeremy L Thompson } 48c8e372f0SJeremy L Thompson } 49c8e372f0SJeremy L Thompson 50c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 51c8e372f0SJeremy L Thompson // 2D transpose tensor contract y 52c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 53c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 54c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 55c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 56*d6c19ee8SJeremy L Thompson __syncthreads(); 57c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 58c8e372f0SJeremy L Thompson __syncthreads(); 59c8e372f0SJeremy L Thompson *V = 0.0; 60c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D) { 61c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 62c8e372f0SJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 63c8e372f0SJeremy L Thompson } 64c8e372f0SJeremy L Thompson } 65c8e372f0SJeremy L Thompson } 66c8e372f0SJeremy L Thompson 67c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 68c8e372f0SJeremy L Thompson // 2D transpose tensor contract x 69c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 70c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 71c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 72c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 73*d6c19ee8SJeremy L Thompson __syncthreads(); 74c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 75c8e372f0SJeremy L Thompson __syncthreads(); 76c8e372f0SJeremy L Thompson *V = 0.0; 77c8e372f0SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D) { 78c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 79c8e372f0SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 80c8e372f0SJeremy L Thompson } 81c8e372f0SJeremy L Thompson } 82c8e372f0SJeremy L Thompson } 83c8e372f0SJeremy L Thompson 84c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 85c8e372f0SJeremy L Thompson // 2D transpose tensor contract and add x 86c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 87c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 88c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 89c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 90*d6c19ee8SJeremy L Thompson __syncthreads(); 91c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 92c8e372f0SJeremy L Thompson __syncthreads(); 93c8e372f0SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D) { 94c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 95c8e372f0SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 96c8e372f0SJeremy L Thompson } 97c8e372f0SJeremy L Thompson } 98c8e372f0SJeremy L Thompson } 99c8e372f0SJeremy L Thompson 100c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 101c8e372f0SJeremy L Thompson // 2D pack/unpack quadrature values 102c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 103c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 104c8e372f0SJeremy L Thompson inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 105c8e372f0SJeremy L Thompson const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D; 106c8e372f0SJeremy L Thompson 107c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 108*d6c19ee8SJeremy L Thompson __syncthreads(); 109c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp]; 110c8e372f0SJeremy L Thompson __syncthreads(); 111c8e372f0SJeremy L Thompson U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0; 112c8e372f0SJeremy L Thompson } 113c8e372f0SJeremy L Thompson } 114c8e372f0SJeremy L Thompson 115c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 116c8e372f0SJeremy L Thompson inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 117c8e372f0SJeremy L Thompson const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D; 118c8e372f0SJeremy L Thompson 119c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 120*d6c19ee8SJeremy L Thompson __syncthreads(); 121c8e372f0SJeremy L Thompson if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp]; 122c8e372f0SJeremy L Thompson __syncthreads(); 123c8e372f0SJeremy L Thompson U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0; 124c8e372f0SJeremy L Thompson } 125c8e372f0SJeremy L Thompson } 126c8e372f0SJeremy L Thompson 127c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 128c8e372f0SJeremy L Thompson // 2D interpolate to quadrature points 129c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 130c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 131c8e372f0SJeremy L Thompson inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 132c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 133c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 134c8e372f0SJeremy L Thompson CeedScalar r_t[1]; 135c8e372f0SJeremy L Thompson 136ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 137c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 138c8e372f0SJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 139c8e372f0SJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 140c8e372f0SJeremy L Thompson } 141ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 142ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 143c8e372f0SJeremy L Thompson } 144c8e372f0SJeremy L Thompson 145c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 146c8e372f0SJeremy L Thompson // 2D interpolate transpose 147c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 148c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 149c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 150c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 151c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 152c8e372f0SJeremy L Thompson CeedScalar r_t[1]; 153c8e372f0SJeremy L Thompson 154ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 155c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 156c8e372f0SJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 157c8e372f0SJeremy L Thompson ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 158c8e372f0SJeremy L Thompson } 159ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 160c8e372f0SJeremy L Thompson } 161c8e372f0SJeremy L Thompson 162c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 163c8e372f0SJeremy L Thompson // 2D derivatives at quadrature points 164c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 165c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 166c8e372f0SJeremy L Thompson inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 167c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 168c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 169c8e372f0SJeremy L Thompson CeedScalar r_t[1]; 170c8e372f0SJeremy L Thompson 171ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 172c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 173c8e372f0SJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t); 174c8e372f0SJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 175c8e372f0SJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 176c8e372f0SJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 177c8e372f0SJeremy L Thompson } 178ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 179ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 180c8e372f0SJeremy L Thompson } 181c8e372f0SJeremy L Thompson 182c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 183c8e372f0SJeremy L Thompson // 2D derivatives transpose 184c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 185c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 186c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 187c8e372f0SJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 188c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 189c8e372f0SJeremy L Thompson CeedScalar r_t[1]; 190c8e372f0SJeremy L Thompson 191ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 192c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 193c8e372f0SJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 194c8e372f0SJeremy L Thompson ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]); 195c8e372f0SJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 196c8e372f0SJeremy L Thompson ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 197c8e372f0SJeremy L Thompson } 198ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 199c8e372f0SJeremy L Thompson } 200c8e372f0SJeremy L Thompson 201c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 202c8e372f0SJeremy L Thompson // 2D quadrature weights 203c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 204c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D> 205c8e372f0SJeremy L Thompson inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 206c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D; 207c8e372f0SJeremy L Thompson 208c8e372f0SJeremy L Thompson *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0; 209c8e372f0SJeremy L Thompson } 210c8e372f0SJeremy L Thompson 211c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 212c8e372f0SJeremy L Thompson // 3D 213c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 214c8e372f0SJeremy L Thompson 215c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 216c8e372f0SJeremy L Thompson // 3D tensor contract x 217c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 218c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 219c8e372f0SJeremy L Thompson inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 220c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 221*d6c19ee8SJeremy L Thompson __syncthreads(); 222c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 223c8e372f0SJeremy L Thompson __syncthreads(); 224c8e372f0SJeremy L Thompson *V = 0.0; 225c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 226c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 227c8e372f0SJeremy L Thompson *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 228c8e372f0SJeremy L Thompson } 229c8e372f0SJeremy L Thompson } 230c8e372f0SJeremy L Thompson } 231c8e372f0SJeremy L Thompson 232c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 233c8e372f0SJeremy L Thompson // 3D tensor contract y 234c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 235c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 236c8e372f0SJeremy L Thompson inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 237c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 238*d6c19ee8SJeremy L Thompson __syncthreads(); 239c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 240c8e372f0SJeremy L Thompson __syncthreads(); 241c8e372f0SJeremy L Thompson *V = 0.0; 242c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 243c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 244c8e372f0SJeremy L Thompson *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 245c8e372f0SJeremy L Thompson } 246c8e372f0SJeremy L Thompson } 247c8e372f0SJeremy L Thompson } 248c8e372f0SJeremy L Thompson 249c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 250c8e372f0SJeremy L Thompson // 3D tensor contract z 251c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 252c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 253c8e372f0SJeremy L Thompson inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 254c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 255*d6c19ee8SJeremy L Thompson __syncthreads(); 256c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 257c8e372f0SJeremy L Thompson __syncthreads(); 258c8e372f0SJeremy L Thompson *V = 0.0; 259c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) { 260c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 261c8e372f0SJeremy L Thompson *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 262c8e372f0SJeremy L Thompson } 263c8e372f0SJeremy L Thompson } 264c8e372f0SJeremy L Thompson } 265c8e372f0SJeremy L Thompson 266c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 267c8e372f0SJeremy L Thompson // 3D tensor contract z 268c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 269c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 270c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 271c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 272*d6c19ee8SJeremy L Thompson __syncthreads(); 273c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 274c8e372f0SJeremy L Thompson __syncthreads(); 275c8e372f0SJeremy L Thompson *V = 0.0; 276c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 277c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 278c8e372f0SJeremy L Thompson *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 279c8e372f0SJeremy L Thompson } 280c8e372f0SJeremy L Thompson } 281c8e372f0SJeremy L Thompson } 282c8e372f0SJeremy L Thompson 283c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 284c8e372f0SJeremy L Thompson // 3D transpose tensor contract z 285c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 286c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 287c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 288c8e372f0SJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 289*d6c19ee8SJeremy L Thompson __syncthreads(); 290c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 291c8e372f0SJeremy L Thompson __syncthreads(); 292c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 293c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 294c8e372f0SJeremy L Thompson *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 295c8e372f0SJeremy L Thompson } 296c8e372f0SJeremy L Thompson } 297c8e372f0SJeremy L Thompson } 298c8e372f0SJeremy L Thompson 299c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 300c8e372f0SJeremy L Thompson // 3D transpose tensor contract y 301c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 302c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 303c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 304c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 305*d6c19ee8SJeremy L Thompson __syncthreads(); 306c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 307c8e372f0SJeremy L Thompson __syncthreads(); 308c8e372f0SJeremy L Thompson *V = 0.0; 309c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 310c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 311c8e372f0SJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 312c8e372f0SJeremy L Thompson } 313c8e372f0SJeremy L Thompson } 314c8e372f0SJeremy L Thompson } 315c8e372f0SJeremy L Thompson 316c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 317c8e372f0SJeremy L Thompson // 3D transpose tensor contract y 318c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 319c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 320c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 321c8e372f0SJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 322*d6c19ee8SJeremy L Thompson __syncthreads(); 323c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 324c8e372f0SJeremy L Thompson __syncthreads(); 325c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 326c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 327c8e372f0SJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 328c8e372f0SJeremy L Thompson } 329c8e372f0SJeremy L Thompson } 330c8e372f0SJeremy L Thompson } 331c8e372f0SJeremy L Thompson 332c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 333c8e372f0SJeremy L Thompson // 3D transpose tensor contract x 334c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 335c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 336c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 337c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 338*d6c19ee8SJeremy L Thompson __syncthreads(); 339c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 340c8e372f0SJeremy L Thompson __syncthreads(); 341c8e372f0SJeremy L Thompson *V = 0.0; 342c8e372f0SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 343c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 344c8e372f0SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 345c8e372f0SJeremy L Thompson } 346c8e372f0SJeremy L Thompson } 347c8e372f0SJeremy L Thompson } 348c8e372f0SJeremy L Thompson 349c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 350c8e372f0SJeremy L Thompson // 3D transpose tensor contract add x 351c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 352c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 353c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 354c8e372f0SJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 355*d6c19ee8SJeremy L Thompson __syncthreads(); 356c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 357c8e372f0SJeremy L Thompson __syncthreads(); 358c8e372f0SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 359c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 360c8e372f0SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 361c8e372f0SJeremy L Thompson } 362c8e372f0SJeremy L Thompson } 363c8e372f0SJeremy L Thompson } 364c8e372f0SJeremy L Thompson 365c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 366c8e372f0SJeremy L Thompson // 3D pack/unpack quadrature values 367c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 368c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 369c8e372f0SJeremy L Thompson inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 370259057edSJeremy L Thompson const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D); 371c8e372f0SJeremy L Thompson 372c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 373*d6c19ee8SJeremy L Thompson __syncthreads(); 374c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp]; 375c8e372f0SJeremy L Thompson __syncthreads(); 376259057edSJeremy L Thompson U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0; 377c8e372f0SJeremy L Thompson } 378c8e372f0SJeremy L Thompson } 379c8e372f0SJeremy L Thompson 380c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 381c8e372f0SJeremy L Thompson inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 382259057edSJeremy L Thompson const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D); 383c8e372f0SJeremy L Thompson 384c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 385*d6c19ee8SJeremy L Thompson __syncthreads(); 386259057edSJeremy L Thompson if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp]; 387c8e372f0SJeremy L Thompson __syncthreads(); 388c8e372f0SJeremy L Thompson U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0; 389c8e372f0SJeremy L Thompson } 390c8e372f0SJeremy L Thompson } 391c8e372f0SJeremy L Thompson 392c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 393c8e372f0SJeremy L Thompson // 3D interpolate to quadrature points 394c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 395c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 396c8e372f0SJeremy L Thompson inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 397c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 398259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 399c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 400c8e372f0SJeremy L Thompson 401ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 402c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 403c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 404c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 405c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 406c8e372f0SJeremy L Thompson } 407ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 408ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 409c8e372f0SJeremy L Thompson } 410c8e372f0SJeremy L Thompson 411c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 412c8e372f0SJeremy L Thompson // 3D interpolate transpose 413c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 414c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 415c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 416c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 417259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 418c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 419c8e372f0SJeremy L Thompson 420ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 421c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 422c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 423c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 424c8e372f0SJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 425c8e372f0SJeremy L Thompson } 426ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 427c8e372f0SJeremy L Thompson } 428c8e372f0SJeremy L Thompson 429c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 430c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points 431c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 432c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 433c8e372f0SJeremy L Thompson inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 434c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 435259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 436c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 437c8e372f0SJeremy L Thompson 438ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 439c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 440c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1); 441c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 442c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]); 443c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 444c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2); 445c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]); 446c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 447c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 448c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]); 449c8e372f0SJeremy L Thompson } 450ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 451ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 452c8e372f0SJeremy L Thompson } 453c8e372f0SJeremy L Thompson 454c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 455c8e372f0SJeremy L Thompson // 3D derivatives transpose 456c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 457c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 458c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 459c8e372f0SJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 460259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 461c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 462c8e372f0SJeremy L Thompson 463ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 464c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 465c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1); 466c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 467c8e372f0SJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]); 468c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1); 469c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 470c8e372f0SJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 471c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1); 472c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 473c8e372f0SJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 474c8e372f0SJeremy L Thompson } 475ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 476c8e372f0SJeremy L Thompson } 477c8e372f0SJeremy L Thompson 478c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 479c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points 480c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 481c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 482c8e372f0SJeremy L Thompson inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 483c8e372f0SJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 484259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 485c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 486c8e372f0SJeremy L Thompson 487ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 488c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 489c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 490c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 491c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 492c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]); 493c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]); 494c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]); 495c8e372f0SJeremy L Thompson } 496ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 497ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 498c8e372f0SJeremy L Thompson } 499c8e372f0SJeremy L Thompson 500c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 501c8e372f0SJeremy L Thompson // 3D derivatives transpose 502c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 503c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 504c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 505c8e372f0SJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 506259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 507c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 508c8e372f0SJeremy L Thompson 509ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 510c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 511c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2); 512c8e372f0SJeremy L Thompson ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2); 513c8e372f0SJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2); 514c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 515c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 516c8e372f0SJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 517c8e372f0SJeremy L Thompson } 518ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 519c8e372f0SJeremy L Thompson } 520c8e372f0SJeremy L Thompson 521c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 522c8e372f0SJeremy L Thompson // 3D quadrature weights 523c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 524c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D> 525c8e372f0SJeremy L Thompson inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 526259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D); 527c8e372f0SJeremy L Thompson 528c8e372f0SJeremy L Thompson *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0; 529c8e372f0SJeremy L Thompson } 530