1c8e372f0SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2c8e372f0SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3c8e372f0SJeremy L Thompson // 4c8e372f0SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 5c8e372f0SJeremy L Thompson // 6c8e372f0SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 7c8e372f0SJeremy L Thompson 8c8e372f0SJeremy L Thompson /// @file 9c8e372f0SJeremy L Thompson /// Internal header for CUDA shared memory tensor product basis templates 10c8e372f0SJeremy L Thompson #include <ceed/types.h> 11c8e372f0SJeremy L Thompson 12c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 13c8e372f0SJeremy L Thompson // 2D 14c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 15c8e372f0SJeremy L Thompson 16c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 17c8e372f0SJeremy L Thompson // 2D tensor contraction x 18c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 19c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 20c8e372f0SJeremy L Thompson inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 21c8e372f0SJeremy L Thompson CeedScalar *V) { 22c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 23c8e372f0SJeremy L Thompson __syncthreads(); 24c8e372f0SJeremy L Thompson *V = 0.0; 25c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D) { 26c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 27c8e372f0SJeremy L Thompson *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 28c8e372f0SJeremy L Thompson } 29c8e372f0SJeremy L Thompson } 30c8e372f0SJeremy L Thompson __syncthreads(); 31c8e372f0SJeremy L Thompson } 32c8e372f0SJeremy L Thompson 33c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 34c8e372f0SJeremy L Thompson // 2D tensor contract y 35c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 36c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 37c8e372f0SJeremy L Thompson inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 38c8e372f0SJeremy L Thompson CeedScalar *V) { 39c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 40c8e372f0SJeremy L Thompson __syncthreads(); 41c8e372f0SJeremy L Thompson *V = 0.0; 42c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) { 43c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 44c8e372f0SJeremy L Thompson *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 45c8e372f0SJeremy L Thompson } 46c8e372f0SJeremy L Thompson } 47c8e372f0SJeremy L Thompson __syncthreads(); 48c8e372f0SJeremy L Thompson } 49c8e372f0SJeremy L Thompson 50c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 51c8e372f0SJeremy L Thompson // 2D transpose tensor contract y 52c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 53c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 54c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 55c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 56c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 57c8e372f0SJeremy L Thompson __syncthreads(); 58c8e372f0SJeremy L Thompson *V = 0.0; 59c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D) { 60c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 61c8e372f0SJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 62c8e372f0SJeremy L Thompson } 63c8e372f0SJeremy L Thompson } 64c8e372f0SJeremy L Thompson __syncthreads(); 65c8e372f0SJeremy L Thompson } 66c8e372f0SJeremy L Thompson 67c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 68c8e372f0SJeremy L Thompson // 2D transpose tensor contract x 69c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 70c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 71c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 72c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 73c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 74c8e372f0SJeremy L Thompson __syncthreads(); 75c8e372f0SJeremy L Thompson *V = 0.0; 76c8e372f0SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D) { 77c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 78c8e372f0SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 79c8e372f0SJeremy L Thompson } 80c8e372f0SJeremy L Thompson } 81c8e372f0SJeremy L Thompson __syncthreads(); 82c8e372f0SJeremy L Thompson } 83c8e372f0SJeremy L Thompson 84c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 85c8e372f0SJeremy L Thompson // 2D transpose tensor contract and add x 86c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 87c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 88c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 89c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 90c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 91c8e372f0SJeremy L Thompson __syncthreads(); 92c8e372f0SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D) { 93c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 94c8e372f0SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 95c8e372f0SJeremy L Thompson } 96c8e372f0SJeremy L Thompson } 97c8e372f0SJeremy L Thompson __syncthreads(); 98c8e372f0SJeremy L Thompson } 99c8e372f0SJeremy L Thompson 100c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 101c8e372f0SJeremy L Thompson // 2D pack/unpack quadrature values 102c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 103c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 104c8e372f0SJeremy L Thompson inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 105c8e372f0SJeremy L Thompson const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D; 106c8e372f0SJeremy L Thompson 107c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 108c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp]; 109c8e372f0SJeremy L Thompson __syncthreads(); 110c8e372f0SJeremy L Thompson U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0; 111c8e372f0SJeremy L Thompson __syncthreads(); 112c8e372f0SJeremy L Thompson } 113c8e372f0SJeremy L Thompson } 114c8e372f0SJeremy L Thompson 115c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 116c8e372f0SJeremy L Thompson inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 117c8e372f0SJeremy L Thompson const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D; 118c8e372f0SJeremy L Thompson 119c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 120c8e372f0SJeremy L Thompson if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp]; 121c8e372f0SJeremy L Thompson __syncthreads(); 122c8e372f0SJeremy L Thompson U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0; 123c8e372f0SJeremy L Thompson __syncthreads(); 124c8e372f0SJeremy L Thompson } 125c8e372f0SJeremy L Thompson } 126c8e372f0SJeremy L Thompson 127c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 128c8e372f0SJeremy L Thompson // 2D interpolate to quadrature points 129c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 130c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 131c8e372f0SJeremy L Thompson inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 132c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 133c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 134c8e372f0SJeremy L Thompson CeedScalar r_t[1]; 135c8e372f0SJeremy L Thompson 136c8e372f0SJeremy L Thompson QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 137c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 138c8e372f0SJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 139c8e372f0SJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 140c8e372f0SJeremy L Thompson } 141*259057edSJeremy L Thompson QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 142c8e372f0SJeremy L Thompson QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 143c8e372f0SJeremy L Thompson } 144c8e372f0SJeremy L Thompson 145c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 146c8e372f0SJeremy L Thompson // 2D interpolate transpose 147c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 148c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 149c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 150c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 151c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 152c8e372f0SJeremy L Thompson CeedScalar r_t[1]; 153c8e372f0SJeremy L Thompson 154c8e372f0SJeremy L Thompson QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 155c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 156c8e372f0SJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 157c8e372f0SJeremy L Thompson ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 158c8e372f0SJeremy L Thompson } 159c8e372f0SJeremy L Thompson QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 160c8e372f0SJeremy L Thompson } 161c8e372f0SJeremy L Thompson 162c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 163c8e372f0SJeremy L Thompson // 2D derivatives at quadrature points 164c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 165c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 166c8e372f0SJeremy L Thompson inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 167c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 168c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 169c8e372f0SJeremy L Thompson CeedScalar r_t[1]; 170c8e372f0SJeremy L Thompson 171c8e372f0SJeremy L Thompson QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 172c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 173c8e372f0SJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t); 174c8e372f0SJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 175c8e372f0SJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 176c8e372f0SJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 177c8e372f0SJeremy L Thompson } 178*259057edSJeremy L Thompson QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 179c8e372f0SJeremy L Thompson QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 180c8e372f0SJeremy L Thompson } 181c8e372f0SJeremy L Thompson 182c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 183c8e372f0SJeremy L Thompson // 2D derivatives transpose 184c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 185c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 186c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 187c8e372f0SJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 188c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 189c8e372f0SJeremy L Thompson CeedScalar r_t[1]; 190c8e372f0SJeremy L Thompson 191c8e372f0SJeremy L Thompson QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 192c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 193c8e372f0SJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 194c8e372f0SJeremy L Thompson ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]); 195c8e372f0SJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 196c8e372f0SJeremy L Thompson ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 197c8e372f0SJeremy L Thompson } 198c8e372f0SJeremy L Thompson QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 199c8e372f0SJeremy L Thompson } 200c8e372f0SJeremy L Thompson 201c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 202c8e372f0SJeremy L Thompson // 2D quadrature weights 203c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 204c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D> 205c8e372f0SJeremy L Thompson inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 206c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D; 207c8e372f0SJeremy L Thompson 208c8e372f0SJeremy L Thompson *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0; 209c8e372f0SJeremy L Thompson } 210c8e372f0SJeremy L Thompson 211c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 212c8e372f0SJeremy L Thompson // 3D 213c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 214c8e372f0SJeremy L Thompson 215c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 216c8e372f0SJeremy L Thompson // 3D tensor contract x 217c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 218c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 219c8e372f0SJeremy L Thompson inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 220c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 221c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 222c8e372f0SJeremy L Thompson __syncthreads(); 223c8e372f0SJeremy L Thompson *V = 0.0; 224c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 225c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 226c8e372f0SJeremy L Thompson *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 227c8e372f0SJeremy L Thompson } 228c8e372f0SJeremy L Thompson } 229c8e372f0SJeremy L Thompson __syncthreads(); 230c8e372f0SJeremy L Thompson } 231c8e372f0SJeremy L Thompson 232c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 233c8e372f0SJeremy L Thompson // 3D tensor contract y 234c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 235c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 236c8e372f0SJeremy L Thompson inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 237c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 238c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 239c8e372f0SJeremy L Thompson __syncthreads(); 240c8e372f0SJeremy L Thompson *V = 0.0; 241c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 242c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 243c8e372f0SJeremy L Thompson *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 244c8e372f0SJeremy L Thompson } 245c8e372f0SJeremy L Thompson } 246c8e372f0SJeremy L Thompson __syncthreads(); 247c8e372f0SJeremy L Thompson } 248c8e372f0SJeremy L Thompson 249c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 250c8e372f0SJeremy L Thompson // 3D tensor contract z 251c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 252c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 253c8e372f0SJeremy L Thompson inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 254c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 255c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 256c8e372f0SJeremy L Thompson __syncthreads(); 257c8e372f0SJeremy L Thompson *V = 0.0; 258c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) { 259c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 260c8e372f0SJeremy L Thompson *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 261c8e372f0SJeremy L Thompson } 262c8e372f0SJeremy L Thompson } 263c8e372f0SJeremy L Thompson __syncthreads(); 264c8e372f0SJeremy L Thompson } 265c8e372f0SJeremy L Thompson 266c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 267c8e372f0SJeremy L Thompson // 3D tensor contract z 268c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 269c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 270c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 271c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 272c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 273c8e372f0SJeremy L Thompson __syncthreads(); 274c8e372f0SJeremy L Thompson *V = 0.0; 275c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 276c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 277c8e372f0SJeremy L Thompson *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 278c8e372f0SJeremy L Thompson } 279c8e372f0SJeremy L Thompson } 280c8e372f0SJeremy L Thompson __syncthreads(); 281c8e372f0SJeremy L Thompson } 282c8e372f0SJeremy L Thompson 283c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 284c8e372f0SJeremy L Thompson // 3D transpose tensor contract z 285c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 286c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 287c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 288c8e372f0SJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 289c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 290c8e372f0SJeremy L Thompson __syncthreads(); 291c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 292c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 293c8e372f0SJeremy L Thompson *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 294c8e372f0SJeremy L Thompson } 295c8e372f0SJeremy L Thompson } 296c8e372f0SJeremy L Thompson __syncthreads(); 297c8e372f0SJeremy L Thompson } 298c8e372f0SJeremy L Thompson 299c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 300c8e372f0SJeremy L Thompson // 3D transpose tensor contract y 301c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 302c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 303c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 304c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 305c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 306c8e372f0SJeremy L Thompson __syncthreads(); 307c8e372f0SJeremy L Thompson *V = 0.0; 308c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 309c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 310c8e372f0SJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 311c8e372f0SJeremy L Thompson } 312c8e372f0SJeremy L Thompson } 313c8e372f0SJeremy L Thompson __syncthreads(); 314c8e372f0SJeremy L Thompson } 315c8e372f0SJeremy L Thompson 316c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 317c8e372f0SJeremy L Thompson // 3D transpose tensor contract y 318c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 319c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 320c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 321c8e372f0SJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 322c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 323c8e372f0SJeremy L Thompson __syncthreads(); 324c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 325c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 326c8e372f0SJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 327c8e372f0SJeremy L Thompson } 328c8e372f0SJeremy L Thompson } 329c8e372f0SJeremy L Thompson __syncthreads(); 330c8e372f0SJeremy L Thompson } 331c8e372f0SJeremy L Thompson 332c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 333c8e372f0SJeremy L Thompson // 3D transpose tensor contract x 334c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 335c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 336c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 337c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 338c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 339c8e372f0SJeremy L Thompson __syncthreads(); 340c8e372f0SJeremy L Thompson *V = 0.0; 341c8e372f0SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 342c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 343c8e372f0SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 344c8e372f0SJeremy L Thompson } 345c8e372f0SJeremy L Thompson } 346c8e372f0SJeremy L Thompson __syncthreads(); 347c8e372f0SJeremy L Thompson } 348c8e372f0SJeremy L Thompson 349c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 350c8e372f0SJeremy L Thompson // 3D transpose tensor contract add x 351c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 352c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 353c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 354c8e372f0SJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 355c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 356c8e372f0SJeremy L Thompson __syncthreads(); 357c8e372f0SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 358c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 359c8e372f0SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 360c8e372f0SJeremy L Thompson } 361c8e372f0SJeremy L Thompson } 362c8e372f0SJeremy L Thompson __syncthreads(); 363c8e372f0SJeremy L Thompson } 364c8e372f0SJeremy L Thompson 365c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 366c8e372f0SJeremy L Thompson // 3D pack/unpack quadrature values 367c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 368c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 369c8e372f0SJeremy L Thompson inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 370*259057edSJeremy L Thompson const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D); 371c8e372f0SJeremy L Thompson 372c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 373c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp]; 374c8e372f0SJeremy L Thompson __syncthreads(); 375*259057edSJeremy L Thompson U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0; 376c8e372f0SJeremy L Thompson __syncthreads(); 377c8e372f0SJeremy L Thompson } 378c8e372f0SJeremy L Thompson } 379c8e372f0SJeremy L Thompson 380c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 381c8e372f0SJeremy L Thompson inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 382*259057edSJeremy L Thompson const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D); 383c8e372f0SJeremy L Thompson 384c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 385*259057edSJeremy L Thompson if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp]; 386c8e372f0SJeremy L Thompson __syncthreads(); 387c8e372f0SJeremy L Thompson U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0; 388c8e372f0SJeremy L Thompson __syncthreads(); 389c8e372f0SJeremy L Thompson } 390c8e372f0SJeremy L Thompson } 391c8e372f0SJeremy L Thompson 392c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 393c8e372f0SJeremy L Thompson // 3D interpolate to quadrature points 394c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 395c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 396c8e372f0SJeremy L Thompson inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 397c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 398*259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 399c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 400c8e372f0SJeremy L Thompson 401c8e372f0SJeremy L Thompson QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 402c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 403c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 404c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 405c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 406c8e372f0SJeremy L Thompson } 407*259057edSJeremy L Thompson QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 408c8e372f0SJeremy L Thompson QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 409c8e372f0SJeremy L Thompson } 410c8e372f0SJeremy L Thompson 411c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 412c8e372f0SJeremy L Thompson // 3D interpolate transpose 413c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 414c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 415c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 416c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 417*259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 418c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 419c8e372f0SJeremy L Thompson 420c8e372f0SJeremy L Thompson QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 421c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 422c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 423c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 424c8e372f0SJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 425c8e372f0SJeremy L Thompson } 426c8e372f0SJeremy L Thompson QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 427c8e372f0SJeremy L Thompson } 428c8e372f0SJeremy L Thompson 429c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 430c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points 431c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 432c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 433c8e372f0SJeremy L Thompson inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 434c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 435*259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 436c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 437c8e372f0SJeremy L Thompson 438c8e372f0SJeremy L Thompson QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 439c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 440c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1); 441c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 442c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]); 443c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 444c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2); 445c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]); 446c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 447c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 448c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]); 449c8e372f0SJeremy L Thompson } 450*259057edSJeremy L Thompson QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 451c8e372f0SJeremy L Thompson QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 452c8e372f0SJeremy L Thompson } 453c8e372f0SJeremy L Thompson 454c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 455c8e372f0SJeremy L Thompson // 3D derivatives transpose 456c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 457c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 458c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 459c8e372f0SJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 460*259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 461c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 462c8e372f0SJeremy L Thompson 463c8e372f0SJeremy L Thompson QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 464c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 465c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1); 466c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 467c8e372f0SJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]); 468c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1); 469c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 470c8e372f0SJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 471c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1); 472c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 473c8e372f0SJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 474c8e372f0SJeremy L Thompson } 475c8e372f0SJeremy L Thompson QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 476c8e372f0SJeremy L Thompson } 477c8e372f0SJeremy L Thompson 478c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 479c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points 480c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 481c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 482c8e372f0SJeremy L Thompson inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 483c8e372f0SJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 484*259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 485c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 486c8e372f0SJeremy L Thompson 487c8e372f0SJeremy L Thompson QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 488c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 489c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 490c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 491c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 492c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]); 493c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]); 494c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]); 495c8e372f0SJeremy L Thompson } 496*259057edSJeremy L Thompson QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 497c8e372f0SJeremy L Thompson QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 498c8e372f0SJeremy L Thompson } 499c8e372f0SJeremy L Thompson 500c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 501c8e372f0SJeremy L Thompson // 3D derivatives transpose 502c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 503c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 504c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 505c8e372f0SJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 506*259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 507c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 508c8e372f0SJeremy L Thompson 509c8e372f0SJeremy L Thompson QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 510c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 511c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2); 512c8e372f0SJeremy L Thompson ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2); 513c8e372f0SJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2); 514c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 515c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 516c8e372f0SJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 517c8e372f0SJeremy L Thompson } 518c8e372f0SJeremy L Thompson QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 519c8e372f0SJeremy L Thompson } 520c8e372f0SJeremy L Thompson 521c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 522c8e372f0SJeremy L Thompson // 3D quadrature weights 523c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 524c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D> 525c8e372f0SJeremy L Thompson inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 526*259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D); 527c8e372f0SJeremy L Thompson 528c8e372f0SJeremy L Thompson *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0; 529c8e372f0SJeremy L Thompson } 530