1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. 2bd882c8aSJames Wright // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3bd882c8aSJames Wright // 4bd882c8aSJames Wright // SPDX-License-Identifier: BSD-2-Clause 5bd882c8aSJames Wright // 6bd882c8aSJames Wright // This file is part of CEED: http://github.com/ceed 7bd882c8aSJames Wright 8bd882c8aSJames Wright /// @file 9bd882c8aSJames Wright /// Internal header for SYCL shared memory tensor product basis templates 10c0b5abf0SJeremy L Thompson #include <ceed/types.h> 11bd882c8aSJames Wright 12bd882c8aSJames Wright //------------------------------------------------------------------------------ 13bd882c8aSJames Wright // 1D 14bd882c8aSJames Wright //------------------------------------------------------------------------------ 15bd882c8aSJames Wright 16bd882c8aSJames Wright //------------------------------------------------------------------------------ 17bd882c8aSJames Wright // 1D tensor contraction x 18bd882c8aSJames Wright //------------------------------------------------------------------------------ 19bd882c8aSJames Wright inline void ContractX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 20bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 21bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 22bd882c8aSJames Wright 23bd882c8aSJames Wright scratch[item_id_x] = *U; 24bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 25bd882c8aSJames Wright 26bd882c8aSJames Wright *V = 0.0; 27bd882c8aSJames Wright if (item_id_x < Q_1D) { 28bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 29bd882c8aSJames Wright *V += B[i + item_id_x * P_1D] * scratch[i]; // Contract x direction 30bd882c8aSJames Wright } 31bd882c8aSJames Wright } 32bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 33bd882c8aSJames Wright } 34bd882c8aSJames Wright 35bd882c8aSJames Wright //------------------------------------------------------------------------------ 36bd882c8aSJames Wright // 1D transpose tensor contraction x 37bd882c8aSJames Wright //------------------------------------------------------------------------------ 38bd882c8aSJames Wright inline void ContractTransposeX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 39bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 40bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 41bd882c8aSJames Wright 42bd882c8aSJames Wright scratch[item_id_x] = *U; 43bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 44bd882c8aSJames Wright 45bd882c8aSJames Wright *V = 0.0; 46bd882c8aSJames Wright if (item_id_x < P_1D) { 47bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 48bd882c8aSJames Wright *V += B[item_id_x + i * P_1D] * scratch[i]; // Contract x direction 49bd882c8aSJames Wright } 50bd882c8aSJames Wright } 51bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 52bd882c8aSJames Wright } 53bd882c8aSJames Wright 54bd882c8aSJames Wright //------------------------------------------------------------------------------ 55bd882c8aSJames Wright // 1D interpolate to quadrature points 56bd882c8aSJames Wright //------------------------------------------------------------------------------ 57bd882c8aSJames Wright inline void Interp1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 58bd882c8aSJames Wright local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 59bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 60bd882c8aSJames Wright ContractX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch); 61bd882c8aSJames Wright } 62bd882c8aSJames Wright } 63bd882c8aSJames Wright 64bd882c8aSJames Wright //------------------------------------------------------------------------------ 65bd882c8aSJames Wright // 1D interpolate transpose 66bd882c8aSJames Wright //------------------------------------------------------------------------------ 67bd882c8aSJames Wright inline void InterpTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 68bd882c8aSJames Wright local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 69bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 70bd882c8aSJames Wright ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch); 71bd882c8aSJames Wright } 72bd882c8aSJames Wright } 73bd882c8aSJames Wright 74bd882c8aSJames Wright //------------------------------------------------------------------------------ 75bd882c8aSJames Wright // 1D derivatives at quadrature points 76bd882c8aSJames Wright //------------------------------------------------------------------------------ 77bd882c8aSJames Wright inline void Grad1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 78bd882c8aSJames Wright local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 79bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 80bd882c8aSJames Wright ContractX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch); 81bd882c8aSJames Wright } 82bd882c8aSJames Wright } 83bd882c8aSJames Wright 84bd882c8aSJames Wright //------------------------------------------------------------------------------ 85bd882c8aSJames Wright // 1D derivatives transpose 86bd882c8aSJames Wright //------------------------------------------------------------------------------ 87bd882c8aSJames Wright inline void GradTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 88bd882c8aSJames Wright local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 89bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 90bd882c8aSJames Wright ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch); 91bd882c8aSJames Wright } 92bd882c8aSJames Wright } 93bd882c8aSJames Wright 94bd882c8aSJames Wright //------------------------------------------------------------------------------ 95bd882c8aSJames Wright // 1D quadrature weights 96bd882c8aSJames Wright //------------------------------------------------------------------------------ 97bd882c8aSJames Wright inline void Weight1d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) { 98bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 99bd882c8aSJames Wright *w = (item_id_x < Q_1D) ? q_weight_1d[item_id_x] : 0.0; 100bd882c8aSJames Wright } 101bd882c8aSJames Wright 102bd882c8aSJames Wright //------------------------------------------------------------------------------ 103bd882c8aSJames Wright // 2D 104bd882c8aSJames Wright //------------------------------------------------------------------------------ 105bd882c8aSJames Wright 106bd882c8aSJames Wright //------------------------------------------------------------------------------ 107bd882c8aSJames Wright // 2D tensor contraction x 108bd882c8aSJames Wright //------------------------------------------------------------------------------ 109bd882c8aSJames Wright inline void ContractX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 110bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 111bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 112bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 113bd882c8aSJames Wright 114bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = *U; 115bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 116bd882c8aSJames Wright 117bd882c8aSJames Wright *V = 0.0; 118bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < P_1D) { 119bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 120bd882c8aSJames Wright *V += B[i + item_id_x * P_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction 121bd882c8aSJames Wright } 122bd882c8aSJames Wright } 123bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 124bd882c8aSJames Wright } 125bd882c8aSJames Wright 126bd882c8aSJames Wright //------------------------------------------------------------------------------ 127bd882c8aSJames Wright // 2D tensor contract y 128bd882c8aSJames Wright //------------------------------------------------------------------------------ 129bd882c8aSJames Wright inline void ContractY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 130bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 131bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 132bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 133bd882c8aSJames Wright 134bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = *U; 135bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 136bd882c8aSJames Wright 137bd882c8aSJames Wright *V = 0.0; 138bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < Q_1D) { 139bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 140bd882c8aSJames Wright *V += B[i + item_id_y * P_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction 141bd882c8aSJames Wright } 142bd882c8aSJames Wright } 143bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 144bd882c8aSJames Wright } 145bd882c8aSJames Wright 146bd882c8aSJames Wright //------------------------------------------------------------------------------ 147bd882c8aSJames Wright // 2D transpose tensor contract y 148bd882c8aSJames Wright //------------------------------------------------------------------------------ 149bd882c8aSJames Wright inline void ContractTransposeY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 150bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 151bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 152bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 153bd882c8aSJames Wright 154bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = *U; 155bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 156bd882c8aSJames Wright 157bd882c8aSJames Wright *V = 0.0; 158bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < P_1D) { 159bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 160bd882c8aSJames Wright *V += B[item_id_y + i * P_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction 161bd882c8aSJames Wright } 162bd882c8aSJames Wright } 163bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 164bd882c8aSJames Wright } 165bd882c8aSJames Wright 166bd882c8aSJames Wright //------------------------------------------------------------------------------ 167bd882c8aSJames Wright // 2D transpose tensor contract x 168bd882c8aSJames Wright //------------------------------------------------------------------------------ 169bd882c8aSJames Wright inline void ContractTransposeX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 170bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 171bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 172bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 173bd882c8aSJames Wright 174bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = *U; 175bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 176bd882c8aSJames Wright 177bd882c8aSJames Wright *V = 0.0; 178bd882c8aSJames Wright if (item_id_x < P_1D && item_id_y < P_1D) { 179bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 180bd882c8aSJames Wright *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction 181bd882c8aSJames Wright } 182bd882c8aSJames Wright } 183bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 184bd882c8aSJames Wright } 185bd882c8aSJames Wright 186bd882c8aSJames Wright //------------------------------------------------------------------------------ 187bd882c8aSJames Wright // 2D transpose tensor contract and add x 188bd882c8aSJames Wright //------------------------------------------------------------------------------ 189bd882c8aSJames Wright inline void ContractTransposeAddX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 190bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 191bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 192bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 193bd882c8aSJames Wright 194bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = *U; 195bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 196bd882c8aSJames Wright 197bd882c8aSJames Wright if (item_id_x < P_1D && item_id_y < P_1D) { 198bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 199bd882c8aSJames Wright *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction 200bd882c8aSJames Wright } 201bd882c8aSJames Wright } 202bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 203bd882c8aSJames Wright } 204bd882c8aSJames Wright 205bd882c8aSJames Wright //------------------------------------------------------------------------------ 206bd882c8aSJames Wright // 2D interpolate to quadrature points 207bd882c8aSJames Wright //------------------------------------------------------------------------------ 208bd882c8aSJames Wright inline void InterpTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 209bd882c8aSJames Wright local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 210bd882c8aSJames Wright CeedScalar r_t[1]; 211bd882c8aSJames Wright 212bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 213bd882c8aSJames Wright ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch); 214bd882c8aSJames Wright ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch); 215bd882c8aSJames Wright } 216bd882c8aSJames Wright } 217bd882c8aSJames Wright 218bd882c8aSJames Wright //------------------------------------------------------------------------------ 219bd882c8aSJames Wright // 2D interpolate transpose 220bd882c8aSJames Wright //------------------------------------------------------------------------------ 221bd882c8aSJames Wright inline void InterpTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 222bd882c8aSJames Wright local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 223bd882c8aSJames Wright CeedScalar r_t[1]; 224bd882c8aSJames Wright 225bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 226bd882c8aSJames Wright ContractTransposeY2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch); 227bd882c8aSJames Wright ContractTransposeX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch); 228bd882c8aSJames Wright } 229bd882c8aSJames Wright } 230bd882c8aSJames Wright 231bd882c8aSJames Wright //------------------------------------------------------------------------------ 232bd882c8aSJames Wright // 2D derivatives at quadrature points 233bd882c8aSJames Wright //------------------------------------------------------------------------------ 234bd882c8aSJames Wright inline void GradTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 235bd882c8aSJames Wright local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 236bd882c8aSJames Wright local CeedScalar *restrict scratch) { 237bd882c8aSJames Wright CeedScalar r_t[1]; 238bd882c8aSJames Wright 239bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 240bd882c8aSJames Wright ContractX2d(P_1D, Q_1D, r_U + comp, s_G, r_t, scratch); 241bd882c8aSJames Wright ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp + 0 * NUM_COMP, scratch); 242bd882c8aSJames Wright ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch); 243bd882c8aSJames Wright ContractY2d(P_1D, Q_1D, r_t, s_G, r_V + comp + 1 * NUM_COMP, scratch); 244bd882c8aSJames Wright } 245bd882c8aSJames Wright } 246bd882c8aSJames Wright 247bd882c8aSJames Wright //------------------------------------------------------------------------------ 248bd882c8aSJames Wright // 2D derivatives transpose 249bd882c8aSJames Wright //------------------------------------------------------------------------------ 250bd882c8aSJames Wright inline void GradTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 251bd882c8aSJames Wright local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 252bd882c8aSJames Wright local CeedScalar *restrict scratch) { 253bd882c8aSJames Wright CeedScalar r_t[1]; 254bd882c8aSJames Wright 255bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 256bd882c8aSJames Wright ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 0 * NUM_COMP, s_B, r_t, scratch); 257bd882c8aSJames Wright ContractTransposeX2d(P_1D, Q_1D, r_t, s_G, r_V + comp, scratch); 258bd882c8aSJames Wright ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 1 * NUM_COMP, s_G, r_t, scratch); 259bd882c8aSJames Wright ContractTransposeAddX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch); 260bd882c8aSJames Wright } 261bd882c8aSJames Wright } 262bd882c8aSJames Wright 263bd882c8aSJames Wright //------------------------------------------------------------------------------ 264bd882c8aSJames Wright // 2D quadrature weights 265bd882c8aSJames Wright //------------------------------------------------------------------------------ 266bd882c8aSJames Wright inline void WeightTensor2d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) { 267bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 268bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 269bd882c8aSJames Wright 270bd882c8aSJames Wright *w = (item_id_x < Q_1D && item_id_y < Q_1D) ? q_weight_1d[item_id_x] * q_weight_1d[item_id_y] : 0.0; 271bd882c8aSJames Wright } 272bd882c8aSJames Wright 273bd882c8aSJames Wright //------------------------------------------------------------------------------ 274bd882c8aSJames Wright // 3D 275bd882c8aSJames Wright //------------------------------------------------------------------------------ 276bd882c8aSJames Wright 277bd882c8aSJames Wright //------------------------------------------------------------------------------ 278bd882c8aSJames Wright // 3D tensor contract x 279bd882c8aSJames Wright //------------------------------------------------------------------------------ 280bd882c8aSJames Wright inline void ContractX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 281bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 282bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 283bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 284bd882c8aSJames Wright 285bd882c8aSJames Wright CeedScalar r_B[T_1D]; 286bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 287bd882c8aSJames Wright r_B[i] = B[i + item_id_x * P_1D]; 288bd882c8aSJames Wright } 289bd882c8aSJames Wright 290bd882c8aSJames Wright for (CeedInt k = 0; k < P_1D; k++) { 291bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = U[k]; 292bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 293bd882c8aSJames Wright 294bd882c8aSJames Wright V[k] = 0.0; 295bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < P_1D) { 296bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 297bd882c8aSJames Wright V[k] += r_B[i] * scratch[i + item_id_y * T_1D]; // Contract x direction 298bd882c8aSJames Wright } 299bd882c8aSJames Wright } 300bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 301bd882c8aSJames Wright } 302bd882c8aSJames Wright } 303bd882c8aSJames Wright 304bd882c8aSJames Wright //------------------------------------------------------------------------------ 305bd882c8aSJames Wright // 3D tensor contract y 306bd882c8aSJames Wright //------------------------------------------------------------------------------ 307bd882c8aSJames Wright inline void ContractY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 308bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 309bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 310bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 311bd882c8aSJames Wright 312bd882c8aSJames Wright CeedScalar r_B[T_1D]; 313bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 314bd882c8aSJames Wright r_B[i] = B[i + item_id_y * P_1D]; 315bd882c8aSJames Wright } 316bd882c8aSJames Wright 317bd882c8aSJames Wright for (CeedInt k = 0; k < P_1D; k++) { 318bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = U[k]; 319bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 320bd882c8aSJames Wright 321bd882c8aSJames Wright V[k] = 0.0; 322bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < Q_1D) { 323bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 324bd882c8aSJames Wright V[k] += r_B[i] * scratch[item_id_x + i * T_1D]; // Contract y direction 325bd882c8aSJames Wright } 326bd882c8aSJames Wright } 327bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 328bd882c8aSJames Wright } 329bd882c8aSJames Wright } 330bd882c8aSJames Wright 331bd882c8aSJames Wright //------------------------------------------------------------------------------ 332bd882c8aSJames Wright // 3D tensor contract z 333bd882c8aSJames Wright //------------------------------------------------------------------------------ 334bd882c8aSJames Wright inline void ContractZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 335bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 336bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 337bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 338bd882c8aSJames Wright 339bd882c8aSJames Wright for (CeedInt k = 0; k < Q_1D; k++) { 340bd882c8aSJames Wright V[k] = 0.0; 341bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < Q_1D) { 342bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 343bd882c8aSJames Wright V[k] += B[i + k * P_1D] * U[i]; // Contract z direction 344bd882c8aSJames Wright } 345bd882c8aSJames Wright } 346bd882c8aSJames Wright } 347bd882c8aSJames Wright } 348bd882c8aSJames Wright 349bd882c8aSJames Wright //------------------------------------------------------------------------------ 350bd882c8aSJames Wright // 3D transpose tensor contract z 351bd882c8aSJames Wright //------------------------------------------------------------------------------ 352bd882c8aSJames Wright inline void ContractTransposeZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 353bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 354bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 355bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 356bd882c8aSJames Wright 357bd882c8aSJames Wright for (CeedInt k = 0; k < P_1D; k++) { 358bd882c8aSJames Wright V[k] = 0.0; 359bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < Q_1D) { 360bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 361bd882c8aSJames Wright V[k] += B[k + i * P_1D] * U[i]; // Contract z direction 362bd882c8aSJames Wright } 363bd882c8aSJames Wright } 364bd882c8aSJames Wright } 365bd882c8aSJames Wright } 366bd882c8aSJames Wright 367bd882c8aSJames Wright //------------------------------------------------------------------------------ 368bd882c8aSJames Wright // 3D transpose tensor contract y 369bd882c8aSJames Wright //------------------------------------------------------------------------------ 370bd882c8aSJames Wright inline void ContractTransposeY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 371bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 372bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 373bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 374bd882c8aSJames Wright 375bd882c8aSJames Wright CeedScalar r_B[T_1D]; 376bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 377bd882c8aSJames Wright r_B[i] = B[item_id_y + i * P_1D]; 378bd882c8aSJames Wright } 379bd882c8aSJames Wright 380bd882c8aSJames Wright for (CeedInt k = 0; k < P_1D; k++) { 381bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = U[k]; 382bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 383bd882c8aSJames Wright 384bd882c8aSJames Wright V[k] = 0.0; 385bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < P_1D) { 386bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 387bd882c8aSJames Wright V[k] += r_B[i] * scratch[item_id_x + i * T_1D]; // Contract y direction 388bd882c8aSJames Wright } 389bd882c8aSJames Wright } 390bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 391bd882c8aSJames Wright } 392bd882c8aSJames Wright } 393bd882c8aSJames Wright 394bd882c8aSJames Wright //------------------------------------------------------------------------------ 395bd882c8aSJames Wright // 3D transpose tensor contract y 396bd882c8aSJames Wright //------------------------------------------------------------------------------ 397bd882c8aSJames Wright inline void ContractTransposeAddY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 398bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 399bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 400bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 401bd882c8aSJames Wright 402bd882c8aSJames Wright CeedScalar r_B[T_1D]; 403bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 404bd882c8aSJames Wright r_B[i] = B[item_id_y + i * P_1D]; 405bd882c8aSJames Wright } 406bd882c8aSJames Wright 407bd882c8aSJames Wright for (CeedInt k = 0; k < P_1D; k++) { 408bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = U[k]; 409bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 410bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < P_1D) { 411bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 412bd882c8aSJames Wright V[k] += r_B[i] * scratch[item_id_x + i * T_1D]; // Contract y direction 413bd882c8aSJames Wright } 414bd882c8aSJames Wright } 415bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 416bd882c8aSJames Wright } 417bd882c8aSJames Wright } 418bd882c8aSJames Wright 419bd882c8aSJames Wright //------------------------------------------------------------------------------ 420bd882c8aSJames Wright // 3D transpose tensor contract x 421bd882c8aSJames Wright //------------------------------------------------------------------------------ 422bd882c8aSJames Wright inline void ContractTransposeX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 423bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 424bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 425bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 426bd882c8aSJames Wright 427bd882c8aSJames Wright CeedScalar r_B[T_1D]; 428bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 429bd882c8aSJames Wright r_B[i] = B[item_id_x + i * P_1D]; 430bd882c8aSJames Wright } 431bd882c8aSJames Wright 432bd882c8aSJames Wright for (CeedInt k = 0; k < P_1D; k++) { 433bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = U[k]; 434bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 435bd882c8aSJames Wright V[k] = 0.0; 436bd882c8aSJames Wright if (item_id_x < P_1D && item_id_y < P_1D) { 437bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 438bd882c8aSJames Wright V[k] += r_B[i] * scratch[i + item_id_y * T_1D]; // Contract x direction 439bd882c8aSJames Wright } 440bd882c8aSJames Wright } 441bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 442bd882c8aSJames Wright } 443bd882c8aSJames Wright } 444bd882c8aSJames Wright 445bd882c8aSJames Wright //------------------------------------------------------------------------------ 446bd882c8aSJames Wright // 3D transpose tensor contract add x 447bd882c8aSJames Wright //------------------------------------------------------------------------------ 448bd882c8aSJames Wright inline void ContractTransposeAddX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 449bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 450bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 451bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 452bd882c8aSJames Wright 453bd882c8aSJames Wright CeedScalar r_B[T_1D]; 454bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 455bd882c8aSJames Wright r_B[i] = B[item_id_x + i * P_1D]; 456bd882c8aSJames Wright } 457bd882c8aSJames Wright 458bd882c8aSJames Wright for (CeedInt k = 0; k < P_1D; k++) { 459bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = U[k]; 460bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 461bd882c8aSJames Wright 462bd882c8aSJames Wright if (item_id_x < P_1D && item_id_y < P_1D) { 463bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 464bd882c8aSJames Wright V[k] += r_B[i] * scratch[i + item_id_y * T_1D]; // Contract x direction 465bd882c8aSJames Wright } 466bd882c8aSJames Wright } 467bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 468bd882c8aSJames Wright } 469bd882c8aSJames Wright } 470bd882c8aSJames Wright 471bd882c8aSJames Wright //------------------------------------------------------------------------------ 472bd882c8aSJames Wright // 3D interpolate to quadrature points 473bd882c8aSJames Wright //------------------------------------------------------------------------------ 474bd882c8aSJames Wright inline void InterpTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 475bd882c8aSJames Wright local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 476bd882c8aSJames Wright CeedScalar r_t1[T_1D]; 477bd882c8aSJames Wright CeedScalar r_t2[T_1D]; 478bd882c8aSJames Wright 479bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 480bd882c8aSJames Wright ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 481bd882c8aSJames Wright ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 482bd882c8aSJames Wright ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D, scratch); 483bd882c8aSJames Wright } 484bd882c8aSJames Wright } 485bd882c8aSJames Wright 486bd882c8aSJames Wright //------------------------------------------------------------------------------ 487bd882c8aSJames Wright // 3D interpolate transpose 488bd882c8aSJames Wright //------------------------------------------------------------------------------ 489bd882c8aSJames Wright inline void InterpTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 490bd882c8aSJames Wright local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 491bd882c8aSJames Wright CeedScalar r_t1[T_1D]; 492bd882c8aSJames Wright CeedScalar r_t2[T_1D]; 493bd882c8aSJames Wright 494bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 495bd882c8aSJames Wright ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D, s_B, r_t1, scratch); 496bd882c8aSJames Wright ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 497bd882c8aSJames Wright ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 498bd882c8aSJames Wright } 499bd882c8aSJames Wright } 500bd882c8aSJames Wright 501bd882c8aSJames Wright //------------------------------------------------------------------------------ 502bd882c8aSJames Wright // 3D derivatives at quadrature points 503bd882c8aSJames Wright //------------------------------------------------------------------------------ 504bd882c8aSJames Wright inline void GradTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 505bd882c8aSJames Wright local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 506bd882c8aSJames Wright local CeedScalar *restrict scratch) { 507bd882c8aSJames Wright CeedScalar r_t1[T_1D]; 508bd882c8aSJames Wright CeedScalar r_t2[T_1D]; 509bd882c8aSJames Wright 510bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 511bd882c8aSJames Wright ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_G, r_t1, scratch); 512bd882c8aSJames Wright ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 513bd882c8aSJames Wright ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch); 514bd882c8aSJames Wright ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 515bd882c8aSJames Wright ContractY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch); 516bd882c8aSJames Wright ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch); 517bd882c8aSJames Wright ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 518bd882c8aSJames Wright ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 519bd882c8aSJames Wright ContractZ3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch); 520bd882c8aSJames Wright } 521bd882c8aSJames Wright } 522bd882c8aSJames Wright 523bd882c8aSJames Wright //------------------------------------------------------------------------------ 524bd882c8aSJames Wright // 3D derivatives transpose 525bd882c8aSJames Wright //------------------------------------------------------------------------------ 526bd882c8aSJames Wright inline void GradTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 527bd882c8aSJames Wright local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 528bd882c8aSJames Wright local CeedScalar *restrict scratch) { 529bd882c8aSJames Wright CeedScalar r_t1[T_1D]; 530bd882c8aSJames Wright CeedScalar r_t2[T_1D]; 531bd882c8aSJames Wright 532bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 533bd882c8aSJames Wright ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_B, r_t1, scratch); 534bd882c8aSJames Wright ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 535bd882c8aSJames Wright ContractTransposeX3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * P_1D, scratch); 536bd882c8aSJames Wright ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_B, r_t1, scratch); 537bd882c8aSJames Wright ContractTransposeY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch); 538bd882c8aSJames Wright ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 539bd882c8aSJames Wright ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t1, scratch); 540bd882c8aSJames Wright ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 541bd882c8aSJames Wright ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 542bd882c8aSJames Wright } 543bd882c8aSJames Wright } 544bd882c8aSJames Wright 545bd882c8aSJames Wright //------------------------------------------------------------------------------ 546bd882c8aSJames Wright // 3D derivatives at quadrature points 547bd882c8aSJames Wright //------------------------------------------------------------------------------ 548bd882c8aSJames Wright inline void GradTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 549bd882c8aSJames Wright local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 550bd882c8aSJames Wright local CeedScalar *restrict scratch) { 551bd882c8aSJames Wright CeedScalar r_t1[T_1D]; 552bd882c8aSJames Wright CeedScalar r_t2[T_1D]; 553bd882c8aSJames Wright 554bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 555bd882c8aSJames Wright ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 556bd882c8aSJames Wright ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 557bd882c8aSJames Wright ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch); 558bd882c8aSJames Wright ContractX3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch); 559bd882c8aSJames Wright ContractY3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch); 560bd882c8aSJames Wright ContractZ3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch); 561bd882c8aSJames Wright } 562bd882c8aSJames Wright } 563bd882c8aSJames Wright 564bd882c8aSJames Wright //------------------------------------------------------------------------------ 565bd882c8aSJames Wright // 3D derivatives transpose 566bd882c8aSJames Wright //------------------------------------------------------------------------------ 567bd882c8aSJames Wright inline void GradTransposeTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 568bd882c8aSJames Wright local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, 569bd882c8aSJames Wright private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 570bd882c8aSJames Wright CeedScalar r_t1[T_1D]; 571bd882c8aSJames Wright CeedScalar r_t2[T_1D]; 572bd882c8aSJames Wright 573bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 574bd882c8aSJames Wright ContractTransposeZ3d(Q_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t2, scratch); 575bd882c8aSJames Wright ContractTransposeAddY3d(Q_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_G, r_t2, scratch); 576bd882c8aSJames Wright ContractTransposeAddX3d(Q_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_G, r_t2, scratch); 577bd882c8aSJames Wright ContractTransposeZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch); 578bd882c8aSJames Wright ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 579bd882c8aSJames Wright ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 580bd882c8aSJames Wright } 581bd882c8aSJames Wright } 582bd882c8aSJames Wright 583bd882c8aSJames Wright //------------------------------------------------------------------------------ 584bd882c8aSJames Wright // 3D quadrature weights 585bd882c8aSJames Wright //------------------------------------------------------------------------------ 586bd882c8aSJames Wright // template <int Q_1D> 587bd882c8aSJames Wright inline void WeightTensor3d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) { 588bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 589bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 590bd882c8aSJames Wright 591bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < Q_1D) { 592bd882c8aSJames Wright const CeedScalar w_xy = q_weight_1d[item_id_x] * q_weight_1d[item_id_y]; 593bd882c8aSJames Wright for (CeedInt q = 0; q < Q_1D; ++q) w[q] = w_xy * q_weight_1d[q]; 594bd882c8aSJames Wright } else { 595bd882c8aSJames Wright for (CeedInt q = 0; q < Q_1D; q++) w[q] = 0.0; 596bd882c8aSJames Wright } 597bd882c8aSJames Wright } 598