15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2bd882c8aSJames Wright // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3bd882c8aSJames Wright // 4bd882c8aSJames Wright // SPDX-License-Identifier: BSD-2-Clause 5bd882c8aSJames Wright // 6bd882c8aSJames Wright // This file is part of CEED: http://github.com/ceed 7bd882c8aSJames Wright 8bd882c8aSJames Wright /// @file 9bd882c8aSJames Wright /// Internal header for SYCL shared memory tensor product basis templates 10bd882c8aSJames Wright 11*c0b5abf0SJeremy L Thompson #include <ceed/types.h> 12bd882c8aSJames Wright 13bd882c8aSJames Wright //------------------------------------------------------------------------------ 14bd882c8aSJames Wright // 1D 15bd882c8aSJames Wright //------------------------------------------------------------------------------ 16bd882c8aSJames Wright 17bd882c8aSJames Wright //------------------------------------------------------------------------------ 18bd882c8aSJames Wright // 1D tensor contraction x 19bd882c8aSJames Wright //------------------------------------------------------------------------------ 20bd882c8aSJames Wright inline void ContractX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 21bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 22bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 23bd882c8aSJames Wright 24bd882c8aSJames Wright scratch[item_id_x] = *U; 25bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 26bd882c8aSJames Wright 27bd882c8aSJames Wright *V = 0.0; 28bd882c8aSJames Wright if (item_id_x < Q_1D) { 29bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 30bd882c8aSJames Wright *V += B[i + item_id_x * P_1D] * scratch[i]; // Contract x direction 31bd882c8aSJames Wright } 32bd882c8aSJames Wright } 33bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 34bd882c8aSJames Wright } 35bd882c8aSJames Wright 36bd882c8aSJames Wright //------------------------------------------------------------------------------ 37bd882c8aSJames Wright // 1D transpose tensor contraction x 38bd882c8aSJames Wright //------------------------------------------------------------------------------ 39bd882c8aSJames Wright inline void ContractTransposeX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 40bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 41bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 42bd882c8aSJames Wright 43bd882c8aSJames Wright scratch[item_id_x] = *U; 44bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 45bd882c8aSJames Wright 46bd882c8aSJames Wright *V = 0.0; 47bd882c8aSJames Wright if (item_id_x < P_1D) { 48bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 49bd882c8aSJames Wright *V += B[item_id_x + i * P_1D] * scratch[i]; // Contract x direction 50bd882c8aSJames Wright } 51bd882c8aSJames Wright } 52bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 53bd882c8aSJames Wright } 54bd882c8aSJames Wright 55bd882c8aSJames Wright //------------------------------------------------------------------------------ 56bd882c8aSJames Wright // 1D interpolate to quadrature points 57bd882c8aSJames Wright //------------------------------------------------------------------------------ 58bd882c8aSJames Wright inline void Interp1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 59bd882c8aSJames Wright local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 60bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 61bd882c8aSJames Wright ContractX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch); 62bd882c8aSJames Wright } 63bd882c8aSJames Wright } 64bd882c8aSJames Wright 65bd882c8aSJames Wright //------------------------------------------------------------------------------ 66bd882c8aSJames Wright // 1D interpolate transpose 67bd882c8aSJames Wright //------------------------------------------------------------------------------ 68bd882c8aSJames Wright inline void InterpTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 69bd882c8aSJames Wright local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 70bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 71bd882c8aSJames Wright ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch); 72bd882c8aSJames Wright } 73bd882c8aSJames Wright } 74bd882c8aSJames Wright 75bd882c8aSJames Wright //------------------------------------------------------------------------------ 76bd882c8aSJames Wright // 1D derivatives at quadrature points 77bd882c8aSJames Wright //------------------------------------------------------------------------------ 78bd882c8aSJames Wright inline void Grad1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 79bd882c8aSJames Wright local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 80bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 81bd882c8aSJames Wright ContractX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch); 82bd882c8aSJames Wright } 83bd882c8aSJames Wright } 84bd882c8aSJames Wright 85bd882c8aSJames Wright //------------------------------------------------------------------------------ 86bd882c8aSJames Wright // 1D derivatives transpose 87bd882c8aSJames Wright //------------------------------------------------------------------------------ 88bd882c8aSJames Wright inline void GradTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 89bd882c8aSJames Wright local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 90bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 91bd882c8aSJames Wright ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch); 92bd882c8aSJames Wright } 93bd882c8aSJames Wright } 94bd882c8aSJames Wright 95bd882c8aSJames Wright //------------------------------------------------------------------------------ 96bd882c8aSJames Wright // 1D quadrature weights 97bd882c8aSJames Wright //------------------------------------------------------------------------------ 98bd882c8aSJames Wright inline void Weight1d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) { 99bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 100bd882c8aSJames Wright *w = (item_id_x < Q_1D) ? q_weight_1d[item_id_x] : 0.0; 101bd882c8aSJames Wright } 102bd882c8aSJames Wright 103bd882c8aSJames Wright //------------------------------------------------------------------------------ 104bd882c8aSJames Wright // 2D 105bd882c8aSJames Wright //------------------------------------------------------------------------------ 106bd882c8aSJames Wright 107bd882c8aSJames Wright //------------------------------------------------------------------------------ 108bd882c8aSJames Wright // 2D tensor contraction x 109bd882c8aSJames Wright //------------------------------------------------------------------------------ 110bd882c8aSJames Wright inline void ContractX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 111bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 112bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 113bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 114bd882c8aSJames Wright 115bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = *U; 116bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 117bd882c8aSJames Wright 118bd882c8aSJames Wright *V = 0.0; 119bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < P_1D) { 120bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 121bd882c8aSJames Wright *V += B[i + item_id_x * P_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction 122bd882c8aSJames Wright } 123bd882c8aSJames Wright } 124bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 125bd882c8aSJames Wright } 126bd882c8aSJames Wright 127bd882c8aSJames Wright //------------------------------------------------------------------------------ 128bd882c8aSJames Wright // 2D tensor contract y 129bd882c8aSJames Wright //------------------------------------------------------------------------------ 130bd882c8aSJames Wright inline void ContractY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 131bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 132bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 133bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 134bd882c8aSJames Wright 135bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = *U; 136bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 137bd882c8aSJames Wright 138bd882c8aSJames Wright *V = 0.0; 139bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < Q_1D) { 140bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 141bd882c8aSJames Wright *V += B[i + item_id_y * P_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction 142bd882c8aSJames Wright } 143bd882c8aSJames Wright } 144bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 145bd882c8aSJames Wright } 146bd882c8aSJames Wright 147bd882c8aSJames Wright //------------------------------------------------------------------------------ 148bd882c8aSJames Wright // 2D transpose tensor contract y 149bd882c8aSJames Wright //------------------------------------------------------------------------------ 150bd882c8aSJames Wright inline void ContractTransposeY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 151bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 152bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 153bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 154bd882c8aSJames Wright 155bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = *U; 156bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 157bd882c8aSJames Wright 158bd882c8aSJames Wright *V = 0.0; 159bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < P_1D) { 160bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 161bd882c8aSJames Wright *V += B[item_id_y + i * P_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction 162bd882c8aSJames Wright } 163bd882c8aSJames Wright } 164bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 165bd882c8aSJames Wright } 166bd882c8aSJames Wright 167bd882c8aSJames Wright //------------------------------------------------------------------------------ 168bd882c8aSJames Wright // 2D transpose tensor contract x 169bd882c8aSJames Wright //------------------------------------------------------------------------------ 170bd882c8aSJames Wright inline void ContractTransposeX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 171bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 172bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 173bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 174bd882c8aSJames Wright 175bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = *U; 176bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 177bd882c8aSJames Wright 178bd882c8aSJames Wright *V = 0.0; 179bd882c8aSJames Wright if (item_id_x < P_1D && item_id_y < P_1D) { 180bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 181bd882c8aSJames Wright *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction 182bd882c8aSJames Wright } 183bd882c8aSJames Wright } 184bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 185bd882c8aSJames Wright } 186bd882c8aSJames Wright 187bd882c8aSJames Wright //------------------------------------------------------------------------------ 188bd882c8aSJames Wright // 2D transpose tensor contract and add x 189bd882c8aSJames Wright //------------------------------------------------------------------------------ 190bd882c8aSJames Wright inline void ContractTransposeAddX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 191bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 192bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 193bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 194bd882c8aSJames Wright 195bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = *U; 196bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 197bd882c8aSJames Wright 198bd882c8aSJames Wright if (item_id_x < P_1D && item_id_y < P_1D) { 199bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 200bd882c8aSJames Wright *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction 201bd882c8aSJames Wright } 202bd882c8aSJames Wright } 203bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 204bd882c8aSJames Wright } 205bd882c8aSJames Wright 206bd882c8aSJames Wright //------------------------------------------------------------------------------ 207bd882c8aSJames Wright // 2D interpolate to quadrature points 208bd882c8aSJames Wright //------------------------------------------------------------------------------ 209bd882c8aSJames Wright inline void InterpTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 210bd882c8aSJames Wright local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 211bd882c8aSJames Wright CeedScalar r_t[1]; 212bd882c8aSJames Wright 213bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 214bd882c8aSJames Wright ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch); 215bd882c8aSJames Wright ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch); 216bd882c8aSJames Wright } 217bd882c8aSJames Wright } 218bd882c8aSJames Wright 219bd882c8aSJames Wright //------------------------------------------------------------------------------ 220bd882c8aSJames Wright // 2D interpolate transpose 221bd882c8aSJames Wright //------------------------------------------------------------------------------ 222bd882c8aSJames Wright inline void InterpTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 223bd882c8aSJames Wright local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 224bd882c8aSJames Wright CeedScalar r_t[1]; 225bd882c8aSJames Wright 226bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 227bd882c8aSJames Wright ContractTransposeY2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch); 228bd882c8aSJames Wright ContractTransposeX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch); 229bd882c8aSJames Wright } 230bd882c8aSJames Wright } 231bd882c8aSJames Wright 232bd882c8aSJames Wright //------------------------------------------------------------------------------ 233bd882c8aSJames Wright // 2D derivatives at quadrature points 234bd882c8aSJames Wright //------------------------------------------------------------------------------ 235bd882c8aSJames Wright inline void GradTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 236bd882c8aSJames Wright local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 237bd882c8aSJames Wright local CeedScalar *restrict scratch) { 238bd882c8aSJames Wright CeedScalar r_t[1]; 239bd882c8aSJames Wright 240bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 241bd882c8aSJames Wright ContractX2d(P_1D, Q_1D, r_U + comp, s_G, r_t, scratch); 242bd882c8aSJames Wright ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp + 0 * NUM_COMP, scratch); 243bd882c8aSJames Wright ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch); 244bd882c8aSJames Wright ContractY2d(P_1D, Q_1D, r_t, s_G, r_V + comp + 1 * NUM_COMP, scratch); 245bd882c8aSJames Wright } 246bd882c8aSJames Wright } 247bd882c8aSJames Wright 248bd882c8aSJames Wright //------------------------------------------------------------------------------ 249bd882c8aSJames Wright // 2D derivatives transpose 250bd882c8aSJames Wright //------------------------------------------------------------------------------ 251bd882c8aSJames Wright inline void GradTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 252bd882c8aSJames Wright local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 253bd882c8aSJames Wright local CeedScalar *restrict scratch) { 254bd882c8aSJames Wright CeedScalar r_t[1]; 255bd882c8aSJames Wright 256bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 257bd882c8aSJames Wright ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 0 * NUM_COMP, s_B, r_t, scratch); 258bd882c8aSJames Wright ContractTransposeX2d(P_1D, Q_1D, r_t, s_G, r_V + comp, scratch); 259bd882c8aSJames Wright ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 1 * NUM_COMP, s_G, r_t, scratch); 260bd882c8aSJames Wright ContractTransposeAddX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch); 261bd882c8aSJames Wright } 262bd882c8aSJames Wright } 263bd882c8aSJames Wright 264bd882c8aSJames Wright //------------------------------------------------------------------------------ 265bd882c8aSJames Wright // 2D quadrature weights 266bd882c8aSJames Wright //------------------------------------------------------------------------------ 267bd882c8aSJames Wright inline void WeightTensor2d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) { 268bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 269bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 270bd882c8aSJames Wright 271bd882c8aSJames Wright *w = (item_id_x < Q_1D && item_id_y < Q_1D) ? q_weight_1d[item_id_x] * q_weight_1d[item_id_y] : 0.0; 272bd882c8aSJames Wright } 273bd882c8aSJames Wright 274bd882c8aSJames Wright //------------------------------------------------------------------------------ 275bd882c8aSJames Wright // 3D 276bd882c8aSJames Wright //------------------------------------------------------------------------------ 277bd882c8aSJames Wright 278bd882c8aSJames Wright //------------------------------------------------------------------------------ 279bd882c8aSJames Wright // 3D tensor contract x 280bd882c8aSJames Wright //------------------------------------------------------------------------------ 281bd882c8aSJames Wright inline void ContractX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 282bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 283bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 284bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 285bd882c8aSJames Wright 286bd882c8aSJames Wright CeedScalar r_B[T_1D]; 287bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 288bd882c8aSJames Wright r_B[i] = B[i + item_id_x * P_1D]; 289bd882c8aSJames Wright } 290bd882c8aSJames Wright 291bd882c8aSJames Wright for (CeedInt k = 0; k < P_1D; k++) { 292bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = U[k]; 293bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 294bd882c8aSJames Wright 295bd882c8aSJames Wright V[k] = 0.0; 296bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < P_1D) { 297bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 298bd882c8aSJames Wright V[k] += r_B[i] * scratch[i + item_id_y * T_1D]; // Contract x direction 299bd882c8aSJames Wright } 300bd882c8aSJames Wright } 301bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 302bd882c8aSJames Wright } 303bd882c8aSJames Wright } 304bd882c8aSJames Wright 305bd882c8aSJames Wright //------------------------------------------------------------------------------ 306bd882c8aSJames Wright // 3D tensor contract y 307bd882c8aSJames Wright //------------------------------------------------------------------------------ 308bd882c8aSJames Wright inline void ContractY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 309bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 310bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 311bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 312bd882c8aSJames Wright 313bd882c8aSJames Wright CeedScalar r_B[T_1D]; 314bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 315bd882c8aSJames Wright r_B[i] = B[i + item_id_y * P_1D]; 316bd882c8aSJames Wright } 317bd882c8aSJames Wright 318bd882c8aSJames Wright for (CeedInt k = 0; k < P_1D; k++) { 319bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = U[k]; 320bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 321bd882c8aSJames Wright 322bd882c8aSJames Wright V[k] = 0.0; 323bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < Q_1D) { 324bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 325bd882c8aSJames Wright V[k] += r_B[i] * scratch[item_id_x + i * T_1D]; // Contract y direction 326bd882c8aSJames Wright } 327bd882c8aSJames Wright } 328bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 329bd882c8aSJames Wright } 330bd882c8aSJames Wright } 331bd882c8aSJames Wright 332bd882c8aSJames Wright //------------------------------------------------------------------------------ 333bd882c8aSJames Wright // 3D tensor contract z 334bd882c8aSJames Wright //------------------------------------------------------------------------------ 335bd882c8aSJames Wright inline void ContractZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 336bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 337bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 338bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 339bd882c8aSJames Wright 340bd882c8aSJames Wright for (CeedInt k = 0; k < Q_1D; k++) { 341bd882c8aSJames Wright V[k] = 0.0; 342bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < Q_1D) { 343bd882c8aSJames Wright for (CeedInt i = 0; i < P_1D; i++) { 344bd882c8aSJames Wright V[k] += B[i + k * P_1D] * U[i]; // Contract z direction 345bd882c8aSJames Wright } 346bd882c8aSJames Wright } 347bd882c8aSJames Wright } 348bd882c8aSJames Wright } 349bd882c8aSJames Wright 350bd882c8aSJames Wright //------------------------------------------------------------------------------ 351bd882c8aSJames Wright // 3D transpose tensor contract z 352bd882c8aSJames Wright //------------------------------------------------------------------------------ 353bd882c8aSJames Wright inline void ContractTransposeZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 354bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 355bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 356bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 357bd882c8aSJames Wright 358bd882c8aSJames Wright for (CeedInt k = 0; k < P_1D; k++) { 359bd882c8aSJames Wright V[k] = 0.0; 360bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < Q_1D) { 361bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 362bd882c8aSJames Wright V[k] += B[k + i * P_1D] * U[i]; // Contract z direction 363bd882c8aSJames Wright } 364bd882c8aSJames Wright } 365bd882c8aSJames Wright } 366bd882c8aSJames Wright } 367bd882c8aSJames Wright 368bd882c8aSJames Wright //------------------------------------------------------------------------------ 369bd882c8aSJames Wright // 3D transpose tensor contract y 370bd882c8aSJames Wright //------------------------------------------------------------------------------ 371bd882c8aSJames Wright inline void ContractTransposeY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 372bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 373bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 374bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 375bd882c8aSJames Wright 376bd882c8aSJames Wright CeedScalar r_B[T_1D]; 377bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 378bd882c8aSJames Wright r_B[i] = B[item_id_y + i * P_1D]; 379bd882c8aSJames Wright } 380bd882c8aSJames Wright 381bd882c8aSJames Wright for (CeedInt k = 0; k < P_1D; k++) { 382bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = U[k]; 383bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 384bd882c8aSJames Wright 385bd882c8aSJames Wright V[k] = 0.0; 386bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < P_1D) { 387bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 388bd882c8aSJames Wright V[k] += r_B[i] * scratch[item_id_x + i * T_1D]; // Contract y direction 389bd882c8aSJames Wright } 390bd882c8aSJames Wright } 391bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 392bd882c8aSJames Wright } 393bd882c8aSJames Wright } 394bd882c8aSJames Wright 395bd882c8aSJames Wright //------------------------------------------------------------------------------ 396bd882c8aSJames Wright // 3D transpose tensor contract y 397bd882c8aSJames Wright //------------------------------------------------------------------------------ 398bd882c8aSJames Wright inline void ContractTransposeAddY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 399bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 400bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 401bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 402bd882c8aSJames Wright 403bd882c8aSJames Wright CeedScalar r_B[T_1D]; 404bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 405bd882c8aSJames Wright r_B[i] = B[item_id_y + i * P_1D]; 406bd882c8aSJames Wright } 407bd882c8aSJames Wright 408bd882c8aSJames Wright for (CeedInt k = 0; k < P_1D; k++) { 409bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = U[k]; 410bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 411bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < P_1D) { 412bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 413bd882c8aSJames Wright V[k] += r_B[i] * scratch[item_id_x + i * T_1D]; // Contract y direction 414bd882c8aSJames Wright } 415bd882c8aSJames Wright } 416bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 417bd882c8aSJames Wright } 418bd882c8aSJames Wright } 419bd882c8aSJames Wright 420bd882c8aSJames Wright //------------------------------------------------------------------------------ 421bd882c8aSJames Wright // 3D transpose tensor contract x 422bd882c8aSJames Wright //------------------------------------------------------------------------------ 423bd882c8aSJames Wright inline void ContractTransposeX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 424bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 425bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 426bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 427bd882c8aSJames Wright 428bd882c8aSJames Wright CeedScalar r_B[T_1D]; 429bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 430bd882c8aSJames Wright r_B[i] = B[item_id_x + i * P_1D]; 431bd882c8aSJames Wright } 432bd882c8aSJames Wright 433bd882c8aSJames Wright for (CeedInt k = 0; k < P_1D; k++) { 434bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = U[k]; 435bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 436bd882c8aSJames Wright V[k] = 0.0; 437bd882c8aSJames Wright if (item_id_x < P_1D && item_id_y < P_1D) { 438bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 439bd882c8aSJames Wright V[k] += r_B[i] * scratch[i + item_id_y * T_1D]; // Contract x direction 440bd882c8aSJames Wright } 441bd882c8aSJames Wright } 442bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 443bd882c8aSJames Wright } 444bd882c8aSJames Wright } 445bd882c8aSJames Wright 446bd882c8aSJames Wright //------------------------------------------------------------------------------ 447bd882c8aSJames Wright // 3D transpose tensor contract add x 448bd882c8aSJames Wright //------------------------------------------------------------------------------ 449bd882c8aSJames Wright inline void ContractTransposeAddX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 450bd882c8aSJames Wright private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 451bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 452bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 453bd882c8aSJames Wright 454bd882c8aSJames Wright CeedScalar r_B[T_1D]; 455bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 456bd882c8aSJames Wright r_B[i] = B[item_id_x + i * P_1D]; 457bd882c8aSJames Wright } 458bd882c8aSJames Wright 459bd882c8aSJames Wright for (CeedInt k = 0; k < P_1D; k++) { 460bd882c8aSJames Wright scratch[item_id_x + item_id_y * T_1D] = U[k]; 461bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 462bd882c8aSJames Wright 463bd882c8aSJames Wright if (item_id_x < P_1D && item_id_y < P_1D) { 464bd882c8aSJames Wright for (CeedInt i = 0; i < Q_1D; i++) { 465bd882c8aSJames Wright V[k] += r_B[i] * scratch[i + item_id_y * T_1D]; // Contract x direction 466bd882c8aSJames Wright } 467bd882c8aSJames Wright } 468bd882c8aSJames Wright work_group_barrier(CLK_LOCAL_MEM_FENCE); 469bd882c8aSJames Wright } 470bd882c8aSJames Wright } 471bd882c8aSJames Wright 472bd882c8aSJames Wright //------------------------------------------------------------------------------ 473bd882c8aSJames Wright // 3D interpolate to quadrature points 474bd882c8aSJames Wright //------------------------------------------------------------------------------ 475bd882c8aSJames Wright inline void InterpTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 476bd882c8aSJames Wright local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 477bd882c8aSJames Wright CeedScalar r_t1[T_1D]; 478bd882c8aSJames Wright CeedScalar r_t2[T_1D]; 479bd882c8aSJames Wright 480bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 481bd882c8aSJames Wright ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 482bd882c8aSJames Wright ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 483bd882c8aSJames Wright ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D, scratch); 484bd882c8aSJames Wright } 485bd882c8aSJames Wright } 486bd882c8aSJames Wright 487bd882c8aSJames Wright //------------------------------------------------------------------------------ 488bd882c8aSJames Wright // 3D interpolate transpose 489bd882c8aSJames Wright //------------------------------------------------------------------------------ 490bd882c8aSJames Wright inline void InterpTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 491bd882c8aSJames Wright local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 492bd882c8aSJames Wright CeedScalar r_t1[T_1D]; 493bd882c8aSJames Wright CeedScalar r_t2[T_1D]; 494bd882c8aSJames Wright 495bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 496bd882c8aSJames Wright ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D, s_B, r_t1, scratch); 497bd882c8aSJames Wright ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 498bd882c8aSJames Wright ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 499bd882c8aSJames Wright } 500bd882c8aSJames Wright } 501bd882c8aSJames Wright 502bd882c8aSJames Wright //------------------------------------------------------------------------------ 503bd882c8aSJames Wright // 3D derivatives at quadrature points 504bd882c8aSJames Wright //------------------------------------------------------------------------------ 505bd882c8aSJames Wright inline void GradTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 506bd882c8aSJames Wright local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 507bd882c8aSJames Wright local CeedScalar *restrict scratch) { 508bd882c8aSJames Wright CeedScalar r_t1[T_1D]; 509bd882c8aSJames Wright CeedScalar r_t2[T_1D]; 510bd882c8aSJames Wright 511bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 512bd882c8aSJames Wright ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_G, r_t1, scratch); 513bd882c8aSJames Wright ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 514bd882c8aSJames Wright ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch); 515bd882c8aSJames Wright ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 516bd882c8aSJames Wright ContractY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch); 517bd882c8aSJames Wright ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch); 518bd882c8aSJames Wright ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 519bd882c8aSJames Wright ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 520bd882c8aSJames Wright ContractZ3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch); 521bd882c8aSJames Wright } 522bd882c8aSJames Wright } 523bd882c8aSJames Wright 524bd882c8aSJames Wright //------------------------------------------------------------------------------ 525bd882c8aSJames Wright // 3D derivatives transpose 526bd882c8aSJames Wright //------------------------------------------------------------------------------ 527bd882c8aSJames Wright inline void GradTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 528bd882c8aSJames Wright local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 529bd882c8aSJames Wright local CeedScalar *restrict scratch) { 530bd882c8aSJames Wright CeedScalar r_t1[T_1D]; 531bd882c8aSJames Wright CeedScalar r_t2[T_1D]; 532bd882c8aSJames Wright 533bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 534bd882c8aSJames Wright ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_B, r_t1, scratch); 535bd882c8aSJames Wright ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 536bd882c8aSJames Wright ContractTransposeX3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * P_1D, scratch); 537bd882c8aSJames Wright ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_B, r_t1, scratch); 538bd882c8aSJames Wright ContractTransposeY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch); 539bd882c8aSJames Wright ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 540bd882c8aSJames Wright ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t1, scratch); 541bd882c8aSJames Wright ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 542bd882c8aSJames Wright ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 543bd882c8aSJames Wright } 544bd882c8aSJames Wright } 545bd882c8aSJames Wright 546bd882c8aSJames Wright //------------------------------------------------------------------------------ 547bd882c8aSJames Wright // 3D derivatives at quadrature points 548bd882c8aSJames Wright //------------------------------------------------------------------------------ 549bd882c8aSJames Wright inline void GradTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 550bd882c8aSJames Wright local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 551bd882c8aSJames Wright local CeedScalar *restrict scratch) { 552bd882c8aSJames Wright CeedScalar r_t1[T_1D]; 553bd882c8aSJames Wright CeedScalar r_t2[T_1D]; 554bd882c8aSJames Wright 555bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 556bd882c8aSJames Wright ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 557bd882c8aSJames Wright ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 558bd882c8aSJames Wright ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch); 559bd882c8aSJames Wright ContractX3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch); 560bd882c8aSJames Wright ContractY3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch); 561bd882c8aSJames Wright ContractZ3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch); 562bd882c8aSJames Wright } 563bd882c8aSJames Wright } 564bd882c8aSJames Wright 565bd882c8aSJames Wright //------------------------------------------------------------------------------ 566bd882c8aSJames Wright // 3D derivatives transpose 567bd882c8aSJames Wright //------------------------------------------------------------------------------ 568bd882c8aSJames Wright inline void GradTransposeTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 569bd882c8aSJames Wright local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, 570bd882c8aSJames Wright private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 571bd882c8aSJames Wright CeedScalar r_t1[T_1D]; 572bd882c8aSJames Wright CeedScalar r_t2[T_1D]; 573bd882c8aSJames Wright 574bd882c8aSJames Wright for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 575bd882c8aSJames Wright ContractTransposeZ3d(Q_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t2, scratch); 576bd882c8aSJames Wright ContractTransposeAddY3d(Q_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_G, r_t2, scratch); 577bd882c8aSJames Wright ContractTransposeAddX3d(Q_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_G, r_t2, scratch); 578bd882c8aSJames Wright ContractTransposeZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch); 579bd882c8aSJames Wright ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 580bd882c8aSJames Wright ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 581bd882c8aSJames Wright } 582bd882c8aSJames Wright } 583bd882c8aSJames Wright 584bd882c8aSJames Wright //------------------------------------------------------------------------------ 585bd882c8aSJames Wright // 3D quadrature weights 586bd882c8aSJames Wright //------------------------------------------------------------------------------ 587bd882c8aSJames Wright // template <int Q_1D> 588bd882c8aSJames Wright inline void WeightTensor3d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) { 589bd882c8aSJames Wright const CeedInt item_id_x = get_local_id(0); 590bd882c8aSJames Wright const CeedInt item_id_y = get_local_id(1); 591bd882c8aSJames Wright 592bd882c8aSJames Wright if (item_id_x < Q_1D && item_id_y < Q_1D) { 593bd882c8aSJames Wright const CeedScalar w_xy = q_weight_1d[item_id_x] * q_weight_1d[item_id_y]; 594bd882c8aSJames Wright for (CeedInt q = 0; q < Q_1D; ++q) w[q] = w_xy * q_weight_1d[q]; 595bd882c8aSJames Wright } else { 596bd882c8aSJames Wright for (CeedInt q = 0; q < Q_1D; q++) w[q] = 0.0; 597bd882c8aSJames Wright } 598bd882c8aSJames Wright } 599