1*f80f4a74SSebastian Grimberg // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2*f80f4a74SSebastian Grimberg // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3*f80f4a74SSebastian Grimberg // 4*f80f4a74SSebastian Grimberg // SPDX-License-Identifier: BSD-2-Clause 5*f80f4a74SSebastian Grimberg // 6*f80f4a74SSebastian Grimberg // This file is part of CEED: http://github.com/ceed 7*f80f4a74SSebastian Grimberg 8*f80f4a74SSebastian Grimberg // macros to abstract access of shared memory and reg. file 9*f80f4a74SSebastian Grimberg #define sT(i, j) sT[(j)*P_ + (i)] 10*f80f4a74SSebastian Grimberg 11*f80f4a74SSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////////////// 12*f80f4a74SSebastian Grimberg // interp basis action (1D) 13*f80f4a74SSebastian Grimberg template <typename T, int DIM_, int NCOMP_, int P_, int Q_> 14*f80f4a74SSebastian Grimberg static __device__ __inline__ void magma_interp_1d_device(const T *sT, magma_trans_t transT, T *sU[NCOMP_], T *sV[NCOMP_], const int tx) { 15*f80f4a74SSebastian Grimberg // Assumptions 16*f80f4a74SSebastian Grimberg // 1. 1D threads of size max(P_,Q_) 17*f80f4a74SSebastian Grimberg // 2. sU[i] is 1xP_: in shared memory 18*f80f4a74SSebastian Grimberg // 3. sV[i] is 1xQ_: in shared memory 19*f80f4a74SSebastian Grimberg // 4. P_roduct per component is one row (1xP_) times T matrix (P_xQ_) => one row (1xQ_) 20*f80f4a74SSebastian Grimberg // 5. Each thread computes one entry in sV[i] 21*f80f4a74SSebastian Grimberg // 6. Must sync before and after call 22*f80f4a74SSebastian Grimberg // 7. Note that the layout for U and V is different from 2D/3D problem 23*f80f4a74SSebastian Grimberg 24*f80f4a74SSebastian Grimberg T rv; 25*f80f4a74SSebastian Grimberg if (tx < Q_) { 26*f80f4a74SSebastian Grimberg for (int icomp = 0; icomp < NCOMP_; icomp++) { 27*f80f4a74SSebastian Grimberg rv = (transT == MagmaTrans) ? sV[icomp][tx] : 0.0; 28*f80f4a74SSebastian Grimberg for (int i = 0; i < P_; i++) { 29*f80f4a74SSebastian Grimberg rv += sU[icomp][i] * sT(i, tx); // sT[tx * P_ + i]; 30*f80f4a74SSebastian Grimberg } 31*f80f4a74SSebastian Grimberg sV[icomp][tx] = rv; 32*f80f4a74SSebastian Grimberg } 33*f80f4a74SSebastian Grimberg } 34*f80f4a74SSebastian Grimberg } 35*f80f4a74SSebastian Grimberg 36*f80f4a74SSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////////////// 37*f80f4a74SSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_1D)) __global__ 38*f80f4a74SSebastian Grimberg void magma_interpn_1d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, 39*f80f4a74SSebastian Grimberg const int cstrdV, const int nelem) { 40*f80f4a74SSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data) 41*f80f4a74SSebastian Grimberg 42*f80f4a74SSebastian Grimberg const int tx = threadIdx.x; 43*f80f4a74SSebastian Grimberg const int ty = threadIdx.y; 44*f80f4a74SSebastian Grimberg const int elem_id = (blockIdx.x * blockDim.y) + ty; 45*f80f4a74SSebastian Grimberg magma_trans_t transT = MagmaNoTrans; 46*f80f4a74SSebastian Grimberg 47*f80f4a74SSebastian Grimberg if (elem_id >= nelem) return; 48*f80f4a74SSebastian Grimberg 49*f80f4a74SSebastian Grimberg CeedScalar *sU[NCOMP]; 50*f80f4a74SSebastian Grimberg CeedScalar *sV[NCOMP]; 51*f80f4a74SSebastian Grimberg 52*f80f4a74SSebastian Grimberg // shift global memory pointers by elem stride 53*f80f4a74SSebastian Grimberg dU += elem_id * estrdU; 54*f80f4a74SSebastian Grimberg dV += elem_id * estrdV; 55*f80f4a74SSebastian Grimberg 56*f80f4a74SSebastian Grimberg // assign shared memory pointers 57*f80f4a74SSebastian Grimberg CeedScalar *sT = (CeedScalar *)(shared_data); 58*f80f4a74SSebastian Grimberg CeedScalar *sW = sT + P * Q; 59*f80f4a74SSebastian Grimberg sU[0] = sW + ty * NCOMP * (P + Q); 60*f80f4a74SSebastian Grimberg sV[0] = sU[0] + (NCOMP * 1 * P); 61*f80f4a74SSebastian Grimberg for (int icomp = 1; icomp < NCOMP; icomp++) { 62*f80f4a74SSebastian Grimberg sU[icomp] = sU[icomp - 1] + (1 * P); 63*f80f4a74SSebastian Grimberg sV[icomp] = sV[icomp - 1] + (1 * Q); 64*f80f4a74SSebastian Grimberg } 65*f80f4a74SSebastian Grimberg 66*f80f4a74SSebastian Grimberg // read T 67*f80f4a74SSebastian Grimberg if (ty == 0) { 68*f80f4a74SSebastian Grimberg dread_T_gm2sm<P, Q>(tx, transT, dT, sT); 69*f80f4a74SSebastian Grimberg } 70*f80f4a74SSebastian Grimberg 71*f80f4a74SSebastian Grimberg // read U 72*f80f4a74SSebastian Grimberg read_1d<CeedScalar, P, NCOMP>(dU, cstrdU, sU, tx); 73*f80f4a74SSebastian Grimberg 74*f80f4a74SSebastian Grimberg __syncthreads(); 75*f80f4a74SSebastian Grimberg magma_interp_1d_device<CeedScalar, DIM, NCOMP, P, Q>(sT, transT, sU, sV, tx); 76*f80f4a74SSebastian Grimberg __syncthreads(); 77*f80f4a74SSebastian Grimberg 78*f80f4a74SSebastian Grimberg // write V 79*f80f4a74SSebastian Grimberg write_1d<CeedScalar, Q, NCOMP>(sV, dV, cstrdV, tx); 80*f80f4a74SSebastian Grimberg } 81*f80f4a74SSebastian Grimberg 82*f80f4a74SSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////////////// 83*f80f4a74SSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_1D)) __global__ 84*f80f4a74SSebastian Grimberg void magma_interpt_1d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, 85*f80f4a74SSebastian Grimberg const int cstrdV, const int nelem) { 86*f80f4a74SSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data) 87*f80f4a74SSebastian Grimberg 88*f80f4a74SSebastian Grimberg const int tx = threadIdx.x; 89*f80f4a74SSebastian Grimberg const int ty = threadIdx.y; 90*f80f4a74SSebastian Grimberg const int elem_id = (blockIdx.x * blockDim.y) + ty; 91*f80f4a74SSebastian Grimberg magma_trans_t transT = MagmaTrans; 92*f80f4a74SSebastian Grimberg 93*f80f4a74SSebastian Grimberg if (elem_id >= nelem) return; 94*f80f4a74SSebastian Grimberg 95*f80f4a74SSebastian Grimberg CeedScalar *sU[NCOMP]; 96*f80f4a74SSebastian Grimberg CeedScalar *sV[NCOMP]; 97*f80f4a74SSebastian Grimberg 98*f80f4a74SSebastian Grimberg // shift global memory pointers by elem stride 99*f80f4a74SSebastian Grimberg dU += elem_id * estrdU; 100*f80f4a74SSebastian Grimberg dV += elem_id * estrdV; 101*f80f4a74SSebastian Grimberg 102*f80f4a74SSebastian Grimberg // assign shared memory pointers 103*f80f4a74SSebastian Grimberg CeedScalar *sT = (CeedScalar *)(shared_data); 104*f80f4a74SSebastian Grimberg CeedScalar *sW = sT + Q * P; 105*f80f4a74SSebastian Grimberg sU[0] = sW + ty * NCOMP * (Q + P); 106*f80f4a74SSebastian Grimberg sV[0] = sU[0] + (NCOMP * 1 * Q); 107*f80f4a74SSebastian Grimberg for (int icomp = 1; icomp < NCOMP; icomp++) { 108*f80f4a74SSebastian Grimberg sU[icomp] = sU[icomp - 1] + (1 * Q); 109*f80f4a74SSebastian Grimberg sV[icomp] = sV[icomp - 1] + (1 * P); 110*f80f4a74SSebastian Grimberg } 111*f80f4a74SSebastian Grimberg 112*f80f4a74SSebastian Grimberg // read T 113*f80f4a74SSebastian Grimberg if (ty == 0) { 114*f80f4a74SSebastian Grimberg dread_T_gm2sm<Q, P>(tx, transT, dT, sT); 115*f80f4a74SSebastian Grimberg } 116*f80f4a74SSebastian Grimberg 117*f80f4a74SSebastian Grimberg // read U 118*f80f4a74SSebastian Grimberg read_1d<CeedScalar, Q, NCOMP>(dU, cstrdU, sU, tx); 119*f80f4a74SSebastian Grimberg 120*f80f4a74SSebastian Grimberg // read V 121*f80f4a74SSebastian Grimberg read_1d<CeedScalar, P, NCOMP>(dV, cstrdV, sV, tx); 122*f80f4a74SSebastian Grimberg 123*f80f4a74SSebastian Grimberg __syncthreads(); 124*f80f4a74SSebastian Grimberg magma_interp_1d_device<CeedScalar, DIM, NCOMP, Q, P>(sT, transT, sU, sV, tx); 125*f80f4a74SSebastian Grimberg __syncthreads(); 126*f80f4a74SSebastian Grimberg 127*f80f4a74SSebastian Grimberg // write V 128*f80f4a74SSebastian Grimberg write_1d<CeedScalar, P, NCOMP>(sV, dV, cstrdV, tx); 129*f80f4a74SSebastian Grimberg } 130