1*9d15e85bSSebastian Grimberg // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2*9d15e85bSSebastian Grimberg // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3*9d15e85bSSebastian Grimberg // 4*9d15e85bSSebastian Grimberg // SPDX-License-Identifier: BSD-2-Clause 5*9d15e85bSSebastian Grimberg // 6*9d15e85bSSebastian Grimberg // This file is part of CEED: http://github.com/ceed 7*9d15e85bSSebastian Grimberg 8*9d15e85bSSebastian Grimberg /// @file 9*9d15e85bSSebastian Grimberg /// Internal header for MAGMA non-tensor basis interpolation 10*9d15e85bSSebastian Grimberg #ifndef CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H 11*9d15e85bSSebastian Grimberg #define CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H 12*9d15e85bSSebastian Grimberg 13*9d15e85bSSebastian Grimberg #include "magma-common-nontensor.h" 14*9d15e85bSSebastian Grimberg 15*9d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 16*9d15e85bSSebastian Grimberg template <typename T, int Q_COMP, int P, int Q, int NB> 17*9d15e85bSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 18*9d15e85bSSebastian Grimberg CeedScalar *shared_data) { 19*9d15e85bSSebastian Grimberg const int tx = threadIdx.x; 20*9d15e85bSSebastian Grimberg const int ty = threadIdx.y; 21*9d15e85bSSebastian Grimberg const int id = blockIdx.x * blockDim.y + ty; 22*9d15e85bSSebastian Grimberg const int nblocks = (n + NB - 1) / NB; 23*9d15e85bSSebastian Grimberg const int myn = min(NB, n - id * NB); 24*9d15e85bSSebastian Grimberg 25*9d15e85bSSebastian Grimberg dB += id * P * NB; 26*9d15e85bSSebastian Grimberg dC += id * Q * NB; 27*9d15e85bSSebastian Grimberg 28*9d15e85bSSebastian Grimberg // A is P x Q 29*9d15e85bSSebastian Grimberg CeedScalar *sB = shared_data + ty * P * NB; 30*9d15e85bSSebastian Grimberg CeedScalar *sA = shared_data + blockDim.y * P * NB; 31*9d15e85bSSebastian Grimberg 32*9d15e85bSSebastian Grimberg // read B once for all C's 33*9d15e85bSSebastian Grimberg if (id < nblocks) { 34*9d15e85bSSebastian Grimberg read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB); 35*9d15e85bSSebastian Grimberg } 36*9d15e85bSSebastian Grimberg 37*9d15e85bSSebastian Grimberg // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 38*9d15e85bSSebastian Grimberg for (int d = 0; d < Q_COMP; d++) { 39*9d15e85bSSebastian Grimberg // init rA, rC 40*9d15e85bSSebastian Grimberg CeedScalar rA[P], rC[NB]; 41*9d15e85bSSebastian Grimberg 42*9d15e85bSSebastian Grimberg // read A using all threads 43*9d15e85bSSebastian Grimberg read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 44*9d15e85bSSebastian Grimberg 45*9d15e85bSSebastian Grimberg if (id < nblocks) { 46*9d15e85bSSebastian Grimberg mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC); 47*9d15e85bSSebastian Grimberg 48*9d15e85bSSebastian Grimberg // write C 49*9d15e85bSSebastian Grimberg write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC); 50*9d15e85bSSebastian Grimberg } 51*9d15e85bSSebastian Grimberg 52*9d15e85bSSebastian Grimberg dA += Q * P; 53*9d15e85bSSebastian Grimberg dC += Q * n; 54*9d15e85bSSebastian Grimberg 55*9d15e85bSSebastian Grimberg __syncthreads(); 56*9d15e85bSSebastian Grimberg } 57*9d15e85bSSebastian Grimberg } 58*9d15e85bSSebastian Grimberg 59*9d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 60*9d15e85bSSebastian Grimberg template <typename T, int P, int Q, int NB> 61*9d15e85bSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 62*9d15e85bSSebastian Grimberg CeedScalar *shared_data) { 63*9d15e85bSSebastian Grimberg const int tx = threadIdx.x; 64*9d15e85bSSebastian Grimberg const int ty = threadIdx.y; 65*9d15e85bSSebastian Grimberg const int id = blockIdx.x * blockDim.y + ty; 66*9d15e85bSSebastian Grimberg const int nblocks = (n + NB - 1) / NB; 67*9d15e85bSSebastian Grimberg const int myn = min(NB, n - id * NB); 68*9d15e85bSSebastian Grimberg 69*9d15e85bSSebastian Grimberg dB += id * P * NB; 70*9d15e85bSSebastian Grimberg dC += id * Q * NB; 71*9d15e85bSSebastian Grimberg 72*9d15e85bSSebastian Grimberg // A is P x Q 73*9d15e85bSSebastian Grimberg CeedScalar *sA = shared_data; 74*9d15e85bSSebastian Grimberg CeedScalar *sB = shared_data + ty * P * NB; 75*9d15e85bSSebastian Grimberg 76*9d15e85bSSebastian Grimberg // init rA, rC 77*9d15e85bSSebastian Grimberg CeedScalar rA[P], rC[NB]; 78*9d15e85bSSebastian Grimberg 79*9d15e85bSSebastian Grimberg // read A using all threads 80*9d15e85bSSebastian Grimberg read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 81*9d15e85bSSebastian Grimberg __syncthreads(); 82*9d15e85bSSebastian Grimberg 83*9d15e85bSSebastian Grimberg // terminate threads with no work 84*9d15e85bSSebastian Grimberg if (id >= nblocks) return; 85*9d15e85bSSebastian Grimberg 86*9d15e85bSSebastian Grimberg // read B once for all C's 87*9d15e85bSSebastian Grimberg read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB); 88*9d15e85bSSebastian Grimberg __syncthreads(); 89*9d15e85bSSebastian Grimberg 90*9d15e85bSSebastian Grimberg mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC); 91*9d15e85bSSebastian Grimberg 92*9d15e85bSSebastian Grimberg // write C 93*9d15e85bSSebastian Grimberg write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC); 94*9d15e85bSSebastian Grimberg } 95*9d15e85bSSebastian Grimberg 96*9d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 97*9d15e85bSSebastian Grimberg template <typename T, int Q_COMP, int P, int Q, int NB> 98*9d15e85bSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 99*9d15e85bSSebastian Grimberg CeedScalar *shared_data) { 100*9d15e85bSSebastian Grimberg const int tx = threadIdx.x; 101*9d15e85bSSebastian Grimberg const int ty = threadIdx.y; 102*9d15e85bSSebastian Grimberg const int id = blockIdx.x * blockDim.y + ty; 103*9d15e85bSSebastian Grimberg const int nblocks = (n + NB - 1) / NB; 104*9d15e85bSSebastian Grimberg const int myn = min(NB, n - id * NB); 105*9d15e85bSSebastian Grimberg 106*9d15e85bSSebastian Grimberg // terminate threads with no work 107*9d15e85bSSebastian Grimberg if (id >= nblocks) return; 108*9d15e85bSSebastian Grimberg 109*9d15e85bSSebastian Grimberg dB += id * Q * NB; 110*9d15e85bSSebastian Grimberg dC += id * P * NB; 111*9d15e85bSSebastian Grimberg 112*9d15e85bSSebastian Grimberg // A is P x Q 113*9d15e85bSSebastian Grimberg CeedScalar *sB = shared_data + ty * Q * NB; 114*9d15e85bSSebastian Grimberg 115*9d15e85bSSebastian Grimberg // init rC 116*9d15e85bSSebastian Grimberg CeedScalar rC[NB] = {0.0}; 117*9d15e85bSSebastian Grimberg 118*9d15e85bSSebastian Grimberg // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 119*9d15e85bSSebastian Grimberg for (int d = 0; d < Q_COMP; d++) { 120*9d15e85bSSebastian Grimberg // init rA 121*9d15e85bSSebastian Grimberg CeedScalar rA[Q]; 122*9d15e85bSSebastian Grimberg 123*9d15e85bSSebastian Grimberg // read A 124*9d15e85bSSebastian Grimberg read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, NB>(tx, dA, rA); 125*9d15e85bSSebastian Grimberg 126*9d15e85bSSebastian Grimberg // read B 127*9d15e85bSSebastian Grimberg read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 128*9d15e85bSSebastian Grimberg __syncthreads(); 129*9d15e85bSSebastian Grimberg 130*9d15e85bSSebastian Grimberg addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 131*9d15e85bSSebastian Grimberg 132*9d15e85bSSebastian Grimberg dA += P * Q; 133*9d15e85bSSebastian Grimberg dB += Q * n; 134*9d15e85bSSebastian Grimberg 135*9d15e85bSSebastian Grimberg __syncthreads(); 136*9d15e85bSSebastian Grimberg } 137*9d15e85bSSebastian Grimberg 138*9d15e85bSSebastian Grimberg // write C 139*9d15e85bSSebastian Grimberg write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 140*9d15e85bSSebastian Grimberg } 141*9d15e85bSSebastian Grimberg 142*9d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 143*9d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ 144*9d15e85bSSebastian Grimberg void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 145*9d15e85bSSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 146*9d15e85bSSebastian Grimberg 147*9d15e85bSSebastian Grimberg if (BASIS_Q_COMP_INTERP == 1) { 148*9d15e85bSSebastian Grimberg magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 149*9d15e85bSSebastian Grimberg } else { 150*9d15e85bSSebastian Grimberg magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 151*9d15e85bSSebastian Grimberg } 152*9d15e85bSSebastian Grimberg } 153*9d15e85bSSebastian Grimberg 154*9d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 155*9d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 156*9d15e85bSSebastian Grimberg void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 157*9d15e85bSSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 158*9d15e85bSSebastian Grimberg 159*9d15e85bSSebastian Grimberg magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 160*9d15e85bSSebastian Grimberg } 161*9d15e85bSSebastian Grimberg 162*9d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 163*9d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ 164*9d15e85bSSebastian Grimberg void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 165*9d15e85bSSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 166*9d15e85bSSebastian Grimberg 167*9d15e85bSSebastian Grimberg if (BASIS_Q_COMP_DERIV == 1) { 168*9d15e85bSSebastian Grimberg magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 169*9d15e85bSSebastian Grimberg } else { 170*9d15e85bSSebastian Grimberg magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 171*9d15e85bSSebastian Grimberg } 172*9d15e85bSSebastian Grimberg } 173*9d15e85bSSebastian Grimberg 174*9d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 175*9d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 176*9d15e85bSSebastian Grimberg void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 177*9d15e85bSSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 178*9d15e85bSSebastian Grimberg 179*9d15e85bSSebastian Grimberg magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 180*9d15e85bSSebastian Grimberg } 181*9d15e85bSSebastian Grimberg 182*9d15e85bSSebastian Grimberg #endif // CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H 183