19d15e85bSSebastian Grimberg // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 29d15e85bSSebastian Grimberg // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 39d15e85bSSebastian Grimberg // 49d15e85bSSebastian Grimberg // SPDX-License-Identifier: BSD-2-Clause 59d15e85bSSebastian Grimberg // 69d15e85bSSebastian Grimberg // This file is part of CEED: http://github.com/ceed 79d15e85bSSebastian Grimberg 89d15e85bSSebastian Grimberg /// @file 99d15e85bSSebastian Grimberg /// Internal header for MAGMA non-tensor basis interpolation 109d15e85bSSebastian Grimberg #ifndef CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H 119d15e85bSSebastian Grimberg #define CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H 129d15e85bSSebastian Grimberg 139d15e85bSSebastian Grimberg #include "magma-common-nontensor.h" 149d15e85bSSebastian Grimberg 159d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 169d15e85bSSebastian Grimberg template <typename T, int Q_COMP, int P, int Q, int NB> 179d15e85bSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 189d15e85bSSebastian Grimberg CeedScalar *shared_data) { 199d15e85bSSebastian Grimberg const int tx = threadIdx.x; 209d15e85bSSebastian Grimberg const int ty = threadIdx.y; 219d15e85bSSebastian Grimberg const int id = blockIdx.x * blockDim.y + ty; 229d15e85bSSebastian Grimberg const int nblocks = (n + NB - 1) / NB; 239d15e85bSSebastian Grimberg const int myn = min(NB, n - id * NB); 249d15e85bSSebastian Grimberg 259d15e85bSSebastian Grimberg dB += id * P * NB; 269d15e85bSSebastian Grimberg dC += id * Q * NB; 279d15e85bSSebastian Grimberg 289d15e85bSSebastian Grimberg // A is P x Q 299d15e85bSSebastian Grimberg CeedScalar *sB = shared_data + ty * P * NB; 309d15e85bSSebastian Grimberg CeedScalar *sA = shared_data + blockDim.y * P * NB; 319d15e85bSSebastian Grimberg 329d15e85bSSebastian Grimberg // read B once for all C's 339d15e85bSSebastian Grimberg if (id < nblocks) { 349d15e85bSSebastian Grimberg read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB); 359d15e85bSSebastian Grimberg } 369d15e85bSSebastian Grimberg 379d15e85bSSebastian Grimberg // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 389d15e85bSSebastian Grimberg for (int d = 0; d < Q_COMP; d++) { 399d15e85bSSebastian Grimberg // init rA, rC 409d15e85bSSebastian Grimberg CeedScalar rA[P], rC[NB]; 419d15e85bSSebastian Grimberg 429d15e85bSSebastian Grimberg // read A using all threads 439d15e85bSSebastian Grimberg read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 449d15e85bSSebastian Grimberg 459d15e85bSSebastian Grimberg mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC); 469d15e85bSSebastian Grimberg 479d15e85bSSebastian Grimberg // write C 48*833aa127SSebastian Grimberg if (id < nblocks) { 499d15e85bSSebastian Grimberg write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC); 509d15e85bSSebastian Grimberg } 519d15e85bSSebastian Grimberg 529d15e85bSSebastian Grimberg dA += Q * P; 539d15e85bSSebastian Grimberg dC += Q * n; 549d15e85bSSebastian Grimberg 559d15e85bSSebastian Grimberg __syncthreads(); 569d15e85bSSebastian Grimberg } 579d15e85bSSebastian Grimberg } 589d15e85bSSebastian Grimberg 599d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 609d15e85bSSebastian Grimberg template <typename T, int P, int Q, int NB> 619d15e85bSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 629d15e85bSSebastian Grimberg CeedScalar *shared_data) { 639d15e85bSSebastian Grimberg const int tx = threadIdx.x; 649d15e85bSSebastian Grimberg const int ty = threadIdx.y; 659d15e85bSSebastian Grimberg const int id = blockIdx.x * blockDim.y + ty; 669d15e85bSSebastian Grimberg const int nblocks = (n + NB - 1) / NB; 679d15e85bSSebastian Grimberg const int myn = min(NB, n - id * NB); 689d15e85bSSebastian Grimberg 699d15e85bSSebastian Grimberg dB += id * P * NB; 709d15e85bSSebastian Grimberg dC += id * Q * NB; 719d15e85bSSebastian Grimberg 729d15e85bSSebastian Grimberg // A is P x Q 739d15e85bSSebastian Grimberg CeedScalar *sA = shared_data; 749d15e85bSSebastian Grimberg CeedScalar *sB = shared_data + ty * P * NB; 759d15e85bSSebastian Grimberg 769d15e85bSSebastian Grimberg // init rA, rC 779d15e85bSSebastian Grimberg CeedScalar rA[P], rC[NB]; 789d15e85bSSebastian Grimberg 799d15e85bSSebastian Grimberg // read A using all threads 809d15e85bSSebastian Grimberg read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 819d15e85bSSebastian Grimberg __syncthreads(); 829d15e85bSSebastian Grimberg 839d15e85bSSebastian Grimberg // terminate threads with no work 849d15e85bSSebastian Grimberg if (id >= nblocks) return; 859d15e85bSSebastian Grimberg 86*833aa127SSebastian Grimberg // read B 879d15e85bSSebastian Grimberg read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB); 889d15e85bSSebastian Grimberg __syncthreads(); 899d15e85bSSebastian Grimberg 909d15e85bSSebastian Grimberg mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC); 919d15e85bSSebastian Grimberg 929d15e85bSSebastian Grimberg // write C 939d15e85bSSebastian Grimberg write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC); 949d15e85bSSebastian Grimberg } 959d15e85bSSebastian Grimberg 969d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 979d15e85bSSebastian Grimberg template <typename T, int Q_COMP, int P, int Q, int NB> 989d15e85bSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 999d15e85bSSebastian Grimberg CeedScalar *shared_data) { 1009d15e85bSSebastian Grimberg const int tx = threadIdx.x; 1019d15e85bSSebastian Grimberg const int ty = threadIdx.y; 1029d15e85bSSebastian Grimberg const int id = blockIdx.x * blockDim.y + ty; 1039d15e85bSSebastian Grimberg const int nblocks = (n + NB - 1) / NB; 1049d15e85bSSebastian Grimberg const int myn = min(NB, n - id * NB); 1059d15e85bSSebastian Grimberg 1069d15e85bSSebastian Grimberg dB += id * Q * NB; 1079d15e85bSSebastian Grimberg dC += id * P * NB; 1089d15e85bSSebastian Grimberg 1099d15e85bSSebastian Grimberg // A is P x Q 110*833aa127SSebastian Grimberg CeedScalar *sA = shared_data; 1119d15e85bSSebastian Grimberg CeedScalar *sB = shared_data + ty * Q * NB; 1129d15e85bSSebastian Grimberg 1139d15e85bSSebastian Grimberg // init rC 1149d15e85bSSebastian Grimberg CeedScalar rC[NB] = {0.0}; 1159d15e85bSSebastian Grimberg 1169d15e85bSSebastian Grimberg // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 1179d15e85bSSebastian Grimberg for (int d = 0; d < Q_COMP; d++) { 1189d15e85bSSebastian Grimberg // init rA 1199d15e85bSSebastian Grimberg CeedScalar rA[Q]; 1209d15e85bSSebastian Grimberg 121*833aa127SSebastian Grimberg // read A using all threads 122*833aa127SSebastian Grimberg read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 123*833aa127SSebastian Grimberg __syncthreads(); 1249d15e85bSSebastian Grimberg 1259d15e85bSSebastian Grimberg // read B 126*833aa127SSebastian Grimberg if (id < nblocks) { 1279d15e85bSSebastian Grimberg read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 128*833aa127SSebastian Grimberg } 1299d15e85bSSebastian Grimberg __syncthreads(); 1309d15e85bSSebastian Grimberg 1319d15e85bSSebastian Grimberg addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 1329d15e85bSSebastian Grimberg 1339d15e85bSSebastian Grimberg dA += P * Q; 1349d15e85bSSebastian Grimberg dB += Q * n; 1359d15e85bSSebastian Grimberg 1369d15e85bSSebastian Grimberg __syncthreads(); 1379d15e85bSSebastian Grimberg } 1389d15e85bSSebastian Grimberg 1399d15e85bSSebastian Grimberg // write C 140*833aa127SSebastian Grimberg if (id < nblocks) { 1419d15e85bSSebastian Grimberg write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 1429d15e85bSSebastian Grimberg } 143*833aa127SSebastian Grimberg } 1449d15e85bSSebastian Grimberg 1459d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 1469d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ 1479d15e85bSSebastian Grimberg void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 1489d15e85bSSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 1499d15e85bSSebastian Grimberg 1509d15e85bSSebastian Grimberg if (BASIS_Q_COMP_INTERP == 1) { 1519d15e85bSSebastian Grimberg magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 1529d15e85bSSebastian Grimberg } else { 1539d15e85bSSebastian Grimberg magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 1549d15e85bSSebastian Grimberg } 1559d15e85bSSebastian Grimberg } 1569d15e85bSSebastian Grimberg 1579d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 1589d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 1599d15e85bSSebastian Grimberg void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 1609d15e85bSSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 1619d15e85bSSebastian Grimberg 1629d15e85bSSebastian Grimberg magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 1639d15e85bSSebastian Grimberg } 1649d15e85bSSebastian Grimberg 1659d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 1669d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ 1679d15e85bSSebastian Grimberg void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 1689d15e85bSSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 1699d15e85bSSebastian Grimberg 1709d15e85bSSebastian Grimberg if (BASIS_Q_COMP_DERIV == 1) { 1719d15e85bSSebastian Grimberg magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 1729d15e85bSSebastian Grimberg } else { 1739d15e85bSSebastian Grimberg magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 1749d15e85bSSebastian Grimberg } 1759d15e85bSSebastian Grimberg } 1769d15e85bSSebastian Grimberg 1779d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 1789d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 1799d15e85bSSebastian Grimberg void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 1809d15e85bSSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 1819d15e85bSSebastian Grimberg 1829d15e85bSSebastian Grimberg magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 1839d15e85bSSebastian Grimberg } 1849d15e85bSSebastian Grimberg 1859d15e85bSSebastian Grimberg #endif // CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H 186