1*5aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 29d15e85bSSebastian Grimberg // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 39d15e85bSSebastian Grimberg // 49d15e85bSSebastian Grimberg // SPDX-License-Identifier: BSD-2-Clause 59d15e85bSSebastian Grimberg // 69d15e85bSSebastian Grimberg // This file is part of CEED: http://github.com/ceed 79d15e85bSSebastian Grimberg 89d15e85bSSebastian Grimberg /// @file 99d15e85bSSebastian Grimberg /// Internal header for MAGMA non-tensor basis interpolation 109d15e85bSSebastian Grimberg #ifndef CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H 119d15e85bSSebastian Grimberg #define CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H 129d15e85bSSebastian Grimberg 139d15e85bSSebastian Grimberg #include "magma-common-nontensor.h" 149d15e85bSSebastian Grimberg 159d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 169d15e85bSSebastian Grimberg template <typename T, int Q_COMP, int P, int Q, int NB> 179d15e85bSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 189d15e85bSSebastian Grimberg CeedScalar *shared_data) { 199d15e85bSSebastian Grimberg const int tx = threadIdx.x; 209d15e85bSSebastian Grimberg const int ty = threadIdx.y; 219d15e85bSSebastian Grimberg const int id = blockIdx.x * blockDim.y + ty; 229d15e85bSSebastian Grimberg const int nblocks = (n + NB - 1) / NB; 239d15e85bSSebastian Grimberg const int myn = min(NB, n - id * NB); 249d15e85bSSebastian Grimberg 259d15e85bSSebastian Grimberg dB += id * P * NB; 269d15e85bSSebastian Grimberg dC += id * Q * NB; 279d15e85bSSebastian Grimberg 289d15e85bSSebastian Grimberg // A is P x Q 299d15e85bSSebastian Grimberg CeedScalar *sB = shared_data + ty * P * NB; 309d15e85bSSebastian Grimberg CeedScalar *sA = shared_data + blockDim.y * P * NB; 319d15e85bSSebastian Grimberg 329d15e85bSSebastian Grimberg // read B once for all C's 339d15e85bSSebastian Grimberg if (id < nblocks) { 349d15e85bSSebastian Grimberg read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB); 359d15e85bSSebastian Grimberg } 369d15e85bSSebastian Grimberg 379d15e85bSSebastian Grimberg // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 389d15e85bSSebastian Grimberg for (int d = 0; d < Q_COMP; d++) { 399d15e85bSSebastian Grimberg // read A using all threads 4086ad04ccSSebastian Grimberg CeedScalar rA[P]; 419d15e85bSSebastian Grimberg read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 429d15e85bSSebastian Grimberg 4386ad04ccSSebastian Grimberg CeedScalar rC[NB]; 449d15e85bSSebastian Grimberg mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC); 459d15e85bSSebastian Grimberg 469d15e85bSSebastian Grimberg // write C 47833aa127SSebastian Grimberg if (id < nblocks) { 489d15e85bSSebastian Grimberg write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC); 499d15e85bSSebastian Grimberg } 509d15e85bSSebastian Grimberg 519d15e85bSSebastian Grimberg dA += Q * P; 529d15e85bSSebastian Grimberg dC += Q * n; 539d15e85bSSebastian Grimberg 549d15e85bSSebastian Grimberg __syncthreads(); 559d15e85bSSebastian Grimberg } 569d15e85bSSebastian Grimberg } 579d15e85bSSebastian Grimberg 589d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 599d15e85bSSebastian Grimberg template <typename T, int Q_COMP, int P, int Q, int NB> 609d15e85bSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 619d15e85bSSebastian Grimberg CeedScalar *shared_data) { 629d15e85bSSebastian Grimberg const int tx = threadIdx.x; 639d15e85bSSebastian Grimberg const int ty = threadIdx.y; 649d15e85bSSebastian Grimberg const int id = blockIdx.x * blockDim.y + ty; 659d15e85bSSebastian Grimberg const int nblocks = (n + NB - 1) / NB; 669d15e85bSSebastian Grimberg const int myn = min(NB, n - id * NB); 679d15e85bSSebastian Grimberg 689d15e85bSSebastian Grimberg dB += id * Q * NB; 699d15e85bSSebastian Grimberg dC += id * P * NB; 709d15e85bSSebastian Grimberg 719d15e85bSSebastian Grimberg // A is P x Q 72833aa127SSebastian Grimberg CeedScalar *sA = shared_data; 739d15e85bSSebastian Grimberg CeedScalar *sB = shared_data + ty * Q * NB; 749d15e85bSSebastian Grimberg 759d15e85bSSebastian Grimberg CeedScalar rC[NB] = {0.0}; 769d15e85bSSebastian Grimberg 779d15e85bSSebastian Grimberg // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 789d15e85bSSebastian Grimberg for (int d = 0; d < Q_COMP; d++) { 79833aa127SSebastian Grimberg // read A using all threads 8086ad04ccSSebastian Grimberg CeedScalar rA[Q]; 81833aa127SSebastian Grimberg read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 82833aa127SSebastian Grimberg __syncthreads(); 839d15e85bSSebastian Grimberg 849d15e85bSSebastian Grimberg // read B 85833aa127SSebastian Grimberg if (id < nblocks) { 869d15e85bSSebastian Grimberg read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 87833aa127SSebastian Grimberg } 889d15e85bSSebastian Grimberg __syncthreads(); 899d15e85bSSebastian Grimberg 909d15e85bSSebastian Grimberg addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 919d15e85bSSebastian Grimberg 929d15e85bSSebastian Grimberg dA += P * Q; 939d15e85bSSebastian Grimberg dB += Q * n; 949d15e85bSSebastian Grimberg 959d15e85bSSebastian Grimberg __syncthreads(); 969d15e85bSSebastian Grimberg } 979d15e85bSSebastian Grimberg 989d15e85bSSebastian Grimberg // write C 99833aa127SSebastian Grimberg if (id < nblocks) { 1009d15e85bSSebastian Grimberg write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 1019d15e85bSSebastian Grimberg } 102833aa127SSebastian Grimberg } 1039d15e85bSSebastian Grimberg 1049d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 10586ad04ccSSebastian Grimberg template <typename T, int P, int Q, int NB> 10686ad04ccSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 10786ad04ccSSebastian Grimberg CeedScalar *shared_data) { 10886ad04ccSSebastian Grimberg const int tx = threadIdx.x; 10986ad04ccSSebastian Grimberg const int ty = threadIdx.y; 11086ad04ccSSebastian Grimberg const int id = blockIdx.x * blockDim.y + ty; 11186ad04ccSSebastian Grimberg const int nblocks = (n + NB - 1) / NB; 11286ad04ccSSebastian Grimberg const int myn = min(NB, n - id * NB); 11386ad04ccSSebastian Grimberg 11486ad04ccSSebastian Grimberg dB += id * P * NB; 11586ad04ccSSebastian Grimberg dC += id * Q * NB; 11686ad04ccSSebastian Grimberg 11786ad04ccSSebastian Grimberg // A is P x Q 11886ad04ccSSebastian Grimberg CeedScalar *sA = shared_data; 11986ad04ccSSebastian Grimberg CeedScalar *sB = shared_data + ty * P * NB; 12086ad04ccSSebastian Grimberg 12186ad04ccSSebastian Grimberg // read A using all threads 12286ad04ccSSebastian Grimberg CeedScalar rA[P]; 12386ad04ccSSebastian Grimberg read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 12486ad04ccSSebastian Grimberg __syncthreads(); 12586ad04ccSSebastian Grimberg 12686ad04ccSSebastian Grimberg // terminate threads with no work 12786ad04ccSSebastian Grimberg if (id >= nblocks) return; 12886ad04ccSSebastian Grimberg 12986ad04ccSSebastian Grimberg // read B 13086ad04ccSSebastian Grimberg read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB); 13186ad04ccSSebastian Grimberg __syncthreads(); 13286ad04ccSSebastian Grimberg 13386ad04ccSSebastian Grimberg CeedScalar rC[NB]; 13486ad04ccSSebastian Grimberg mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC); 13586ad04ccSSebastian Grimberg 13686ad04ccSSebastian Grimberg // write C 13786ad04ccSSebastian Grimberg write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC); 13886ad04ccSSebastian Grimberg } 13986ad04ccSSebastian Grimberg 14086ad04ccSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 14186ad04ccSSebastian Grimberg template <typename T, int P, int Q, int NB> 14286ad04ccSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_t1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 14386ad04ccSSebastian Grimberg CeedScalar *shared_data) { 14486ad04ccSSebastian Grimberg const int tx = threadIdx.x; 14586ad04ccSSebastian Grimberg const int ty = threadIdx.y; 14686ad04ccSSebastian Grimberg const int id = blockIdx.x * blockDim.y + ty; 14786ad04ccSSebastian Grimberg const int nblocks = (n + NB - 1) / NB; 14886ad04ccSSebastian Grimberg const int myn = min(NB, n - id * NB); 14986ad04ccSSebastian Grimberg 15086ad04ccSSebastian Grimberg dB += id * Q * NB; 15186ad04ccSSebastian Grimberg dC += id * P * NB; 15286ad04ccSSebastian Grimberg 15386ad04ccSSebastian Grimberg // A is P x Q 15486ad04ccSSebastian Grimberg CeedScalar *sA = shared_data; 15586ad04ccSSebastian Grimberg CeedScalar *sB = shared_data + ty * Q * NB; 15686ad04ccSSebastian Grimberg 15786ad04ccSSebastian Grimberg // read A using all threads 15886ad04ccSSebastian Grimberg CeedScalar rA[Q]; 15986ad04ccSSebastian Grimberg read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 16086ad04ccSSebastian Grimberg __syncthreads(); 16186ad04ccSSebastian Grimberg 16286ad04ccSSebastian Grimberg // terminate threads with no work 16386ad04ccSSebastian Grimberg if (id >= nblocks) return; 16486ad04ccSSebastian Grimberg 16586ad04ccSSebastian Grimberg // read B 16686ad04ccSSebastian Grimberg read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 16786ad04ccSSebastian Grimberg __syncthreads(); 16886ad04ccSSebastian Grimberg 16986ad04ccSSebastian Grimberg CeedScalar rC[NB]; 17086ad04ccSSebastian Grimberg mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 17186ad04ccSSebastian Grimberg 17286ad04ccSSebastian Grimberg // write C 17386ad04ccSSebastian Grimberg write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 17486ad04ccSSebastian Grimberg } 17586ad04ccSSebastian Grimberg 17686ad04ccSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 1779d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ 1789d15e85bSSebastian Grimberg void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 1799d15e85bSSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 1809d15e85bSSebastian Grimberg 18186ad04ccSSebastian Grimberg #if BASIS_Q_COMP_INTERP == 1 1829d15e85bSSebastian Grimberg magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 18386ad04ccSSebastian Grimberg #else 1849d15e85bSSebastian Grimberg magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 18586ad04ccSSebastian Grimberg #endif 1869d15e85bSSebastian Grimberg } 1879d15e85bSSebastian Grimberg 1889d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 1899d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 1909d15e85bSSebastian Grimberg void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 1919d15e85bSSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 1929d15e85bSSebastian Grimberg 19386ad04ccSSebastian Grimberg #if BASIS_Q_COMP_INTERP == 1 19486ad04ccSSebastian Grimberg magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 19586ad04ccSSebastian Grimberg #else 1969d15e85bSSebastian Grimberg magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 19786ad04ccSSebastian Grimberg #endif 1989d15e85bSSebastian Grimberg } 1999d15e85bSSebastian Grimberg 2009d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 2019d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ 2029d15e85bSSebastian Grimberg void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 2039d15e85bSSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 2049d15e85bSSebastian Grimberg 20586ad04ccSSebastian Grimberg #if BASIS_Q_COMP_DERIV == 1 2069d15e85bSSebastian Grimberg magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 20786ad04ccSSebastian Grimberg #else 2089d15e85bSSebastian Grimberg magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 20986ad04ccSSebastian Grimberg #endif 2109d15e85bSSebastian Grimberg } 2119d15e85bSSebastian Grimberg 2129d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 2139d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 2149d15e85bSSebastian Grimberg void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 2159d15e85bSSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 2169d15e85bSSebastian Grimberg 21786ad04ccSSebastian Grimberg #if BASIS_Q_COMP_DERIV == 1 21886ad04ccSSebastian Grimberg magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 21986ad04ccSSebastian Grimberg #else 2209d15e85bSSebastian Grimberg magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 22186ad04ccSSebastian Grimberg #endif 2229d15e85bSSebastian Grimberg } 2239d15e85bSSebastian Grimberg 2249d15e85bSSebastian Grimberg #endif // CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H 225