xref: /libCEED/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h (revision 9d15e85b4f78ffb2d2860753c87a3b1789cc3bb6)
1*9d15e85bSSebastian Grimberg // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2*9d15e85bSSebastian Grimberg // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3*9d15e85bSSebastian Grimberg //
4*9d15e85bSSebastian Grimberg // SPDX-License-Identifier: BSD-2-Clause
5*9d15e85bSSebastian Grimberg //
6*9d15e85bSSebastian Grimberg // This file is part of CEED:  http://github.com/ceed
7*9d15e85bSSebastian Grimberg 
8*9d15e85bSSebastian Grimberg /// @file
9*9d15e85bSSebastian Grimberg /// Internal header for MAGMA non-tensor basis interpolation
10*9d15e85bSSebastian Grimberg #ifndef CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H
11*9d15e85bSSebastian Grimberg #define CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H
12*9d15e85bSSebastian Grimberg 
13*9d15e85bSSebastian Grimberg #include "magma-common-nontensor.h"
14*9d15e85bSSebastian Grimberg 
15*9d15e85bSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
16*9d15e85bSSebastian Grimberg template <typename T, int Q_COMP, int P, int Q, int NB>
17*9d15e85bSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
18*9d15e85bSSebastian Grimberg                                                                  CeedScalar *shared_data) {
19*9d15e85bSSebastian Grimberg   const int tx      = threadIdx.x;
20*9d15e85bSSebastian Grimberg   const int ty      = threadIdx.y;
21*9d15e85bSSebastian Grimberg   const int id      = blockIdx.x * blockDim.y + ty;
22*9d15e85bSSebastian Grimberg   const int nblocks = (n + NB - 1) / NB;
23*9d15e85bSSebastian Grimberg   const int myn     = min(NB, n - id * NB);
24*9d15e85bSSebastian Grimberg 
25*9d15e85bSSebastian Grimberg   dB += id * P * NB;
26*9d15e85bSSebastian Grimberg   dC += id * Q * NB;
27*9d15e85bSSebastian Grimberg 
28*9d15e85bSSebastian Grimberg   // A is P x Q
29*9d15e85bSSebastian Grimberg   CeedScalar *sB = shared_data + ty * P * NB;
30*9d15e85bSSebastian Grimberg   CeedScalar *sA = shared_data + blockDim.y * P * NB;
31*9d15e85bSSebastian Grimberg 
32*9d15e85bSSebastian Grimberg   // read B once for all C's
33*9d15e85bSSebastian Grimberg   if (id < nblocks) {
34*9d15e85bSSebastian Grimberg     read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
35*9d15e85bSSebastian Grimberg   }
36*9d15e85bSSebastian Grimberg 
37*9d15e85bSSebastian Grimberg   // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
38*9d15e85bSSebastian Grimberg   for (int d = 0; d < Q_COMP; d++) {
39*9d15e85bSSebastian Grimberg     // init rA, rC
40*9d15e85bSSebastian Grimberg     CeedScalar rA[P], rC[NB];
41*9d15e85bSSebastian Grimberg 
42*9d15e85bSSebastian Grimberg     // read A using all threads
43*9d15e85bSSebastian Grimberg     read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
44*9d15e85bSSebastian Grimberg 
45*9d15e85bSSebastian Grimberg     if (id < nblocks) {
46*9d15e85bSSebastian Grimberg       mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
47*9d15e85bSSebastian Grimberg 
48*9d15e85bSSebastian Grimberg       // write C
49*9d15e85bSSebastian Grimberg       write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
50*9d15e85bSSebastian Grimberg     }
51*9d15e85bSSebastian Grimberg 
52*9d15e85bSSebastian Grimberg     dA += Q * P;
53*9d15e85bSSebastian Grimberg     dC += Q * n;
54*9d15e85bSSebastian Grimberg 
55*9d15e85bSSebastian Grimberg     __syncthreads();
56*9d15e85bSSebastian Grimberg   }
57*9d15e85bSSebastian Grimberg }
58*9d15e85bSSebastian Grimberg 
59*9d15e85bSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
60*9d15e85bSSebastian Grimberg template <typename T, int P, int Q, int NB>
61*9d15e85bSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
62*9d15e85bSSebastian Grimberg                                                                   CeedScalar *shared_data) {
63*9d15e85bSSebastian Grimberg   const int tx      = threadIdx.x;
64*9d15e85bSSebastian Grimberg   const int ty      = threadIdx.y;
65*9d15e85bSSebastian Grimberg   const int id      = blockIdx.x * blockDim.y + ty;
66*9d15e85bSSebastian Grimberg   const int nblocks = (n + NB - 1) / NB;
67*9d15e85bSSebastian Grimberg   const int myn     = min(NB, n - id * NB);
68*9d15e85bSSebastian Grimberg 
69*9d15e85bSSebastian Grimberg   dB += id * P * NB;
70*9d15e85bSSebastian Grimberg   dC += id * Q * NB;
71*9d15e85bSSebastian Grimberg 
72*9d15e85bSSebastian Grimberg   // A is P x Q
73*9d15e85bSSebastian Grimberg   CeedScalar *sA = shared_data;
74*9d15e85bSSebastian Grimberg   CeedScalar *sB = shared_data + ty * P * NB;
75*9d15e85bSSebastian Grimberg 
76*9d15e85bSSebastian Grimberg   // init rA, rC
77*9d15e85bSSebastian Grimberg   CeedScalar rA[P], rC[NB];
78*9d15e85bSSebastian Grimberg 
79*9d15e85bSSebastian Grimberg   // read A using all threads
80*9d15e85bSSebastian Grimberg   read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
81*9d15e85bSSebastian Grimberg   __syncthreads();
82*9d15e85bSSebastian Grimberg 
83*9d15e85bSSebastian Grimberg   // terminate threads with no work
84*9d15e85bSSebastian Grimberg   if (id >= nblocks) return;
85*9d15e85bSSebastian Grimberg 
86*9d15e85bSSebastian Grimberg   // read B once for all C's
87*9d15e85bSSebastian Grimberg   read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
88*9d15e85bSSebastian Grimberg   __syncthreads();
89*9d15e85bSSebastian Grimberg 
90*9d15e85bSSebastian Grimberg   mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
91*9d15e85bSSebastian Grimberg 
92*9d15e85bSSebastian Grimberg   // write C
93*9d15e85bSSebastian Grimberg   write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
94*9d15e85bSSebastian Grimberg }
95*9d15e85bSSebastian Grimberg 
96*9d15e85bSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
97*9d15e85bSSebastian Grimberg template <typename T, int Q_COMP, int P, int Q, int NB>
98*9d15e85bSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
99*9d15e85bSSebastian Grimberg                                                                  CeedScalar *shared_data) {
100*9d15e85bSSebastian Grimberg   const int tx      = threadIdx.x;
101*9d15e85bSSebastian Grimberg   const int ty      = threadIdx.y;
102*9d15e85bSSebastian Grimberg   const int id      = blockIdx.x * blockDim.y + ty;
103*9d15e85bSSebastian Grimberg   const int nblocks = (n + NB - 1) / NB;
104*9d15e85bSSebastian Grimberg   const int myn     = min(NB, n - id * NB);
105*9d15e85bSSebastian Grimberg 
106*9d15e85bSSebastian Grimberg   // terminate threads with no work
107*9d15e85bSSebastian Grimberg   if (id >= nblocks) return;
108*9d15e85bSSebastian Grimberg 
109*9d15e85bSSebastian Grimberg   dB += id * Q * NB;
110*9d15e85bSSebastian Grimberg   dC += id * P * NB;
111*9d15e85bSSebastian Grimberg 
112*9d15e85bSSebastian Grimberg   // A is P x Q
113*9d15e85bSSebastian Grimberg   CeedScalar *sB = shared_data + ty * Q * NB;
114*9d15e85bSSebastian Grimberg 
115*9d15e85bSSebastian Grimberg   // init rC
116*9d15e85bSSebastian Grimberg   CeedScalar rC[NB] = {0.0};
117*9d15e85bSSebastian Grimberg 
118*9d15e85bSSebastian Grimberg   // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
119*9d15e85bSSebastian Grimberg   for (int d = 0; d < Q_COMP; d++) {
120*9d15e85bSSebastian Grimberg     // init rA
121*9d15e85bSSebastian Grimberg     CeedScalar rA[Q];
122*9d15e85bSSebastian Grimberg 
123*9d15e85bSSebastian Grimberg     // read A
124*9d15e85bSSebastian Grimberg     read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, NB>(tx, dA, rA);
125*9d15e85bSSebastian Grimberg 
126*9d15e85bSSebastian Grimberg     // read B
127*9d15e85bSSebastian Grimberg     read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
128*9d15e85bSSebastian Grimberg     __syncthreads();
129*9d15e85bSSebastian Grimberg 
130*9d15e85bSSebastian Grimberg     addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
131*9d15e85bSSebastian Grimberg 
132*9d15e85bSSebastian Grimberg     dA += P * Q;
133*9d15e85bSSebastian Grimberg     dB += Q * n;
134*9d15e85bSSebastian Grimberg 
135*9d15e85bSSebastian Grimberg     __syncthreads();
136*9d15e85bSSebastian Grimberg   }
137*9d15e85bSSebastian Grimberg 
138*9d15e85bSSebastian Grimberg   // write C
139*9d15e85bSSebastian Grimberg   write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
140*9d15e85bSSebastian Grimberg }
141*9d15e85bSSebastian Grimberg 
142*9d15e85bSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
143*9d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
144*9d15e85bSSebastian Grimberg     void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
145*9d15e85bSSebastian Grimberg   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
146*9d15e85bSSebastian Grimberg 
147*9d15e85bSSebastian Grimberg   if (BASIS_Q_COMP_INTERP == 1) {
148*9d15e85bSSebastian Grimberg     magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
149*9d15e85bSSebastian Grimberg   } else {
150*9d15e85bSSebastian Grimberg     magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
151*9d15e85bSSebastian Grimberg   }
152*9d15e85bSSebastian Grimberg }
153*9d15e85bSSebastian Grimberg 
154*9d15e85bSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
155*9d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
156*9d15e85bSSebastian Grimberg     void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
157*9d15e85bSSebastian Grimberg   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
158*9d15e85bSSebastian Grimberg 
159*9d15e85bSSebastian Grimberg   magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
160*9d15e85bSSebastian Grimberg }
161*9d15e85bSSebastian Grimberg 
162*9d15e85bSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
163*9d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
164*9d15e85bSSebastian Grimberg     void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
165*9d15e85bSSebastian Grimberg   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
166*9d15e85bSSebastian Grimberg 
167*9d15e85bSSebastian Grimberg   if (BASIS_Q_COMP_DERIV == 1) {
168*9d15e85bSSebastian Grimberg     magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
169*9d15e85bSSebastian Grimberg   } else {
170*9d15e85bSSebastian Grimberg     magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
171*9d15e85bSSebastian Grimberg   }
172*9d15e85bSSebastian Grimberg }
173*9d15e85bSSebastian Grimberg 
174*9d15e85bSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
175*9d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
176*9d15e85bSSebastian Grimberg     void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
177*9d15e85bSSebastian Grimberg   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
178*9d15e85bSSebastian Grimberg 
179*9d15e85bSSebastian Grimberg   magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
180*9d15e85bSSebastian Grimberg }
181*9d15e85bSSebastian Grimberg 
182*9d15e85bSSebastian Grimberg #endif  // CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H
183