xref: /libCEED/rust/libceed-sys/c-src/include/ceed/jit-source/hip/hip-gen-templates.h (revision 9e201c85545dd39529c090846df629a32c15659b)
1*9e201c85SYohann // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2*9e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3*9e201c85SYohann //
4*9e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause
5*9e201c85SYohann //
6*9e201c85SYohann // This file is part of CEED:  http://github.com/ceed
7*9e201c85SYohann 
8*9e201c85SYohann /// @file
9*9e201c85SYohann /// Internal header for HIP backend macro and type definitions for JiT source
10*9e201c85SYohann #ifndef _ceed_hip_gen_templates_h
11*9e201c85SYohann #define _ceed_hip_gen_templates_h
12*9e201c85SYohann 
13*9e201c85SYohann #include <ceed/types.h>
14*9e201c85SYohann 
15*9e201c85SYohann 
16*9e201c85SYohann //------------------------------------------------------------------------------
17*9e201c85SYohann // Load matrices for basis actions
18*9e201c85SYohann //------------------------------------------------------------------------------
19*9e201c85SYohann template <int P, int Q>
20*9e201c85SYohann inline __device__ void loadMatrix(SharedData_Hip& data, const CeedScalar* d_B, CeedScalar* B) {
21*9e201c85SYohann   for (CeedInt i = data.t_id; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
22*9e201c85SYohann     B[i] = d_B[i];
23*9e201c85SYohann }
24*9e201c85SYohann 
25*9e201c85SYohann //------------------------------------------------------------------------------
26*9e201c85SYohann // 1D
27*9e201c85SYohann //------------------------------------------------------------------------------
28*9e201c85SYohann 
29*9e201c85SYohann //------------------------------------------------------------------------------
30*9e201c85SYohann // L-vector -> E-vector, offsets provided
31*9e201c85SYohann //------------------------------------------------------------------------------
32*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d>
33*9e201c85SYohann inline __device__ void readDofsOffset1d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
34*9e201c85SYohann   if (data.t_id_x < P1d) {
35*9e201c85SYohann     const CeedInt node = data.t_id_x;
36*9e201c85SYohann     const CeedInt ind = indices[node + elem * P1d];
37*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp)
38*9e201c85SYohann       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
39*9e201c85SYohann   }
40*9e201c85SYohann }
41*9e201c85SYohann 
42*9e201c85SYohann //------------------------------------------------------------------------------
43*9e201c85SYohann // L-vector -> E-vector, strided
44*9e201c85SYohann //------------------------------------------------------------------------------
45*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
46*9e201c85SYohann inline __device__ void readDofsStrided1d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
47*9e201c85SYohann   if (data.t_id_x < P1d) {
48*9e201c85SYohann     const CeedInt node = data.t_id_x;
49*9e201c85SYohann     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
50*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp)
51*9e201c85SYohann       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
52*9e201c85SYohann   }
53*9e201c85SYohann }
54*9e201c85SYohann 
55*9e201c85SYohann //------------------------------------------------------------------------------
56*9e201c85SYohann // E-vector -> L-vector, offsets provided
57*9e201c85SYohann //------------------------------------------------------------------------------
58*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d>
59*9e201c85SYohann inline __device__ void writeDofsOffset1d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
60*9e201c85SYohann   if (data.t_id_x < P1d) {
61*9e201c85SYohann     const CeedInt node = data.t_id_x;
62*9e201c85SYohann     const CeedInt ind = indices[node + elem * P1d];
63*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp)
64*9e201c85SYohann       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
65*9e201c85SYohann   }
66*9e201c85SYohann }
67*9e201c85SYohann 
68*9e201c85SYohann //------------------------------------------------------------------------------
69*9e201c85SYohann // E-vector -> L-vector, strided
70*9e201c85SYohann //------------------------------------------------------------------------------
71*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
72*9e201c85SYohann inline __device__ void writeDofsStrided1d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
73*9e201c85SYohann   if (data.t_id_x < P1d) {
74*9e201c85SYohann     const CeedInt node = data.t_id_x;
75*9e201c85SYohann     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
76*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp)
77*9e201c85SYohann       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
78*9e201c85SYohann   }
79*9e201c85SYohann }
80*9e201c85SYohann 
81*9e201c85SYohann //------------------------------------------------------------------------------
82*9e201c85SYohann // 2D
83*9e201c85SYohann //------------------------------------------------------------------------------
84*9e201c85SYohann 
85*9e201c85SYohann //------------------------------------------------------------------------------
86*9e201c85SYohann // L-vector -> E-vector, offsets provided
87*9e201c85SYohann //------------------------------------------------------------------------------
88*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d>
89*9e201c85SYohann inline __device__ void readDofsOffset2d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
90*9e201c85SYohann   if (data.t_id_x < P1d && data.t_id_y < P1d) {
91*9e201c85SYohann     const CeedInt node = data.t_id_x + data.t_id_y*P1d;
92*9e201c85SYohann     const CeedInt ind = indices[node + elem * P1d*P1d];
93*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp)
94*9e201c85SYohann       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
95*9e201c85SYohann   }
96*9e201c85SYohann }
97*9e201c85SYohann 
98*9e201c85SYohann //------------------------------------------------------------------------------
99*9e201c85SYohann // L-vector -> E-vector, strided
100*9e201c85SYohann //------------------------------------------------------------------------------
101*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
102*9e201c85SYohann inline __device__ void readDofsStrided2d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
103*9e201c85SYohann   if (data.t_id_x < P1d && data.t_id_y < P1d) {
104*9e201c85SYohann     const CeedInt node = data.t_id_x + data.t_id_y*P1d;
105*9e201c85SYohann     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
106*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp)
107*9e201c85SYohann       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
108*9e201c85SYohann   }
109*9e201c85SYohann }
110*9e201c85SYohann 
111*9e201c85SYohann //------------------------------------------------------------------------------
112*9e201c85SYohann // E-vector -> L-vector, offsets provided
113*9e201c85SYohann //------------------------------------------------------------------------------
114*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d>
115*9e201c85SYohann inline __device__ void writeDofsOffset2d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
116*9e201c85SYohann   if (data.t_id_x < P1d && data.t_id_y < P1d) {
117*9e201c85SYohann     const CeedInt node = data.t_id_x + data.t_id_y*P1d;
118*9e201c85SYohann     const CeedInt ind = indices[node + elem * P1d*P1d];
119*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp)
120*9e201c85SYohann       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
121*9e201c85SYohann   }
122*9e201c85SYohann }
123*9e201c85SYohann 
124*9e201c85SYohann //------------------------------------------------------------------------------
125*9e201c85SYohann // E-vector -> L-vector, strided
126*9e201c85SYohann //------------------------------------------------------------------------------
127*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
128*9e201c85SYohann inline __device__ void writeDofsStrided2d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
129*9e201c85SYohann   if (data.t_id_x < P1d && data.t_id_y < P1d) {
130*9e201c85SYohann     const CeedInt node = data.t_id_x + data.t_id_y*P1d;
131*9e201c85SYohann     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
132*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp)
133*9e201c85SYohann       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
134*9e201c85SYohann   }
135*9e201c85SYohann }
136*9e201c85SYohann 
137*9e201c85SYohann //------------------------------------------------------------------------------
138*9e201c85SYohann // 3D
139*9e201c85SYohann //------------------------------------------------------------------------------
140*9e201c85SYohann 
141*9e201c85SYohann //------------------------------------------------------------------------------
142*9e201c85SYohann // L-vector -> E-vector, offsets provided
143*9e201c85SYohann //------------------------------------------------------------------------------
144*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d>
145*9e201c85SYohann inline __device__ void readDofsOffset3d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
146*9e201c85SYohann   if (data.t_id_x < P1d && data.t_id_y < P1d)
147*9e201c85SYohann     for (CeedInt z = 0; z < P1d; ++z) {
148*9e201c85SYohann       const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d;
149*9e201c85SYohann       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
150*9e201c85SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp)
151*9e201c85SYohann         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
152*9e201c85SYohann     }
153*9e201c85SYohann }
154*9e201c85SYohann 
155*9e201c85SYohann //------------------------------------------------------------------------------
156*9e201c85SYohann // L-vector -> E-vector, strided
157*9e201c85SYohann //------------------------------------------------------------------------------
158*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
159*9e201c85SYohann inline __device__ void readDofsStrided3d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
160*9e201c85SYohann   if (data.t_id_x < P1d && data.t_id_y < P1d)
161*9e201c85SYohann     for (CeedInt z = 0; z < P1d; ++z) {
162*9e201c85SYohann       const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d;
163*9e201c85SYohann       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
164*9e201c85SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp)
165*9e201c85SYohann         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
166*9e201c85SYohann     }
167*9e201c85SYohann }
168*9e201c85SYohann 
169*9e201c85SYohann //------------------------------------------------------------------------------
170*9e201c85SYohann // E-vector -> Q-vector, offests provided
171*9e201c85SYohann //------------------------------------------------------------------------------
172*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int Q1d>
173*9e201c85SYohann inline __device__ void readSliceQuadsOffset3d(SharedData_Hip& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
174*9e201c85SYohann   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
175*9e201c85SYohann     const CeedInt node = data.t_id_x + data.t_id_y*Q1d + q*Q1d*Q1d;
176*9e201c85SYohann     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
177*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp)
178*9e201c85SYohann       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
179*9e201c85SYohann   }
180*9e201c85SYohann }
181*9e201c85SYohann 
182*9e201c85SYohann //------------------------------------------------------------------------------
183*9e201c85SYohann // E-vector -> Q-vector, strided
184*9e201c85SYohann //------------------------------------------------------------------------------
185*9e201c85SYohann template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
186*9e201c85SYohann inline __device__ void readSliceQuadsStrided3d(SharedData_Hip& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
187*9e201c85SYohann   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
188*9e201c85SYohann     const CeedInt node = data.t_id_x + data.t_id_y*Q1d + q*Q1d*Q1d;
189*9e201c85SYohann     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
190*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp)
191*9e201c85SYohann       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
192*9e201c85SYohann   }
193*9e201c85SYohann }
194*9e201c85SYohann 
195*9e201c85SYohann //------------------------------------------------------------------------------
196*9e201c85SYohann // E-vector -> L-vector, offsets provided
197*9e201c85SYohann //------------------------------------------------------------------------------
198*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d>
199*9e201c85SYohann inline __device__ void writeDofsOffset3d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
200*9e201c85SYohann   if (data.t_id_x < P1d && data.t_id_y < P1d)
201*9e201c85SYohann     for (CeedInt z = 0; z < P1d; ++z) {
202*9e201c85SYohann       const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d;
203*9e201c85SYohann       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
204*9e201c85SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp)
205*9e201c85SYohann         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
206*9e201c85SYohann     }
207*9e201c85SYohann }
208*9e201c85SYohann 
209*9e201c85SYohann //------------------------------------------------------------------------------
210*9e201c85SYohann // E-vector -> L-vector, strided
211*9e201c85SYohann //------------------------------------------------------------------------------
212*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
213*9e201c85SYohann inline __device__ void writeDofsStrided3d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
214*9e201c85SYohann   if (data.t_id_x < P1d && data.t_id_y < P1d)
215*9e201c85SYohann     for (CeedInt z = 0; z < P1d; ++z) {
216*9e201c85SYohann       const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d;
217*9e201c85SYohann       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
218*9e201c85SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp)
219*9e201c85SYohann         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
220*9e201c85SYohann     }
221*9e201c85SYohann }
222*9e201c85SYohann 
223*9e201c85SYohann //------------------------------------------------------------------------------
224*9e201c85SYohann // 3D collocated derivatives computation
225*9e201c85SYohann //------------------------------------------------------------------------------
226*9e201c85SYohann template <int NCOMP, int Q1d>
227*9e201c85SYohann inline __device__ void gradCollo3d(SharedData_Hip& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
228*9e201c85SYohann   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
229*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
230*9e201c85SYohann       data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[q + comp*Q1d];
231*9e201c85SYohann       __syncthreads();
232*9e201c85SYohann       // X derivative
233*9e201c85SYohann       r_V[comp+0*NCOMP] = 0.0;
234*9e201c85SYohann       for (CeedInt i = 0; i < Q1d; ++i)
235*9e201c85SYohann         r_V[comp+0*NCOMP] += c_G[i + data.t_id_x*Q1d] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction (X derivative)
236*9e201c85SYohann       // Y derivative
237*9e201c85SYohann       r_V[comp+1*NCOMP] = 0.0;
238*9e201c85SYohann       for (CeedInt i = 0; i < Q1d; ++i)
239*9e201c85SYohann         r_V[comp+1*NCOMP] += c_G[i + data.t_id_y*Q1d] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction (Y derivative)
240*9e201c85SYohann       // Z derivative
241*9e201c85SYohann       r_V[comp+2*NCOMP] = 0.0;
242*9e201c85SYohann       for (CeedInt i = 0; i < Q1d; ++i)
243*9e201c85SYohann         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
244*9e201c85SYohann       __syncthreads();
245*9e201c85SYohann     }
246*9e201c85SYohann   }
247*9e201c85SYohann }
248*9e201c85SYohann 
249*9e201c85SYohann //------------------------------------------------------------------------------
250*9e201c85SYohann // 3D collocated derivatives transpose
251*9e201c85SYohann //------------------------------------------------------------------------------
252*9e201c85SYohann template <int NCOMP, int Q1d>
253*9e201c85SYohann inline __device__ void gradColloTranspose3d(SharedData_Hip& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
254*9e201c85SYohann   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
255*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
256*9e201c85SYohann       // X derivative
257*9e201c85SYohann       data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[comp + 0*NCOMP];
258*9e201c85SYohann       __syncthreads();
259*9e201c85SYohann       for (CeedInt i = 0; i < Q1d; ++i)
260*9e201c85SYohann         r_V[q+comp*Q1d] += c_G[data.t_id_x + i*Q1d] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction (X derivative)
261*9e201c85SYohann       __syncthreads();
262*9e201c85SYohann       // Y derivative
263*9e201c85SYohann       data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[comp + 1*NCOMP];
264*9e201c85SYohann       __syncthreads();
265*9e201c85SYohann       for (CeedInt i = 0; i < Q1d; ++i)
266*9e201c85SYohann         r_V[q+comp*Q1d] += c_G[data.t_id_y + i*Q1d] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction (Y derivative)
267*9e201c85SYohann       __syncthreads();
268*9e201c85SYohann       // Z derivative
269*9e201c85SYohann       for (CeedInt i = 0; i < Q1d; ++i)
270*9e201c85SYohann         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
271*9e201c85SYohann     }
272*9e201c85SYohann   }
273*9e201c85SYohann }
274*9e201c85SYohann 
275*9e201c85SYohann #endif
276