xref: /libCEED/rust/libceed-sys/c-src/include/ceed/jit-source/cuda/cuda-gen-templates.h (revision 9e201c85545dd39529c090846df629a32c15659b)
1*9e201c85SYohann // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2*9e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3*9e201c85SYohann //
4*9e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause
5*9e201c85SYohann //
6*9e201c85SYohann // This file is part of CEED:  http://github.com/ceed
7*9e201c85SYohann 
8*9e201c85SYohann /// @file
9*9e201c85SYohann /// Internal header for CUDA backend macro and type definitions for JiT source
10*9e201c85SYohann #ifndef _ceed_cuda_gen_templates_h
11*9e201c85SYohann #define _ceed_cuda_gen_templates_h
12*9e201c85SYohann 
13*9e201c85SYohann #include <ceed/types.h>
14*9e201c85SYohann 
15*9e201c85SYohann //------------------------------------------------------------------------------
16*9e201c85SYohann // Load matrices for basis actions
17*9e201c85SYohann //------------------------------------------------------------------------------
18*9e201c85SYohann template <int P, int Q>
19*9e201c85SYohann inline __device__ void loadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
20*9e201c85SYohann   for (CeedInt i = data.t_id; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
21*9e201c85SYohann     B[i] = d_B[i];
22*9e201c85SYohann }
23*9e201c85SYohann 
24*9e201c85SYohann //------------------------------------------------------------------------------
25*9e201c85SYohann // 1D
26*9e201c85SYohann //------------------------------------------------------------------------------
27*9e201c85SYohann 
28*9e201c85SYohann //------------------------------------------------------------------------------
29*9e201c85SYohann // L-vector -> E-vector, offsets provided
30*9e201c85SYohann //------------------------------------------------------------------------------
31*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d>
32*9e201c85SYohann inline __device__ void readDofsOffset1d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
33*9e201c85SYohann   if (data.t_id_x < P1d) {
34*9e201c85SYohann     const CeedInt node = data.t_id_x;
35*9e201c85SYohann     const CeedInt ind = indices[node + elem * P1d];
36*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp)
37*9e201c85SYohann       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
38*9e201c85SYohann   }
39*9e201c85SYohann }
40*9e201c85SYohann 
41*9e201c85SYohann //------------------------------------------------------------------------------
42*9e201c85SYohann // L-vector -> E-vector, strided
43*9e201c85SYohann //------------------------------------------------------------------------------
44*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
45*9e201c85SYohann inline __device__ void readDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
46*9e201c85SYohann   if (data.t_id_x < P1d) {
47*9e201c85SYohann     const CeedInt node = data.t_id_x;
48*9e201c85SYohann     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
49*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp)
50*9e201c85SYohann       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
51*9e201c85SYohann   }
52*9e201c85SYohann }
53*9e201c85SYohann 
54*9e201c85SYohann //------------------------------------------------------------------------------
55*9e201c85SYohann // E-vector -> L-vector, offsets provided
56*9e201c85SYohann //------------------------------------------------------------------------------
57*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d>
58*9e201c85SYohann inline __device__ void writeDofsOffset1d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
59*9e201c85SYohann   if (data.t_id_x < P1d) {
60*9e201c85SYohann     const CeedInt node = data.t_id_x;
61*9e201c85SYohann     const CeedInt ind = indices[node + elem * P1d];
62*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp)
63*9e201c85SYohann       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
64*9e201c85SYohann   }
65*9e201c85SYohann }
66*9e201c85SYohann 
67*9e201c85SYohann //------------------------------------------------------------------------------
68*9e201c85SYohann // E-vector -> L-vector, strided
69*9e201c85SYohann //------------------------------------------------------------------------------
70*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
71*9e201c85SYohann inline __device__ void writeDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
72*9e201c85SYohann   if (data.t_id_x < P1d) {
73*9e201c85SYohann     const CeedInt node = data.t_id_x;
74*9e201c85SYohann     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
75*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp)
76*9e201c85SYohann       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
77*9e201c85SYohann   }
78*9e201c85SYohann }
79*9e201c85SYohann 
80*9e201c85SYohann //------------------------------------------------------------------------------
81*9e201c85SYohann // 2D
82*9e201c85SYohann //------------------------------------------------------------------------------
83*9e201c85SYohann 
84*9e201c85SYohann //------------------------------------------------------------------------------
85*9e201c85SYohann // L-vector -> E-vector, offsets provided
86*9e201c85SYohann //------------------------------------------------------------------------------
87*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d>
88*9e201c85SYohann inline __device__ void readDofsOffset2d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
89*9e201c85SYohann   if (data.t_id_x < P1d && data.t_id_y < P1d) {
90*9e201c85SYohann     const CeedInt node = data.t_id_x + data.t_id_y*P1d;
91*9e201c85SYohann     const CeedInt ind = indices[node + elem * P1d*P1d];
92*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp)
93*9e201c85SYohann       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
94*9e201c85SYohann   }
95*9e201c85SYohann }
96*9e201c85SYohann 
97*9e201c85SYohann //------------------------------------------------------------------------------
98*9e201c85SYohann // L-vector -> E-vector, strided
99*9e201c85SYohann //------------------------------------------------------------------------------
100*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
101*9e201c85SYohann inline __device__ void readDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
102*9e201c85SYohann   if (data.t_id_x < P1d && data.t_id_y < P1d) {
103*9e201c85SYohann     const CeedInt node = data.t_id_x + data.t_id_y*P1d;
104*9e201c85SYohann     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
105*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp)
106*9e201c85SYohann       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
107*9e201c85SYohann   }
108*9e201c85SYohann }
109*9e201c85SYohann 
110*9e201c85SYohann //------------------------------------------------------------------------------
111*9e201c85SYohann // E-vector -> L-vector, offsets provided
112*9e201c85SYohann //------------------------------------------------------------------------------
113*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d>
114*9e201c85SYohann inline __device__ void writeDofsOffset2d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
115*9e201c85SYohann   if (data.t_id_x < P1d && data.t_id_y < P1d) {
116*9e201c85SYohann     const CeedInt node = data.t_id_x + data.t_id_y*P1d;
117*9e201c85SYohann     const CeedInt ind = indices[node + elem * P1d*P1d];
118*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp)
119*9e201c85SYohann       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
120*9e201c85SYohann   }
121*9e201c85SYohann }
122*9e201c85SYohann 
123*9e201c85SYohann //------------------------------------------------------------------------------
124*9e201c85SYohann // E-vector -> L-vector, strided
125*9e201c85SYohann //------------------------------------------------------------------------------
126*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
127*9e201c85SYohann inline __device__ void writeDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
128*9e201c85SYohann   if (data.t_id_x < P1d && data.t_id_y < P1d) {
129*9e201c85SYohann     const CeedInt node = data.t_id_x + data.t_id_y*P1d;
130*9e201c85SYohann     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
131*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp)
132*9e201c85SYohann       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
133*9e201c85SYohann   }
134*9e201c85SYohann }
135*9e201c85SYohann 
136*9e201c85SYohann //------------------------------------------------------------------------------
137*9e201c85SYohann // 3D
138*9e201c85SYohann //------------------------------------------------------------------------------
139*9e201c85SYohann 
140*9e201c85SYohann //------------------------------------------------------------------------------
141*9e201c85SYohann // L-vector -> E-vector, offsets provided
142*9e201c85SYohann //------------------------------------------------------------------------------
143*9e201c85SYohann // TODO: remove "Dofs" and "Quads" in the following function names?
144*9e201c85SYohann //   - readDofsOffset3d -> readOffset3d ?
145*9e201c85SYohann //   - readDofsStrided3d -> readStrided3d ?
146*9e201c85SYohann //   - readSliceQuadsOffset3d -> readSliceOffset3d ?
147*9e201c85SYohann //   - readSliceQuadsStrided3d -> readSliceStrided3d ?
148*9e201c85SYohann //   - writeDofsOffset3d -> writeOffset3d ?
149*9e201c85SYohann //   - writeDofsStrided3d -> writeStrided3d ?
150*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d>
151*9e201c85SYohann inline __device__ void readDofsOffset3d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
152*9e201c85SYohann   if (data.t_id_x < P1d && data.t_id_y < P1d)
153*9e201c85SYohann     for (CeedInt z = 0; z < P1d; ++z) {
154*9e201c85SYohann       const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d;
155*9e201c85SYohann       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
156*9e201c85SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp)
157*9e201c85SYohann         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
158*9e201c85SYohann     }
159*9e201c85SYohann }
160*9e201c85SYohann 
161*9e201c85SYohann //------------------------------------------------------------------------------
162*9e201c85SYohann // L-vector -> E-vector, strided
163*9e201c85SYohann //------------------------------------------------------------------------------
164*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
165*9e201c85SYohann inline __device__ void readDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
166*9e201c85SYohann   if (data.t_id_x < P1d && data.t_id_y < P1d)
167*9e201c85SYohann     for (CeedInt z = 0; z < P1d; ++z) {
168*9e201c85SYohann       const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d;
169*9e201c85SYohann       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
170*9e201c85SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp)
171*9e201c85SYohann         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
172*9e201c85SYohann     }
173*9e201c85SYohann }
174*9e201c85SYohann 
175*9e201c85SYohann //------------------------------------------------------------------------------
176*9e201c85SYohann // E-vector -> Q-vector, offests provided
177*9e201c85SYohann //------------------------------------------------------------------------------
178*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int Q1d>
179*9e201c85SYohann inline __device__ void readSliceQuadsOffset3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
180*9e201c85SYohann   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
181*9e201c85SYohann     const CeedInt node = data.t_id_x + data.t_id_y*Q1d + q*Q1d*Q1d;
182*9e201c85SYohann     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
183*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp)
184*9e201c85SYohann       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
185*9e201c85SYohann   }
186*9e201c85SYohann }
187*9e201c85SYohann 
188*9e201c85SYohann //------------------------------------------------------------------------------
189*9e201c85SYohann // E-vector -> Q-vector, strided
190*9e201c85SYohann //------------------------------------------------------------------------------
191*9e201c85SYohann template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
192*9e201c85SYohann inline __device__ void readSliceQuadsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
193*9e201c85SYohann   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
194*9e201c85SYohann     const CeedInt node = data.t_id_x + data.t_id_y*Q1d + q*Q1d*Q1d;
195*9e201c85SYohann     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
196*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp)
197*9e201c85SYohann       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
198*9e201c85SYohann   }
199*9e201c85SYohann }
200*9e201c85SYohann 
201*9e201c85SYohann //------------------------------------------------------------------------------
202*9e201c85SYohann // E-vector -> L-vector, offsets provided
203*9e201c85SYohann //------------------------------------------------------------------------------
204*9e201c85SYohann template <int NCOMP, int COMPSTRIDE, int P1d>
205*9e201c85SYohann inline __device__ void writeDofsOffset3d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
206*9e201c85SYohann   if (data.t_id_x < P1d && data.t_id_y < P1d)
207*9e201c85SYohann     for (CeedInt z = 0; z < P1d; ++z) {
208*9e201c85SYohann       const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d;
209*9e201c85SYohann       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
210*9e201c85SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp)
211*9e201c85SYohann         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
212*9e201c85SYohann     }
213*9e201c85SYohann }
214*9e201c85SYohann 
215*9e201c85SYohann //------------------------------------------------------------------------------
216*9e201c85SYohann // E-vector -> L-vector, strided
217*9e201c85SYohann //------------------------------------------------------------------------------
218*9e201c85SYohann template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
219*9e201c85SYohann inline __device__ void writeDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
220*9e201c85SYohann   if (data.t_id_x < P1d && data.t_id_y < P1d)
221*9e201c85SYohann     for (CeedInt z = 0; z < P1d; ++z) {
222*9e201c85SYohann       const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d;
223*9e201c85SYohann       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
224*9e201c85SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp)
225*9e201c85SYohann         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
226*9e201c85SYohann     }
227*9e201c85SYohann }
228*9e201c85SYohann 
229*9e201c85SYohann //------------------------------------------------------------------------------
230*9e201c85SYohann // 3D collocated derivatives computation
231*9e201c85SYohann //------------------------------------------------------------------------------
232*9e201c85SYohann template <int NCOMP, int Q1d>
233*9e201c85SYohann inline __device__ void gradCollo3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
234*9e201c85SYohann   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
235*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
236*9e201c85SYohann       data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[q + comp*Q1d];
237*9e201c85SYohann       __syncthreads();
238*9e201c85SYohann       // X derivative
239*9e201c85SYohann       r_V[comp+0*NCOMP] = 0.0;
240*9e201c85SYohann       for (CeedInt i = 0; i < Q1d; ++i)
241*9e201c85SYohann         r_V[comp+0*NCOMP] += c_G[i + data.t_id_x*Q1d] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction (X derivative)
242*9e201c85SYohann       // Y derivative
243*9e201c85SYohann       r_V[comp+1*NCOMP] = 0.0;
244*9e201c85SYohann       for (CeedInt i = 0; i < Q1d; ++i)
245*9e201c85SYohann         r_V[comp+1*NCOMP] += c_G[i + data.t_id_y*Q1d] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction (Y derivative)
246*9e201c85SYohann       // Z derivative
247*9e201c85SYohann       r_V[comp+2*NCOMP] = 0.0;
248*9e201c85SYohann       for (CeedInt i = 0; i < Q1d; ++i)
249*9e201c85SYohann         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
250*9e201c85SYohann       __syncthreads();
251*9e201c85SYohann     }
252*9e201c85SYohann   }
253*9e201c85SYohann }
254*9e201c85SYohann 
255*9e201c85SYohann //------------------------------------------------------------------------------
256*9e201c85SYohann // 3D collocated derivatives transpose
257*9e201c85SYohann //------------------------------------------------------------------------------
258*9e201c85SYohann template <int NCOMP, int Q1d>
259*9e201c85SYohann inline __device__ void gradColloTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
260*9e201c85SYohann   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
261*9e201c85SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
262*9e201c85SYohann       // X derivative
263*9e201c85SYohann       data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[comp + 0*NCOMP];
264*9e201c85SYohann       __syncthreads();
265*9e201c85SYohann       for (CeedInt i = 0; i < Q1d; ++i)
266*9e201c85SYohann         r_V[q+comp*Q1d] += c_G[data.t_id_x + i*Q1d] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction (X derivative)
267*9e201c85SYohann       __syncthreads();
268*9e201c85SYohann       // Y derivative
269*9e201c85SYohann       data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[comp + 1*NCOMP];
270*9e201c85SYohann       __syncthreads();
271*9e201c85SYohann       for (CeedInt i = 0; i < Q1d; ++i)
272*9e201c85SYohann         r_V[q+comp*Q1d] += c_G[data.t_id_y + i*Q1d] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction (Y derivative)
273*9e201c85SYohann       __syncthreads();
274*9e201c85SYohann       // Z derivative
275*9e201c85SYohann       for (CeedInt i = 0; i < Q1d; ++i)
276*9e201c85SYohann         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
277*9e201c85SYohann     }
278*9e201c85SYohann   }
279*9e201c85SYohann }
280*9e201c85SYohann 
281*9e201c85SYohann #endif
282