xref: /libCEED/rust/libceed-sys/c-src/include/ceed/jit-source/cuda/cuda-gen-templates.h (revision 8b97b69a34af3dd9e1bc9b8fd8651212448dc9ec)
15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
29e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39e201c85SYohann //
49e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause
59e201c85SYohann //
69e201c85SYohann // This file is part of CEED:  http://github.com/ceed
79e201c85SYohann 
89e201c85SYohann /// @file
99e201c85SYohann /// Internal header for CUDA backend macro and type definitions for JiT source
10c0b5abf0SJeremy L Thompson #include <ceed/types.h>
119e201c85SYohann 
129e201c85SYohann //------------------------------------------------------------------------------
139e201c85SYohann // Load matrices for basis actions
149e201c85SYohann //------------------------------------------------------------------------------
159e201c85SYohann template <int P, int Q>
16f815fac9SJeremy L Thompson inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
172b730f8bSJeremy L Thompson   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
189e201c85SYohann }
199e201c85SYohann 
209e201c85SYohann //------------------------------------------------------------------------------
21*8b97b69aSJeremy L Thompson // AtPoints
22*8b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
23*8b97b69aSJeremy L Thompson 
24*8b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
25*8b97b69aSJeremy L Thompson // L-vector -> single point
26*8b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
27*8b97b69aSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
28*8b97b69aSJeremy L Thompson inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
29*8b97b69aSJeremy L Thompson                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
30*8b97b69aSJeremy L Thompson   const CeedInt ind = indices[p + elem * NUM_PTS];
31*8b97b69aSJeremy L Thompson 
32*8b97b69aSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
33*8b97b69aSJeremy L Thompson     r_u[comp] = d_u[ind + comp * COMP_STRIDE];
34*8b97b69aSJeremy L Thompson   }
35*8b97b69aSJeremy L Thompson }
36*8b97b69aSJeremy L Thompson 
37*8b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
38*8b97b69aSJeremy L Thompson // Single point -> L-vector
39*8b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
40*8b97b69aSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
41*8b97b69aSJeremy L Thompson inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
42*8b97b69aSJeremy L Thompson                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
43*8b97b69aSJeremy L Thompson   if (p < points_in_elem) {
44*8b97b69aSJeremy L Thompson     const CeedInt ind = indices[p + elem * NUM_PTS];
45*8b97b69aSJeremy L Thompson 
46*8b97b69aSJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
47*8b97b69aSJeremy L Thompson       d_u[ind + comp * COMP_STRIDE] += r_u[comp];
48*8b97b69aSJeremy L Thompson     }
49*8b97b69aSJeremy L Thompson   }
50*8b97b69aSJeremy L Thompson }
51*8b97b69aSJeremy L Thompson 
52*8b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
539e201c85SYohann // 1D
549e201c85SYohann //------------------------------------------------------------------------------
559e201c85SYohann 
569e201c85SYohann //------------------------------------------------------------------------------
579e201c85SYohann // L-vector -> E-vector, offsets provided
589e201c85SYohann //------------------------------------------------------------------------------
59ca735530SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1d>
60f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
61672b0f2aSSebastian Grimberg                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
62ca735530SJeremy L Thompson   if (data.t_id_x < P_1d) {
639e201c85SYohann     const CeedInt node = data.t_id_x;
64ca735530SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1d];
65ca735530SJeremy L Thompson 
66ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
679e201c85SYohann   }
689e201c85SYohann }
699e201c85SYohann 
709e201c85SYohann //------------------------------------------------------------------------------
719e201c85SYohann // L-vector -> E-vector, strided
729e201c85SYohann //------------------------------------------------------------------------------
73ca735530SJeremy L Thompson template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
74f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
75672b0f2aSSebastian Grimberg                                          CeedScalar *__restrict__ r_u) {
76ca735530SJeremy L Thompson   if (data.t_id_x < P_1d) {
779e201c85SYohann     const CeedInt node = data.t_id_x;
789e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
79ca735530SJeremy L Thompson 
80ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
819e201c85SYohann   }
829e201c85SYohann }
839e201c85SYohann 
849e201c85SYohann //------------------------------------------------------------------------------
859e201c85SYohann // E-vector -> L-vector, offsets provided
869e201c85SYohann //------------------------------------------------------------------------------
87ca735530SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1d>
88f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
89672b0f2aSSebastian Grimberg                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
90ca735530SJeremy L Thompson   if (data.t_id_x < P_1d) {
919e201c85SYohann     const CeedInt node = data.t_id_x;
92ca735530SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1d];
93ca735530SJeremy L Thompson 
94ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
959e201c85SYohann   }
969e201c85SYohann }
979e201c85SYohann 
989e201c85SYohann //------------------------------------------------------------------------------
999e201c85SYohann // E-vector -> L-vector, strided
1009e201c85SYohann //------------------------------------------------------------------------------
101ca735530SJeremy L Thompson template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
102f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
103672b0f2aSSebastian Grimberg                                           CeedScalar *__restrict__ d_v) {
104ca735530SJeremy L Thompson   if (data.t_id_x < P_1d) {
1059e201c85SYohann     const CeedInt node = data.t_id_x;
1069e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
107ca735530SJeremy L Thompson 
108ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
1099e201c85SYohann   }
1109e201c85SYohann }
1119e201c85SYohann 
1129e201c85SYohann //------------------------------------------------------------------------------
1139e201c85SYohann // 2D
1149e201c85SYohann //------------------------------------------------------------------------------
1159e201c85SYohann 
1169e201c85SYohann //------------------------------------------------------------------------------
1179e201c85SYohann // L-vector -> E-vector, offsets provided
1189e201c85SYohann //------------------------------------------------------------------------------
119ca735530SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1d>
120f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
121672b0f2aSSebastian Grimberg                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
122ca735530SJeremy L Thompson   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
123ca735530SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
124ca735530SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1d * P_1d];
125ca735530SJeremy L Thompson 
126ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
1279e201c85SYohann   }
1289e201c85SYohann }
1299e201c85SYohann 
1309e201c85SYohann //------------------------------------------------------------------------------
1319e201c85SYohann // L-vector -> E-vector, strided
1329e201c85SYohann //------------------------------------------------------------------------------
133ca735530SJeremy L Thompson template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
134f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
135672b0f2aSSebastian Grimberg                                          CeedScalar *__restrict__ r_u) {
136ca735530SJeremy L Thompson   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
137ca735530SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
1389e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
139ca735530SJeremy L Thompson 
140ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
1419e201c85SYohann   }
1429e201c85SYohann }
1439e201c85SYohann 
1449e201c85SYohann //------------------------------------------------------------------------------
1459e201c85SYohann // E-vector -> L-vector, offsets provided
1469e201c85SYohann //------------------------------------------------------------------------------
147ca735530SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1d>
148f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
149672b0f2aSSebastian Grimberg                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
150ca735530SJeremy L Thompson   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
151ca735530SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
152ca735530SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1d * P_1d];
153ca735530SJeremy L Thompson 
154ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
1559e201c85SYohann   }
1569e201c85SYohann }
1579e201c85SYohann 
1589e201c85SYohann //------------------------------------------------------------------------------
1599e201c85SYohann // E-vector -> L-vector, strided
1609e201c85SYohann //------------------------------------------------------------------------------
161ca735530SJeremy L Thompson template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
162f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
163672b0f2aSSebastian Grimberg                                           CeedScalar *__restrict__ d_v) {
164ca735530SJeremy L Thompson   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
165ca735530SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
1669e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
167ca735530SJeremy L Thompson 
168ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
1699e201c85SYohann   }
1709e201c85SYohann }
1719e201c85SYohann 
1729e201c85SYohann //------------------------------------------------------------------------------
1739e201c85SYohann // 3D
1749e201c85SYohann //------------------------------------------------------------------------------
1759e201c85SYohann 
1769e201c85SYohann //------------------------------------------------------------------------------
1779e201c85SYohann // L-vector -> E-vector, offsets provided
1789e201c85SYohann //------------------------------------------------------------------------------
179ca735530SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1d>
180f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
181672b0f2aSSebastian Grimberg                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
182ca735530SJeremy L Thompson   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
183ca735530SJeremy L Thompson     for (CeedInt z = 0; z < P_1d; z++) {
184ca735530SJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
185ca735530SJeremy L Thompson       const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
186ca735530SJeremy L Thompson 
187ca735530SJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp];
1889e201c85SYohann     }
1899e201c85SYohann }
1909e201c85SYohann 
1919e201c85SYohann //------------------------------------------------------------------------------
1929e201c85SYohann // L-vector -> E-vector, strided
1939e201c85SYohann //------------------------------------------------------------------------------
194ca735530SJeremy L Thompson template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
195f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
196672b0f2aSSebastian Grimberg                                          CeedScalar *__restrict__ r_u) {
197ca735530SJeremy L Thompson   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
198ca735530SJeremy L Thompson     for (CeedInt z = 0; z < P_1d; z++) {
199ca735530SJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
2009e201c85SYohann       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
201ca735530SJeremy L Thompson 
202ca735530SJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP];
2039e201c85SYohann     }
2049e201c85SYohann }
2059e201c85SYohann 
2069e201c85SYohann //------------------------------------------------------------------------------
2079e201c85SYohann // E-vector -> Q-vector, offests provided
2089e201c85SYohann //------------------------------------------------------------------------------
209ca735530SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int Q_1d>
210f815fac9SJeremy L Thompson inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
211f815fac9SJeremy L Thompson                                                const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
212f815fac9SJeremy L Thompson                                                CeedScalar *__restrict__ r_u) {
213ca735530SJeremy L Thompson   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
214ca735530SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
215ca735530SJeremy L Thompson     const CeedInt ind  = indices[node + elem * Q_1d * Q_1d * Q_1d];
216ca735530SJeremy L Thompson 
217ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
2189e201c85SYohann   }
2199e201c85SYohann }
2209e201c85SYohann 
2219e201c85SYohann //------------------------------------------------------------------------------
2229e201c85SYohann // E-vector -> Q-vector, strided
2239e201c85SYohann //------------------------------------------------------------------------------
224ca735530SJeremy L Thompson template <int NUM_COMP, int Q_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
225f815fac9SJeremy L Thompson inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
226672b0f2aSSebastian Grimberg                                               CeedScalar *__restrict__ r_u) {
227ca735530SJeremy L Thompson   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
228ca735530SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
2299e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
230ca735530SJeremy L Thompson 
231ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
2329e201c85SYohann   }
2339e201c85SYohann }
2349e201c85SYohann 
2359e201c85SYohann //------------------------------------------------------------------------------
2369e201c85SYohann // E-vector -> L-vector, offsets provided
2379e201c85SYohann //------------------------------------------------------------------------------
238ca735530SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1d>
239f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
240672b0f2aSSebastian Grimberg                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
241ca735530SJeremy L Thompson   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
242ca735530SJeremy L Thompson     for (CeedInt z = 0; z < P_1d; z++) {
243ca735530SJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
244ca735530SJeremy L Thompson       const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
245ca735530SJeremy L Thompson 
246ca735530SJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]);
2479e201c85SYohann     }
2489e201c85SYohann }
2499e201c85SYohann 
2509e201c85SYohann //------------------------------------------------------------------------------
2519e201c85SYohann // E-vector -> L-vector, strided
2529e201c85SYohann //------------------------------------------------------------------------------
253ca735530SJeremy L Thompson template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
254f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
255672b0f2aSSebastian Grimberg                                           CeedScalar *__restrict__ d_v) {
256ca735530SJeremy L Thompson   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
257ca735530SJeremy L Thompson     for (CeedInt z = 0; z < P_1d; z++) {
258ca735530SJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
2599e201c85SYohann       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
260ca735530SJeremy L Thompson 
261ca735530SJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d];
2629e201c85SYohann     }
2639e201c85SYohann }
2649e201c85SYohann 
2659e201c85SYohann //------------------------------------------------------------------------------
2669e201c85SYohann // 3D collocated derivatives computation
2679e201c85SYohann //------------------------------------------------------------------------------
268ca735530SJeremy L Thompson template <int NUM_COMP, int Q_1d>
269f815fac9SJeremy L Thompson inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
2702b730f8bSJeremy L Thompson                                         CeedScalar *__restrict__ r_V) {
271ca735530SJeremy L Thompson   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
272ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
273ca735530SJeremy L Thompson       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1d];
2749e201c85SYohann       __syncthreads();
2759e201c85SYohann       // X derivative
276ca735530SJeremy L Thompson       r_V[comp + 0 * NUM_COMP] = 0.0;
277ca735530SJeremy L Thompson       for (CeedInt i = 0; i < Q_1d; i++)
278ca735530SJeremy L Thompson         r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
2799e201c85SYohann       // Y derivative
280ca735530SJeremy L Thompson       r_V[comp + 1 * NUM_COMP] = 0.0;
281ca735530SJeremy L Thompson       for (CeedInt i = 0; i < Q_1d; i++)
282ca735530SJeremy L Thompson         r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
2839e201c85SYohann       // Z derivative
284ca735530SJeremy L Thompson       r_V[comp + 2 * NUM_COMP] = 0.0;
285ca735530SJeremy L Thompson       for (CeedInt i = 0; i < Q_1d; i++) r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d];  // Contract z direction (Z derivative)
2869e201c85SYohann       __syncthreads();
2879e201c85SYohann     }
2889e201c85SYohann   }
2899e201c85SYohann }
2909e201c85SYohann 
2919e201c85SYohann //------------------------------------------------------------------------------
2929e201c85SYohann // 3D collocated derivatives transpose
2939e201c85SYohann //------------------------------------------------------------------------------
294ca735530SJeremy L Thompson template <int NUM_COMP, int Q_1d>
295f815fac9SJeremy L Thompson inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
2962b730f8bSJeremy L Thompson                                                  CeedScalar *__restrict__ r_V) {
297ca735530SJeremy L Thompson   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
298ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2999e201c85SYohann       // X derivative
300ca735530SJeremy L Thompson       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
3019e201c85SYohann       __syncthreads();
302ca735530SJeremy L Thompson       for (CeedInt i = 0; i < Q_1d; i++)
303ca735530SJeremy L Thompson         r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
3049e201c85SYohann       __syncthreads();
3059e201c85SYohann       // Y derivative
306ca735530SJeremy L Thompson       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
3079e201c85SYohann       __syncthreads();
308ca735530SJeremy L Thompson       for (CeedInt i = 0; i < Q_1d; i++)
309ca735530SJeremy L Thompson         r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
3109e201c85SYohann       __syncthreads();
3119e201c85SYohann       // Z derivative
312ca735530SJeremy L Thompson       for (CeedInt i = 0; i < Q_1d; i++)
313ca735530SJeremy L Thompson         r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP];  // PARTIAL contract z direction (Z derivative)
3149e201c85SYohann     }
3159e201c85SYohann   }
3169e201c85SYohann }
317