xref: /libCEED/rust/libceed-sys/c-src/include/ceed/jit-source/hip/hip-gen-templates.h (revision 692716b783f81dd51daef618726177f4c6d7441d)
1d275d636SJeremy L Thompson // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
29e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39e201c85SYohann //
49e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause
59e201c85SYohann //
69e201c85SYohann // This file is part of CEED:  http://github.com/ceed
79e201c85SYohann 
89e201c85SYohann /// @file
99e201c85SYohann /// Internal header for HIP backend macro and type definitions for JiT source
10c0b5abf0SJeremy L Thompson #include <ceed/types.h>
119e201c85SYohann 
129e201c85SYohann //------------------------------------------------------------------------------
139e201c85SYohann // Load matrices for basis actions
149e201c85SYohann //------------------------------------------------------------------------------
159e201c85SYohann template <int P, int Q>
16f815fac9SJeremy L Thompson inline __device__ void LoadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
172b730f8bSJeremy L Thompson   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
189e201c85SYohann }
199e201c85SYohann 
209e201c85SYohann //------------------------------------------------------------------------------
213a2968d6SJeremy L Thompson // AtPoints
223a2968d6SJeremy L Thompson //------------------------------------------------------------------------------
233a2968d6SJeremy L Thompson 
243a2968d6SJeremy L Thompson //------------------------------------------------------------------------------
253a2968d6SJeremy L Thompson // L-vector -> single point
263a2968d6SJeremy L Thompson //------------------------------------------------------------------------------
273a2968d6SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
283a2968d6SJeremy L Thompson inline __device__ void ReadPoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
293a2968d6SJeremy L Thompson                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
303a2968d6SJeremy L Thompson   const CeedInt ind = indices[p + elem * NUM_PTS];
313a2968d6SJeremy L Thompson 
323a2968d6SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
333a2968d6SJeremy L Thompson     r_u[comp] = d_u[ind + comp * COMP_STRIDE];
343a2968d6SJeremy L Thompson   }
353a2968d6SJeremy L Thompson }
363a2968d6SJeremy L Thompson 
373a2968d6SJeremy L Thompson //------------------------------------------------------------------------------
383a2968d6SJeremy L Thompson // Single point -> L-vector
393a2968d6SJeremy L Thompson //------------------------------------------------------------------------------
403a2968d6SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
413a2968d6SJeremy L Thompson inline __device__ void WritePoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
423a2968d6SJeremy L Thompson                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
433a2968d6SJeremy L Thompson   if (p < points_in_elem) {
443a2968d6SJeremy L Thompson     const CeedInt ind = indices[p + elem * NUM_PTS];
453a2968d6SJeremy L Thompson 
463a2968d6SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
473a2968d6SJeremy L Thompson       d_u[ind + comp * COMP_STRIDE] += r_u[comp];
483a2968d6SJeremy L Thompson     }
493a2968d6SJeremy L Thompson   }
503a2968d6SJeremy L Thompson }
513a2968d6SJeremy L Thompson 
523a2968d6SJeremy L Thompson //------------------------------------------------------------------------------
539e201c85SYohann // 1D
549e201c85SYohann //------------------------------------------------------------------------------
559e201c85SYohann 
569e201c85SYohann //------------------------------------------------------------------------------
570183ed61SJeremy L Thompson // Set E-vector value
580183ed61SJeremy L Thompson //------------------------------------------------------------------------------
590183ed61SJeremy L Thompson template <int NUM_COMP, int P_1D>
600183ed61SJeremy L Thompson inline __device__ void SetEVecStandard1d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
610183ed61SJeremy L Thompson   const CeedInt target_comp = n / P_1D;
620183ed61SJeremy L Thompson   const CeedInt target_node = n % P_1D;
630183ed61SJeremy L Thompson 
640183ed61SJeremy L Thompson   if (data.t_id_x == target_node) {
650183ed61SJeremy L Thompson     r_v[target_comp] = value;
660183ed61SJeremy L Thompson   }
670183ed61SJeremy L Thompson }
680183ed61SJeremy L Thompson 
690183ed61SJeremy L Thompson //------------------------------------------------------------------------------
709e201c85SYohann // L-vector -> E-vector, offsets provided
719e201c85SYohann //------------------------------------------------------------------------------
726b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
73f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
74672b0f2aSSebastian Grimberg                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
756b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D) {
769e201c85SYohann     const CeedInt node = data.t_id_x;
776b92dc4bSJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D];
78672b0f2aSSebastian Grimberg 
79672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
809e201c85SYohann   }
819e201c85SYohann }
829e201c85SYohann 
839e201c85SYohann //------------------------------------------------------------------------------
849e201c85SYohann // L-vector -> E-vector, strided
859e201c85SYohann //------------------------------------------------------------------------------
866b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
87f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
886b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D) {
899e201c85SYohann     const CeedInt node = data.t_id_x;
909e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
91672b0f2aSSebastian Grimberg 
92672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
939e201c85SYohann   }
949e201c85SYohann }
959e201c85SYohann 
969e201c85SYohann //------------------------------------------------------------------------------
979e201c85SYohann // E-vector -> L-vector, offsets provided
989e201c85SYohann //------------------------------------------------------------------------------
996b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
100f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
101672b0f2aSSebastian Grimberg                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
1026b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D) {
1039e201c85SYohann     const CeedInt node = data.t_id_x;
1046b92dc4bSJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D];
105672b0f2aSSebastian Grimberg 
106672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
1079e201c85SYohann   }
1089e201c85SYohann }
1099e201c85SYohann 
1100183ed61SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
1110183ed61SJeremy L Thompson inline __device__ void WriteLVecStandard1d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
1120183ed61SJeremy L Thompson                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
1130183ed61SJeremy L Thompson                                                   CeedScalar *__restrict__ d_v) {
1140183ed61SJeremy L Thompson   const CeedInt target_comp = n / P_1D;
1150183ed61SJeremy L Thompson   const CeedInt target_node = n % P_1D;
1160183ed61SJeremy L Thompson 
1170183ed61SJeremy L Thompson   if (data.t_id_x == target_node) {
1180183ed61SJeremy L Thompson     const CeedInt ind = indices[target_node + elem * P_1D];
1190183ed61SJeremy L Thompson 
1200183ed61SJeremy L Thompson     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
1210183ed61SJeremy L Thompson   }
1220183ed61SJeremy L Thompson }
1230183ed61SJeremy L Thompson 
1249e201c85SYohann //------------------------------------------------------------------------------
125*692716b7SZach Atkins // E-vector -> L-vector, full assembly
126*692716b7SZach Atkins //------------------------------------------------------------------------------
127*692716b7SZach Atkins template <int NUM_COMP, int COMP_STRIDE, int P_1D>
128*692716b7SZach Atkins inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
129*692716b7SZach Atkins                                                     const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
130*692716b7SZach Atkins   const CeedInt in_comp    = in / P_1D;
131*692716b7SZach Atkins   const CeedInt in_node    = in % P_1D;
132*692716b7SZach Atkins   const CeedInt e_vec_size = P_1D * NUM_COMP;
133*692716b7SZach Atkins 
134*692716b7SZach Atkins   if (data.t_id_x < P_1D) {
135*692716b7SZach Atkins     const CeedInt out_node = data.t_id_x;
136*692716b7SZach Atkins 
137*692716b7SZach Atkins     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
138*692716b7SZach Atkins       d_v[elem * e_vec_size * e_vec_size + (in_comp * NUM_COMP + comp) * P_1D * P_1D + out_node * P_1D + in_node] += r_v[comp];
139*692716b7SZach Atkins     }
140*692716b7SZach Atkins   }
141*692716b7SZach Atkins }
142*692716b7SZach Atkins 
143*692716b7SZach Atkins //------------------------------------------------------------------------------
1449e201c85SYohann // E-vector -> L-vector, strided
1459e201c85SYohann //------------------------------------------------------------------------------
1466b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
147f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
148672b0f2aSSebastian Grimberg                                           CeedScalar *__restrict__ d_v) {
1496b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D) {
1509e201c85SYohann     const CeedInt node = data.t_id_x;
1519e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
152672b0f2aSSebastian Grimberg 
153672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
1549e201c85SYohann   }
1559e201c85SYohann }
1569e201c85SYohann 
1579e201c85SYohann //------------------------------------------------------------------------------
1589e201c85SYohann // 2D
1599e201c85SYohann //------------------------------------------------------------------------------
1609e201c85SYohann 
1619e201c85SYohann //------------------------------------------------------------------------------
1620183ed61SJeremy L Thompson // Set E-vector value
1630183ed61SJeremy L Thompson //------------------------------------------------------------------------------
1640183ed61SJeremy L Thompson template <int NUM_COMP, int P_1D>
1650183ed61SJeremy L Thompson inline __device__ void SetEVecStandard2d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
1660183ed61SJeremy L Thompson   const CeedInt target_comp   = n / (P_1D * P_1D);
1670183ed61SJeremy L Thompson   const CeedInt target_node_x = n % P_1D;
1680183ed61SJeremy L Thompson   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
1690183ed61SJeremy L Thompson 
1700183ed61SJeremy L Thompson   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
1710183ed61SJeremy L Thompson     r_v[target_comp] = value;
1720183ed61SJeremy L Thompson   }
1730183ed61SJeremy L Thompson }
1740183ed61SJeremy L Thompson 
1750183ed61SJeremy L Thompson //------------------------------------------------------------------------------
1769e201c85SYohann // L-vector -> E-vector, offsets provided
1779e201c85SYohann //------------------------------------------------------------------------------
1786b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
179f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
180672b0f2aSSebastian Grimberg                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
1816b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
1826b92dc4bSJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
1836b92dc4bSJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
184672b0f2aSSebastian Grimberg 
185672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
1869e201c85SYohann   }
1879e201c85SYohann }
1889e201c85SYohann 
1899e201c85SYohann //------------------------------------------------------------------------------
1909e201c85SYohann // L-vector -> E-vector, strided
1919e201c85SYohann //------------------------------------------------------------------------------
1926b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
193f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
1946b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
1956b92dc4bSJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
1969e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
197672b0f2aSSebastian Grimberg 
198672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
1999e201c85SYohann   }
2009e201c85SYohann }
2019e201c85SYohann 
2029e201c85SYohann //------------------------------------------------------------------------------
2039e201c85SYohann // E-vector -> L-vector, offsets provided
2049e201c85SYohann //------------------------------------------------------------------------------
2056b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
206f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
207672b0f2aSSebastian Grimberg                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
2086b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
2096b92dc4bSJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
2106b92dc4bSJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
211672b0f2aSSebastian Grimberg 
212672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
2139e201c85SYohann   }
2149e201c85SYohann }
2159e201c85SYohann 
2160183ed61SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
2170183ed61SJeremy L Thompson inline __device__ void WriteLVecStandard2d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
2180183ed61SJeremy L Thompson                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
2190183ed61SJeremy L Thompson                                                   CeedScalar *__restrict__ d_v) {
2200183ed61SJeremy L Thompson   const CeedInt target_comp   = n / (P_1D * P_1D);
2210183ed61SJeremy L Thompson   const CeedInt target_node_x = n % P_1D;
2220183ed61SJeremy L Thompson   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
2230183ed61SJeremy L Thompson 
2240183ed61SJeremy L Thompson   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
2250183ed61SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
2260183ed61SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
2270183ed61SJeremy L Thompson 
2280183ed61SJeremy L Thompson     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
2290183ed61SJeremy L Thompson   }
2300183ed61SJeremy L Thompson }
2310183ed61SJeremy L Thompson 
2329e201c85SYohann //------------------------------------------------------------------------------
233*692716b7SZach Atkins // E-vector -> L-vector, full assembly
234*692716b7SZach Atkins //------------------------------------------------------------------------------
235*692716b7SZach Atkins template <int NUM_COMP, int COMP_STRIDE, int P_1D>
236*692716b7SZach Atkins inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
237*692716b7SZach Atkins                                                     const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
238*692716b7SZach Atkins   const CeedInt elem_size  = P_1D * P_1D;
239*692716b7SZach Atkins   const CeedInt in_comp    = in / elem_size;
240*692716b7SZach Atkins   const CeedInt in_node_x  = in % P_1D;
241*692716b7SZach Atkins   const CeedInt in_node_y  = (in % elem_size) / P_1D;
242*692716b7SZach Atkins   const CeedInt e_vec_size = elem_size * NUM_COMP;
243*692716b7SZach Atkins 
244*692716b7SZach Atkins   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
245*692716b7SZach Atkins     const CeedInt in_node  = in_node_x + in_node_y * P_1D;
246*692716b7SZach Atkins     const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D;
247*692716b7SZach Atkins 
248*692716b7SZach Atkins     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
249*692716b7SZach Atkins       const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
250*692716b7SZach Atkins 
251*692716b7SZach Atkins       d_v[elem * e_vec_size * e_vec_size + index] += r_v[comp];
252*692716b7SZach Atkins     }
253*692716b7SZach Atkins   }
254*692716b7SZach Atkins }
255*692716b7SZach Atkins 
256*692716b7SZach Atkins //------------------------------------------------------------------------------
2579e201c85SYohann // E-vector -> L-vector, strided
2589e201c85SYohann //------------------------------------------------------------------------------
2596b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
260f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
261672b0f2aSSebastian Grimberg                                           CeedScalar *__restrict__ d_v) {
2626b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
2636b92dc4bSJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
2649e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
265672b0f2aSSebastian Grimberg 
266672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
2679e201c85SYohann   }
2689e201c85SYohann }
2699e201c85SYohann 
2709e201c85SYohann //------------------------------------------------------------------------------
2719e201c85SYohann // 3D
2729e201c85SYohann //------------------------------------------------------------------------------
2739e201c85SYohann 
2749e201c85SYohann //------------------------------------------------------------------------------
2750183ed61SJeremy L Thompson // Set E-vector value
2760183ed61SJeremy L Thompson //------------------------------------------------------------------------------
2770183ed61SJeremy L Thompson template <int NUM_COMP, int P_1D>
2780183ed61SJeremy L Thompson inline __device__ void SetEVecStandard3d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
2790183ed61SJeremy L Thompson   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
2800183ed61SJeremy L Thompson   const CeedInt target_node_x = n % P_1D;
2810183ed61SJeremy L Thompson   const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D;
2820183ed61SJeremy L Thompson   const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
2830183ed61SJeremy L Thompson 
2840183ed61SJeremy L Thompson   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
2850183ed61SJeremy L Thompson     r_v[target_node_z + target_comp * P_1D] = value;
2860183ed61SJeremy L Thompson   }
2870183ed61SJeremy L Thompson }
2880183ed61SJeremy L Thompson 
2890183ed61SJeremy L Thompson //------------------------------------------------------------------------------
2909e201c85SYohann // L-vector -> E-vector, offsets provided
2919e201c85SYohann //------------------------------------------------------------------------------
2926b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
293f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
294672b0f2aSSebastian Grimberg                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
2956b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
2966b92dc4bSJeremy L Thompson     for (CeedInt z = 0; z < P_1D; z++) {
2976b92dc4bSJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
2986b92dc4bSJeremy L Thompson       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
299672b0f2aSSebastian Grimberg 
3006b92dc4bSJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp];
3019e201c85SYohann     }
3029e201c85SYohann   }
303688b5473SJeremy L Thompson }
3049e201c85SYohann 
3059e201c85SYohann //------------------------------------------------------------------------------
3069e201c85SYohann // L-vector -> E-vector, strided
3079e201c85SYohann //------------------------------------------------------------------------------
3086b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
309f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
3106b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
3116b92dc4bSJeremy L Thompson     for (CeedInt z = 0; z < P_1D; z++) {
3126b92dc4bSJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
3139e201c85SYohann       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
314672b0f2aSSebastian Grimberg 
3156b92dc4bSJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP];
3169e201c85SYohann     }
3179e201c85SYohann   }
318688b5473SJeremy L Thompson }
3199e201c85SYohann 
3209e201c85SYohann //------------------------------------------------------------------------------
3219e201c85SYohann // E-vector -> Q-vector, offests provided
3229e201c85SYohann //------------------------------------------------------------------------------
3236b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int Q_1D>
324f815fac9SJeremy L Thompson inline __device__ void ReadEVecSliceStandard3d(SharedData_Hip &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
325f815fac9SJeremy L Thompson                                                const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
326f815fac9SJeremy L Thompson                                                CeedScalar *__restrict__ r_u) {
3276b92dc4bSJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
3286b92dc4bSJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
3296b92dc4bSJeremy L Thompson     const CeedInt ind  = indices[node + elem * Q_1D * Q_1D * Q_1D];
330672b0f2aSSebastian Grimberg 
331672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
3329e201c85SYohann   }
3339e201c85SYohann }
3349e201c85SYohann 
3359e201c85SYohann //------------------------------------------------------------------------------
3369e201c85SYohann // E-vector -> Q-vector, strided
3379e201c85SYohann //------------------------------------------------------------------------------
3386b92dc4bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
339f815fac9SJeremy L Thompson inline __device__ void ReadEVecSliceStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
340672b0f2aSSebastian Grimberg                                               CeedScalar *__restrict__ r_u) {
3416b92dc4bSJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
3426b92dc4bSJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
3439e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
344672b0f2aSSebastian Grimberg 
345672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
3469e201c85SYohann   }
3479e201c85SYohann }
3489e201c85SYohann 
3499e201c85SYohann //------------------------------------------------------------------------------
3509e201c85SYohann // E-vector -> L-vector, offsets provided
3519e201c85SYohann //------------------------------------------------------------------------------
3526b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
353f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
354672b0f2aSSebastian Grimberg                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
3556b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
3566b92dc4bSJeremy L Thompson     for (CeedInt z = 0; z < P_1D; z++) {
3576b92dc4bSJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
3586b92dc4bSJeremy L Thompson       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
359672b0f2aSSebastian Grimberg 
3606b92dc4bSJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]);
3619e201c85SYohann     }
3629e201c85SYohann   }
363688b5473SJeremy L Thompson }
3649e201c85SYohann 
3650183ed61SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
3660183ed61SJeremy L Thompson inline __device__ void WriteLVecStandard3d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
3670183ed61SJeremy L Thompson                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
3680183ed61SJeremy L Thompson                                                   CeedScalar *__restrict__ d_v) {
3690183ed61SJeremy L Thompson   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
3700183ed61SJeremy L Thompson   const CeedInt target_node_x = n % P_1D;
3710183ed61SJeremy L Thompson   const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D;
3720183ed61SJeremy L Thompson   const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
3730183ed61SJeremy L Thompson 
3740183ed61SJeremy L Thompson   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
3750183ed61SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D;
3760183ed61SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
3770183ed61SJeremy L Thompson 
3780183ed61SJeremy L Thompson     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]);
3790183ed61SJeremy L Thompson   }
3800183ed61SJeremy L Thompson }
3810183ed61SJeremy L Thompson 
3829e201c85SYohann //------------------------------------------------------------------------------
383*692716b7SZach Atkins // E-vector -> L-vector, full assembly
384*692716b7SZach Atkins //------------------------------------------------------------------------------
385*692716b7SZach Atkins template <int NUM_COMP, int COMP_STRIDE, int P_1D>
386*692716b7SZach Atkins inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
387*692716b7SZach Atkins                                                     const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
388*692716b7SZach Atkins   const CeedInt elem_size  = P_1D * P_1D * P_1D;
389*692716b7SZach Atkins   const CeedInt in_comp    = in / elem_size;
390*692716b7SZach Atkins   const CeedInt in_node_x  = in % P_1D;
391*692716b7SZach Atkins   const CeedInt in_node_y  = (in % (P_1D * P_1D)) / P_1D;
392*692716b7SZach Atkins   const CeedInt in_node_z  = (in % elem_size) / (P_1D * P_1D);
393*692716b7SZach Atkins   const CeedInt e_vec_size = elem_size * NUM_COMP;
394*692716b7SZach Atkins 
395*692716b7SZach Atkins   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
396*692716b7SZach Atkins     const CeedInt in_node = in_node_x + in_node_y * P_1D + in_node_z * P_1D * P_1D;
397*692716b7SZach Atkins     for (CeedInt z = 0; z < P_1D; z++) {
398*692716b7SZach Atkins       const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
399*692716b7SZach Atkins 
400*692716b7SZach Atkins       for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
401*692716b7SZach Atkins         const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
402*692716b7SZach Atkins 
403*692716b7SZach Atkins         d_v[elem * e_vec_size * e_vec_size + index] += r_v[z + comp * P_1D];
404*692716b7SZach Atkins       }
405*692716b7SZach Atkins     }
406*692716b7SZach Atkins   }
407*692716b7SZach Atkins }
408*692716b7SZach Atkins 
409*692716b7SZach Atkins //------------------------------------------------------------------------------
4109e201c85SYohann // E-vector -> L-vector, strided
4119e201c85SYohann //------------------------------------------------------------------------------
4126b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
413f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
414672b0f2aSSebastian Grimberg                                           CeedScalar *__restrict__ d_v) {
4156b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
4166b92dc4bSJeremy L Thompson     for (CeedInt z = 0; z < P_1D; z++) {
4176b92dc4bSJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
4189e201c85SYohann       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
419672b0f2aSSebastian Grimberg 
4206b92dc4bSJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D];
4219e201c85SYohann     }
4229e201c85SYohann   }
423688b5473SJeremy L Thompson }
4249e201c85SYohann 
4259e201c85SYohann //------------------------------------------------------------------------------
4269e201c85SYohann // 3D collocated derivatives computation
4279e201c85SYohann //------------------------------------------------------------------------------
4286b92dc4bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
429f815fac9SJeremy L Thompson inline __device__ void GradColloSlice3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
4302b730f8bSJeremy L Thompson                                         CeedScalar *__restrict__ r_V) {
4316b92dc4bSJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
432672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
433d6c19ee8SJeremy L Thompson       __syncthreads();
4346b92dc4bSJeremy L Thompson       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D];
4359e201c85SYohann       __syncthreads();
4369e201c85SYohann       // X derivative
437672b0f2aSSebastian Grimberg       r_V[comp + 0 * NUM_COMP] = 0.0;
4386b92dc4bSJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
4396b92dc4bSJeremy L Thompson         r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D];
440688b5473SJeremy L Thompson       }
4419e201c85SYohann       // Y derivative
442672b0f2aSSebastian Grimberg       r_V[comp + 1 * NUM_COMP] = 0.0;
4436b92dc4bSJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
4446b92dc4bSJeremy L Thompson         r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D];
445688b5473SJeremy L Thompson       }
4469e201c85SYohann       // Z derivative
447672b0f2aSSebastian Grimberg       r_V[comp + 2 * NUM_COMP] = 0.0;
4486b92dc4bSJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
4496b92dc4bSJeremy L Thompson         r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D];
450688b5473SJeremy L Thompson       }
4519e201c85SYohann     }
4529e201c85SYohann   }
4539e201c85SYohann }
4549e201c85SYohann 
4559e201c85SYohann //------------------------------------------------------------------------------
4569e201c85SYohann // 3D collocated derivatives transpose
4579e201c85SYohann //------------------------------------------------------------------------------
4586b92dc4bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
459f815fac9SJeremy L Thompson inline __device__ void GradColloSliceTranspose3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
4602b730f8bSJeremy L Thompson                                                  CeedScalar *__restrict__ r_V) {
4616b92dc4bSJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
462672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4639e201c85SYohann       // X derivative
464d6c19ee8SJeremy L Thompson       __syncthreads();
465672b0f2aSSebastian Grimberg       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
4669e201c85SYohann       __syncthreads();
4676b92dc4bSJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
4686b92dc4bSJeremy L Thompson         r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D];
469688b5473SJeremy L Thompson       }
4709e201c85SYohann       // Y derivative
471d6c19ee8SJeremy L Thompson       __syncthreads();
472672b0f2aSSebastian Grimberg       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
4739e201c85SYohann       __syncthreads();
4746b92dc4bSJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
4756b92dc4bSJeremy L Thompson         r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D];
476688b5473SJeremy L Thompson       }
4779e201c85SYohann       // Z derivative
4786b92dc4bSJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
4796b92dc4bSJeremy L Thompson         r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP];
480688b5473SJeremy L Thompson       }
4819e201c85SYohann     }
4829e201c85SYohann   }
4839e201c85SYohann }
484