xref: /libCEED/rust/libceed-sys/c-src/backends/magma/ceed-magma-basis.c (revision 38293ee66094a4bc140c5a2101071dba903c8073)
13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
37f5b9731SStan Tomov //
43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
57f5b9731SStan Tomov //
63d8e8822SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
77f5b9731SStan Tomov 
849aac155SJeremy L Thompson #include <ceed.h>
9ec3da8bcSJed Brown #include <ceed/backend.h>
10f6af633fSnbeams #include <ceed/jit-tools.h>
11f6af633fSnbeams #include <string.h>
122b730f8bSJeremy L Thompson 
13e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP
14f6af633fSnbeams #include "../hip/ceed-hip-common.h"
15f6af633fSnbeams #include "../hip/ceed-hip-compile.h"
16f6af633fSnbeams #else
17f6af633fSnbeams #include "../cuda/ceed-cuda-common.h"
18f6af633fSnbeams #include "../cuda/ceed-cuda-compile.h"
19f6af633fSnbeams #endif
2000fb7a04SSebastian Grimberg #include "ceed-magma-common.h"
2100fb7a04SSebastian Grimberg #include "ceed-magma.h"
227f5b9731SStan Tomov 
237f5b9731SStan Tomov #ifdef __cplusplus
247f5b9731SStan Tomov CEED_INTERN "C"
257f5b9731SStan Tomov #endif
262b730f8bSJeremy L Thompson     int
27*38293ee6SJeremy L Thompson     CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector U, CeedVector V) {
287f5b9731SStan Tomov   Ceed              ceed;
29e0582403Sabdelfattah83   Ceed_Magma       *data;
30*38293ee6SJeremy L Thompson   CeedInt           dim, num_comp, num_dof, P_1d, Q_1d;
316574a04fSJeremy L Thompson   const CeedScalar *du;
326574a04fSJeremy L Thompson   CeedScalar       *dv;
33*38293ee6SJeremy L Thompson   CeedBasis_Magma  *impl;
34*38293ee6SJeremy L Thompson 
35*38293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
36*38293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
37*38293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
38*38293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetNumNodes(basis, &num_dof));
39*38293ee6SJeremy L Thompson 
40*38293ee6SJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
41*38293ee6SJeremy L Thompson 
426574a04fSJeremy L Thompson   if (U != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du));
43*38293ee6SJeremy L Thompson   else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
446574a04fSJeremy L Thompson   CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv));
457f5b9731SStan Tomov 
462b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
477f5b9731SStan Tomov 
48*38293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
49*38293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
507f5b9731SStan Tomov 
51*38293ee6SJeremy L Thompson   CeedDebug256(ceed, 4, "[CeedBasisApply_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, num_comp * CeedIntPow(P_1d, dim), num_comp);
527f5b9731SStan Tomov 
53*38293ee6SJeremy L Thompson   if (t_mode == CEED_TRANSPOSE) {
541f9221feSJeremy L Thompson     CeedSize length;
55*38293ee6SJeremy L Thompson 
562b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorGetLength(V, &length));
5780a9ef05SNatalie Beams     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
586574a04fSJeremy L Thompson       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue);
5980a9ef05SNatalie Beams     } else {
606574a04fSJeremy L Thompson       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)dv, length, data->queue);
6180a9ef05SNatalie Beams     }
62e0582403Sabdelfattah83     ceed_magma_queue_sync(data->queue);
637f5b9731SStan Tomov   }
64f6af633fSnbeams 
65*38293ee6SJeremy L Thompson   switch (e_mode) {
663513a710Sjeremylt     case CEED_EVAL_INTERP: {
67*38293ee6SJeremy L Thompson       CeedInt P = P_1d, Q = Q_1d;
68*38293ee6SJeremy L Thompson 
69*38293ee6SJeremy L Thompson       if (t_mode == CEED_TRANSPOSE) {
70*38293ee6SJeremy L Thompson         P = Q_1d;
71*38293ee6SJeremy L Thompson         Q = P_1d;
727f5b9731SStan Tomov       }
737f5b9731SStan Tomov 
747f5b9731SStan Tomov       // Define element sizes for dofs/quad
75*38293ee6SJeremy L Thompson       CeedInt elem_qpts_size = CeedIntPow(Q_1d, dim);
76*38293ee6SJeremy L Thompson       CeedInt elem_dofs_size = CeedIntPow(P_1d, dim);
777f5b9731SStan Tomov 
787f5b9731SStan Tomov       // E-vector ordering -------------- Q-vector ordering
79868539c2SNatalie Beams       //  component                        component
80868539c2SNatalie Beams       //    elem                             elem
817f5b9731SStan Tomov       //       node                            node
827f5b9731SStan Tomov 
837f5b9731SStan Tomov       // ---  Define strides for NOTRANSPOSE mode: ---
846574a04fSJeremy L Thompson       // Input (du) is E-vector, output (dv) is Q-vector
857f5b9731SStan Tomov 
867f5b9731SStan Tomov       // Element strides
87*38293ee6SJeremy L Thompson       CeedInt u_elem_stride = elem_dofs_size;
88*38293ee6SJeremy L Thompson       CeedInt v_elem_stride = elem_qpts_size;
897f5b9731SStan Tomov       // Component strides
90*38293ee6SJeremy L Thompson       CeedInt u_comp_stride = num_elem * elem_dofs_size;
91*38293ee6SJeremy L Thompson       CeedInt v_comp_stride = num_elem * elem_qpts_size;
927f5b9731SStan Tomov 
937f5b9731SStan Tomov       // ---  Swap strides for TRANSPOSE mode: ---
94*38293ee6SJeremy L Thompson       if (t_mode == CEED_TRANSPOSE) {
956574a04fSJeremy L Thompson         // Input (du) is Q-vector, output (dv) is E-vector
967f5b9731SStan Tomov         // Element strides
97*38293ee6SJeremy L Thompson         v_elem_stride = elem_dofs_size;
98*38293ee6SJeremy L Thompson         u_elem_stride = elem_qpts_size;
997f5b9731SStan Tomov         // Component strides
100*38293ee6SJeremy L Thompson         v_comp_stride = num_elem * elem_dofs_size;
101*38293ee6SJeremy L Thompson         u_comp_stride = num_elem * elem_qpts_size;
1027f5b9731SStan Tomov       }
103*38293ee6SJeremy L Thompson       CeedInt num_threads = 1;
104*38293ee6SJeremy L Thompson       CeedInt num_t_col   = 1;
105*38293ee6SJeremy L Thompson       CeedInt shared_mem  = 0;
106*38293ee6SJeremy L Thompson       CeedInt max_P_Q     = CeedIntMax(P, Q);
107f6af633fSnbeams 
108f6af633fSnbeams       switch (dim) {
109f6af633fSnbeams         case 1:
110*38293ee6SJeremy L Thompson           num_threads = max_P_Q;
111*38293ee6SJeremy L Thompson           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_1D);
112*38293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * num_t_col * (num_comp * (1 * P + 1 * Q));
113*38293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * (P * Q);
114f6af633fSnbeams           break;
115f6af633fSnbeams         case 2:
116*38293ee6SJeremy L Thompson           num_threads = max_P_Q;
117*38293ee6SJeremy L Thompson           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D);
118*38293ee6SJeremy L Thompson           shared_mem += P * Q * sizeof(CeedScalar);                      // for sT
119*38293ee6SJeremy L Thompson           shared_mem += num_t_col * (P * max_P_Q * sizeof(CeedScalar));  // for reforming rU we need PxP, and for the intermediate output we need PxQ
120f6af633fSnbeams           break;
121f6af633fSnbeams         case 3:
122*38293ee6SJeremy L Thompson           num_threads = max_P_Q * max_P_Q;
123*38293ee6SJeremy L Thompson           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D);
124*38293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * (P * Q);  // for sT
125*38293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * num_t_col *
126*38293ee6SJeremy L Thompson                         (CeedIntMax(P * P * max_P_Q,
127f6af633fSnbeams                                     P * Q * Q));  // rU needs P^2xP, the intermediate output needs max(P^2xQ,PQ^2)
128f6af633fSnbeams       }
129*38293ee6SJeremy L Thompson       CeedInt grid   = (num_elem + num_t_col - 1) / num_t_col;
130*38293ee6SJeremy L Thompson       void   *args[] = {&impl->d_interp_1d, &du, &u_elem_stride, &u_comp_stride, &dv, &v_elem_stride, &v_comp_stride, &num_elem};
131f6af633fSnbeams 
132*38293ee6SJeremy L Thompson       if (t_mode == CEED_TRANSPOSE) {
133*38293ee6SJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp_tr, grid, num_threads, num_t_col, 1, shared_mem, args));
134f6af633fSnbeams       } else {
135*38293ee6SJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp, grid, num_threads, num_t_col, 1, shared_mem, args));
136f6af633fSnbeams       }
1372b730f8bSJeremy L Thompson     } break;
1383513a710Sjeremylt     case CEED_EVAL_GRAD: {
139*38293ee6SJeremy L Thompson       CeedInt P = P_1d, Q = Q_1d;
140*38293ee6SJeremy L Thompson 
1417f5b9731SStan Tomov       // In CEED_NOTRANSPOSE mode:
142*38293ee6SJeremy L Thompson       // du is (P^dim x nc), column-major layout (nc = num_comp)
143*38293ee6SJeremy L Thompson       // dv is (Q^dim x nc x dim), column-major layout (nc = num_comp)
1446574a04fSJeremy L Thompson       // In CEED_TRANSPOSE mode, the sizes of du and dv are switched.
145*38293ee6SJeremy L Thompson       if (t_mode == CEED_TRANSPOSE) {
146*38293ee6SJeremy L Thompson         P = Q_1d;
147*38293ee6SJeremy L Thompson         Q = P_1d;
1487f5b9731SStan Tomov       }
1497f5b9731SStan Tomov 
1507f5b9731SStan Tomov       // Define element sizes for dofs/quad
151*38293ee6SJeremy L Thompson       CeedInt elem_qpts_size = CeedIntPow(Q_1d, dim);
152*38293ee6SJeremy L Thompson       CeedInt elem_dofs_size = CeedIntPow(P_1d, dim);
1537f5b9731SStan Tomov 
1547f5b9731SStan Tomov       // E-vector ordering -------------- Q-vector ordering
1557f5b9731SStan Tomov       //                                  dim
156868539c2SNatalie Beams       //  component                        component
157868539c2SNatalie Beams       //    elem                              elem
1587f5b9731SStan Tomov       //       node                            node
1597f5b9731SStan Tomov 
1607f5b9731SStan Tomov       // ---  Define strides for NOTRANSPOSE mode: ---
1616574a04fSJeremy L Thompson       // Input (du) is E-vector, output (dv) is Q-vector
1627f5b9731SStan Tomov 
1637f5b9731SStan Tomov       // Element strides
164*38293ee6SJeremy L Thompson       CeedInt u_elem_stride = elem_dofs_size;
165*38293ee6SJeremy L Thompson       CeedInt v_elem_stride = elem_qpts_size;
1667f5b9731SStan Tomov       // Component strides
167*38293ee6SJeremy L Thompson       CeedInt u_comp_stride = num_elem * elem_dofs_size;
168*38293ee6SJeremy L Thompson       CeedInt v_comp_stride = num_elem * elem_qpts_size;
1697f5b9731SStan Tomov       // Dimension strides
170*38293ee6SJeremy L Thompson       CeedInt u_dim_stride = 0;
171*38293ee6SJeremy L Thompson       CeedInt v_dim_stride = num_elem * elem_qpts_size * num_comp;
1727f5b9731SStan Tomov 
1737f5b9731SStan Tomov       // ---  Swap strides for TRANSPOSE mode: ---
174*38293ee6SJeremy L Thompson       if (t_mode == CEED_TRANSPOSE) {
1756574a04fSJeremy L Thompson         // Input (du) is Q-vector, output (dv) is E-vector
1767f5b9731SStan Tomov         // Element strides
177*38293ee6SJeremy L Thompson         v_elem_stride = elem_dofs_size;
178*38293ee6SJeremy L Thompson         u_elem_stride = elem_qpts_size;
1797f5b9731SStan Tomov         // Component strides
180*38293ee6SJeremy L Thompson         v_comp_stride = num_elem * elem_dofs_size;
181*38293ee6SJeremy L Thompson         u_comp_stride = num_elem * elem_qpts_size;
1827f5b9731SStan Tomov         // Dimension strides
183*38293ee6SJeremy L Thompson         v_dim_stride = 0;
184*38293ee6SJeremy L Thompson         u_dim_stride = num_elem * elem_qpts_size * num_comp;
1857f5b9731SStan Tomov       }
186*38293ee6SJeremy L Thompson       CeedInt num_threads = 1;
187*38293ee6SJeremy L Thompson       CeedInt num_t_col   = 1;
188*38293ee6SJeremy L Thompson       CeedInt shared_mem  = 0;
189*38293ee6SJeremy L Thompson       CeedInt max_P_Q     = CeedIntMax(P, Q);
190f6af633fSnbeams 
191f6af633fSnbeams       switch (dim) {
192f6af633fSnbeams         case 1:
193*38293ee6SJeremy L Thompson           num_threads = max_P_Q;
194*38293ee6SJeremy L Thompson           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_1D);
195*38293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * num_t_col * (num_comp * (1 * P + 1 * Q));
196*38293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * (P * Q);
197f6af633fSnbeams           break;
198f6af633fSnbeams         case 2:
199*38293ee6SJeremy L Thompson           num_threads = max_P_Q;
200*38293ee6SJeremy L Thompson           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D);
201*38293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * 2 * P * Q;                  // for sTinterp and sTgrad
202*38293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * num_t_col * (P * max_P_Q);  // for reforming rU we need PxP, and for the intermediate output we need PxQ
203f6af633fSnbeams           break;
204f6af633fSnbeams         case 3:
205*38293ee6SJeremy L Thompson           num_threads = max_P_Q * max_P_Q;
206*38293ee6SJeremy L Thompson           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D);
207*38293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * 2 * P * Q;  // for sTinterp and sTgrad
208*38293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * num_t_col *
2092b730f8bSJeremy L Thompson                         CeedIntMax(P * P * P,
2102b730f8bSJeremy L Thompson                                    (P * P * Q) + (P * Q * Q));  // rU needs P^2xP, the intermediate outputs need (P^2.Q + P.Q^2)
211f6af633fSnbeams       }
212*38293ee6SJeremy L Thompson       CeedInt grid   = (num_elem + num_t_col - 1) / num_t_col;
213*38293ee6SJeremy L Thompson       void   *args[] = {&impl->d_interp_1d, &impl->d_grad_1d, &du,           &u_elem_stride, &u_comp_stride, &u_dim_stride, &dv,
214*38293ee6SJeremy L Thompson                         &v_elem_stride,     &v_comp_stride,   &v_dim_stride, &num_elem};
215f6af633fSnbeams 
216*38293ee6SJeremy L Thompson       if (t_mode == CEED_TRANSPOSE) {
217*38293ee6SJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad_tr, grid, num_threads, num_t_col, 1, shared_mem, args));
218f6af633fSnbeams       } else {
219*38293ee6SJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad, grid, num_threads, num_t_col, 1, shared_mem, args));
220f6af633fSnbeams       }
2212b730f8bSJeremy L Thompson     } break;
2223513a710Sjeremylt     case CEED_EVAL_WEIGHT: {
223*38293ee6SJeremy L Thompson       CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT inum_compatible with CEED_TRANSPOSE");
224*38293ee6SJeremy L Thompson       CeedInt Q              = Q_1d;
225*38293ee6SJeremy L Thompson       CeedInt elem_dofs_size = CeedIntPow(Q, dim);
226*38293ee6SJeremy L Thompson       CeedInt num_threads    = 1;
227*38293ee6SJeremy L Thompson       CeedInt num_t_col      = 1;
228*38293ee6SJeremy L Thompson       CeedInt shared_mem     = 0;
229f6af633fSnbeams 
230f6af633fSnbeams       switch (dim) {
231f6af633fSnbeams         case 1:
232*38293ee6SJeremy L Thompson           num_threads = Q;
233*38293ee6SJeremy L Thompson           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_1D);
234*38293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * Q;              // for d_q_weight_1d
235*38293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * num_t_col * Q;  // for output
236f6af633fSnbeams           break;
237f6af633fSnbeams         case 2:
238*38293ee6SJeremy L Thompson           num_threads = Q;
239*38293ee6SJeremy L Thompson           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D);
240*38293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * Q;  // for d_q_weight_1d
241f6af633fSnbeams           break;
242f6af633fSnbeams         case 3:
243*38293ee6SJeremy L Thompson           num_threads = Q * Q;
244*38293ee6SJeremy L Thompson           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D);
245*38293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * Q;  // for d_q_weight_1d
246f6af633fSnbeams       }
247*38293ee6SJeremy L Thompson       CeedInt grid   = (num_elem + num_t_col - 1) / num_t_col;
248*38293ee6SJeremy L Thompson       void   *args[] = {&impl->d_q_weight_1d, &dv, &elem_dofs_size, &num_elem};
249f6af633fSnbeams 
250*38293ee6SJeremy L Thompson       CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_weight, grid, num_threads, num_t_col, 1, shared_mem, args));
2512b730f8bSJeremy L Thompson     } break;
2523513a710Sjeremylt     // LCOV_EXCL_START
2533513a710Sjeremylt     case CEED_EVAL_DIV:
254e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
2553513a710Sjeremylt     case CEED_EVAL_CURL:
256e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
2573513a710Sjeremylt     case CEED_EVAL_NONE:
2582b730f8bSJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
2593513a710Sjeremylt       // LCOV_EXCL_STOP
2603513a710Sjeremylt   }
2617f5b9731SStan Tomov 
262e0582403Sabdelfattah83   // must sync to ensure completeness
263e0582403Sabdelfattah83   ceed_magma_queue_sync(data->queue);
264e0582403Sabdelfattah83 
265*38293ee6SJeremy L Thompson   if (e_mode != CEED_EVAL_WEIGHT) {
2666574a04fSJeremy L Thompson     CeedCallBackend(CeedVectorRestoreArrayRead(U, &du));
2677f5b9731SStan Tomov   }
2686574a04fSJeremy L Thompson   CeedCallBackend(CeedVectorRestoreArray(V, &dv));
269e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
2707f5b9731SStan Tomov }
2717f5b9731SStan Tomov 
2727f5b9731SStan Tomov #ifdef __cplusplus
2737f5b9731SStan Tomov CEED_INTERN "C"
2747f5b9731SStan Tomov #endif
2752b730f8bSJeremy L Thompson     int
276*38293ee6SJeremy L Thompson     CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector U, CeedVector V) {
277868539c2SNatalie Beams   Ceed                      ceed;
278e0582403Sabdelfattah83   Ceed_Magma               *data;
279*38293ee6SJeremy L Thompson   CeedInt                   dim, num_comp, num_dof, num_qpts, NB = 1;
280868539c2SNatalie Beams   const CeedScalar         *du;
281868539c2SNatalie Beams   CeedScalar               *dv;
282*38293ee6SJeremy L Thompson   CeedBasisNonTensor_Magma *impl;
283*38293ee6SJeremy L Thompson   CeedMagmaFunction        *interp, *grad;
284*38293ee6SJeremy L Thompson 
285*38293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
286*38293ee6SJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
287*38293ee6SJeremy L Thompson   magma_int_t arch = magma_getdevice_arch();
288*38293ee6SJeremy L Thompson 
289*38293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
290*38293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
291*38293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetNumNodes(basis, &num_dof));
292*38293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
293*38293ee6SJeremy L Thompson   CeedInt P = num_dof, Q = num_qpts, N = num_elem * num_comp;
294*38293ee6SJeremy L Thompson 
2956574a04fSJeremy L Thompson   if (U != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du));
296*38293ee6SJeremy L Thompson   else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
2972b730f8bSJeremy L Thompson   CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv));
298868539c2SNatalie Beams 
2992b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
300868539c2SNatalie Beams 
301*38293ee6SJeremy L Thompson   CeedDebug256(ceed, 4, "[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, num_comp * num_dof, num_comp);
302868539c2SNatalie Beams 
303*38293ee6SJeremy L Thompson   if (t_mode == CEED_TRANSPOSE) {
3041f9221feSJeremy L Thompson     CeedSize length;
305*38293ee6SJeremy L Thompson 
3062b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorGetLength(V, &length));
30780a9ef05SNatalie Beams     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
3082b730f8bSJeremy L Thompson       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue);
30980a9ef05SNatalie Beams     } else {
3102b730f8bSJeremy L Thompson       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)dv, length, data->queue);
31180a9ef05SNatalie Beams     }
312e0582403Sabdelfattah83     ceed_magma_queue_sync(data->queue);
313868539c2SNatalie Beams   }
31480a9ef05SNatalie Beams 
315*38293ee6SJeremy L Thompson   CeedInt n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_N_VALUES};
316023b8a51Sabdelfattah83   CeedInt iN                                        = 0;
317*38293ee6SJeremy L Thompson   CeedInt diff                                      = abs(n_array[iN] - N);
318*38293ee6SJeremy L Thompson 
319023b8a51Sabdelfattah83   for (CeedInt in = iN + 1; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
320*38293ee6SJeremy L Thompson     CeedInt idiff = abs(n_array[in] - N);
321023b8a51Sabdelfattah83     if (idiff < diff) {
322023b8a51Sabdelfattah83       iN   = in;
323023b8a51Sabdelfattah83       diff = idiff;
324868539c2SNatalie Beams     }
32580a9ef05SNatalie Beams   }
32680a9ef05SNatalie Beams 
327*38293ee6SJeremy L Thompson   NB     = nontensor_rtc_get_nb(arch, 'd', e_mode, t_mode, P, n_array[iN], Q);
328*38293ee6SJeremy L Thompson   interp = (t_mode == CEED_TRANSPOSE) ? &impl->magma_interp_tr_nontensor[iN] : &impl->magma_interp_nontensor[iN];
329*38293ee6SJeremy L Thompson   grad   = (t_mode == CEED_TRANSPOSE) ? &impl->magma_grad_tr_nontensor[iN] : &impl->magma_grad_nontensor[iN];
33080a9ef05SNatalie Beams 
331*38293ee6SJeremy L Thompson   switch (e_mode) {
33280a9ef05SNatalie Beams     case CEED_EVAL_INTERP: {
333*38293ee6SJeremy L Thompson       CeedInt P = num_dof, Q = num_qpts;
334023b8a51Sabdelfattah83       if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
335*38293ee6SJeremy L Thompson         CeedInt M          = (t_mode == CEED_TRANSPOSE) ? P : Q;
336*38293ee6SJeremy L Thompson         CeedInt K          = (t_mode == CEED_TRANSPOSE) ? Q : P;
337*38293ee6SJeremy L Thompson         CeedInt num_t_col  = MAGMA_NONTENSOR_BASIS_NTCOL(M);
338*38293ee6SJeremy L Thompson         CeedInt shared_mem = 0, shared_mem_A = 0, shared_mem_B = 0;
339*38293ee6SJeremy L Thompson         shared_mem_B += num_t_col * K * NB * sizeof(CeedScalar);
340*38293ee6SJeremy L Thompson         shared_mem_A += (t_mode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar);
341*38293ee6SJeremy L Thompson         shared_mem = (t_mode == CEED_TRANSPOSE) ? (shared_mem_A + shared_mem_B) : CeedIntMax(shared_mem_A, shared_mem_B);
342023b8a51Sabdelfattah83 
343*38293ee6SJeremy L Thompson         CeedInt       grid    = MAGMA_CEILDIV(MAGMA_CEILDIV(N, NB), num_t_col);
344*38293ee6SJeremy L Thompson         magma_trans_t trans_A = (t_mode == CEED_TRANSPOSE) ? MagmaNoTrans : MagmaTrans;
345*38293ee6SJeremy L Thompson         magma_trans_t trans_B = MagmaNoTrans;
346023b8a51Sabdelfattah83         CeedScalar    alpha = 1.0, beta = 0.0;
347023b8a51Sabdelfattah83 
348*38293ee6SJeremy L Thompson         void *args[] = {&trans_A, &trans_B, &N, &alpha, &impl->d_interp, &P, &du, &K, &beta, &dv, &M};
349*38293ee6SJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, *interp, grid, M, num_t_col, 1, shared_mem, args));
350023b8a51Sabdelfattah83       } else {
351*38293ee6SJeremy L Thompson         if (t_mode == CEED_TRANSPOSE) {
352*38293ee6SJeremy L Thompson           magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, num_elem * num_comp, Q, 1.0, impl->d_interp, P, du, Q, 0.0, dv, P, data->queue);
353*38293ee6SJeremy L Thompson         } else {
354*38293ee6SJeremy L Thompson           magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, num_elem * num_comp, P, 1.0, impl->d_interp, P, du, P, 0.0, dv, Q, data->queue);
355*38293ee6SJeremy L Thompson         }
356023b8a51Sabdelfattah83       }
3572b730f8bSJeremy L Thompson     } break;
35880a9ef05SNatalie Beams 
35980a9ef05SNatalie Beams     case CEED_EVAL_GRAD: {
360*38293ee6SJeremy L Thompson       CeedInt P = num_dof, Q = num_qpts;
361023b8a51Sabdelfattah83       if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
362*38293ee6SJeremy L Thompson         CeedInt M          = (t_mode == CEED_TRANSPOSE) ? P : Q;
363*38293ee6SJeremy L Thompson         CeedInt K          = (t_mode == CEED_TRANSPOSE) ? Q : P;
364*38293ee6SJeremy L Thompson         CeedInt num_t_col  = MAGMA_NONTENSOR_BASIS_NTCOL(M);
365*38293ee6SJeremy L Thompson         CeedInt shared_mem = 0, shared_mem_A = 0, shared_mem_B = 0;
366*38293ee6SJeremy L Thompson         shared_mem_B += num_t_col * K * NB * sizeof(CeedScalar);
367*38293ee6SJeremy L Thompson         shared_mem_A += (t_mode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar);
368*38293ee6SJeremy L Thompson         shared_mem = shared_mem_A + shared_mem_B;
369023b8a51Sabdelfattah83 
370*38293ee6SJeremy L Thompson         CeedInt       grid    = MAGMA_CEILDIV(MAGMA_CEILDIV(N, NB), num_t_col);
371*38293ee6SJeremy L Thompson         magma_trans_t trans_A = (t_mode == CEED_TRANSPOSE) ? MagmaNoTrans : MagmaTrans;
372*38293ee6SJeremy L Thompson         magma_trans_t trans_B = MagmaNoTrans;
373023b8a51Sabdelfattah83 
374*38293ee6SJeremy L Thompson         void *args[] = {&trans_A, &trans_B, &N, &impl->d_grad, &P, &du, &K, &dv, &M};
375*38293ee6SJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, *grad, grid, M, num_t_col, 1, shared_mem, args));
376023b8a51Sabdelfattah83       } else {
377*38293ee6SJeremy L Thompson         if (t_mode == CEED_TRANSPOSE) {
37880a9ef05SNatalie Beams           CeedScalar beta = 0.0;
37980a9ef05SNatalie Beams           for (int d = 0; d < dim; d++) {
3802b730f8bSJeremy L Thompson             if (d > 0) beta = 1.0;
381*38293ee6SJeremy L Thompson             magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, num_elem * num_comp, Q, 1.0, impl->d_grad + d * P * Q, P,
382*38293ee6SJeremy L Thompson                                  du + d * num_elem * num_comp * Q, Q, beta, dv, P, data->queue);
38380a9ef05SNatalie Beams           }
38480a9ef05SNatalie Beams         } else {
38580a9ef05SNatalie Beams           for (int d = 0; d < dim; d++)
386*38293ee6SJeremy L Thompson             magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, num_elem * num_comp, P, 1.0, impl->d_grad + d * P * Q, P, du, P, 0.0,
387*38293ee6SJeremy L Thompson                                  dv + d * num_elem * num_comp * Q, Q, data->queue);
388023b8a51Sabdelfattah83         }
389868539c2SNatalie Beams       }
3902b730f8bSJeremy L Thompson     } break;
391868539c2SNatalie Beams 
392868539c2SNatalie Beams     case CEED_EVAL_WEIGHT: {
393*38293ee6SJeremy L Thompson       CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT inum_compatible with CEED_TRANSPOSE");
394868539c2SNatalie Beams 
395*38293ee6SJeremy L Thompson       int elemsPerBlock = 1;  // basis->Q_1d < 7 ? optElems[basis->Q_1d] : 1;
396*38293ee6SJeremy L Thompson       int grid          = num_elem / elemsPerBlock + ((num_elem / elemsPerBlock * elemsPerBlock < num_elem) ? 1 : 0);
397*38293ee6SJeremy L Thompson 
398*38293ee6SJeremy L Thompson       magma_weight_nontensor(grid, num_qpts, num_elem, num_qpts, impl->d_q_weight, dv, data->queue);
3992b730f8bSJeremy L Thompson     } break;
400868539c2SNatalie Beams 
401868539c2SNatalie Beams     // LCOV_EXCL_START
402868539c2SNatalie Beams     case CEED_EVAL_DIV:
403e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
404868539c2SNatalie Beams     case CEED_EVAL_CURL:
405e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
406868539c2SNatalie Beams     case CEED_EVAL_NONE:
4072b730f8bSJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
408868539c2SNatalie Beams       // LCOV_EXCL_STOP
409868539c2SNatalie Beams   }
410868539c2SNatalie Beams 
411e0582403Sabdelfattah83   // must sync to ensure completeness
412e0582403Sabdelfattah83   ceed_magma_queue_sync(data->queue);
413e0582403Sabdelfattah83 
414*38293ee6SJeremy L Thompson   if (e_mode != CEED_EVAL_WEIGHT) {
4152b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorRestoreArrayRead(U, &du));
416868539c2SNatalie Beams   }
4172b730f8bSJeremy L Thompson   CeedCallBackend(CeedVectorRestoreArray(V, &dv));
418e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
419868539c2SNatalie Beams }
420868539c2SNatalie Beams 
421868539c2SNatalie Beams #ifdef __cplusplus
422868539c2SNatalie Beams CEED_INTERN "C"
423868539c2SNatalie Beams #endif
4242b730f8bSJeremy L Thompson     int
4252b730f8bSJeremy L Thompson     CeedBasisDestroy_Magma(CeedBasis basis) {
426f6af633fSnbeams   Ceed             ceed;
427*38293ee6SJeremy L Thompson   CeedBasis_Magma *impl;
428*38293ee6SJeremy L Thompson 
429*38293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
430*38293ee6SJeremy L Thompson   CeedCallBackend(magma_free(impl->d_q_ref_1d));
431*38293ee6SJeremy L Thompson   CeedCallBackend(magma_free(impl->d_interp_1d));
432*38293ee6SJeremy L Thompson   CeedCallBackend(magma_free(impl->d_grad_1d));
433*38293ee6SJeremy L Thompson   CeedCallBackend(magma_free(impl->d_q_weight_1d));
4342b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
435e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP
4362b730f8bSJeremy L Thompson   CeedCallHip(ceed, hipModuleUnload(impl->module));
437f6af633fSnbeams #else
4382b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cuModuleUnload(impl->module));
439f6af633fSnbeams #endif
4402b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&impl));
441e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
4427f5b9731SStan Tomov }
4437f5b9731SStan Tomov 
4447f5b9731SStan Tomov #ifdef __cplusplus
4457f5b9731SStan Tomov CEED_INTERN "C"
4467f5b9731SStan Tomov #endif
4472b730f8bSJeremy L Thompson     int
4482b730f8bSJeremy L Thompson     CeedBasisDestroyNonTensor_Magma(CeedBasis basis) {
449023b8a51Sabdelfattah83   Ceed                      ceed;
450*38293ee6SJeremy L Thompson   CeedBasisNonTensor_Magma *impl;
451*38293ee6SJeremy L Thompson 
452*38293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
453*38293ee6SJeremy L Thompson   CeedCallBackend(magma_free(impl->d_q_ref));
454*38293ee6SJeremy L Thompson   CeedCallBackend(magma_free(impl->d_interp));
455*38293ee6SJeremy L Thompson   CeedCallBackend(magma_free(impl->d_grad));
456*38293ee6SJeremy L Thompson   CeedCallBackend(magma_free(impl->d_q_weight));
457023b8a51Sabdelfattah83   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
458023b8a51Sabdelfattah83 #ifdef CEED_MAGMA_USE_HIP
459023b8a51Sabdelfattah83   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
460023b8a51Sabdelfattah83     CeedCallHip(ceed, hipModuleUnload(impl->module[in]));
461023b8a51Sabdelfattah83   }
462023b8a51Sabdelfattah83 #else
463023b8a51Sabdelfattah83   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
464023b8a51Sabdelfattah83     CeedCallCuda(ceed, cuModuleUnload(impl->module[in]));
465023b8a51Sabdelfattah83   }
466023b8a51Sabdelfattah83 #endif
4672b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&impl));
468e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
469868539c2SNatalie Beams }
470868539c2SNatalie Beams 
471868539c2SNatalie Beams #ifdef __cplusplus
472868539c2SNatalie Beams CEED_INTERN "C"
473868539c2SNatalie Beams #endif
4742b730f8bSJeremy L Thompson     int
475*38293ee6SJeremy L Thompson     CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
476*38293ee6SJeremy L Thompson                                   const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
477*38293ee6SJeremy L Thompson   Ceed             ceed, ceed_delegate;
478*38293ee6SJeremy L Thompson   Ceed_Magma      *data;
479*38293ee6SJeremy L Thompson   char            *magma_common_path, *interp_path, *grad_path, *weight_path, *basis_kernel_source;
480*38293ee6SJeremy L Thompson   CeedInt          num_comp = 0;
4817f5b9731SStan Tomov   CeedBasis_Magma *impl;
482*38293ee6SJeremy L Thompson 
4832b730f8bSJeremy L Thompson   CeedCallBackend(CeedCalloc(1, &impl));
4842b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
4857f5b9731SStan Tomov 
486c9f8acf2SJeremy L Thompson   // Check for supported parameters
487*38293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
4882b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
489e0582403Sabdelfattah83 
490f6af633fSnbeams   // Compile kernels
491023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_defs.h", &magma_common_path));
49223d4529eSJeremy L Thompson   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
4932b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &basis_kernel_source));
494023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_tensor.h", &magma_common_path));
495023b8a51Sabdelfattah83   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, magma_common_path, &basis_kernel_source));
496f6af633fSnbeams   char   *interp_name_base = "ceed/jit-source/magma/interp";
497f6af633fSnbeams   CeedInt interp_name_len  = strlen(interp_name_base) + 6;
498f6af633fSnbeams   char    interp_name[interp_name_len];
499*38293ee6SJeremy L Thompson 
5002b730f8bSJeremy L Thompson   snprintf(interp_name, interp_name_len, "%s-%" CeedInt_FMT "d.h", interp_name_base, dim);
5012b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetJitAbsolutePath(ceed, interp_name, &interp_path));
5022b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, interp_path, &basis_kernel_source));
503f6af633fSnbeams   char   *grad_name_base = "ceed/jit-source/magma/grad";
504f6af633fSnbeams   CeedInt grad_name_len  = strlen(grad_name_base) + 6;
505f6af633fSnbeams   char    grad_name[grad_name_len];
506*38293ee6SJeremy L Thompson 
5072b730f8bSJeremy L Thompson   snprintf(grad_name, grad_name_len, "%s-%" CeedInt_FMT "d.h", grad_name_base, dim);
5082b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetJitAbsolutePath(ceed, grad_name, &grad_path));
5092b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_path, &basis_kernel_source));
510f6af633fSnbeams   char   *weight_name_base = "ceed/jit-source/magma/weight";
511f6af633fSnbeams   CeedInt weight_name_len  = strlen(weight_name_base) + 6;
512f6af633fSnbeams   char    weight_name[weight_name_len];
513*38293ee6SJeremy L Thompson 
5142b730f8bSJeremy L Thompson   snprintf(weight_name, weight_name_len, "%s-%" CeedInt_FMT "d.h", weight_name_base, dim);
5152b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetJitAbsolutePath(ceed, weight_name, &weight_path));
5162b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_path, &basis_kernel_source));
51723d4529eSJeremy L Thompson   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
518f6af633fSnbeams   // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip
519f6af633fSnbeams   // data
520*38293ee6SJeremy L Thompson   CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
521*38293ee6SJeremy L Thompson   CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module, 5, "DIM", dim, "NCOMP", num_comp, "P", P_1d, "Q", Q_1d, "MAXPQ",
522*38293ee6SJeremy L Thompson                                    CeedIntMax(P_1d, Q_1d)));
523f6af633fSnbeams 
524f6af633fSnbeams   // Kernel setup
525f6af633fSnbeams   switch (dim) {
526f6af633fSnbeams     case 1:
5272b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->magma_interp));
5282b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->magma_interp_tr));
5292b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->magma_grad));
5302b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->magma_grad_tr));
5312b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->magma_weight));
532f6af633fSnbeams       break;
533f6af633fSnbeams     case 2:
5342b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->magma_interp));
5352b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->magma_interp_tr));
5362b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->magma_grad));
5372b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->magma_grad_tr));
5382b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->magma_weight));
539f6af633fSnbeams       break;
540f6af633fSnbeams     case 3:
5412b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->magma_interp));
5422b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->magma_interp_tr));
5432b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->magma_grad));
5442b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->magma_grad_tr));
5452b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->magma_weight));
546f6af633fSnbeams   }
547f6af633fSnbeams 
5482b730f8bSJeremy L Thompson   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma));
5492b730f8bSJeremy L Thompson   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma));
5507f5b9731SStan Tomov 
551*38293ee6SJeremy L Thompson   // Copy q_ref_1d to the GPU
552*38293ee6SJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->d_q_ref_1d, Q_1d * sizeof(q_ref_1d[0])));
553*38293ee6SJeremy L Thompson   magma_setvector(Q_1d, sizeof(q_ref_1d[0]), q_ref_1d, 1, impl->d_q_ref_1d, 1, data->queue);
5547f5b9731SStan Tomov 
555*38293ee6SJeremy L Thompson   // Copy interp_1d to the GPU
556*38293ee6SJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->d_interp_1d, Q_1d * P_1d * sizeof(interp_1d[0])));
557*38293ee6SJeremy L Thompson   magma_setvector(Q_1d * P_1d, sizeof(interp_1d[0]), interp_1d, 1, impl->d_interp_1d, 1, data->queue);
5587f5b9731SStan Tomov 
559*38293ee6SJeremy L Thompson   // Copy grad_1d to the GPU
560*38293ee6SJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->d_grad_1d, Q_1d * P_1d * sizeof(grad_1d[0])));
561*38293ee6SJeremy L Thompson   magma_setvector(Q_1d * P_1d, sizeof(grad_1d[0]), grad_1d, 1, impl->d_grad_1d, 1, data->queue);
5627f5b9731SStan Tomov 
563*38293ee6SJeremy L Thompson   // Copy q_weight_1d to the GPU
564*38293ee6SJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->d_q_weight_1d, Q_1d * sizeof(q_weight_1d[0])));
565*38293ee6SJeremy L Thompson   magma_setvector(Q_1d, sizeof(q_weight_1d[0]), q_weight_1d, 1, impl->d_q_weight_1d, 1, data->queue);
5667f5b9731SStan Tomov 
5672b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisSetData(basis, impl));
5682b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&magma_common_path));
5692b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&interp_path));
5702b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&grad_path));
5712b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&weight_path));
5722b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&basis_kernel_source));
573e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
5747f5b9731SStan Tomov }
5757f5b9731SStan Tomov 
5767f5b9731SStan Tomov #ifdef __cplusplus
5777f5b9731SStan Tomov CEED_INTERN "C"
5787f5b9731SStan Tomov #endif
5792b730f8bSJeremy L Thompson     int
580*38293ee6SJeremy L Thompson     CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_dof, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
581*38293ee6SJeremy L Thompson                             const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
582*38293ee6SJeremy L Thompson   Ceed                      ceed, ceed_delegate;
583e0582403Sabdelfattah83   Ceed_Magma               *data;
584*38293ee6SJeremy L Thompson   char                     *magma_common_path, *interp_path, *grad_path, *basis_kernel_source;
585*38293ee6SJeremy L Thompson   CeedBasisNonTensor_Magma *impl;
586*38293ee6SJeremy L Thompson 
587*38293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
5882b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
589023b8a51Sabdelfattah83   magma_int_t arch = magma_getdevice_arch();
590*38293ee6SJeremy L Thompson 
5912b730f8bSJeremy L Thompson   CeedCallBackend(CeedCalloc(1, &impl));
592023b8a51Sabdelfattah83   // Compile kernels
593023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_defs.h", &magma_common_path));
59423d4529eSJeremy L Thompson   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
595023b8a51Sabdelfattah83   CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &basis_kernel_source));
596023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_nontensor.h", &magma_common_path));
597023b8a51Sabdelfattah83   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, magma_common_path, &basis_kernel_source));
598023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/interp-nontensor.h", &interp_path));
599023b8a51Sabdelfattah83   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, interp_path, &basis_kernel_source));
600023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/grad-nontensor.h", &grad_path));
601023b8a51Sabdelfattah83   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_path, &basis_kernel_source));
602023b8a51Sabdelfattah83 
603023b8a51Sabdelfattah83   // tuning parameters for nb
604023b8a51Sabdelfattah83   CeedInt nb_interp_n[MAGMA_NONTENSOR_KERNEL_INSTANCES];
605023b8a51Sabdelfattah83   CeedInt nb_interp_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
606023b8a51Sabdelfattah83   CeedInt nb_grad_n[MAGMA_NONTENSOR_KERNEL_INSTANCES];
607023b8a51Sabdelfattah83   CeedInt nb_grad_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
608*38293ee6SJeremy L Thompson   CeedInt P = num_dof, Q = num_qpts;
609*38293ee6SJeremy L Thompson   CeedInt n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_N_VALUES};
610*38293ee6SJeremy L Thompson 
611023b8a51Sabdelfattah83   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
612*38293ee6SJeremy L Thompson     nb_interp_n[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_INTERP, CEED_NOTRANSPOSE, P, n_array[in], Q);
613*38293ee6SJeremy L Thompson     nb_interp_t[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_INTERP, CEED_TRANSPOSE, P, n_array[in], Q);
614*38293ee6SJeremy L Thompson     nb_grad_n[in]   = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_GRAD, CEED_NOTRANSPOSE, P, n_array[in], Q);
615*38293ee6SJeremy L Thompson     nb_grad_t[in]   = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_GRAD, CEED_TRANSPOSE, P, n_array[in], Q);
616023b8a51Sabdelfattah83   }
617023b8a51Sabdelfattah83 
618023b8a51Sabdelfattah83   // compile
619*38293ee6SJeremy L Thompson   CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
620023b8a51Sabdelfattah83   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
621*38293ee6SJeremy L Thompson     CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[in], 7, "DIM", dim, "P", P, "Q", Q, "NB_INTERP_N",
622*38293ee6SJeremy L Thompson                                      nb_interp_n[in], "NB_INTERP_T", nb_interp_t[in], "NB_GRAD_N", nb_grad_n[in], "NB_GRAD_T", nb_grad_t[in]));
623023b8a51Sabdelfattah83   }
624023b8a51Sabdelfattah83 
625023b8a51Sabdelfattah83   // get kernels
626023b8a51Sabdelfattah83   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
627023b8a51Sabdelfattah83     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_interp_nontensor_n", &impl->magma_interp_nontensor[in]));
628023b8a51Sabdelfattah83     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_interp_nontensor_t", &impl->magma_interp_tr_nontensor[in]));
629023b8a51Sabdelfattah83     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_grad_nontensor_n", &impl->magma_grad_nontensor[in]));
630023b8a51Sabdelfattah83     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_grad_nontensor_t", &impl->magma_grad_tr_nontensor[in]));
631023b8a51Sabdelfattah83   }
632023b8a51Sabdelfattah83 
633023b8a51Sabdelfattah83   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
634023b8a51Sabdelfattah83   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
635868539c2SNatalie Beams 
636*38293ee6SJeremy L Thompson   // Copy q_ref to the GPU
637*38293ee6SJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->d_q_ref, num_qpts * sizeof(q_ref[0])));
638*38293ee6SJeremy L Thompson   magma_setvector(num_qpts, sizeof(q_ref[0]), q_ref, 1, impl->d_q_ref, 1, data->queue);
639868539c2SNatalie Beams 
640868539c2SNatalie Beams   // Copy interp to the GPU
641*38293ee6SJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_dof * sizeof(interp[0])));
642*38293ee6SJeremy L Thompson   magma_setvector(num_qpts * num_dof, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue);
643868539c2SNatalie Beams 
644868539c2SNatalie Beams   // Copy grad to the GPU
645*38293ee6SJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->d_grad, num_qpts * num_dof * dim * sizeof(grad[0])));
646*38293ee6SJeremy L Thompson   magma_setvector(num_qpts * num_dof * dim, sizeof(grad[0]), grad, 1, impl->d_grad, 1, data->queue);
647868539c2SNatalie Beams 
648*38293ee6SJeremy L Thompson   // Copy q_weight to the GPU
649*38293ee6SJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
650*38293ee6SJeremy L Thompson   magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
651868539c2SNatalie Beams 
652023b8a51Sabdelfattah83   CeedCallBackend(CeedBasisSetData(basis, impl));
653023b8a51Sabdelfattah83   CeedCallBackend(CeedFree(&magma_common_path));
654023b8a51Sabdelfattah83   CeedCallBackend(CeedFree(&interp_path));
655023b8a51Sabdelfattah83   CeedCallBackend(CeedFree(&grad_path));
656023b8a51Sabdelfattah83   CeedCallBackend(CeedFree(&basis_kernel_source));
657e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
6587f5b9731SStan Tomov }
659