xref: /libCEED/rust/libceed-sys/c-src/backends/magma/ceed-magma-basis.c (revision 6574a04ff2135c3834f1b6ef9a4ec7566c4782db)
13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
37f5b9731SStan Tomov //
43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
57f5b9731SStan Tomov //
63d8e8822SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
77f5b9731SStan Tomov 
849aac155SJeremy L Thompson #include <ceed.h>
9ec3da8bcSJed Brown #include <ceed/backend.h>
10f6af633fSnbeams #include <ceed/jit-tools.h>
11f6af633fSnbeams #include <string.h>
122b730f8bSJeremy L Thompson 
137f5b9731SStan Tomov #include "ceed-magma.h"
14e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP
15f6af633fSnbeams #include "../hip/ceed-hip-common.h"
16f6af633fSnbeams #include "../hip/ceed-hip-compile.h"
17f6af633fSnbeams #else
18f6af633fSnbeams #include "../cuda/ceed-cuda-common.h"
19f6af633fSnbeams #include "../cuda/ceed-cuda-compile.h"
20f6af633fSnbeams #endif
217f5b9731SStan Tomov 
227f5b9731SStan Tomov #ifdef __cplusplus
237f5b9731SStan Tomov CEED_INTERN "C"
247f5b9731SStan Tomov #endif
252b730f8bSJeremy L Thompson     int
262b730f8bSJeremy L Thompson     CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector U, CeedVector V) {
277f5b9731SStan Tomov   Ceed ceed;
282b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
29e0582403Sabdelfattah83   CeedInt dim, ncomp, ndof;
302b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
312b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp));
322b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumNodes(basis, &ndof));
33e0582403Sabdelfattah83 
34e0582403Sabdelfattah83   Ceed_Magma *data;
352b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
36e0582403Sabdelfattah83 
37*6574a04fSJeremy L Thompson   const CeedScalar *du;
38*6574a04fSJeremy L Thompson   CeedScalar       *dv;
39*6574a04fSJeremy L Thompson   if (U != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du));
40*6574a04fSJeremy L Thompson   else CeedCheck(emode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
41*6574a04fSJeremy L Thompson   CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv));
427f5b9731SStan Tomov 
437f5b9731SStan Tomov   CeedBasis_Magma *impl;
442b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
457f5b9731SStan Tomov 
467f5b9731SStan Tomov   CeedInt P1d, Q1d;
472b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P1d));
482b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q1d));
497f5b9731SStan Tomov 
502b730f8bSJeremy L Thompson   CeedDebug256(ceed, 4, "[CeedBasisApply_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, ncomp * CeedIntPow(P1d, dim), ncomp);
517f5b9731SStan Tomov 
527f5b9731SStan Tomov   if (tmode == CEED_TRANSPOSE) {
531f9221feSJeremy L Thompson     CeedSize length;
542b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorGetLength(V, &length));
5580a9ef05SNatalie Beams     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
56*6574a04fSJeremy L Thompson       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue);
5780a9ef05SNatalie Beams     } else {
58*6574a04fSJeremy L Thompson       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)dv, length, data->queue);
5980a9ef05SNatalie Beams     }
60e0582403Sabdelfattah83     ceed_magma_queue_sync(data->queue);
617f5b9731SStan Tomov   }
62f6af633fSnbeams 
633513a710Sjeremylt   switch (emode) {
643513a710Sjeremylt     case CEED_EVAL_INTERP: {
657f5b9731SStan Tomov       CeedInt P = P1d, Q = Q1d;
667f5b9731SStan Tomov       if (tmode == CEED_TRANSPOSE) {
672b730f8bSJeremy L Thompson         P = Q1d;
682b730f8bSJeremy L Thompson         Q = P1d;
697f5b9731SStan Tomov       }
707f5b9731SStan Tomov 
717f5b9731SStan Tomov       // Define element sizes for dofs/quad
727f5b9731SStan Tomov       CeedInt elquadsize = CeedIntPow(Q1d, dim);
737f5b9731SStan Tomov       CeedInt eldofssize = CeedIntPow(P1d, dim);
747f5b9731SStan Tomov 
757f5b9731SStan Tomov       // E-vector ordering -------------- Q-vector ordering
76868539c2SNatalie Beams       //  component                        component
77868539c2SNatalie Beams       //    elem                             elem
787f5b9731SStan Tomov       //       node                            node
797f5b9731SStan Tomov 
807f5b9731SStan Tomov       // ---  Define strides for NOTRANSPOSE mode: ---
81*6574a04fSJeremy L Thompson       // Input (du) is E-vector, output (dv) is Q-vector
827f5b9731SStan Tomov 
837f5b9731SStan Tomov       // Element strides
84868539c2SNatalie Beams       CeedInt u_elstride = eldofssize;
857f5b9731SStan Tomov       CeedInt v_elstride = elquadsize;
867f5b9731SStan Tomov       // Component strides
87868539c2SNatalie Beams       CeedInt u_compstride = nelem * eldofssize;
887f5b9731SStan Tomov       CeedInt v_compstride = nelem * elquadsize;
897f5b9731SStan Tomov 
907f5b9731SStan Tomov       // ---  Swap strides for TRANSPOSE mode: ---
917f5b9731SStan Tomov       if (tmode == CEED_TRANSPOSE) {
92*6574a04fSJeremy L Thompson         // Input (du) is Q-vector, output (dv) is E-vector
937f5b9731SStan Tomov         // Element strides
94868539c2SNatalie Beams         v_elstride = eldofssize;
957f5b9731SStan Tomov         u_elstride = elquadsize;
967f5b9731SStan Tomov         // Component strides
97868539c2SNatalie Beams         v_compstride = nelem * eldofssize;
987f5b9731SStan Tomov         u_compstride = nelem * elquadsize;
997f5b9731SStan Tomov       }
1007f5b9731SStan Tomov 
101f6af633fSnbeams       CeedInt nthreads = 1;
102f6af633fSnbeams       CeedInt ntcol    = 1;
103f6af633fSnbeams       CeedInt shmem    = 0;
104f6af633fSnbeams       CeedInt maxPQ    = CeedIntMax(P, Q);
105f6af633fSnbeams 
106f6af633fSnbeams       switch (dim) {
107f6af633fSnbeams         case 1:
108f6af633fSnbeams           nthreads = maxPQ;
109f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D);
110f6af633fSnbeams           shmem += sizeof(CeedScalar) * ntcol * (ncomp * (1 * P + 1 * Q));
111f6af633fSnbeams           shmem += sizeof(CeedScalar) * (P * Q);
112f6af633fSnbeams           break;
113f6af633fSnbeams         case 2:
114f6af633fSnbeams           nthreads = maxPQ;
115f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D);
116f6af633fSnbeams           shmem += P * Q * sizeof(CeedScalar);                // for sT
1172b730f8bSJeremy L Thompson           shmem += ntcol * (P * maxPQ * sizeof(CeedScalar));  // for reforming rU we need PxP, and for the intermediate output we need PxQ
118f6af633fSnbeams           break;
119f6af633fSnbeams         case 3:
120f6af633fSnbeams           nthreads = maxPQ * maxPQ;
121f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D);
122f6af633fSnbeams           shmem += sizeof(CeedScalar) * (P * Q);  // for sT
1232b730f8bSJeremy L Thompson           shmem += sizeof(CeedScalar) * ntcol *
1242b730f8bSJeremy L Thompson                    (CeedIntMax(P * P * maxPQ,
125f6af633fSnbeams                                P * Q * Q));  // rU needs P^2xP, the intermediate output needs max(P^2xQ,PQ^2)
126f6af633fSnbeams       }
127f6af633fSnbeams       CeedInt grid   = (nelem + ntcol - 1) / ntcol;
128*6574a04fSJeremy L Thompson       void   *args[] = {&impl->dinterp1d, &du, &u_elstride, &u_compstride, &dv, &v_elstride, &v_compstride, &nelem};
129f6af633fSnbeams 
130f6af633fSnbeams       if (tmode == CEED_TRANSPOSE) {
1312b730f8bSJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp_tr, grid, nthreads, ntcol, 1, shmem, args));
132f6af633fSnbeams       } else {
1332b730f8bSJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp, grid, nthreads, ntcol, 1, shmem, args));
134f6af633fSnbeams       }
1352b730f8bSJeremy L Thompson     } break;
1363513a710Sjeremylt     case CEED_EVAL_GRAD: {
1377f5b9731SStan Tomov       CeedInt P = P1d, Q = Q1d;
1387f5b9731SStan Tomov       // In CEED_NOTRANSPOSE mode:
139*6574a04fSJeremy L Thompson       // du is (P^dim x nc), column-major layout (nc = ncomp)
140*6574a04fSJeremy L Thompson       // dv is (Q^dim x nc x dim), column-major layout (nc = ncomp)
141*6574a04fSJeremy L Thompson       // In CEED_TRANSPOSE mode, the sizes of du and dv are switched.
1427f5b9731SStan Tomov       if (tmode == CEED_TRANSPOSE) {
1437f5b9731SStan Tomov         P = Q1d, Q = P1d;
1447f5b9731SStan Tomov       }
1457f5b9731SStan Tomov 
1467f5b9731SStan Tomov       // Define element sizes for dofs/quad
1477f5b9731SStan Tomov       CeedInt elquadsize = CeedIntPow(Q1d, dim);
1487f5b9731SStan Tomov       CeedInt eldofssize = CeedIntPow(P1d, dim);
1497f5b9731SStan Tomov 
1507f5b9731SStan Tomov       // E-vector ordering -------------- Q-vector ordering
1517f5b9731SStan Tomov       //                                  dim
152868539c2SNatalie Beams       //  component                        component
153868539c2SNatalie Beams       //    elem                              elem
1547f5b9731SStan Tomov       //       node                            node
1557f5b9731SStan Tomov 
1567f5b9731SStan Tomov       // ---  Define strides for NOTRANSPOSE mode: ---
157*6574a04fSJeremy L Thompson       // Input (du) is E-vector, output (dv) is Q-vector
1587f5b9731SStan Tomov 
1597f5b9731SStan Tomov       // Element strides
160868539c2SNatalie Beams       CeedInt u_elstride = eldofssize;
1617f5b9731SStan Tomov       CeedInt v_elstride = elquadsize;
1627f5b9731SStan Tomov       // Component strides
163868539c2SNatalie Beams       CeedInt u_compstride = nelem * eldofssize;
1647f5b9731SStan Tomov       CeedInt v_compstride = nelem * elquadsize;
1657f5b9731SStan Tomov       // Dimension strides
1667f5b9731SStan Tomov       CeedInt u_dimstride = 0;
1677f5b9731SStan Tomov       CeedInt v_dimstride = nelem * elquadsize * ncomp;
1687f5b9731SStan Tomov 
1697f5b9731SStan Tomov       // ---  Swap strides for TRANSPOSE mode: ---
1707f5b9731SStan Tomov       if (tmode == CEED_TRANSPOSE) {
171*6574a04fSJeremy L Thompson         // Input (du) is Q-vector, output (dv) is E-vector
1727f5b9731SStan Tomov         // Element strides
173868539c2SNatalie Beams         v_elstride = eldofssize;
1747f5b9731SStan Tomov         u_elstride = elquadsize;
1757f5b9731SStan Tomov         // Component strides
176868539c2SNatalie Beams         v_compstride = nelem * eldofssize;
1777f5b9731SStan Tomov         u_compstride = nelem * elquadsize;
1787f5b9731SStan Tomov         // Dimension strides
1797f5b9731SStan Tomov         v_dimstride = 0;
1807f5b9731SStan Tomov         u_dimstride = nelem * elquadsize * ncomp;
1817f5b9731SStan Tomov       }
1827f5b9731SStan Tomov 
183f6af633fSnbeams       CeedInt nthreads = 1;
184f6af633fSnbeams       CeedInt ntcol    = 1;
185f6af633fSnbeams       CeedInt shmem    = 0;
186f6af633fSnbeams       CeedInt maxPQ    = CeedIntMax(P, Q);
187f6af633fSnbeams 
188f6af633fSnbeams       switch (dim) {
189f6af633fSnbeams         case 1:
190f6af633fSnbeams           nthreads = maxPQ;
191f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D);
192f6af633fSnbeams           shmem += sizeof(CeedScalar) * ntcol * (ncomp * (1 * P + 1 * Q));
193f6af633fSnbeams           shmem += sizeof(CeedScalar) * (P * Q);
194f6af633fSnbeams           break;
195f6af633fSnbeams         case 2:
196f6af633fSnbeams           nthreads = maxPQ;
197f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D);
198f6af633fSnbeams           shmem += sizeof(CeedScalar) * 2 * P * Q;            // for sTinterp and sTgrad
1992b730f8bSJeremy L Thompson           shmem += sizeof(CeedScalar) * ntcol * (P * maxPQ);  // for reforming rU we need PxP, and for the intermediate output we need PxQ
200f6af633fSnbeams           break;
201f6af633fSnbeams         case 3:
202f6af633fSnbeams           nthreads = maxPQ * maxPQ;
203f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D);
204f6af633fSnbeams           shmem += sizeof(CeedScalar) * 2 * P * Q;  // for sTinterp and sTgrad
2052b730f8bSJeremy L Thompson           shmem += sizeof(CeedScalar) * ntcol *
2062b730f8bSJeremy L Thompson                    CeedIntMax(P * P * P,
2072b730f8bSJeremy L Thompson                               (P * P * Q) + (P * Q * Q));  // rU needs P^2xP, the intermediate outputs need (P^2.Q + P.Q^2)
208f6af633fSnbeams       }
209f6af633fSnbeams       CeedInt grid   = (nelem + ntcol - 1) / ntcol;
210*6574a04fSJeremy L Thompson       void   *args[] = {&impl->dinterp1d, &impl->dgrad1d, &du,          &u_elstride, &u_compstride, &u_dimstride, &dv,
2112b730f8bSJeremy L Thompson                         &v_elstride,      &v_compstride,  &v_dimstride, &nelem};
212f6af633fSnbeams 
213f6af633fSnbeams       if (tmode == CEED_TRANSPOSE) {
2142b730f8bSJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad_tr, grid, nthreads, ntcol, 1, shmem, args));
215f6af633fSnbeams       } else {
2162b730f8bSJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad, grid, nthreads, ntcol, 1, shmem, args));
217f6af633fSnbeams       }
2182b730f8bSJeremy L Thompson     } break;
2193513a710Sjeremylt     case CEED_EVAL_WEIGHT: {
220*6574a04fSJeremy L Thompson       CeedCheck(tmode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
2217f5b9731SStan Tomov       CeedInt Q          = Q1d;
222f6af633fSnbeams       CeedInt eldofssize = CeedIntPow(Q, dim);
223f6af633fSnbeams       CeedInt nthreads   = 1;
224f6af633fSnbeams       CeedInt ntcol      = 1;
225f6af633fSnbeams       CeedInt shmem      = 0;
226f6af633fSnbeams 
227f6af633fSnbeams       switch (dim) {
228f6af633fSnbeams         case 1:
229f6af633fSnbeams           nthreads = Q;
230f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D);
231f6af633fSnbeams           shmem += sizeof(CeedScalar) * Q;          // for dqweight1d
232f6af633fSnbeams           shmem += sizeof(CeedScalar) * ntcol * Q;  // for output
233f6af633fSnbeams           break;
234f6af633fSnbeams         case 2:
235f6af633fSnbeams           nthreads = Q;
236f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D);
237f6af633fSnbeams           shmem += sizeof(CeedScalar) * Q;  // for dqweight1d
238f6af633fSnbeams           break;
239f6af633fSnbeams         case 3:
240f6af633fSnbeams           nthreads = Q * Q;
241f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D);
242f6af633fSnbeams           shmem += sizeof(CeedScalar) * Q;  // for dqweight1d
243f6af633fSnbeams       }
244f6af633fSnbeams       CeedInt grid   = (nelem + ntcol - 1) / ntcol;
245*6574a04fSJeremy L Thompson       void   *args[] = {&impl->dqweight1d, &dv, &eldofssize, &nelem};
246f6af633fSnbeams 
2472b730f8bSJeremy L Thompson       CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_weight, grid, nthreads, ntcol, 1, shmem, args));
2482b730f8bSJeremy L Thompson     } break;
2493513a710Sjeremylt     // LCOV_EXCL_START
2503513a710Sjeremylt     case CEED_EVAL_DIV:
251e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
2523513a710Sjeremylt     case CEED_EVAL_CURL:
253e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
2543513a710Sjeremylt     case CEED_EVAL_NONE:
2552b730f8bSJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
2563513a710Sjeremylt       // LCOV_EXCL_STOP
2573513a710Sjeremylt   }
2587f5b9731SStan Tomov 
259e0582403Sabdelfattah83   // must sync to ensure completeness
260e0582403Sabdelfattah83   ceed_magma_queue_sync(data->queue);
261e0582403Sabdelfattah83 
2627f5b9731SStan Tomov   if (emode != CEED_EVAL_WEIGHT) {
263*6574a04fSJeremy L Thompson     CeedCallBackend(CeedVectorRestoreArrayRead(U, &du));
2647f5b9731SStan Tomov   }
265*6574a04fSJeremy L Thompson   CeedCallBackend(CeedVectorRestoreArray(V, &dv));
266e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
2677f5b9731SStan Tomov }
2687f5b9731SStan Tomov 
2697f5b9731SStan Tomov #ifdef __cplusplus
2707f5b9731SStan Tomov CEED_INTERN "C"
2717f5b9731SStan Tomov #endif
2722b730f8bSJeremy L Thompson     int
273023b8a51Sabdelfattah83     CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector U, CeedVector V) {
274868539c2SNatalie Beams   Ceed ceed;
2752b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
276e0582403Sabdelfattah83 
277e0582403Sabdelfattah83   Ceed_Magma *data;
2782b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
279e0582403Sabdelfattah83 
280023b8a51Sabdelfattah83   magma_int_t arch = magma_getdevice_arch();
281023b8a51Sabdelfattah83 
282868539c2SNatalie Beams   CeedInt dim, ncomp, ndof, nqpt;
2832b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
2842b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp));
2852b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumNodes(basis, &ndof));
2862b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &nqpt));
287868539c2SNatalie Beams   const CeedScalar *du;
288868539c2SNatalie Beams   CeedScalar       *dv;
289*6574a04fSJeremy L Thompson   if (U != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du));
290*6574a04fSJeremy L Thompson   else CeedCheck(emode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
2912b730f8bSJeremy L Thompson   CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv));
292868539c2SNatalie Beams 
293868539c2SNatalie Beams   CeedBasisNonTensor_Magma *impl;
2942b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
295868539c2SNatalie Beams 
2962b730f8bSJeremy L Thompson   CeedDebug256(ceed, 4, "[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, ncomp * ndof, ncomp);
297868539c2SNatalie Beams 
298868539c2SNatalie Beams   if (tmode == CEED_TRANSPOSE) {
2991f9221feSJeremy L Thompson     CeedSize length;
3002b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorGetLength(V, &length));
30180a9ef05SNatalie Beams     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
3022b730f8bSJeremy L Thompson       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue);
30380a9ef05SNatalie Beams     } else {
3042b730f8bSJeremy L Thompson       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)dv, length, data->queue);
30580a9ef05SNatalie Beams     }
306e0582403Sabdelfattah83     ceed_magma_queue_sync(data->queue);
307868539c2SNatalie Beams   }
30880a9ef05SNatalie Beams 
309023b8a51Sabdelfattah83   CeedInt            P = ndof, Q = nqpt, N = nelem * ncomp;
310023b8a51Sabdelfattah83   CeedInt            NB = 1;
311023b8a51Sabdelfattah83   CeedMagmaFunction *interp, *grad;
312868539c2SNatalie Beams 
313023b8a51Sabdelfattah83   CeedInt Narray[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_N_VALUES};
314023b8a51Sabdelfattah83   CeedInt iN                                       = 0;
315023b8a51Sabdelfattah83   CeedInt diff                                     = abs(Narray[iN] - N);
316023b8a51Sabdelfattah83   for (CeedInt in = iN + 1; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
317023b8a51Sabdelfattah83     CeedInt idiff = abs(Narray[in] - N);
318023b8a51Sabdelfattah83     if (idiff < diff) {
319023b8a51Sabdelfattah83       iN   = in;
320023b8a51Sabdelfattah83       diff = idiff;
321868539c2SNatalie Beams     }
32280a9ef05SNatalie Beams   }
32380a9ef05SNatalie Beams 
324023b8a51Sabdelfattah83   NB     = nontensor_rtc_get_nb(arch, 'd', emode, tmode, P, Narray[iN], Q);
325023b8a51Sabdelfattah83   interp = (tmode == CEED_TRANSPOSE) ? &impl->magma_interp_tr_nontensor[iN] : &impl->magma_interp_nontensor[iN];
326023b8a51Sabdelfattah83   grad   = (tmode == CEED_TRANSPOSE) ? &impl->magma_grad_tr_nontensor[iN] : &impl->magma_grad_nontensor[iN];
32780a9ef05SNatalie Beams 
32880a9ef05SNatalie Beams   switch (emode) {
32980a9ef05SNatalie Beams     case CEED_EVAL_INTERP: {
33080a9ef05SNatalie Beams       CeedInt P = ndof, Q = nqpt;
331023b8a51Sabdelfattah83       if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
332023b8a51Sabdelfattah83         CeedInt M     = (tmode == CEED_TRANSPOSE) ? P : Q;
333023b8a51Sabdelfattah83         CeedInt K     = (tmode == CEED_TRANSPOSE) ? Q : P;
334023b8a51Sabdelfattah83         CeedInt ntcol = MAGMA_NONTENSOR_BASIS_NTCOL(M);
335023b8a51Sabdelfattah83         CeedInt shmem = 0, shmemA = 0, shmemB = 0;
336023b8a51Sabdelfattah83         shmemB += ntcol * K * NB * sizeof(CeedScalar);
337023b8a51Sabdelfattah83         shmemA += (tmode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar);
338023b8a51Sabdelfattah83         shmem = (tmode == CEED_TRANSPOSE) ? (shmemA + shmemB) : CeedIntMax(shmemA, shmemB);
339023b8a51Sabdelfattah83 
340023b8a51Sabdelfattah83         CeedInt       grid   = MAGMA_CEILDIV(MAGMA_CEILDIV(N, NB), ntcol);
341023b8a51Sabdelfattah83         magma_trans_t transA = (tmode == CEED_TRANSPOSE) ? MagmaNoTrans : MagmaTrans;
342023b8a51Sabdelfattah83         magma_trans_t transB = MagmaNoTrans;
343023b8a51Sabdelfattah83         CeedScalar    alpha = 1.0, beta = 0.0;
344023b8a51Sabdelfattah83 
345023b8a51Sabdelfattah83         void *args[] = {&transA, &transB, &N, &alpha, &impl->dinterp, &P, &du, &K, &beta, &dv, &M};
346023b8a51Sabdelfattah83         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, *interp, grid, M, ntcol, 1, shmem, args));
347023b8a51Sabdelfattah83       } else {
34880a9ef05SNatalie Beams         if (tmode == CEED_TRANSPOSE)
349023b8a51Sabdelfattah83           magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, impl->dinterp, P, du, Q, 0.0, dv, P, data->queue);
350023b8a51Sabdelfattah83         else magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, impl->dinterp, P, du, P, 0.0, dv, Q, data->queue);
351023b8a51Sabdelfattah83       }
3522b730f8bSJeremy L Thompson     } break;
35380a9ef05SNatalie Beams 
35480a9ef05SNatalie Beams     case CEED_EVAL_GRAD: {
35580a9ef05SNatalie Beams       CeedInt P = ndof, Q = nqpt;
356023b8a51Sabdelfattah83       if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
357023b8a51Sabdelfattah83         CeedInt M     = (tmode == CEED_TRANSPOSE) ? P : Q;
358023b8a51Sabdelfattah83         CeedInt K     = (tmode == CEED_TRANSPOSE) ? Q : P;
359023b8a51Sabdelfattah83         CeedInt ntcol = MAGMA_NONTENSOR_BASIS_NTCOL(M);
360023b8a51Sabdelfattah83         CeedInt shmem = 0, shmemA = 0, shmemB = 0;
361023b8a51Sabdelfattah83         shmemB += ntcol * K * NB * sizeof(CeedScalar);
362023b8a51Sabdelfattah83         shmemA += (tmode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar);
363023b8a51Sabdelfattah83         shmem = shmemA + shmemB;
364023b8a51Sabdelfattah83 
365023b8a51Sabdelfattah83         CeedInt       grid   = MAGMA_CEILDIV(MAGMA_CEILDIV(N, NB), ntcol);
366023b8a51Sabdelfattah83         magma_trans_t transA = (tmode == CEED_TRANSPOSE) ? MagmaNoTrans : MagmaTrans;
367023b8a51Sabdelfattah83         magma_trans_t transB = MagmaNoTrans;
368023b8a51Sabdelfattah83 
369023b8a51Sabdelfattah83         void *args[] = {&transA, &transB, &N, &impl->dgrad, &P, &du, &K, &dv, &M};
370023b8a51Sabdelfattah83         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, *grad, grid, M, ntcol, 1, shmem, args));
371023b8a51Sabdelfattah83       } else {
37280a9ef05SNatalie Beams         if (tmode == CEED_TRANSPOSE) {
37380a9ef05SNatalie Beams           CeedScalar beta = 0.0;
37480a9ef05SNatalie Beams           for (int d = 0; d < dim; d++) {
3752b730f8bSJeremy L Thompson             if (d > 0) beta = 1.0;
376023b8a51Sabdelfattah83             magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, impl->dgrad + d * P * Q, P, du + d * nelem * ncomp * Q, Q,
377023b8a51Sabdelfattah83                                  beta, dv, P, data->queue);
37880a9ef05SNatalie Beams           }
37980a9ef05SNatalie Beams         } else {
38080a9ef05SNatalie Beams           for (int d = 0; d < dim; d++)
381023b8a51Sabdelfattah83             magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, impl->dgrad + d * P * Q, P, du, P, 0.0,
382023b8a51Sabdelfattah83                                  dv + d * nelem * ncomp * Q, Q, data->queue);
383023b8a51Sabdelfattah83         }
384868539c2SNatalie Beams       }
3852b730f8bSJeremy L Thompson     } break;
386868539c2SNatalie Beams 
387868539c2SNatalie Beams     case CEED_EVAL_WEIGHT: {
388*6574a04fSJeremy L Thompson       CeedCheck(tmode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
389868539c2SNatalie Beams 
390868539c2SNatalie Beams       int elemsPerBlock = 1;  // basis->Q1d < 7 ? optElems[basis->Q1d] : 1;
3912b730f8bSJeremy L Thompson       int grid          = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0);
3922b730f8bSJeremy L Thompson       magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv, data->queue);
3932b730f8bSJeremy L Thompson     } break;
394868539c2SNatalie Beams 
395868539c2SNatalie Beams     // LCOV_EXCL_START
396868539c2SNatalie Beams     case CEED_EVAL_DIV:
397e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
398868539c2SNatalie Beams     case CEED_EVAL_CURL:
399e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
400868539c2SNatalie Beams     case CEED_EVAL_NONE:
4012b730f8bSJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
402868539c2SNatalie Beams       // LCOV_EXCL_STOP
403868539c2SNatalie Beams   }
404868539c2SNatalie Beams 
405e0582403Sabdelfattah83   // must sync to ensure completeness
406e0582403Sabdelfattah83   ceed_magma_queue_sync(data->queue);
407e0582403Sabdelfattah83 
408868539c2SNatalie Beams   if (emode != CEED_EVAL_WEIGHT) {
4092b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorRestoreArrayRead(U, &du));
410868539c2SNatalie Beams   }
4112b730f8bSJeremy L Thompson   CeedCallBackend(CeedVectorRestoreArray(V, &dv));
412e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
413868539c2SNatalie Beams }
414868539c2SNatalie Beams 
415868539c2SNatalie Beams #ifdef __cplusplus
416868539c2SNatalie Beams CEED_INTERN "C"
417868539c2SNatalie Beams #endif
4182b730f8bSJeremy L Thompson     int
4192b730f8bSJeremy L Thompson     CeedBasisDestroy_Magma(CeedBasis basis) {
4207f5b9731SStan Tomov   CeedBasis_Magma *impl;
4212b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
4227f5b9731SStan Tomov 
4232b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dqref1d));
4242b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dinterp1d));
4252b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dgrad1d));
4262b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dqweight1d));
427f6af633fSnbeams   Ceed ceed;
4282b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
429e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP
4302b730f8bSJeremy L Thompson   CeedCallHip(ceed, hipModuleUnload(impl->module));
431f6af633fSnbeams #else
4322b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cuModuleUnload(impl->module));
433f6af633fSnbeams #endif
4347f5b9731SStan Tomov 
4352b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&impl));
4367f5b9731SStan Tomov 
437e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
4387f5b9731SStan Tomov }
4397f5b9731SStan Tomov 
4407f5b9731SStan Tomov #ifdef __cplusplus
4417f5b9731SStan Tomov CEED_INTERN "C"
4427f5b9731SStan Tomov #endif
4432b730f8bSJeremy L Thompson     int
4442b730f8bSJeremy L Thompson     CeedBasisDestroyNonTensor_Magma(CeedBasis basis) {
445868539c2SNatalie Beams   CeedBasisNonTensor_Magma *impl;
4462b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
447868539c2SNatalie Beams 
4482b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dqref));
4492b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dinterp));
4502b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dgrad));
4512b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dqweight));
452023b8a51Sabdelfattah83   Ceed ceed;
453023b8a51Sabdelfattah83   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
454023b8a51Sabdelfattah83 #ifdef CEED_MAGMA_USE_HIP
455023b8a51Sabdelfattah83   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
456023b8a51Sabdelfattah83     CeedCallHip(ceed, hipModuleUnload(impl->module[in]));
457023b8a51Sabdelfattah83   }
458023b8a51Sabdelfattah83 #else
459023b8a51Sabdelfattah83   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
460023b8a51Sabdelfattah83     CeedCallCuda(ceed, cuModuleUnload(impl->module[in]));
461023b8a51Sabdelfattah83   }
462023b8a51Sabdelfattah83 #endif
4632b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&impl));
464868539c2SNatalie Beams 
465e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
466868539c2SNatalie Beams }
467868539c2SNatalie Beams 
468868539c2SNatalie Beams #ifdef __cplusplus
469868539c2SNatalie Beams CEED_INTERN "C"
470868539c2SNatalie Beams #endif
4712b730f8bSJeremy L Thompson     int
4722b730f8bSJeremy L Thompson     CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d,
4732b730f8bSJeremy L Thompson                                   const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis) {
4747f5b9731SStan Tomov   CeedBasis_Magma *impl;
4752b730f8bSJeremy L Thompson   CeedCallBackend(CeedCalloc(1, &impl));
4767f5b9731SStan Tomov   Ceed ceed;
4772b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
4787f5b9731SStan Tomov 
479c9f8acf2SJeremy L Thompson   // Check for supported parameters
480c9f8acf2SJeremy L Thompson   CeedInt ncomp = 0;
4812b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp));
482e0582403Sabdelfattah83   Ceed_Magma *data;
4832b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
484e0582403Sabdelfattah83 
485f6af633fSnbeams   // Compile kernels
486f6af633fSnbeams   char *magma_common_path;
487f6af633fSnbeams   char *interp_path, *grad_path, *weight_path;
488f6af633fSnbeams   char *basis_kernel_source;
489023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_defs.h", &magma_common_path));
490f6af633fSnbeams   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n");
4912b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &basis_kernel_source));
492023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_tensor.h", &magma_common_path));
493023b8a51Sabdelfattah83   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, magma_common_path, &basis_kernel_source));
494f6af633fSnbeams   char   *interp_name_base = "ceed/jit-source/magma/interp";
495f6af633fSnbeams   CeedInt interp_name_len  = strlen(interp_name_base) + 6;
496f6af633fSnbeams   char    interp_name[interp_name_len];
4972b730f8bSJeremy L Thompson   snprintf(interp_name, interp_name_len, "%s-%" CeedInt_FMT "d.h", interp_name_base, dim);
4982b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetJitAbsolutePath(ceed, interp_name, &interp_path));
4992b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, interp_path, &basis_kernel_source));
500f6af633fSnbeams   char   *grad_name_base = "ceed/jit-source/magma/grad";
501f6af633fSnbeams   CeedInt grad_name_len  = strlen(grad_name_base) + 6;
502f6af633fSnbeams   char    grad_name[grad_name_len];
5032b730f8bSJeremy L Thompson   snprintf(grad_name, grad_name_len, "%s-%" CeedInt_FMT "d.h", grad_name_base, dim);
5042b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetJitAbsolutePath(ceed, grad_name, &grad_path));
5052b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_path, &basis_kernel_source));
506f6af633fSnbeams   char   *weight_name_base = "ceed/jit-source/magma/weight";
507f6af633fSnbeams   CeedInt weight_name_len  = strlen(weight_name_base) + 6;
508f6af633fSnbeams   char    weight_name[weight_name_len];
5092b730f8bSJeremy L Thompson   snprintf(weight_name, weight_name_len, "%s-%" CeedInt_FMT "d.h", weight_name_base, dim);
5102b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetJitAbsolutePath(ceed, weight_name, &weight_path));
5112b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_path, &basis_kernel_source));
5122b730f8bSJeremy L Thompson   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n");
513f6af633fSnbeams   // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip
514f6af633fSnbeams   // data
515f6af633fSnbeams   Ceed delegate;
5162b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetDelegate(ceed, &delegate));
5172b730f8bSJeremy L Thompson   CeedCallBackend(CeedCompileMagma(delegate, basis_kernel_source, &impl->module, 5, "DIM", dim, "NCOMP", ncomp, "P", P1d, "Q", Q1d, "MAXPQ",
5182b730f8bSJeremy L Thompson                                    CeedIntMax(P1d, Q1d)));
519f6af633fSnbeams 
520f6af633fSnbeams   // Kernel setup
521f6af633fSnbeams   switch (dim) {
522f6af633fSnbeams     case 1:
5232b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->magma_interp));
5242b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->magma_interp_tr));
5252b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->magma_grad));
5262b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->magma_grad_tr));
5272b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->magma_weight));
528f6af633fSnbeams       break;
529f6af633fSnbeams     case 2:
5302b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->magma_interp));
5312b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->magma_interp_tr));
5322b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->magma_grad));
5332b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->magma_grad_tr));
5342b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->magma_weight));
535f6af633fSnbeams       break;
536f6af633fSnbeams     case 3:
5372b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->magma_interp));
5382b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->magma_interp_tr));
5392b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->magma_grad));
5402b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->magma_grad_tr));
5412b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->magma_weight));
542f6af633fSnbeams   }
543f6af633fSnbeams 
5442b730f8bSJeremy L Thompson   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma));
5452b730f8bSJeremy L Thompson   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma));
5467f5b9731SStan Tomov 
5477f5b9731SStan Tomov   // Copy qref1d to the GPU
5482b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dqref1d, Q1d * sizeof(qref1d[0])));
5492b730f8bSJeremy L Thompson   magma_setvector(Q1d, sizeof(qref1d[0]), qref1d, 1, impl->dqref1d, 1, data->queue);
5507f5b9731SStan Tomov 
5517f5b9731SStan Tomov   // Copy interp1d to the GPU
5522b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dinterp1d, Q1d * P1d * sizeof(interp1d[0])));
5532b730f8bSJeremy L Thompson   magma_setvector(Q1d * P1d, sizeof(interp1d[0]), interp1d, 1, impl->dinterp1d, 1, data->queue);
5547f5b9731SStan Tomov 
5557f5b9731SStan Tomov   // Copy grad1d to the GPU
5562b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dgrad1d, Q1d * P1d * sizeof(grad1d[0])));
5572b730f8bSJeremy L Thompson   magma_setvector(Q1d * P1d, sizeof(grad1d[0]), grad1d, 1, impl->dgrad1d, 1, data->queue);
5587f5b9731SStan Tomov 
5597f5b9731SStan Tomov   // Copy qweight1d to the GPU
5602b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dqweight1d, Q1d * sizeof(qweight1d[0])));
5612b730f8bSJeremy L Thompson   magma_setvector(Q1d, sizeof(qweight1d[0]), qweight1d, 1, impl->dqweight1d, 1, data->queue);
5627f5b9731SStan Tomov 
5632b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisSetData(basis, impl));
5642b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&magma_common_path));
5652b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&interp_path));
5662b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&grad_path));
5672b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&weight_path));
5682b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&basis_kernel_source));
569f6af633fSnbeams 
570e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
5717f5b9731SStan Tomov }
5727f5b9731SStan Tomov 
5737f5b9731SStan Tomov #ifdef __cplusplus
5747f5b9731SStan Tomov CEED_INTERN "C"
5757f5b9731SStan Tomov #endif
5762b730f8bSJeremy L Thompson     int
5772b730f8bSJeremy L Thompson     CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp, const CeedScalar *grad,
5782b730f8bSJeremy L Thompson                             const CeedScalar *qref, const CeedScalar *qweight, CeedBasis basis) {
579868539c2SNatalie Beams   CeedBasisNonTensor_Magma *impl;
5807f5b9731SStan Tomov   Ceed                      ceed;
5812b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
5827f5b9731SStan Tomov 
583e0582403Sabdelfattah83   Ceed_Magma *data;
5842b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
585023b8a51Sabdelfattah83   magma_int_t arch = magma_getdevice_arch();
5862b730f8bSJeremy L Thompson   CeedCallBackend(CeedCalloc(1, &impl));
587023b8a51Sabdelfattah83   // Compile kernels
588023b8a51Sabdelfattah83   char *magma_common_path;
589023b8a51Sabdelfattah83   char *interp_path, *grad_path;
590023b8a51Sabdelfattah83   char *basis_kernel_source;
591023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_defs.h", &magma_common_path));
592023b8a51Sabdelfattah83   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n");
593023b8a51Sabdelfattah83   CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &basis_kernel_source));
594023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_nontensor.h", &magma_common_path));
595023b8a51Sabdelfattah83   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, magma_common_path, &basis_kernel_source));
596023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/interp-nontensor.h", &interp_path));
597023b8a51Sabdelfattah83   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, interp_path, &basis_kernel_source));
598023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/grad-nontensor.h", &grad_path));
599023b8a51Sabdelfattah83   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_path, &basis_kernel_source));
600023b8a51Sabdelfattah83 
601023b8a51Sabdelfattah83   // tuning parameters for nb
602023b8a51Sabdelfattah83   CeedInt nb_interp_n[MAGMA_NONTENSOR_KERNEL_INSTANCES];
603023b8a51Sabdelfattah83   CeedInt nb_interp_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
604023b8a51Sabdelfattah83   CeedInt nb_grad_n[MAGMA_NONTENSOR_KERNEL_INSTANCES];
605023b8a51Sabdelfattah83   CeedInt nb_grad_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
606023b8a51Sabdelfattah83   CeedInt P = ndof, Q = nqpts;
607023b8a51Sabdelfattah83   CeedInt Narray[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_N_VALUES};
608023b8a51Sabdelfattah83   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
609023b8a51Sabdelfattah83     nb_interp_n[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_INTERP, CEED_NOTRANSPOSE, P, Narray[in], Q);
610023b8a51Sabdelfattah83     nb_interp_t[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_INTERP, CEED_TRANSPOSE, P, Narray[in], Q);
611023b8a51Sabdelfattah83     nb_grad_n[in]   = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_GRAD, CEED_NOTRANSPOSE, P, Narray[in], Q);
612023b8a51Sabdelfattah83     nb_grad_t[in]   = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_GRAD, CEED_TRANSPOSE, P, Narray[in], Q);
613023b8a51Sabdelfattah83   }
614023b8a51Sabdelfattah83 
615023b8a51Sabdelfattah83   // compile
616023b8a51Sabdelfattah83   Ceed delegate;
617023b8a51Sabdelfattah83   CeedCallBackend(CeedGetDelegate(ceed, &delegate));
618023b8a51Sabdelfattah83   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
619023b8a51Sabdelfattah83     CeedCallBackend(CeedCompileMagma(delegate, basis_kernel_source, &impl->module[in], 7, "DIM", dim, "P", P, "Q", Q, "NB_INTERP_N", nb_interp_n[in],
620023b8a51Sabdelfattah83                                      "NB_INTERP_T", nb_interp_t[in], "NB_GRAD_N", nb_grad_n[in], "NB_GRAD_T", nb_grad_t[in]));
621023b8a51Sabdelfattah83   }
622023b8a51Sabdelfattah83 
623023b8a51Sabdelfattah83   // get kernels
624023b8a51Sabdelfattah83   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
625023b8a51Sabdelfattah83     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_interp_nontensor_n", &impl->magma_interp_nontensor[in]));
626023b8a51Sabdelfattah83     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_interp_nontensor_t", &impl->magma_interp_tr_nontensor[in]));
627023b8a51Sabdelfattah83     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_grad_nontensor_n", &impl->magma_grad_nontensor[in]));
628023b8a51Sabdelfattah83     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_grad_nontensor_t", &impl->magma_grad_tr_nontensor[in]));
629023b8a51Sabdelfattah83   }
630023b8a51Sabdelfattah83 
631023b8a51Sabdelfattah83   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
632023b8a51Sabdelfattah83   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
633868539c2SNatalie Beams 
634868539c2SNatalie Beams   // Copy qref to the GPU
6352b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dqref, nqpts * sizeof(qref[0])));
636e0582403Sabdelfattah83   magma_setvector(nqpts, sizeof(qref[0]), qref, 1, impl->dqref, 1, data->queue);
637868539c2SNatalie Beams 
638868539c2SNatalie Beams   // Copy interp to the GPU
6392b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dinterp, nqpts * ndof * sizeof(interp[0])));
6402b730f8bSJeremy L Thompson   magma_setvector(nqpts * ndof, sizeof(interp[0]), interp, 1, impl->dinterp, 1, data->queue);
641868539c2SNatalie Beams 
642868539c2SNatalie Beams   // Copy grad to the GPU
6432b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dgrad, nqpts * ndof * dim * sizeof(grad[0])));
6442b730f8bSJeremy L Thompson   magma_setvector(nqpts * ndof * dim, sizeof(grad[0]), grad, 1, impl->dgrad, 1, data->queue);
645868539c2SNatalie Beams 
646868539c2SNatalie Beams   // Copy qweight to the GPU
6472b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dqweight, nqpts * sizeof(qweight[0])));
6482b730f8bSJeremy L Thompson   magma_setvector(nqpts, sizeof(qweight[0]), qweight, 1, impl->dqweight, 1, data->queue);
649868539c2SNatalie Beams 
650023b8a51Sabdelfattah83   CeedCallBackend(CeedBasisSetData(basis, impl));
651023b8a51Sabdelfattah83   CeedCallBackend(CeedFree(&magma_common_path));
652023b8a51Sabdelfattah83   CeedCallBackend(CeedFree(&interp_path));
653023b8a51Sabdelfattah83   CeedCallBackend(CeedFree(&grad_path));
654023b8a51Sabdelfattah83   CeedCallBackend(CeedFree(&basis_kernel_source));
655e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
6567f5b9731SStan Tomov }
657