xref: /libCEED/rust/libceed-sys/c-src/backends/magma/ceed-magma-basis.c (revision 2b730f8b5a9c809740a0b3b302db43a719c636b1)
13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
37f5b9731SStan Tomov //
43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
57f5b9731SStan Tomov //
63d8e8822SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
77f5b9731SStan Tomov 
8ec3da8bcSJed Brown #include <ceed/backend.h>
9*2b730f8bSJeremy L Thompson #include <ceed/ceed.h>
10f6af633fSnbeams #include <ceed/jit-tools.h>
11f6af633fSnbeams #include <string.h>
12*2b730f8bSJeremy L Thompson 
137f5b9731SStan Tomov #include "ceed-magma.h"
14e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP
15f6af633fSnbeams #include "../hip/ceed-hip-common.h"
16f6af633fSnbeams #include "../hip/ceed-hip-compile.h"
17f6af633fSnbeams #else
18f6af633fSnbeams #include "../cuda/ceed-cuda-common.h"
19f6af633fSnbeams #include "../cuda/ceed-cuda-compile.h"
20f6af633fSnbeams #endif
217f5b9731SStan Tomov 
227f5b9731SStan Tomov #ifdef __cplusplus
237f5b9731SStan Tomov CEED_INTERN "C"
247f5b9731SStan Tomov #endif
25*2b730f8bSJeremy L Thompson     int
26*2b730f8bSJeremy L Thompson     CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector U, CeedVector V) {
277f5b9731SStan Tomov   Ceed ceed;
28*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
29e0582403Sabdelfattah83   CeedInt dim, ncomp, ndof;
30*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
31*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp));
32*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumNodes(basis, &ndof));
33e0582403Sabdelfattah83 
34e0582403Sabdelfattah83   Ceed_Magma *data;
35*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
36e0582403Sabdelfattah83 
377f5b9731SStan Tomov   const CeedScalar *u;
387f5b9731SStan Tomov   CeedScalar       *v;
39868539c2SNatalie Beams   if (emode != CEED_EVAL_WEIGHT) {
40*2b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &u));
417f5b9731SStan Tomov   } else if (emode != CEED_EVAL_WEIGHT) {
427f5b9731SStan Tomov     // LCOV_EXCL_START
43*2b730f8bSJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
447f5b9731SStan Tomov     // LCOV_EXCL_STOP
457f5b9731SStan Tomov   }
46*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &v));
477f5b9731SStan Tomov 
487f5b9731SStan Tomov   CeedBasis_Magma *impl;
49*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
507f5b9731SStan Tomov 
517f5b9731SStan Tomov   CeedInt P1d, Q1d;
52*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P1d));
53*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q1d));
547f5b9731SStan Tomov 
55*2b730f8bSJeremy L Thompson   CeedDebug256(ceed, 4, "[CeedBasisApply_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, ncomp * CeedIntPow(P1d, dim), ncomp);
567f5b9731SStan Tomov 
577f5b9731SStan Tomov   if (tmode == CEED_TRANSPOSE) {
581f9221feSJeremy L Thompson     CeedSize length;
59*2b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorGetLength(V, &length));
6080a9ef05SNatalie Beams     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
61*2b730f8bSJeremy L Thompson       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)v, length, data->queue);
6280a9ef05SNatalie Beams     } else {
63*2b730f8bSJeremy L Thompson       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)v, length, data->queue);
6480a9ef05SNatalie Beams     }
65e0582403Sabdelfattah83     ceed_magma_queue_sync(data->queue);
667f5b9731SStan Tomov   }
67f6af633fSnbeams 
683513a710Sjeremylt   switch (emode) {
693513a710Sjeremylt     case CEED_EVAL_INTERP: {
707f5b9731SStan Tomov       CeedInt P = P1d, Q = Q1d;
717f5b9731SStan Tomov       if (tmode == CEED_TRANSPOSE) {
72*2b730f8bSJeremy L Thompson         P = Q1d;
73*2b730f8bSJeremy L Thompson         Q = P1d;
747f5b9731SStan Tomov       }
757f5b9731SStan Tomov 
767f5b9731SStan Tomov       // Define element sizes for dofs/quad
777f5b9731SStan Tomov       CeedInt elquadsize = CeedIntPow(Q1d, dim);
787f5b9731SStan Tomov       CeedInt eldofssize = CeedIntPow(P1d, dim);
797f5b9731SStan Tomov 
807f5b9731SStan Tomov       // E-vector ordering -------------- Q-vector ordering
81868539c2SNatalie Beams       //  component                        component
82868539c2SNatalie Beams       //    elem                             elem
837f5b9731SStan Tomov       //       node                            node
847f5b9731SStan Tomov 
857f5b9731SStan Tomov       // ---  Define strides for NOTRANSPOSE mode: ---
867f5b9731SStan Tomov       // Input (u) is E-vector, output (v) is Q-vector
877f5b9731SStan Tomov 
887f5b9731SStan Tomov       // Element strides
89868539c2SNatalie Beams       CeedInt u_elstride = eldofssize;
907f5b9731SStan Tomov       CeedInt v_elstride = elquadsize;
917f5b9731SStan Tomov       // Component strides
92868539c2SNatalie Beams       CeedInt u_compstride = nelem * eldofssize;
937f5b9731SStan Tomov       CeedInt v_compstride = nelem * elquadsize;
947f5b9731SStan Tomov 
957f5b9731SStan Tomov       // ---  Swap strides for TRANSPOSE mode: ---
967f5b9731SStan Tomov       if (tmode == CEED_TRANSPOSE) {
977f5b9731SStan Tomov         // Input (u) is Q-vector, output (v) is E-vector
987f5b9731SStan Tomov         // Element strides
99868539c2SNatalie Beams         v_elstride = eldofssize;
1007f5b9731SStan Tomov         u_elstride = elquadsize;
1017f5b9731SStan Tomov         // Component strides
102868539c2SNatalie Beams         v_compstride = nelem * eldofssize;
1037f5b9731SStan Tomov         u_compstride = nelem * elquadsize;
1047f5b9731SStan Tomov       }
1057f5b9731SStan Tomov 
106f6af633fSnbeams       CeedInt nthreads = 1;
107f6af633fSnbeams       CeedInt ntcol    = 1;
108f6af633fSnbeams       CeedInt shmem    = 0;
109f6af633fSnbeams       CeedInt maxPQ    = CeedIntMax(P, Q);
110f6af633fSnbeams 
111f6af633fSnbeams       switch (dim) {
112f6af633fSnbeams         case 1:
113f6af633fSnbeams           nthreads = maxPQ;
114f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D);
115f6af633fSnbeams           shmem += sizeof(CeedScalar) * ntcol * (ncomp * (1 * P + 1 * Q));
116f6af633fSnbeams           shmem += sizeof(CeedScalar) * (P * Q);
117f6af633fSnbeams           break;
118f6af633fSnbeams         case 2:
119f6af633fSnbeams           nthreads = maxPQ;
120f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D);
121f6af633fSnbeams           shmem += P * Q * sizeof(CeedScalar);                // for sT
122*2b730f8bSJeremy L Thompson           shmem += ntcol * (P * maxPQ * sizeof(CeedScalar));  // for reforming rU we need PxP, and for the intermediate output we need PxQ
123f6af633fSnbeams           break;
124f6af633fSnbeams         case 3:
125f6af633fSnbeams           nthreads = maxPQ * maxPQ;
126f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D);
127f6af633fSnbeams           shmem += sizeof(CeedScalar) * (P * Q);  // for sT
128*2b730f8bSJeremy L Thompson           shmem += sizeof(CeedScalar) * ntcol *
129*2b730f8bSJeremy L Thompson                    (CeedIntMax(P * P * maxPQ,
130f6af633fSnbeams                                P * Q * Q));  // rU needs P^2xP, the intermediate output needs max(P^2xQ,PQ^2)
131f6af633fSnbeams       }
132f6af633fSnbeams       CeedInt grid   = (nelem + ntcol - 1) / ntcol;
133*2b730f8bSJeremy L Thompson       void   *args[] = {&impl->dinterp1d, &u, &u_elstride, &u_compstride, &v, &v_elstride, &v_compstride, &nelem};
134f6af633fSnbeams 
135f6af633fSnbeams       if (tmode == CEED_TRANSPOSE) {
136*2b730f8bSJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp_tr, grid, nthreads, ntcol, 1, shmem, args));
137f6af633fSnbeams       } else {
138*2b730f8bSJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp, grid, nthreads, ntcol, 1, shmem, args));
139f6af633fSnbeams       }
140*2b730f8bSJeremy L Thompson     } break;
1413513a710Sjeremylt     case CEED_EVAL_GRAD: {
1427f5b9731SStan Tomov       CeedInt P = P1d, Q = Q1d;
1437f5b9731SStan Tomov       // In CEED_NOTRANSPOSE mode:
1447f5b9731SStan Tomov       // u is (P^dim x nc), column-major layout (nc = ncomp)
1457f5b9731SStan Tomov       // v is (Q^dim x nc x dim), column-major layout (nc = ncomp)
1467f5b9731SStan Tomov       // In CEED_TRANSPOSE mode, the sizes of u and v are switched.
1477f5b9731SStan Tomov       if (tmode == CEED_TRANSPOSE) {
1487f5b9731SStan Tomov         P = Q1d, Q = P1d;
1497f5b9731SStan Tomov       }
1507f5b9731SStan Tomov 
1517f5b9731SStan Tomov       // Define element sizes for dofs/quad
1527f5b9731SStan Tomov       CeedInt elquadsize = CeedIntPow(Q1d, dim);
1537f5b9731SStan Tomov       CeedInt eldofssize = CeedIntPow(P1d, dim);
1547f5b9731SStan Tomov 
1557f5b9731SStan Tomov       // E-vector ordering -------------- Q-vector ordering
1567f5b9731SStan Tomov       //                                  dim
157868539c2SNatalie Beams       //  component                        component
158868539c2SNatalie Beams       //    elem                              elem
1597f5b9731SStan Tomov       //       node                            node
1607f5b9731SStan Tomov 
1617f5b9731SStan Tomov       // ---  Define strides for NOTRANSPOSE mode: ---
1627f5b9731SStan Tomov       // Input (u) is E-vector, output (v) is Q-vector
1637f5b9731SStan Tomov 
1647f5b9731SStan Tomov       // Element strides
165868539c2SNatalie Beams       CeedInt u_elstride = eldofssize;
1667f5b9731SStan Tomov       CeedInt v_elstride = elquadsize;
1677f5b9731SStan Tomov       // Component strides
168868539c2SNatalie Beams       CeedInt u_compstride = nelem * eldofssize;
1697f5b9731SStan Tomov       CeedInt v_compstride = nelem * elquadsize;
1707f5b9731SStan Tomov       // Dimension strides
1717f5b9731SStan Tomov       CeedInt u_dimstride = 0;
1727f5b9731SStan Tomov       CeedInt v_dimstride = nelem * elquadsize * ncomp;
1737f5b9731SStan Tomov 
1747f5b9731SStan Tomov       // ---  Swap strides for TRANSPOSE mode: ---
1757f5b9731SStan Tomov       if (tmode == CEED_TRANSPOSE) {
1767f5b9731SStan Tomov         // Input (u) is Q-vector, output (v) is E-vector
1777f5b9731SStan Tomov         // Element strides
178868539c2SNatalie Beams         v_elstride = eldofssize;
1797f5b9731SStan Tomov         u_elstride = elquadsize;
1807f5b9731SStan Tomov         // Component strides
181868539c2SNatalie Beams         v_compstride = nelem * eldofssize;
1827f5b9731SStan Tomov         u_compstride = nelem * elquadsize;
1837f5b9731SStan Tomov         // Dimension strides
1847f5b9731SStan Tomov         v_dimstride = 0;
1857f5b9731SStan Tomov         u_dimstride = nelem * elquadsize * ncomp;
1867f5b9731SStan Tomov       }
1877f5b9731SStan Tomov 
188f6af633fSnbeams       CeedInt nthreads = 1;
189f6af633fSnbeams       CeedInt ntcol    = 1;
190f6af633fSnbeams       CeedInt shmem    = 0;
191f6af633fSnbeams       CeedInt maxPQ    = CeedIntMax(P, Q);
192f6af633fSnbeams 
193f6af633fSnbeams       switch (dim) {
194f6af633fSnbeams         case 1:
195f6af633fSnbeams           nthreads = maxPQ;
196f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D);
197f6af633fSnbeams           shmem += sizeof(CeedScalar) * ntcol * (ncomp * (1 * P + 1 * Q));
198f6af633fSnbeams           shmem += sizeof(CeedScalar) * (P * Q);
199f6af633fSnbeams           break;
200f6af633fSnbeams         case 2:
201f6af633fSnbeams           nthreads = maxPQ;
202f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D);
203f6af633fSnbeams           shmem += sizeof(CeedScalar) * 2 * P * Q;            // for sTinterp and sTgrad
204*2b730f8bSJeremy L Thompson           shmem += sizeof(CeedScalar) * ntcol * (P * maxPQ);  // for reforming rU we need PxP, and for the intermediate output we need PxQ
205f6af633fSnbeams           break;
206f6af633fSnbeams         case 3:
207f6af633fSnbeams           nthreads = maxPQ * maxPQ;
208f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D);
209f6af633fSnbeams           shmem += sizeof(CeedScalar) * 2 * P * Q;  // for sTinterp and sTgrad
210*2b730f8bSJeremy L Thompson           shmem += sizeof(CeedScalar) * ntcol *
211*2b730f8bSJeremy L Thompson                    CeedIntMax(P * P * P,
212*2b730f8bSJeremy L Thompson                               (P * P * Q) + (P * Q * Q));  // rU needs P^2xP, the intermediate outputs need (P^2.Q + P.Q^2)
213f6af633fSnbeams       }
214f6af633fSnbeams       CeedInt grid   = (nelem + ntcol - 1) / ntcol;
215*2b730f8bSJeremy L Thompson       void   *args[] = {&impl->dinterp1d, &impl->dgrad1d, &u,           &u_elstride, &u_compstride, &u_dimstride, &v,
216*2b730f8bSJeremy L Thompson                         &v_elstride,      &v_compstride,  &v_dimstride, &nelem};
217f6af633fSnbeams 
218f6af633fSnbeams       if (tmode == CEED_TRANSPOSE) {
219*2b730f8bSJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad_tr, grid, nthreads, ntcol, 1, shmem, args));
220f6af633fSnbeams       } else {
221*2b730f8bSJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad, grid, nthreads, ntcol, 1, shmem, args));
222f6af633fSnbeams       }
223*2b730f8bSJeremy L Thompson     } break;
2243513a710Sjeremylt     case CEED_EVAL_WEIGHT: {
2257f5b9731SStan Tomov       if (tmode == CEED_TRANSPOSE)
2267f5b9731SStan Tomov         // LCOV_EXCL_START
227*2b730f8bSJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
2287f5b9731SStan Tomov       // LCOV_EXCL_STOP
2297f5b9731SStan Tomov       CeedInt Q          = Q1d;
230f6af633fSnbeams       CeedInt eldofssize = CeedIntPow(Q, dim);
231f6af633fSnbeams       CeedInt nthreads   = 1;
232f6af633fSnbeams       CeedInt ntcol      = 1;
233f6af633fSnbeams       CeedInt shmem      = 0;
234f6af633fSnbeams 
235f6af633fSnbeams       switch (dim) {
236f6af633fSnbeams         case 1:
237f6af633fSnbeams           nthreads = Q;
238f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D);
239f6af633fSnbeams           shmem += sizeof(CeedScalar) * Q;          // for dqweight1d
240f6af633fSnbeams           shmem += sizeof(CeedScalar) * ntcol * Q;  // for output
241f6af633fSnbeams           break;
242f6af633fSnbeams         case 2:
243f6af633fSnbeams           nthreads = Q;
244f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D);
245f6af633fSnbeams           shmem += sizeof(CeedScalar) * Q;  // for dqweight1d
246f6af633fSnbeams           break;
247f6af633fSnbeams         case 3:
248f6af633fSnbeams           nthreads = Q * Q;
249f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D);
250f6af633fSnbeams           shmem += sizeof(CeedScalar) * Q;  // for dqweight1d
251f6af633fSnbeams       }
252f6af633fSnbeams       CeedInt grid   = (nelem + ntcol - 1) / ntcol;
253f6af633fSnbeams       void   *args[] = {&impl->dqweight1d, &v, &eldofssize, &nelem};
254f6af633fSnbeams 
255*2b730f8bSJeremy L Thompson       CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_weight, grid, nthreads, ntcol, 1, shmem, args));
256*2b730f8bSJeremy L Thompson     } break;
2573513a710Sjeremylt     // LCOV_EXCL_START
2583513a710Sjeremylt     case CEED_EVAL_DIV:
259e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
2603513a710Sjeremylt     case CEED_EVAL_CURL:
261e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
2623513a710Sjeremylt     case CEED_EVAL_NONE:
263*2b730f8bSJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
2643513a710Sjeremylt       // LCOV_EXCL_STOP
2653513a710Sjeremylt   }
2667f5b9731SStan Tomov 
267e0582403Sabdelfattah83   // must sync to ensure completeness
268e0582403Sabdelfattah83   ceed_magma_queue_sync(data->queue);
269e0582403Sabdelfattah83 
2707f5b9731SStan Tomov   if (emode != CEED_EVAL_WEIGHT) {
271*2b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorRestoreArrayRead(U, &u));
2727f5b9731SStan Tomov   }
273*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedVectorRestoreArray(V, &v));
274e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
2757f5b9731SStan Tomov }
2767f5b9731SStan Tomov 
2777f5b9731SStan Tomov #ifdef __cplusplus
2787f5b9731SStan Tomov CEED_INTERN "C"
2797f5b9731SStan Tomov #endif
280*2b730f8bSJeremy L Thompson     int
281*2b730f8bSJeremy L Thompson     CeedBasisApplyNonTensor_f64_Magma(CeedBasis basis, CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector U, CeedVector V) {
282868539c2SNatalie Beams   Ceed ceed;
283*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
284e0582403Sabdelfattah83 
285e0582403Sabdelfattah83   Ceed_Magma *data;
286*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
287e0582403Sabdelfattah83 
288868539c2SNatalie Beams   CeedInt dim, ncomp, ndof, nqpt;
289*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
290*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp));
291*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumNodes(basis, &ndof));
292*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &nqpt));
293868539c2SNatalie Beams   const CeedScalar *du;
294868539c2SNatalie Beams   CeedScalar       *dv;
295868539c2SNatalie Beams   if (emode != CEED_EVAL_WEIGHT) {
296*2b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du));
297868539c2SNatalie Beams   } else if (emode != CEED_EVAL_WEIGHT) {
298868539c2SNatalie Beams     // LCOV_EXCL_START
299*2b730f8bSJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
300868539c2SNatalie Beams     // LCOV_EXCL_STOP
301868539c2SNatalie Beams   }
302*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv));
303868539c2SNatalie Beams 
304868539c2SNatalie Beams   CeedBasisNonTensor_Magma *impl;
305*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
306868539c2SNatalie Beams 
307*2b730f8bSJeremy L Thompson   CeedDebug256(ceed, 4, "[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, ncomp * ndof, ncomp);
308868539c2SNatalie Beams 
309868539c2SNatalie Beams   if (tmode == CEED_TRANSPOSE) {
3101f9221feSJeremy L Thompson     CeedSize length;
311*2b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorGetLength(V, &length));
31280a9ef05SNatalie Beams     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
313*2b730f8bSJeremy L Thompson       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue);
31480a9ef05SNatalie Beams     } else {
315*2b730f8bSJeremy L Thompson       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)dv, length, data->queue);
31680a9ef05SNatalie Beams     }
317e0582403Sabdelfattah83     ceed_magma_queue_sync(data->queue);
318868539c2SNatalie Beams   }
31980a9ef05SNatalie Beams 
320868539c2SNatalie Beams   switch (emode) {
321868539c2SNatalie Beams     case CEED_EVAL_INTERP: {
322868539c2SNatalie Beams       CeedInt P = ndof, Q = nqpt;
323868539c2SNatalie Beams       if (tmode == CEED_TRANSPOSE)
324*2b730f8bSJeremy L Thompson         magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, (double *)impl->dinterp, P, (double *)du, Q, 0.0, (double *)dv, P,
325*2b730f8bSJeremy L Thompson                               data->queue);
326868539c2SNatalie Beams       else
327*2b730f8bSJeremy L Thompson         magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, (double *)impl->dinterp, P, (double *)du, P, 0.0, (double *)dv, Q,
328*2b730f8bSJeremy L Thompson                               data->queue);
329*2b730f8bSJeremy L Thompson     } break;
330868539c2SNatalie Beams 
331868539c2SNatalie Beams     case CEED_EVAL_GRAD: {
332868539c2SNatalie Beams       CeedInt P = ndof, Q = nqpt;
333868539c2SNatalie Beams       if (tmode == CEED_TRANSPOSE) {
33480a9ef05SNatalie Beams         CeedScalar beta = 0.0;
335868539c2SNatalie Beams         for (int d = 0; d < dim; d++) {
336*2b730f8bSJeremy L Thompson           if (d > 0) beta = 1.0;
337*2b730f8bSJeremy L Thompson           magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, (double *)(impl->dgrad + d * P * Q), P,
338*2b730f8bSJeremy L Thompson                                 (double *)(du + d * nelem * ncomp * Q), Q, beta, (double *)dv, P, data->queue);
339868539c2SNatalie Beams         }
340868539c2SNatalie Beams       } else {
341868539c2SNatalie Beams         for (int d = 0; d < dim; d++)
342*2b730f8bSJeremy L Thompson           magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, (double *)(impl->dgrad + d * P * Q), P, (double *)du, P, 0.0,
343*2b730f8bSJeremy L Thompson                                 (double *)(dv + d * nelem * ncomp * Q), Q, data->queue);
34480a9ef05SNatalie Beams       }
345*2b730f8bSJeremy L Thompson     } break;
34680a9ef05SNatalie Beams 
34780a9ef05SNatalie Beams     case CEED_EVAL_WEIGHT: {
34880a9ef05SNatalie Beams       if (tmode == CEED_TRANSPOSE)
34980a9ef05SNatalie Beams         // LCOV_EXCL_START
350*2b730f8bSJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
35180a9ef05SNatalie Beams       // LCOV_EXCL_STOP
35280a9ef05SNatalie Beams 
35380a9ef05SNatalie Beams       int elemsPerBlock = 1;  // basis->Q1d < 7 ? optElems[basis->Q1d] : 1;
354*2b730f8bSJeremy L Thompson       int grid          = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0);
355*2b730f8bSJeremy L Thompson       magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv, data->queue);
356*2b730f8bSJeremy L Thompson     } break;
35780a9ef05SNatalie Beams 
35880a9ef05SNatalie Beams     // LCOV_EXCL_START
35980a9ef05SNatalie Beams     case CEED_EVAL_DIV:
36080a9ef05SNatalie Beams       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
36180a9ef05SNatalie Beams     case CEED_EVAL_CURL:
36280a9ef05SNatalie Beams       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
36380a9ef05SNatalie Beams     case CEED_EVAL_NONE:
364*2b730f8bSJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
36580a9ef05SNatalie Beams       // LCOV_EXCL_STOP
36680a9ef05SNatalie Beams   }
36780a9ef05SNatalie Beams 
36880a9ef05SNatalie Beams   // must sync to ensure completeness
36980a9ef05SNatalie Beams   ceed_magma_queue_sync(data->queue);
37080a9ef05SNatalie Beams 
37180a9ef05SNatalie Beams   if (emode != CEED_EVAL_WEIGHT) {
372*2b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorRestoreArrayRead(U, &du));
37380a9ef05SNatalie Beams   }
374*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedVectorRestoreArray(V, &dv));
37580a9ef05SNatalie Beams   return CEED_ERROR_SUCCESS;
37680a9ef05SNatalie Beams }
37780a9ef05SNatalie Beams 
378*2b730f8bSJeremy L Thompson int CeedBasisApplyNonTensor_f32_Magma(CeedBasis basis, CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector U, CeedVector V) {
37980a9ef05SNatalie Beams   Ceed ceed;
380*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
38180a9ef05SNatalie Beams 
38280a9ef05SNatalie Beams   Ceed_Magma *data;
383*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
38480a9ef05SNatalie Beams 
38580a9ef05SNatalie Beams   CeedInt dim, ncomp, ndof, nqpt;
386*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
387*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp));
388*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumNodes(basis, &ndof));
389*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &nqpt));
39080a9ef05SNatalie Beams   const CeedScalar *du;
39180a9ef05SNatalie Beams   CeedScalar       *dv;
39280a9ef05SNatalie Beams   if (emode != CEED_EVAL_WEIGHT) {
393*2b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du));
39480a9ef05SNatalie Beams   } else if (emode != CEED_EVAL_WEIGHT) {
39580a9ef05SNatalie Beams     // LCOV_EXCL_START
396*2b730f8bSJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
39780a9ef05SNatalie Beams     // LCOV_EXCL_STOP
39880a9ef05SNatalie Beams   }
399*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv));
40080a9ef05SNatalie Beams 
40180a9ef05SNatalie Beams   CeedBasisNonTensor_Magma *impl;
402*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
40380a9ef05SNatalie Beams 
404*2b730f8bSJeremy L Thompson   CeedDebug256(ceed, 4, "[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, ncomp * ndof, ncomp);
40580a9ef05SNatalie Beams 
40680a9ef05SNatalie Beams   if (tmode == CEED_TRANSPOSE) {
4071f9221feSJeremy L Thompson     CeedSize length;
408*2b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorGetLength(V, &length));
40980a9ef05SNatalie Beams     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
410*2b730f8bSJeremy L Thompson       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue);
41180a9ef05SNatalie Beams     } else {
412*2b730f8bSJeremy L Thompson       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)dv, length, data->queue);
41380a9ef05SNatalie Beams     }
41480a9ef05SNatalie Beams     ceed_magma_queue_sync(data->queue);
41580a9ef05SNatalie Beams   }
41680a9ef05SNatalie Beams 
41780a9ef05SNatalie Beams   switch (emode) {
41880a9ef05SNatalie Beams     case CEED_EVAL_INTERP: {
41980a9ef05SNatalie Beams       CeedInt P = ndof, Q = nqpt;
42080a9ef05SNatalie Beams       if (tmode == CEED_TRANSPOSE)
421*2b730f8bSJeremy L Thompson         magma_sgemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, (float *)impl->dinterp, P, (float *)du, Q, 0.0, (float *)dv, P,
422*2b730f8bSJeremy L Thompson                               data->queue);
42380a9ef05SNatalie Beams       else
424*2b730f8bSJeremy L Thompson         magma_sgemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, (float *)impl->dinterp, P, (float *)du, P, 0.0, (float *)dv, Q,
425*2b730f8bSJeremy L Thompson                               data->queue);
426*2b730f8bSJeremy L Thompson     } break;
42780a9ef05SNatalie Beams 
42880a9ef05SNatalie Beams     case CEED_EVAL_GRAD: {
42980a9ef05SNatalie Beams       CeedInt P = ndof, Q = nqpt;
43080a9ef05SNatalie Beams       if (tmode == CEED_TRANSPOSE) {
43180a9ef05SNatalie Beams         CeedScalar beta = 0.0;
43280a9ef05SNatalie Beams         for (int d = 0; d < dim; d++) {
433*2b730f8bSJeremy L Thompson           if (d > 0) beta = 1.0;
434*2b730f8bSJeremy L Thompson           magma_sgemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, (float *)(impl->dgrad + d * P * Q), P,
435*2b730f8bSJeremy L Thompson                                 (float *)(du + d * nelem * ncomp * Q), Q, beta, (float *)dv, P, data->queue);
43680a9ef05SNatalie Beams         }
43780a9ef05SNatalie Beams       } else {
43880a9ef05SNatalie Beams         for (int d = 0; d < dim; d++)
439*2b730f8bSJeremy L Thompson           magma_sgemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, (float *)(impl->dgrad + d * P * Q), P, (float *)du, P, 0.0,
440*2b730f8bSJeremy L Thompson                                 (float *)(dv + d * nelem * ncomp * Q), Q, data->queue);
441868539c2SNatalie Beams       }
442*2b730f8bSJeremy L Thompson     } break;
443868539c2SNatalie Beams 
444868539c2SNatalie Beams     case CEED_EVAL_WEIGHT: {
445868539c2SNatalie Beams       if (tmode == CEED_TRANSPOSE)
446868539c2SNatalie Beams         // LCOV_EXCL_START
447*2b730f8bSJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
448868539c2SNatalie Beams       // LCOV_EXCL_STOP
449868539c2SNatalie Beams 
450868539c2SNatalie Beams       int elemsPerBlock = 1;  // basis->Q1d < 7 ? optElems[basis->Q1d] : 1;
451*2b730f8bSJeremy L Thompson       int grid          = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0);
452*2b730f8bSJeremy L Thompson       magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv, data->queue);
453*2b730f8bSJeremy L Thompson     } break;
454868539c2SNatalie Beams 
455868539c2SNatalie Beams     // LCOV_EXCL_START
456868539c2SNatalie Beams     case CEED_EVAL_DIV:
457e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
458868539c2SNatalie Beams     case CEED_EVAL_CURL:
459e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
460868539c2SNatalie Beams     case CEED_EVAL_NONE:
461*2b730f8bSJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
462868539c2SNatalie Beams       // LCOV_EXCL_STOP
463868539c2SNatalie Beams   }
464868539c2SNatalie Beams 
465e0582403Sabdelfattah83   // must sync to ensure completeness
466e0582403Sabdelfattah83   ceed_magma_queue_sync(data->queue);
467e0582403Sabdelfattah83 
468868539c2SNatalie Beams   if (emode != CEED_EVAL_WEIGHT) {
469*2b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorRestoreArrayRead(U, &du));
470868539c2SNatalie Beams   }
471*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedVectorRestoreArray(V, &dv));
472e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
473868539c2SNatalie Beams }
474868539c2SNatalie Beams 
475868539c2SNatalie Beams #ifdef __cplusplus
476868539c2SNatalie Beams CEED_INTERN "C"
477868539c2SNatalie Beams #endif
478*2b730f8bSJeremy L Thompson     int
479*2b730f8bSJeremy L Thompson     CeedBasisDestroy_Magma(CeedBasis basis) {
4807f5b9731SStan Tomov   CeedBasis_Magma *impl;
481*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
4827f5b9731SStan Tomov 
483*2b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dqref1d));
484*2b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dinterp1d));
485*2b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dgrad1d));
486*2b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dqweight1d));
487f6af633fSnbeams   Ceed ceed;
488*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
489e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP
490*2b730f8bSJeremy L Thompson   CeedCallHip(ceed, hipModuleUnload(impl->module));
491f6af633fSnbeams #else
492*2b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cuModuleUnload(impl->module));
493f6af633fSnbeams #endif
4947f5b9731SStan Tomov 
495*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&impl));
4967f5b9731SStan Tomov 
497e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
4987f5b9731SStan Tomov }
4997f5b9731SStan Tomov 
5007f5b9731SStan Tomov #ifdef __cplusplus
5017f5b9731SStan Tomov CEED_INTERN "C"
5027f5b9731SStan Tomov #endif
503*2b730f8bSJeremy L Thompson     int
504*2b730f8bSJeremy L Thompson     CeedBasisDestroyNonTensor_Magma(CeedBasis basis) {
505868539c2SNatalie Beams   CeedBasisNonTensor_Magma *impl;
506*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
507868539c2SNatalie Beams 
508*2b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dqref));
509*2b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dinterp));
510*2b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dgrad));
511*2b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dqweight));
512868539c2SNatalie Beams 
513*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&impl));
514868539c2SNatalie Beams 
515e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
516868539c2SNatalie Beams }
517868539c2SNatalie Beams 
518868539c2SNatalie Beams #ifdef __cplusplus
519868539c2SNatalie Beams CEED_INTERN "C"
520868539c2SNatalie Beams #endif
521*2b730f8bSJeremy L Thompson     int
522*2b730f8bSJeremy L Thompson     CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d,
523*2b730f8bSJeremy L Thompson                                   const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis) {
5247f5b9731SStan Tomov   CeedBasis_Magma *impl;
525*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedCalloc(1, &impl));
5267f5b9731SStan Tomov   Ceed ceed;
527*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
5287f5b9731SStan Tomov 
529c9f8acf2SJeremy L Thompson   // Check for supported parameters
530c9f8acf2SJeremy L Thompson   CeedInt ncomp = 0;
531*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp));
532e0582403Sabdelfattah83   Ceed_Magma *data;
533*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
534e0582403Sabdelfattah83 
535f6af633fSnbeams   // Compile kernels
536f6af633fSnbeams   char *magma_common_path;
537f6af633fSnbeams   char *interp_path, *grad_path, *weight_path;
538f6af633fSnbeams   char *basis_kernel_source;
539*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_device.h", &magma_common_path));
540f6af633fSnbeams   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n");
541*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &basis_kernel_source));
542f6af633fSnbeams   char   *interp_name_base = "ceed/jit-source/magma/interp";
543f6af633fSnbeams   CeedInt interp_name_len  = strlen(interp_name_base) + 6;
544f6af633fSnbeams   char    interp_name[interp_name_len];
545*2b730f8bSJeremy L Thompson   snprintf(interp_name, interp_name_len, "%s-%" CeedInt_FMT "d.h", interp_name_base, dim);
546*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetJitAbsolutePath(ceed, interp_name, &interp_path));
547*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, interp_path, &basis_kernel_source));
548f6af633fSnbeams   char   *grad_name_base = "ceed/jit-source/magma/grad";
549f6af633fSnbeams   CeedInt grad_name_len  = strlen(grad_name_base) + 6;
550f6af633fSnbeams   char    grad_name[grad_name_len];
551*2b730f8bSJeremy L Thompson   snprintf(grad_name, grad_name_len, "%s-%" CeedInt_FMT "d.h", grad_name_base, dim);
552*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetJitAbsolutePath(ceed, grad_name, &grad_path));
553*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_path, &basis_kernel_source));
554f6af633fSnbeams   char   *weight_name_base = "ceed/jit-source/magma/weight";
555f6af633fSnbeams   CeedInt weight_name_len  = strlen(weight_name_base) + 6;
556f6af633fSnbeams   char    weight_name[weight_name_len];
557*2b730f8bSJeremy L Thompson   snprintf(weight_name, weight_name_len, "%s-%" CeedInt_FMT "d.h", weight_name_base, dim);
558*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetJitAbsolutePath(ceed, weight_name, &weight_path));
559*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_path, &basis_kernel_source));
560*2b730f8bSJeremy L Thompson   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n");
561f6af633fSnbeams   // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip
562f6af633fSnbeams   // data
563f6af633fSnbeams   Ceed delegate;
564*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetDelegate(ceed, &delegate));
565*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedCompileMagma(delegate, basis_kernel_source, &impl->module, 5, "DIM", dim, "NCOMP", ncomp, "P", P1d, "Q", Q1d, "MAXPQ",
566*2b730f8bSJeremy L Thompson                                    CeedIntMax(P1d, Q1d)));
567f6af633fSnbeams 
568f6af633fSnbeams   // Kernel setup
569f6af633fSnbeams   switch (dim) {
570f6af633fSnbeams     case 1:
571*2b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->magma_interp));
572*2b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->magma_interp_tr));
573*2b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->magma_grad));
574*2b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->magma_grad_tr));
575*2b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->magma_weight));
576f6af633fSnbeams       break;
577f6af633fSnbeams     case 2:
578*2b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->magma_interp));
579*2b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->magma_interp_tr));
580*2b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->magma_grad));
581*2b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->magma_grad_tr));
582*2b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->magma_weight));
583f6af633fSnbeams       break;
584f6af633fSnbeams     case 3:
585*2b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->magma_interp));
586*2b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->magma_interp_tr));
587*2b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->magma_grad));
588*2b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->magma_grad_tr));
589*2b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->magma_weight));
590f6af633fSnbeams   }
591f6af633fSnbeams 
592*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma));
593*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma));
5947f5b9731SStan Tomov 
5957f5b9731SStan Tomov   // Copy qref1d to the GPU
596*2b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dqref1d, Q1d * sizeof(qref1d[0])));
597*2b730f8bSJeremy L Thompson   magma_setvector(Q1d, sizeof(qref1d[0]), qref1d, 1, impl->dqref1d, 1, data->queue);
5987f5b9731SStan Tomov 
5997f5b9731SStan Tomov   // Copy interp1d to the GPU
600*2b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dinterp1d, Q1d * P1d * sizeof(interp1d[0])));
601*2b730f8bSJeremy L Thompson   magma_setvector(Q1d * P1d, sizeof(interp1d[0]), interp1d, 1, impl->dinterp1d, 1, data->queue);
6027f5b9731SStan Tomov 
6037f5b9731SStan Tomov   // Copy grad1d to the GPU
604*2b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dgrad1d, Q1d * P1d * sizeof(grad1d[0])));
605*2b730f8bSJeremy L Thompson   magma_setvector(Q1d * P1d, sizeof(grad1d[0]), grad1d, 1, impl->dgrad1d, 1, data->queue);
6067f5b9731SStan Tomov 
6077f5b9731SStan Tomov   // Copy qweight1d to the GPU
608*2b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dqweight1d, Q1d * sizeof(qweight1d[0])));
609*2b730f8bSJeremy L Thompson   magma_setvector(Q1d, sizeof(qweight1d[0]), qweight1d, 1, impl->dqweight1d, 1, data->queue);
6107f5b9731SStan Tomov 
611*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisSetData(basis, impl));
612*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisSetData(basis, impl));
613*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&magma_common_path));
614*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&interp_path));
615*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&grad_path));
616*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&weight_path));
617*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&basis_kernel_source));
618f6af633fSnbeams 
619e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
6207f5b9731SStan Tomov }
6217f5b9731SStan Tomov 
6227f5b9731SStan Tomov #ifdef __cplusplus
6237f5b9731SStan Tomov CEED_INTERN "C"
6247f5b9731SStan Tomov #endif
625*2b730f8bSJeremy L Thompson     int
626*2b730f8bSJeremy L Thompson     CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp, const CeedScalar *grad,
627*2b730f8bSJeremy L Thompson                             const CeedScalar *qref, const CeedScalar *qweight, CeedBasis basis) {
628868539c2SNatalie Beams   CeedBasisNonTensor_Magma *impl;
6297f5b9731SStan Tomov   Ceed                      ceed;
630*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
6317f5b9731SStan Tomov 
632e0582403Sabdelfattah83   Ceed_Magma *data;
633*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
634e0582403Sabdelfattah83 
63580a9ef05SNatalie Beams   if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) {
636*2b730f8bSJeremy L Thompson     CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_f64_Magma));
63780a9ef05SNatalie Beams   } else {
638*2b730f8bSJeremy L Thompson     CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_f32_Magma));
63980a9ef05SNatalie Beams   }
640*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
641868539c2SNatalie Beams 
642*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedCalloc(1, &impl));
643*2b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisSetData(basis, impl));
644868539c2SNatalie Beams 
645868539c2SNatalie Beams   // Copy qref to the GPU
646*2b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dqref, nqpts * sizeof(qref[0])));
647e0582403Sabdelfattah83   magma_setvector(nqpts, sizeof(qref[0]), qref, 1, impl->dqref, 1, data->queue);
648868539c2SNatalie Beams 
649868539c2SNatalie Beams   // Copy interp to the GPU
650*2b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dinterp, nqpts * ndof * sizeof(interp[0])));
651*2b730f8bSJeremy L Thompson   magma_setvector(nqpts * ndof, sizeof(interp[0]), interp, 1, impl->dinterp, 1, data->queue);
652868539c2SNatalie Beams 
653868539c2SNatalie Beams   // Copy grad to the GPU
654*2b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dgrad, nqpts * ndof * dim * sizeof(grad[0])));
655*2b730f8bSJeremy L Thompson   magma_setvector(nqpts * ndof * dim, sizeof(grad[0]), grad, 1, impl->dgrad, 1, data->queue);
656868539c2SNatalie Beams 
657868539c2SNatalie Beams   // Copy qweight to the GPU
658*2b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dqweight, nqpts * sizeof(qweight[0])));
659*2b730f8bSJeremy L Thompson   magma_setvector(nqpts, sizeof(qweight[0]), qweight, 1, impl->dqweight, 1, data->queue);
660868539c2SNatalie Beams 
661e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
6627f5b9731SStan Tomov }
663