xref: /libCEED/rust/libceed-sys/c-src/backends/magma/ceed-magma-basis.c (revision e5f091eb2082fd2e5a436aed5b3c40dee25ac3c3)
13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
37f5b9731SStan Tomov //
43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
57f5b9731SStan Tomov //
63d8e8822SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
77f5b9731SStan Tomov 
8ec3da8bcSJed Brown #include <ceed/ceed.h>
9ec3da8bcSJed Brown #include <ceed/backend.h>
10f6af633fSnbeams #include <ceed/jit-tools.h>
11f6af633fSnbeams #include <string.h>
127f5b9731SStan Tomov #include "ceed-magma.h"
13*e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP
14f6af633fSnbeams #include "../hip/ceed-hip-common.h"
15f6af633fSnbeams #include "../hip/ceed-hip-compile.h"
16f6af633fSnbeams #else
17f6af633fSnbeams #include "../cuda/ceed-cuda-common.h"
18f6af633fSnbeams #include "../cuda/ceed-cuda-compile.h"
19f6af633fSnbeams #endif
207f5b9731SStan Tomov 
217f5b9731SStan Tomov #ifdef __cplusplus
227f5b9731SStan Tomov CEED_INTERN "C"
237f5b9731SStan Tomov #endif
247f5b9731SStan Tomov int CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem,
257f5b9731SStan Tomov                          CeedTransposeMode tmode, CeedEvalMode emode,
263513a710Sjeremylt                          CeedVector U, CeedVector V) {
277f5b9731SStan Tomov   int ierr;
287f5b9731SStan Tomov   Ceed ceed;
29e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
30e0582403Sabdelfattah83   CeedInt dim, ncomp, ndof;
31e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
32e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
33e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr);
34e0582403Sabdelfattah83 
35e0582403Sabdelfattah83   Ceed_Magma *data;
36e15f9bd0SJeremy L Thompson   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
37e0582403Sabdelfattah83 
387f5b9731SStan Tomov   const CeedScalar *u;
397f5b9731SStan Tomov   CeedScalar *v;
40868539c2SNatalie Beams   if (emode != CEED_EVAL_WEIGHT) {
41e15f9bd0SJeremy L Thompson     ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &u); CeedChkBackend(ierr);
427f5b9731SStan Tomov   } else if (emode != CEED_EVAL_WEIGHT) {
437f5b9731SStan Tomov     // LCOV_EXCL_START
44e15f9bd0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND,
457f5b9731SStan Tomov                      "An input vector is required for this CeedEvalMode");
467f5b9731SStan Tomov     // LCOV_EXCL_STOP
477f5b9731SStan Tomov   }
489c774eddSJeremy L Thompson   ierr = CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &v); CeedChkBackend(ierr);
497f5b9731SStan Tomov 
507f5b9731SStan Tomov   CeedBasis_Magma *impl;
51e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
527f5b9731SStan Tomov 
537f5b9731SStan Tomov   CeedInt P1d, Q1d;
54e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
55e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
567f5b9731SStan Tomov 
573f21f6b1SJeremy L Thompson   CeedDebug(ceed, "\033[01m[CeedBasisApply_Magma] vsize=%d, comp = %d",
587f5b9731SStan Tomov             ncomp*CeedIntPow(P1d, dim), ncomp);
597f5b9731SStan Tomov 
607f5b9731SStan Tomov   if (tmode == CEED_TRANSPOSE) {
611f9221feSJeremy L Thompson     CeedSize length;
62e15f9bd0SJeremy L Thompson     ierr = CeedVectorGetLength(V, &length); CeedChkBackend(ierr);
6380a9ef05SNatalie Beams     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
6480a9ef05SNatalie Beams       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *) v, length,
6580a9ef05SNatalie Beams                        data->queue);
6680a9ef05SNatalie Beams     } else {
6780a9ef05SNatalie Beams       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *) v, length,
6880a9ef05SNatalie Beams                        data->queue);
6980a9ef05SNatalie Beams     }
70e0582403Sabdelfattah83     ceed_magma_queue_sync( data->queue );
717f5b9731SStan Tomov   }
72f6af633fSnbeams 
733513a710Sjeremylt   switch (emode) {
743513a710Sjeremylt   case CEED_EVAL_INTERP: {
757f5b9731SStan Tomov     CeedInt P = P1d, Q = Q1d;
767f5b9731SStan Tomov     if (tmode == CEED_TRANSPOSE) {
777f5b9731SStan Tomov       P = Q1d; Q = P1d;
787f5b9731SStan Tomov     }
797f5b9731SStan Tomov 
807f5b9731SStan Tomov     // Define element sizes for dofs/quad
817f5b9731SStan Tomov     CeedInt elquadsize = CeedIntPow(Q1d, dim);
827f5b9731SStan Tomov     CeedInt eldofssize = CeedIntPow(P1d, dim);
837f5b9731SStan Tomov 
847f5b9731SStan Tomov     // E-vector ordering -------------- Q-vector ordering
85868539c2SNatalie Beams     //  component                        component
86868539c2SNatalie Beams     //    elem                             elem
877f5b9731SStan Tomov     //       node                            node
887f5b9731SStan Tomov 
897f5b9731SStan Tomov     // ---  Define strides for NOTRANSPOSE mode: ---
907f5b9731SStan Tomov     // Input (u) is E-vector, output (v) is Q-vector
917f5b9731SStan Tomov 
927f5b9731SStan Tomov     // Element strides
93868539c2SNatalie Beams     CeedInt u_elstride = eldofssize;
947f5b9731SStan Tomov     CeedInt v_elstride = elquadsize;
957f5b9731SStan Tomov     // Component strides
96868539c2SNatalie Beams     CeedInt u_compstride = nelem * eldofssize;
977f5b9731SStan Tomov     CeedInt v_compstride = nelem * elquadsize;
987f5b9731SStan Tomov 
997f5b9731SStan Tomov     // ---  Swap strides for TRANSPOSE mode: ---
1007f5b9731SStan Tomov     if (tmode == CEED_TRANSPOSE) {
1017f5b9731SStan Tomov       // Input (u) is Q-vector, output (v) is E-vector
1027f5b9731SStan Tomov       // Element strides
103868539c2SNatalie Beams       v_elstride = eldofssize;
1047f5b9731SStan Tomov       u_elstride = elquadsize;
1057f5b9731SStan Tomov       // Component strides
106868539c2SNatalie Beams       v_compstride = nelem * eldofssize;
1077f5b9731SStan Tomov       u_compstride = nelem * elquadsize;
1087f5b9731SStan Tomov     }
1097f5b9731SStan Tomov 
110f6af633fSnbeams     CeedInt nthreads = 1;
111f6af633fSnbeams     CeedInt ntcol = 1;
112f6af633fSnbeams     CeedInt shmem = 0;
113f6af633fSnbeams     CeedInt maxPQ = CeedIntMax(P, Q);
114f6af633fSnbeams 
115f6af633fSnbeams     switch (dim) {
116f6af633fSnbeams     case 1:
117f6af633fSnbeams       nthreads = maxPQ;
118f6af633fSnbeams       ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D);
119f6af633fSnbeams       shmem += sizeof(CeedScalar) * ntcol * ( ncomp * (1*P + 1*Q) );
120f6af633fSnbeams       shmem += sizeof(CeedScalar) * (P*Q);
121f6af633fSnbeams       break;
122f6af633fSnbeams     case 2:
123f6af633fSnbeams       nthreads = maxPQ;
124f6af633fSnbeams       ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D);
125f6af633fSnbeams       shmem += P*Q    *sizeof(CeedScalar);  // for sT
126f6af633fSnbeams       shmem += ntcol * ( P*maxPQ*sizeof(
127f6af633fSnbeams                            CeedScalar) );  // for reforming rU we need PxP, and for the intermediate output we need PxQ
128f6af633fSnbeams       break;
129f6af633fSnbeams     case 3:
130f6af633fSnbeams       nthreads = maxPQ*maxPQ;
131f6af633fSnbeams       ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D);
132f6af633fSnbeams       shmem += sizeof(CeedScalar)* (P*Q);  // for sT
133f6af633fSnbeams       shmem += sizeof(CeedScalar)* ntcol * (CeedIntMax(P*P*maxPQ,
134f6af633fSnbeams                                             P*Q*Q));  // rU needs P^2xP, the intermediate output needs max(P^2xQ,PQ^2)
135f6af633fSnbeams     }
136f6af633fSnbeams     CeedInt grid = (nelem + ntcol-1) / ntcol;
137f6af633fSnbeams     void *args[] = {&impl->dinterp1d,
138f6af633fSnbeams                     &u, &u_elstride, &u_compstride,
139f6af633fSnbeams                     &v, &v_elstride, &v_compstride,
140f6af633fSnbeams                     &nelem
141f6af633fSnbeams                    };
142f6af633fSnbeams 
143f6af633fSnbeams     if (tmode == CEED_TRANSPOSE) {
144f6af633fSnbeams       ierr = MAGMA_RTC_RUN_KERNEL_DIM_SH(ceed, impl->magma_interp_tr, grid,
145f6af633fSnbeams                                          nthreads, ntcol, 1, shmem,
146f6af633fSnbeams                                          args); CeedChkBackend(ierr);
147f6af633fSnbeams     } else {
148f6af633fSnbeams       ierr = MAGMA_RTC_RUN_KERNEL_DIM_SH(ceed, impl->magma_interp, grid,
149f6af633fSnbeams                                          nthreads, ntcol, 1, shmem,
150f6af633fSnbeams                                          args); CeedChkBackend(ierr);
151f6af633fSnbeams     }
1527f5b9731SStan Tomov   }
1533513a710Sjeremylt   break;
1543513a710Sjeremylt   case CEED_EVAL_GRAD: {
1557f5b9731SStan Tomov     CeedInt P = P1d, Q = Q1d;
1567f5b9731SStan Tomov     // In CEED_NOTRANSPOSE mode:
1577f5b9731SStan Tomov     // u is (P^dim x nc), column-major layout (nc = ncomp)
1587f5b9731SStan Tomov     // v is (Q^dim x nc x dim), column-major layout (nc = ncomp)
1597f5b9731SStan Tomov     // In CEED_TRANSPOSE mode, the sizes of u and v are switched.
1607f5b9731SStan Tomov     if (tmode == CEED_TRANSPOSE) {
1617f5b9731SStan Tomov       P = Q1d, Q = P1d;
1627f5b9731SStan Tomov     }
1637f5b9731SStan Tomov 
1647f5b9731SStan Tomov     // Define element sizes for dofs/quad
1657f5b9731SStan Tomov     CeedInt elquadsize = CeedIntPow(Q1d, dim);
1667f5b9731SStan Tomov     CeedInt eldofssize = CeedIntPow(P1d, dim);
1677f5b9731SStan Tomov 
1687f5b9731SStan Tomov     // E-vector ordering -------------- Q-vector ordering
1697f5b9731SStan Tomov     //                                  dim
170868539c2SNatalie Beams     //  component                        component
171868539c2SNatalie Beams     //    elem                              elem
1727f5b9731SStan Tomov     //       node                            node
1737f5b9731SStan Tomov 
1747f5b9731SStan Tomov     // ---  Define strides for NOTRANSPOSE mode: ---
1757f5b9731SStan Tomov     // Input (u) is E-vector, output (v) is Q-vector
1767f5b9731SStan Tomov 
1777f5b9731SStan Tomov     // Element strides
178868539c2SNatalie Beams     CeedInt u_elstride = eldofssize;
1797f5b9731SStan Tomov     CeedInt v_elstride = elquadsize;
1807f5b9731SStan Tomov     // Component strides
181868539c2SNatalie Beams     CeedInt u_compstride = nelem * eldofssize;
1827f5b9731SStan Tomov     CeedInt v_compstride = nelem * elquadsize;
1837f5b9731SStan Tomov     // Dimension strides
1847f5b9731SStan Tomov     CeedInt u_dimstride = 0;
1857f5b9731SStan Tomov     CeedInt v_dimstride = nelem * elquadsize * ncomp;
1867f5b9731SStan Tomov 
1877f5b9731SStan Tomov     // ---  Swap strides for TRANSPOSE mode: ---
1887f5b9731SStan Tomov     if (tmode == CEED_TRANSPOSE) {
1897f5b9731SStan Tomov       // Input (u) is Q-vector, output (v) is E-vector
1907f5b9731SStan Tomov       // Element strides
191868539c2SNatalie Beams       v_elstride = eldofssize;
1927f5b9731SStan Tomov       u_elstride = elquadsize;
1937f5b9731SStan Tomov       // Component strides
194868539c2SNatalie Beams       v_compstride = nelem * eldofssize;
1957f5b9731SStan Tomov       u_compstride = nelem * elquadsize;
1967f5b9731SStan Tomov       // Dimension strides
1977f5b9731SStan Tomov       v_dimstride = 0;
1987f5b9731SStan Tomov       u_dimstride = nelem * elquadsize * ncomp;
1997f5b9731SStan Tomov 
2007f5b9731SStan Tomov     }
2017f5b9731SStan Tomov 
202f6af633fSnbeams     CeedInt nthreads = 1;
203f6af633fSnbeams     CeedInt ntcol = 1;
204f6af633fSnbeams     CeedInt shmem = 0;
205f6af633fSnbeams     CeedInt maxPQ = CeedIntMax(P, Q);
206f6af633fSnbeams 
207f6af633fSnbeams     switch (dim) {
208f6af633fSnbeams     case 1:
209f6af633fSnbeams       nthreads = maxPQ;
210f6af633fSnbeams       ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D);
211f6af633fSnbeams       shmem += sizeof(CeedScalar) * ntcol * (ncomp * (1*P + 1*Q));
212f6af633fSnbeams       shmem += sizeof(CeedScalar) * (P*Q);
213f6af633fSnbeams       break;
214f6af633fSnbeams     case 2:
215f6af633fSnbeams       nthreads = maxPQ;
216f6af633fSnbeams       ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D);
217f6af633fSnbeams       shmem += sizeof(CeedScalar) * 2*P*Q;  // for sTinterp and sTgrad
218f6af633fSnbeams       shmem += sizeof(CeedScalar) * ntcol *
219f6af633fSnbeams                (P*maxPQ);  // for reforming rU we need PxP, and for the intermediate output we need PxQ
220f6af633fSnbeams       break;
221f6af633fSnbeams     case 3:
222f6af633fSnbeams       nthreads = maxPQ * maxPQ;
223f6af633fSnbeams       ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D);
224f6af633fSnbeams       shmem += sizeof(CeedScalar) * 2*P*Q;  // for sTinterp and sTgrad
225f6af633fSnbeams       shmem += sizeof(CeedScalar) * ntcol * CeedIntMax(P*P*P,
226f6af633fSnbeams                (P*P*Q) +
227f6af633fSnbeams                (P*Q*Q));  // rU needs P^2xP, the intermediate outputs need (P^2.Q + P.Q^2)
228f6af633fSnbeams     }
229f6af633fSnbeams     CeedInt grid = (nelem + ntcol-1) / ntcol;
230f6af633fSnbeams     void *args[] = {&impl->dinterp1d, &impl->dgrad1d,
231f6af633fSnbeams                     &u, &u_elstride, &u_compstride, &u_dimstride,
232f6af633fSnbeams                     &v, &v_elstride, &v_compstride, &v_dimstride,
233f6af633fSnbeams                     &nelem
234f6af633fSnbeams                    };
235f6af633fSnbeams 
236f6af633fSnbeams     if (tmode == CEED_TRANSPOSE) {
237f6af633fSnbeams       ierr = MAGMA_RTC_RUN_KERNEL_DIM_SH(ceed, impl->magma_grad_tr, grid,
238f6af633fSnbeams                                          nthreads, ntcol, 1, shmem,
239f6af633fSnbeams                                          args); CeedChkBackend(ierr);
240f6af633fSnbeams     } else {
241f6af633fSnbeams       ierr = MAGMA_RTC_RUN_KERNEL_DIM_SH(ceed, impl->magma_grad, grid,
242f6af633fSnbeams                                          nthreads, ntcol, 1, shmem,
243f6af633fSnbeams                                          args); CeedChkBackend(ierr);
244f6af633fSnbeams     }
2457f5b9731SStan Tomov   }
2463513a710Sjeremylt   break;
2473513a710Sjeremylt   case CEED_EVAL_WEIGHT: {
2487f5b9731SStan Tomov     if (tmode == CEED_TRANSPOSE)
2497f5b9731SStan Tomov       // LCOV_EXCL_START
250e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND,
2517f5b9731SStan Tomov                        "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
2527f5b9731SStan Tomov     // LCOV_EXCL_STOP
2537f5b9731SStan Tomov     CeedInt Q = Q1d;
254f6af633fSnbeams     CeedInt eldofssize = CeedIntPow(Q, dim);
255f6af633fSnbeams     CeedInt nthreads = 1;
256f6af633fSnbeams     CeedInt ntcol = 1;
257f6af633fSnbeams     CeedInt shmem = 0;
258f6af633fSnbeams 
259f6af633fSnbeams     switch (dim) {
260f6af633fSnbeams     case 1:
261f6af633fSnbeams       nthreads = Q;
262f6af633fSnbeams       ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D);
263f6af633fSnbeams       shmem += sizeof(CeedScalar) * Q;  // for dqweight1d
264f6af633fSnbeams       shmem += sizeof(CeedScalar) * ntcol * Q; // for output
265f6af633fSnbeams       break;
266f6af633fSnbeams     case 2:
267f6af633fSnbeams       nthreads = Q;
268f6af633fSnbeams       ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D);
269f6af633fSnbeams       shmem += sizeof(CeedScalar) * Q;  // for dqweight1d
270f6af633fSnbeams       break;
271f6af633fSnbeams     case 3:
272f6af633fSnbeams       nthreads = Q * Q;
273f6af633fSnbeams       ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D);
274f6af633fSnbeams       shmem += sizeof(CeedScalar) * Q;  // for dqweight1d
275f6af633fSnbeams     }
276f6af633fSnbeams     CeedInt grid = (nelem + ntcol-1) / ntcol;
277f6af633fSnbeams     void *args[] = {&impl->dqweight1d, &v, &eldofssize, &nelem};
278f6af633fSnbeams 
279f6af633fSnbeams     ierr = MAGMA_RTC_RUN_KERNEL_DIM_SH(ceed, impl->magma_weight, grid,
280f6af633fSnbeams                                        nthreads, ntcol, 1, shmem,
281f6af633fSnbeams                                        args); CeedChkBackend(ierr);
2827f5b9731SStan Tomov   }
2833513a710Sjeremylt   break;
2843513a710Sjeremylt   // LCOV_EXCL_START
2853513a710Sjeremylt   case CEED_EVAL_DIV:
286e15f9bd0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
2873513a710Sjeremylt   case CEED_EVAL_CURL:
288e15f9bd0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
2893513a710Sjeremylt   case CEED_EVAL_NONE:
290e15f9bd0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND,
2913513a710Sjeremylt                      "CEED_EVAL_NONE does not make sense in this context");
2923513a710Sjeremylt     // LCOV_EXCL_STOP
2933513a710Sjeremylt   }
2947f5b9731SStan Tomov 
295e0582403Sabdelfattah83   // must sync to ensure completeness
296e0582403Sabdelfattah83   ceed_magma_queue_sync( data->queue );
297e0582403Sabdelfattah83 
2987f5b9731SStan Tomov   if (emode!=CEED_EVAL_WEIGHT) {
299e15f9bd0SJeremy L Thompson     ierr = CeedVectorRestoreArrayRead(U, &u); CeedChkBackend(ierr);
3007f5b9731SStan Tomov   }
301e15f9bd0SJeremy L Thompson   ierr = CeedVectorRestoreArray(V, &v); CeedChkBackend(ierr);
302e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
3037f5b9731SStan Tomov }
3047f5b9731SStan Tomov 
3057f5b9731SStan Tomov #ifdef __cplusplus
3067f5b9731SStan Tomov CEED_INTERN "C"
3077f5b9731SStan Tomov #endif
30880a9ef05SNatalie Beams int CeedBasisApplyNonTensor_f64_Magma(CeedBasis basis, CeedInt nelem,
309868539c2SNatalie Beams                                       CeedTransposeMode tmode, CeedEvalMode emode,
310868539c2SNatalie Beams                                       CeedVector U, CeedVector V) {
311868539c2SNatalie Beams   int ierr;
312868539c2SNatalie Beams   Ceed ceed;
313e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
314e0582403Sabdelfattah83 
315e0582403Sabdelfattah83   Ceed_Magma *data;
316e15f9bd0SJeremy L Thompson   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
317e0582403Sabdelfattah83 
318868539c2SNatalie Beams   CeedInt dim, ncomp, ndof, nqpt;
319e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
320e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
321e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr);
322e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr);
323868539c2SNatalie Beams   const CeedScalar *du;
324868539c2SNatalie Beams   CeedScalar *dv;
325868539c2SNatalie Beams   if (emode != CEED_EVAL_WEIGHT) {
326e15f9bd0SJeremy L Thompson     ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du); CeedChkBackend(ierr);
327868539c2SNatalie Beams   } else if (emode != CEED_EVAL_WEIGHT) {
328868539c2SNatalie Beams     // LCOV_EXCL_START
329e15f9bd0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND,
330868539c2SNatalie Beams                      "An input vector is required for this CeedEvalMode");
331868539c2SNatalie Beams     // LCOV_EXCL_STOP
332868539c2SNatalie Beams   }
3339c774eddSJeremy L Thompson   ierr = CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv); CeedChkBackend(ierr);
334868539c2SNatalie Beams 
335868539c2SNatalie Beams   CeedBasisNonTensor_Magma *impl;
336e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
337868539c2SNatalie Beams 
3383f21f6b1SJeremy L Thompson   CeedDebug(ceed, "\033[01m[CeedBasisApplyNonTensor_Magma] vsize=%d, comp = %d",
339868539c2SNatalie Beams             ncomp*ndof, ncomp);
340868539c2SNatalie Beams 
341868539c2SNatalie Beams   if (tmode == CEED_TRANSPOSE) {
3421f9221feSJeremy L Thompson     CeedSize length;
343868539c2SNatalie Beams     ierr = CeedVectorGetLength(V, &length);
34480a9ef05SNatalie Beams     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
34580a9ef05SNatalie Beams       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *) dv, length,
34680a9ef05SNatalie Beams                        data->queue);
34780a9ef05SNatalie Beams     } else {
34880a9ef05SNatalie Beams       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *) dv, length,
34980a9ef05SNatalie Beams                        data->queue);
35080a9ef05SNatalie Beams     }
351e0582403Sabdelfattah83     ceed_magma_queue_sync( data->queue );
352868539c2SNatalie Beams   }
35380a9ef05SNatalie Beams 
354868539c2SNatalie Beams   switch (emode) {
355868539c2SNatalie Beams   case CEED_EVAL_INTERP: {
356868539c2SNatalie Beams     CeedInt P = ndof, Q = nqpt;
357868539c2SNatalie Beams     if (tmode == CEED_TRANSPOSE)
358e0582403Sabdelfattah83       magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans,
359868539c2SNatalie Beams                             P, nelem*ncomp, Q,
36080a9ef05SNatalie Beams                             1.0, (double *)impl->dinterp, P,
36180a9ef05SNatalie Beams                             (double *)du, Q,
36280a9ef05SNatalie Beams                             0.0, (double *)dv, P, data->queue);
363868539c2SNatalie Beams     else
364e0582403Sabdelfattah83       magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans,
365868539c2SNatalie Beams                             Q, nelem*ncomp, P,
36680a9ef05SNatalie Beams                             1.0, (double *)impl->dinterp, P,
36780a9ef05SNatalie Beams                             (double *)du, P,
36880a9ef05SNatalie Beams                             0.0, (double *)dv, Q, data->queue);
369868539c2SNatalie Beams   }
370868539c2SNatalie Beams   break;
371868539c2SNatalie Beams 
372868539c2SNatalie Beams   case CEED_EVAL_GRAD: {
373868539c2SNatalie Beams     CeedInt P = ndof, Q = nqpt;
374868539c2SNatalie Beams     if (tmode == CEED_TRANSPOSE) {
37580a9ef05SNatalie Beams       CeedScalar beta = 0.0;
376868539c2SNatalie Beams       for(int d=0; d<dim; d++) {
377868539c2SNatalie Beams         if (d>0)
378868539c2SNatalie Beams           beta = 1.0;
379e0582403Sabdelfattah83         magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans,
380868539c2SNatalie Beams                               P, nelem*ncomp, Q,
38180a9ef05SNatalie Beams                               1.0, (double *)(impl->dgrad + d*P*Q), P,
38280a9ef05SNatalie Beams                               (double *)(du + d*nelem*ncomp*Q), Q,
38380a9ef05SNatalie Beams                               beta, (double *)dv, P, data->queue);
384868539c2SNatalie Beams       }
385868539c2SNatalie Beams     } else {
386868539c2SNatalie Beams       for(int d=0; d< dim; d++)
387e0582403Sabdelfattah83         magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans,
388868539c2SNatalie Beams                               Q, nelem*ncomp, P,
38980a9ef05SNatalie Beams                               1.0, (double *)(impl->dgrad + d*P*Q), P,
39080a9ef05SNatalie Beams                               (double *)du, P,
39180a9ef05SNatalie Beams                               0.0, (double *)(dv + d*nelem*ncomp*Q), Q, data->queue);
39280a9ef05SNatalie Beams     }
39380a9ef05SNatalie Beams   }
39480a9ef05SNatalie Beams   break;
39580a9ef05SNatalie Beams 
39680a9ef05SNatalie Beams   case CEED_EVAL_WEIGHT: {
39780a9ef05SNatalie Beams     if (tmode == CEED_TRANSPOSE)
39880a9ef05SNatalie Beams       // LCOV_EXCL_START
39980a9ef05SNatalie Beams       return CeedError(ceed, CEED_ERROR_BACKEND,
40080a9ef05SNatalie Beams                        "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
40180a9ef05SNatalie Beams     // LCOV_EXCL_STOP
40280a9ef05SNatalie Beams 
40380a9ef05SNatalie Beams     int elemsPerBlock = 1;//basis->Q1d < 7 ? optElems[basis->Q1d] : 1;
40480a9ef05SNatalie Beams     int grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)?
40580a9ef05SNatalie Beams                                        1 : 0 );
40680a9ef05SNatalie Beams     magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv,
40780a9ef05SNatalie Beams                            data->queue);
40880a9ef05SNatalie Beams     CeedChkBackend(ierr);
40980a9ef05SNatalie Beams   }
41080a9ef05SNatalie Beams   break;
41180a9ef05SNatalie Beams 
41280a9ef05SNatalie Beams   // LCOV_EXCL_START
41380a9ef05SNatalie Beams   case CEED_EVAL_DIV:
41480a9ef05SNatalie Beams     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
41580a9ef05SNatalie Beams   case CEED_EVAL_CURL:
41680a9ef05SNatalie Beams     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
41780a9ef05SNatalie Beams   case CEED_EVAL_NONE:
41880a9ef05SNatalie Beams     return CeedError(ceed, CEED_ERROR_BACKEND,
41980a9ef05SNatalie Beams                      "CEED_EVAL_NONE does not make sense in this context");
42080a9ef05SNatalie Beams     // LCOV_EXCL_STOP
42180a9ef05SNatalie Beams   }
42280a9ef05SNatalie Beams 
42380a9ef05SNatalie Beams   // must sync to ensure completeness
42480a9ef05SNatalie Beams   ceed_magma_queue_sync( data->queue );
42580a9ef05SNatalie Beams 
42680a9ef05SNatalie Beams   if (emode!=CEED_EVAL_WEIGHT) {
42780a9ef05SNatalie Beams     ierr = CeedVectorRestoreArrayRead(U, &du); CeedChkBackend(ierr);
42880a9ef05SNatalie Beams   }
42980a9ef05SNatalie Beams   ierr = CeedVectorRestoreArray(V, &dv); CeedChkBackend(ierr);
43080a9ef05SNatalie Beams   return CEED_ERROR_SUCCESS;
43180a9ef05SNatalie Beams }
43280a9ef05SNatalie Beams 
43380a9ef05SNatalie Beams int CeedBasisApplyNonTensor_f32_Magma(CeedBasis basis, CeedInt nelem,
43480a9ef05SNatalie Beams                                       CeedTransposeMode tmode, CeedEvalMode emode,
43580a9ef05SNatalie Beams                                       CeedVector U, CeedVector V) {
43680a9ef05SNatalie Beams   int ierr;
43780a9ef05SNatalie Beams   Ceed ceed;
43880a9ef05SNatalie Beams   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
43980a9ef05SNatalie Beams 
44080a9ef05SNatalie Beams   Ceed_Magma *data;
44180a9ef05SNatalie Beams   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
44280a9ef05SNatalie Beams 
44380a9ef05SNatalie Beams   CeedInt dim, ncomp, ndof, nqpt;
44480a9ef05SNatalie Beams   ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
44580a9ef05SNatalie Beams   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
44680a9ef05SNatalie Beams   ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr);
44780a9ef05SNatalie Beams   ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr);
44880a9ef05SNatalie Beams   const CeedScalar *du;
44980a9ef05SNatalie Beams   CeedScalar *dv;
45080a9ef05SNatalie Beams   if (emode != CEED_EVAL_WEIGHT) {
45180a9ef05SNatalie Beams     ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du); CeedChkBackend(ierr);
45280a9ef05SNatalie Beams   } else if (emode != CEED_EVAL_WEIGHT) {
45380a9ef05SNatalie Beams     // LCOV_EXCL_START
45480a9ef05SNatalie Beams     return CeedError(ceed, CEED_ERROR_BACKEND,
45580a9ef05SNatalie Beams                      "An input vector is required for this CeedEvalMode");
45680a9ef05SNatalie Beams     // LCOV_EXCL_STOP
45780a9ef05SNatalie Beams   }
4589c774eddSJeremy L Thompson   ierr = CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv); CeedChkBackend(ierr);
45980a9ef05SNatalie Beams 
46080a9ef05SNatalie Beams   CeedBasisNonTensor_Magma *impl;
46180a9ef05SNatalie Beams   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
46280a9ef05SNatalie Beams 
4633f21f6b1SJeremy L Thompson   CeedDebug(ceed, "\033[01m[CeedBasisApplyNonTensor_Magma] vsize=%d, comp = %d",
46480a9ef05SNatalie Beams             ncomp*ndof, ncomp);
46580a9ef05SNatalie Beams 
46680a9ef05SNatalie Beams   if (tmode == CEED_TRANSPOSE) {
4671f9221feSJeremy L Thompson     CeedSize length;
46880a9ef05SNatalie Beams     ierr = CeedVectorGetLength(V, &length);
46980a9ef05SNatalie Beams     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
47080a9ef05SNatalie Beams       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *) dv, length,
47180a9ef05SNatalie Beams                        data->queue);
47280a9ef05SNatalie Beams     } else {
47380a9ef05SNatalie Beams       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *) dv, length,
47480a9ef05SNatalie Beams                        data->queue);
47580a9ef05SNatalie Beams     }
47680a9ef05SNatalie Beams     ceed_magma_queue_sync( data->queue );
47780a9ef05SNatalie Beams   }
47880a9ef05SNatalie Beams 
47980a9ef05SNatalie Beams   switch (emode) {
48080a9ef05SNatalie Beams   case CEED_EVAL_INTERP: {
48180a9ef05SNatalie Beams     CeedInt P = ndof, Q = nqpt;
48280a9ef05SNatalie Beams     if (tmode == CEED_TRANSPOSE)
48380a9ef05SNatalie Beams       magma_sgemm_nontensor(MagmaNoTrans, MagmaNoTrans,
48480a9ef05SNatalie Beams                             P, nelem*ncomp, Q,
48580a9ef05SNatalie Beams                             1.0, (float *)impl->dinterp, P,
48680a9ef05SNatalie Beams                             (float *)du, Q,
48780a9ef05SNatalie Beams                             0.0, (float *)dv, P, data->queue);
48880a9ef05SNatalie Beams     else
48980a9ef05SNatalie Beams       magma_sgemm_nontensor(MagmaTrans, MagmaNoTrans,
49080a9ef05SNatalie Beams                             Q, nelem*ncomp, P,
49180a9ef05SNatalie Beams                             1.0, (float *)impl->dinterp, P,
49280a9ef05SNatalie Beams                             (float *)du, P,
49380a9ef05SNatalie Beams                             0.0, (float *)dv, Q, data->queue);
49480a9ef05SNatalie Beams   }
49580a9ef05SNatalie Beams   break;
49680a9ef05SNatalie Beams 
49780a9ef05SNatalie Beams   case CEED_EVAL_GRAD: {
49880a9ef05SNatalie Beams     CeedInt P = ndof, Q = nqpt;
49980a9ef05SNatalie Beams     if (tmode == CEED_TRANSPOSE) {
50080a9ef05SNatalie Beams       CeedScalar beta = 0.0;
50180a9ef05SNatalie Beams       for(int d=0; d<dim; d++) {
50280a9ef05SNatalie Beams         if (d>0)
50380a9ef05SNatalie Beams           beta = 1.0;
50480a9ef05SNatalie Beams         magma_sgemm_nontensor(MagmaNoTrans, MagmaNoTrans,
50580a9ef05SNatalie Beams                               P, nelem*ncomp, Q,
50680a9ef05SNatalie Beams                               1.0, (float *)(impl->dgrad + d*P*Q), P,
50780a9ef05SNatalie Beams                               (float *)(du + d*nelem*ncomp*Q), Q,
50880a9ef05SNatalie Beams                               beta, (float *)dv, P, data->queue);
50980a9ef05SNatalie Beams       }
51080a9ef05SNatalie Beams     } else {
51180a9ef05SNatalie Beams       for(int d=0; d< dim; d++)
51280a9ef05SNatalie Beams         magma_sgemm_nontensor(MagmaTrans, MagmaNoTrans,
51380a9ef05SNatalie Beams                               Q, nelem*ncomp, P,
51480a9ef05SNatalie Beams                               1.0, (float *)(impl->dgrad + d*P*Q), P,
51580a9ef05SNatalie Beams                               (float *)du, P,
51680a9ef05SNatalie Beams                               0.0, (float *)(dv + d*nelem*ncomp*Q), Q, data->queue);
517868539c2SNatalie Beams     }
518868539c2SNatalie Beams   }
519868539c2SNatalie Beams   break;
520868539c2SNatalie Beams 
521868539c2SNatalie Beams   case CEED_EVAL_WEIGHT: {
522868539c2SNatalie Beams     if (tmode == CEED_TRANSPOSE)
523868539c2SNatalie Beams       // LCOV_EXCL_START
524e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND,
525868539c2SNatalie Beams                        "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
526868539c2SNatalie Beams     // LCOV_EXCL_STOP
527868539c2SNatalie Beams 
528868539c2SNatalie Beams     int elemsPerBlock = 1;//basis->Q1d < 7 ? optElems[basis->Q1d] : 1;
529868539c2SNatalie Beams     int grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)?
530868539c2SNatalie Beams                                        1 : 0 );
531e0582403Sabdelfattah83     magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv,
532e0582403Sabdelfattah83                            data->queue);
533e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
534868539c2SNatalie Beams   }
535868539c2SNatalie Beams   break;
536868539c2SNatalie Beams 
537868539c2SNatalie Beams   // LCOV_EXCL_START
538868539c2SNatalie Beams   case CEED_EVAL_DIV:
539e15f9bd0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
540868539c2SNatalie Beams   case CEED_EVAL_CURL:
541e15f9bd0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
542868539c2SNatalie Beams   case CEED_EVAL_NONE:
543e15f9bd0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND,
544868539c2SNatalie Beams                      "CEED_EVAL_NONE does not make sense in this context");
545868539c2SNatalie Beams     // LCOV_EXCL_STOP
546868539c2SNatalie Beams   }
547868539c2SNatalie Beams 
548e0582403Sabdelfattah83   // must sync to ensure completeness
549e0582403Sabdelfattah83   ceed_magma_queue_sync( data->queue );
550e0582403Sabdelfattah83 
551868539c2SNatalie Beams   if (emode!=CEED_EVAL_WEIGHT) {
552e15f9bd0SJeremy L Thompson     ierr = CeedVectorRestoreArrayRead(U, &du); CeedChkBackend(ierr);
553868539c2SNatalie Beams   }
554e15f9bd0SJeremy L Thompson   ierr = CeedVectorRestoreArray(V, &dv); CeedChkBackend(ierr);
555e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
556868539c2SNatalie Beams }
557868539c2SNatalie Beams 
558868539c2SNatalie Beams #ifdef __cplusplus
559868539c2SNatalie Beams CEED_INTERN "C"
560868539c2SNatalie Beams #endif
5613513a710Sjeremylt int CeedBasisDestroy_Magma(CeedBasis basis) {
5627f5b9731SStan Tomov   int ierr;
5637f5b9731SStan Tomov   CeedBasis_Magma *impl;
564e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
5657f5b9731SStan Tomov 
566e15f9bd0SJeremy L Thompson   ierr = magma_free(impl->dqref1d); CeedChkBackend(ierr);
567e15f9bd0SJeremy L Thompson   ierr = magma_free(impl->dinterp1d); CeedChkBackend(ierr);
568e15f9bd0SJeremy L Thompson   ierr = magma_free(impl->dgrad1d); CeedChkBackend(ierr);
569e15f9bd0SJeremy L Thompson   ierr = magma_free(impl->dqweight1d); CeedChkBackend(ierr);
570f6af633fSnbeams   Ceed ceed;
571f6af633fSnbeams   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
572*e5f091ebSnbeams   #ifdef CEED_MAGMA_USE_HIP
573f6af633fSnbeams   ierr = hipModuleUnload(impl->module); CeedChk_Hip(ceed, ierr);
574f6af633fSnbeams   #else
575f6af633fSnbeams   ierr = cuModuleUnload(impl->module); CeedChk_Cu(ceed, ierr);
576f6af633fSnbeams   #endif
5777f5b9731SStan Tomov 
578e15f9bd0SJeremy L Thompson   ierr = CeedFree(&impl); CeedChkBackend(ierr);
5797f5b9731SStan Tomov 
580e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
5817f5b9731SStan Tomov }
5827f5b9731SStan Tomov 
5837f5b9731SStan Tomov #ifdef __cplusplus
5847f5b9731SStan Tomov CEED_INTERN "C"
5857f5b9731SStan Tomov #endif
586868539c2SNatalie Beams int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) {
587868539c2SNatalie Beams   int ierr;
588868539c2SNatalie Beams   CeedBasisNonTensor_Magma *impl;
589e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
590868539c2SNatalie Beams 
591e15f9bd0SJeremy L Thompson   ierr = magma_free(impl->dqref); CeedChkBackend(ierr);
592e15f9bd0SJeremy L Thompson   ierr = magma_free(impl->dinterp); CeedChkBackend(ierr);
593e15f9bd0SJeremy L Thompson   ierr = magma_free(impl->dgrad); CeedChkBackend(ierr);
594e15f9bd0SJeremy L Thompson   ierr = magma_free(impl->dqweight); CeedChkBackend(ierr);
595868539c2SNatalie Beams 
596e15f9bd0SJeremy L Thompson   ierr = CeedFree(&impl); CeedChkBackend(ierr);
597868539c2SNatalie Beams 
598e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
599868539c2SNatalie Beams }
600868539c2SNatalie Beams 
601868539c2SNatalie Beams #ifdef __cplusplus
602868539c2SNatalie Beams CEED_INTERN "C"
603868539c2SNatalie Beams #endif
6043513a710Sjeremylt int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d,
6053513a710Sjeremylt                                   const CeedScalar *interp1d,
6067f5b9731SStan Tomov                                   const CeedScalar *grad1d,
6077f5b9731SStan Tomov                                   const CeedScalar *qref1d,
6083513a710Sjeremylt                                   const CeedScalar *qweight1d, CeedBasis basis) {
6097f5b9731SStan Tomov   int ierr;
6107f5b9731SStan Tomov   CeedBasis_Magma *impl;
611f6af633fSnbeams   ierr = CeedCalloc(1,&impl); CeedChkBackend(ierr);
6127f5b9731SStan Tomov   Ceed ceed;
613e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
6147f5b9731SStan Tomov 
615c9f8acf2SJeremy L Thompson   // Check for supported parameters
616c9f8acf2SJeremy L Thompson   CeedInt ncomp = 0;
617e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
618e0582403Sabdelfattah83   Ceed_Magma *data;
619e15f9bd0SJeremy L Thompson   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
620e0582403Sabdelfattah83 
621f6af633fSnbeams   // Compile kernels
622f6af633fSnbeams   char *magma_common_path;
623f6af633fSnbeams   char *interp_path, *grad_path, *weight_path;
624f6af633fSnbeams   char *basis_kernel_source;
625f6af633fSnbeams   ierr = CeedGetJitAbsolutePath(ceed,
626f6af633fSnbeams                                 "ceed/jit-source/magma/magma_common_device.h",
627f6af633fSnbeams                                 &magma_common_path); CeedChkBackend(ierr);
628f6af633fSnbeams   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n");
629f6af633fSnbeams   ierr = CeedLoadSourceToBuffer(ceed, magma_common_path,
630f6af633fSnbeams                                 &basis_kernel_source);
631f6af633fSnbeams   CeedChkBackend(ierr);
632f6af633fSnbeams   char *interp_name_base = "ceed/jit-source/magma/interp";
633f6af633fSnbeams   CeedInt interp_name_len = strlen(interp_name_base) + 6;
634f6af633fSnbeams   char interp_name[interp_name_len];
635f6af633fSnbeams   snprintf(interp_name, interp_name_len, "%s-%dd.h", interp_name_base, dim);
636f6af633fSnbeams   ierr = CeedGetJitAbsolutePath(ceed, interp_name, &interp_path);
637f6af633fSnbeams   CeedChkBackend(ierr);
638f6af633fSnbeams   ierr = CeedLoadSourceToInitializedBuffer(ceed, interp_path,
639f6af633fSnbeams          &basis_kernel_source);
640f6af633fSnbeams   CeedChkBackend(ierr);
641f6af633fSnbeams   char *grad_name_base = "ceed/jit-source/magma/grad";
642f6af633fSnbeams   CeedInt grad_name_len = strlen(grad_name_base) + 6;
643f6af633fSnbeams   char grad_name[grad_name_len];
644f6af633fSnbeams   snprintf(grad_name, grad_name_len, "%s-%dd.h", grad_name_base, dim);
645f6af633fSnbeams   ierr = CeedGetJitAbsolutePath(ceed, grad_name, &grad_path);
646f6af633fSnbeams   CeedChkBackend(ierr);
647f6af633fSnbeams   ierr = CeedLoadSourceToInitializedBuffer(ceed, grad_path,
648f6af633fSnbeams          &basis_kernel_source);
649f6af633fSnbeams   CeedChkBackend(ierr);
650f6af633fSnbeams   char *weight_name_base = "ceed/jit-source/magma/weight";
651f6af633fSnbeams   CeedInt weight_name_len = strlen(weight_name_base) + 6;
652f6af633fSnbeams   char weight_name[weight_name_len];
653f6af633fSnbeams   snprintf(weight_name, weight_name_len, "%s-%dd.h", weight_name_base, dim);
654f6af633fSnbeams   ierr = CeedGetJitAbsolutePath(ceed, weight_name, &weight_path);
655f6af633fSnbeams   CeedChkBackend(ierr);
656f6af633fSnbeams   ierr = CeedLoadSourceToInitializedBuffer(ceed, weight_path,
657f6af633fSnbeams          &basis_kernel_source);
658f6af633fSnbeams   CeedChkBackend(ierr);
659f6af633fSnbeams   CeedDebug256(ceed, 2,
660f6af633fSnbeams                "----- Loading Basis Kernel Source Complete! -----\n");
661f6af633fSnbeams   // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip
662f6af633fSnbeams   // data
663f6af633fSnbeams   Ceed delegate;
664f6af633fSnbeams   ierr = CeedGetDelegate(ceed, &delegate); CeedChkBackend(ierr);
665f6af633fSnbeams   ierr = MAGMA_RTC_COMPILE(delegate, basis_kernel_source, &impl->module, 5,
666f6af633fSnbeams                            "DIM", dim,
667f6af633fSnbeams                            "NCOMP", ncomp,
668f6af633fSnbeams                            "P", P1d,
669f6af633fSnbeams                            "Q", Q1d,
670f6af633fSnbeams                            "MAXPQ", CeedIntMax(P1d, Q1d));
671f6af633fSnbeams   CeedChkBackend(ierr);
672f6af633fSnbeams 
673f6af633fSnbeams   // Kernel setup
674f6af633fSnbeams   switch (dim) {
675f6af633fSnbeams   case 1:
676f6af633fSnbeams     ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_interpn_1d_kernel",
677f6af633fSnbeams                                 &impl->magma_interp);
678f6af633fSnbeams     CeedChkBackend(ierr);
679f6af633fSnbeams     ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_interpt_1d_kernel",
680f6af633fSnbeams                                 &impl->magma_interp_tr);
681f6af633fSnbeams     CeedChkBackend(ierr);
682f6af633fSnbeams     ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_gradn_1d_kernel",
683f6af633fSnbeams                                 &impl->magma_grad);
684f6af633fSnbeams     CeedChkBackend(ierr);
685f6af633fSnbeams     ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_gradt_1d_kernel",
686f6af633fSnbeams                                 &impl->magma_grad_tr);
687f6af633fSnbeams     CeedChkBackend(ierr);
688f6af633fSnbeams     ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_weight_1d_kernel",
689f6af633fSnbeams                                 &impl->magma_weight);
690f6af633fSnbeams     CeedChkBackend(ierr);
691f6af633fSnbeams     break;
692f6af633fSnbeams   case 2:
693f6af633fSnbeams     ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_interpn_2d_kernel",
694f6af633fSnbeams                                 &impl->magma_interp);
695f6af633fSnbeams     CeedChkBackend(ierr);
696f6af633fSnbeams     ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_interpt_2d_kernel",
697f6af633fSnbeams                                 &impl->magma_interp_tr);
698f6af633fSnbeams     CeedChkBackend(ierr);
699f6af633fSnbeams     ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_gradn_2d_kernel",
700f6af633fSnbeams                                 &impl->magma_grad);
701f6af633fSnbeams     CeedChkBackend(ierr);
702f6af633fSnbeams     ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_gradt_2d_kernel",
703f6af633fSnbeams                                 &impl->magma_grad_tr);
704f6af633fSnbeams     CeedChkBackend(ierr);
705f6af633fSnbeams     ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_weight_2d_kernel",
706f6af633fSnbeams                                 &impl->magma_weight);
707f6af633fSnbeams     CeedChkBackend(ierr);
708f6af633fSnbeams     break;
709f6af633fSnbeams   case 3:
710f6af633fSnbeams     ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_interpn_3d_kernel",
711f6af633fSnbeams                                 &impl->magma_interp);
712f6af633fSnbeams     CeedChkBackend(ierr);
713f6af633fSnbeams     ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_interpt_3d_kernel",
714f6af633fSnbeams                                 &impl->magma_interp_tr);
715f6af633fSnbeams     CeedChkBackend(ierr);
716f6af633fSnbeams     ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_gradn_3d_kernel",
717f6af633fSnbeams                                 &impl->magma_grad);
718f6af633fSnbeams     CeedChkBackend(ierr);
719f6af633fSnbeams     ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_gradt_3d_kernel",
720f6af633fSnbeams                                 &impl->magma_grad_tr);
721f6af633fSnbeams     CeedChkBackend(ierr);
722f6af633fSnbeams     ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_weight_3d_kernel",
723f6af633fSnbeams                                 &impl->magma_weight);
724f6af633fSnbeams     CeedChkBackend(ierr);
725f6af633fSnbeams   }
726f6af633fSnbeams 
7277f5b9731SStan Tomov   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
728e15f9bd0SJeremy L Thompson                                 CeedBasisApply_Magma); CeedChkBackend(ierr);
7297f5b9731SStan Tomov   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
730e15f9bd0SJeremy L Thompson                                 CeedBasisDestroy_Magma); CeedChkBackend(ierr);
7317f5b9731SStan Tomov 
7327f5b9731SStan Tomov   // Copy qref1d to the GPU
7337f5b9731SStan Tomov   ierr = magma_malloc((void **)&impl->dqref1d, Q1d*sizeof(qref1d[0]));
734e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
735e0582403Sabdelfattah83   magma_setvector(Q1d, sizeof(qref1d[0]), qref1d, 1, impl->dqref1d, 1,
736e0582403Sabdelfattah83                   data->queue);
7377f5b9731SStan Tomov 
7387f5b9731SStan Tomov   // Copy interp1d to the GPU
7397f5b9731SStan Tomov   ierr = magma_malloc((void **)&impl->dinterp1d, Q1d*P1d*sizeof(interp1d[0]));
740e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
741e0582403Sabdelfattah83   magma_setvector(Q1d*P1d, sizeof(interp1d[0]), interp1d, 1, impl->dinterp1d, 1,
742e0582403Sabdelfattah83                   data->queue);
7437f5b9731SStan Tomov 
7447f5b9731SStan Tomov   // Copy grad1d to the GPU
7457f5b9731SStan Tomov   ierr = magma_malloc((void **)&impl->dgrad1d, Q1d*P1d*sizeof(grad1d[0]));
746e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
747e0582403Sabdelfattah83   magma_setvector(Q1d*P1d, sizeof(grad1d[0]), grad1d, 1, impl->dgrad1d, 1,
748e0582403Sabdelfattah83                   data->queue);
7497f5b9731SStan Tomov 
7507f5b9731SStan Tomov   // Copy qweight1d to the GPU
7517f5b9731SStan Tomov   ierr = magma_malloc((void **)&impl->dqweight1d, Q1d*sizeof(qweight1d[0]));
752e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
753e0582403Sabdelfattah83   magma_setvector(Q1d, sizeof(qweight1d[0]), qweight1d, 1, impl->dqweight1d, 1,
754e0582403Sabdelfattah83                   data->queue);
7557f5b9731SStan Tomov 
756f6af633fSnbeams   ierr = CeedBasisSetData(basis, impl); CeedChkBackend(ierr);
757f6af633fSnbeams 
758e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
7597f5b9731SStan Tomov }
7607f5b9731SStan Tomov 
7617f5b9731SStan Tomov #ifdef __cplusplus
7627f5b9731SStan Tomov CEED_INTERN "C"
7637f5b9731SStan Tomov #endif
7643513a710Sjeremylt int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof,
7653513a710Sjeremylt                             CeedInt nqpts, const CeedScalar *interp,
7663513a710Sjeremylt                             const CeedScalar *grad, const CeedScalar *qref,
7673513a710Sjeremylt                             const CeedScalar *qweight, CeedBasis basis) {
7687f5b9731SStan Tomov   int ierr;
769868539c2SNatalie Beams   CeedBasisNonTensor_Magma *impl;
7707f5b9731SStan Tomov   Ceed ceed;
771e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
7727f5b9731SStan Tomov 
773e0582403Sabdelfattah83   Ceed_Magma *data;
774e15f9bd0SJeremy L Thompson   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
775e0582403Sabdelfattah83 
77680a9ef05SNatalie Beams   if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) {
777868539c2SNatalie Beams     ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
77880a9ef05SNatalie Beams                                   CeedBasisApplyNonTensor_f64_Magma);
77980a9ef05SNatalie Beams     CeedChkBackend(ierr);
78080a9ef05SNatalie Beams   } else {
78180a9ef05SNatalie Beams     ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
78280a9ef05SNatalie Beams                                   CeedBasisApplyNonTensor_f32_Magma);
78380a9ef05SNatalie Beams     CeedChkBackend(ierr);
78480a9ef05SNatalie Beams   }
785868539c2SNatalie Beams   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
786e15f9bd0SJeremy L Thompson                                 CeedBasisDestroyNonTensor_Magma); CeedChkBackend(ierr);
787868539c2SNatalie Beams 
788e15f9bd0SJeremy L Thompson   ierr = CeedCalloc(1,&impl); CeedChkBackend(ierr);
789e15f9bd0SJeremy L Thompson   ierr = CeedBasisSetData(basis, impl); CeedChkBackend(ierr);
790868539c2SNatalie Beams 
791868539c2SNatalie Beams   // Copy qref to the GPU
792868539c2SNatalie Beams   ierr = magma_malloc((void **)&impl->dqref, nqpts*sizeof(qref[0]));
793e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
794e0582403Sabdelfattah83   magma_setvector(nqpts, sizeof(qref[0]), qref, 1, impl->dqref, 1, data->queue);
795868539c2SNatalie Beams 
796868539c2SNatalie Beams   // Copy interp to the GPU
797868539c2SNatalie Beams   ierr = magma_malloc((void **)&impl->dinterp, nqpts*ndof*sizeof(interp[0]));
798e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
799e0582403Sabdelfattah83   magma_setvector(nqpts*ndof, sizeof(interp[0]), interp, 1, impl->dinterp, 1,
800e0582403Sabdelfattah83                   data->queue);
801868539c2SNatalie Beams 
802868539c2SNatalie Beams   // Copy grad to the GPU
803868539c2SNatalie Beams   ierr = magma_malloc((void **)&impl->dgrad, nqpts*ndof*dim*sizeof(grad[0]));
804e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
805e0582403Sabdelfattah83   magma_setvector(nqpts*ndof*dim, sizeof(grad[0]), grad, 1, impl->dgrad, 1,
806e0582403Sabdelfattah83                   data->queue);
807868539c2SNatalie Beams 
808868539c2SNatalie Beams   // Copy qweight to the GPU
809868539c2SNatalie Beams   ierr = magma_malloc((void **)&impl->dqweight, nqpts*sizeof(qweight[0]));
810e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
811e0582403Sabdelfattah83   magma_setvector(nqpts, sizeof(qweight[0]), qweight, 1, impl->dqweight, 1,
812e0582403Sabdelfattah83                   data->queue);
813868539c2SNatalie Beams 
814e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
8157f5b9731SStan Tomov }
816