13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 37f5b9731SStan Tomov // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 57f5b9731SStan Tomov // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 77f5b9731SStan Tomov 8ec3da8bcSJed Brown #include <ceed/backend.h> 9*2b730f8bSJeremy L Thompson #include <ceed/ceed.h> 10f6af633fSnbeams #include <ceed/jit-tools.h> 11f6af633fSnbeams #include <string.h> 12*2b730f8bSJeremy L Thompson 137f5b9731SStan Tomov #include "ceed-magma.h" 14e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP 15f6af633fSnbeams #include "../hip/ceed-hip-common.h" 16f6af633fSnbeams #include "../hip/ceed-hip-compile.h" 17f6af633fSnbeams #else 18f6af633fSnbeams #include "../cuda/ceed-cuda-common.h" 19f6af633fSnbeams #include "../cuda/ceed-cuda-compile.h" 20f6af633fSnbeams #endif 217f5b9731SStan Tomov 227f5b9731SStan Tomov #ifdef __cplusplus 237f5b9731SStan Tomov CEED_INTERN "C" 247f5b9731SStan Tomov #endif 25*2b730f8bSJeremy L Thompson int 26*2b730f8bSJeremy L Thompson CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector U, CeedVector V) { 277f5b9731SStan Tomov Ceed ceed; 28*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 29e0582403Sabdelfattah83 CeedInt dim, ncomp, ndof; 30*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 31*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp)); 32*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &ndof)); 33e0582403Sabdelfattah83 34e0582403Sabdelfattah83 Ceed_Magma *data; 35*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &data)); 36e0582403Sabdelfattah83 377f5b9731SStan Tomov const CeedScalar *u; 387f5b9731SStan Tomov CeedScalar *v; 39868539c2SNatalie Beams if (emode != CEED_EVAL_WEIGHT) { 40*2b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &u)); 417f5b9731SStan Tomov } else if (emode != CEED_EVAL_WEIGHT) { 427f5b9731SStan Tomov // LCOV_EXCL_START 43*2b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 447f5b9731SStan Tomov // LCOV_EXCL_STOP 457f5b9731SStan Tomov } 46*2b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &v)); 477f5b9731SStan Tomov 487f5b9731SStan Tomov CeedBasis_Magma *impl; 49*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &impl)); 507f5b9731SStan Tomov 517f5b9731SStan Tomov CeedInt P1d, Q1d; 52*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P1d)); 53*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q1d)); 547f5b9731SStan Tomov 55*2b730f8bSJeremy L Thompson CeedDebug256(ceed, 4, "[CeedBasisApply_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, ncomp * CeedIntPow(P1d, dim), ncomp); 567f5b9731SStan Tomov 577f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) { 581f9221feSJeremy L Thompson CeedSize length; 59*2b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetLength(V, &length)); 6080a9ef05SNatalie Beams if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { 61*2b730f8bSJeremy L Thompson magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)v, length, data->queue); 6280a9ef05SNatalie Beams } else { 63*2b730f8bSJeremy L Thompson magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)v, length, data->queue); 6480a9ef05SNatalie Beams } 65e0582403Sabdelfattah83 ceed_magma_queue_sync(data->queue); 667f5b9731SStan Tomov } 67f6af633fSnbeams 683513a710Sjeremylt switch (emode) { 693513a710Sjeremylt case CEED_EVAL_INTERP: { 707f5b9731SStan Tomov CeedInt P = P1d, Q = Q1d; 717f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) { 72*2b730f8bSJeremy L Thompson P = Q1d; 73*2b730f8bSJeremy L Thompson Q = P1d; 747f5b9731SStan Tomov } 757f5b9731SStan Tomov 767f5b9731SStan Tomov // Define element sizes for dofs/quad 777f5b9731SStan Tomov CeedInt elquadsize = CeedIntPow(Q1d, dim); 787f5b9731SStan Tomov CeedInt eldofssize = CeedIntPow(P1d, dim); 797f5b9731SStan Tomov 807f5b9731SStan Tomov // E-vector ordering -------------- Q-vector ordering 81868539c2SNatalie Beams // component component 82868539c2SNatalie Beams // elem elem 837f5b9731SStan Tomov // node node 847f5b9731SStan Tomov 857f5b9731SStan Tomov // --- Define strides for NOTRANSPOSE mode: --- 867f5b9731SStan Tomov // Input (u) is E-vector, output (v) is Q-vector 877f5b9731SStan Tomov 887f5b9731SStan Tomov // Element strides 89868539c2SNatalie Beams CeedInt u_elstride = eldofssize; 907f5b9731SStan Tomov CeedInt v_elstride = elquadsize; 917f5b9731SStan Tomov // Component strides 92868539c2SNatalie Beams CeedInt u_compstride = nelem * eldofssize; 937f5b9731SStan Tomov CeedInt v_compstride = nelem * elquadsize; 947f5b9731SStan Tomov 957f5b9731SStan Tomov // --- Swap strides for TRANSPOSE mode: --- 967f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) { 977f5b9731SStan Tomov // Input (u) is Q-vector, output (v) is E-vector 987f5b9731SStan Tomov // Element strides 99868539c2SNatalie Beams v_elstride = eldofssize; 1007f5b9731SStan Tomov u_elstride = elquadsize; 1017f5b9731SStan Tomov // Component strides 102868539c2SNatalie Beams v_compstride = nelem * eldofssize; 1037f5b9731SStan Tomov u_compstride = nelem * elquadsize; 1047f5b9731SStan Tomov } 1057f5b9731SStan Tomov 106f6af633fSnbeams CeedInt nthreads = 1; 107f6af633fSnbeams CeedInt ntcol = 1; 108f6af633fSnbeams CeedInt shmem = 0; 109f6af633fSnbeams CeedInt maxPQ = CeedIntMax(P, Q); 110f6af633fSnbeams 111f6af633fSnbeams switch (dim) { 112f6af633fSnbeams case 1: 113f6af633fSnbeams nthreads = maxPQ; 114f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D); 115f6af633fSnbeams shmem += sizeof(CeedScalar) * ntcol * (ncomp * (1 * P + 1 * Q)); 116f6af633fSnbeams shmem += sizeof(CeedScalar) * (P * Q); 117f6af633fSnbeams break; 118f6af633fSnbeams case 2: 119f6af633fSnbeams nthreads = maxPQ; 120f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D); 121f6af633fSnbeams shmem += P * Q * sizeof(CeedScalar); // for sT 122*2b730f8bSJeremy L Thompson shmem += ntcol * (P * maxPQ * sizeof(CeedScalar)); // for reforming rU we need PxP, and for the intermediate output we need PxQ 123f6af633fSnbeams break; 124f6af633fSnbeams case 3: 125f6af633fSnbeams nthreads = maxPQ * maxPQ; 126f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D); 127f6af633fSnbeams shmem += sizeof(CeedScalar) * (P * Q); // for sT 128*2b730f8bSJeremy L Thompson shmem += sizeof(CeedScalar) * ntcol * 129*2b730f8bSJeremy L Thompson (CeedIntMax(P * P * maxPQ, 130f6af633fSnbeams P * Q * Q)); // rU needs P^2xP, the intermediate output needs max(P^2xQ,PQ^2) 131f6af633fSnbeams } 132f6af633fSnbeams CeedInt grid = (nelem + ntcol - 1) / ntcol; 133*2b730f8bSJeremy L Thompson void *args[] = {&impl->dinterp1d, &u, &u_elstride, &u_compstride, &v, &v_elstride, &v_compstride, &nelem}; 134f6af633fSnbeams 135f6af633fSnbeams if (tmode == CEED_TRANSPOSE) { 136*2b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp_tr, grid, nthreads, ntcol, 1, shmem, args)); 137f6af633fSnbeams } else { 138*2b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp, grid, nthreads, ntcol, 1, shmem, args)); 139f6af633fSnbeams } 140*2b730f8bSJeremy L Thompson } break; 1413513a710Sjeremylt case CEED_EVAL_GRAD: { 1427f5b9731SStan Tomov CeedInt P = P1d, Q = Q1d; 1437f5b9731SStan Tomov // In CEED_NOTRANSPOSE mode: 1447f5b9731SStan Tomov // u is (P^dim x nc), column-major layout (nc = ncomp) 1457f5b9731SStan Tomov // v is (Q^dim x nc x dim), column-major layout (nc = ncomp) 1467f5b9731SStan Tomov // In CEED_TRANSPOSE mode, the sizes of u and v are switched. 1477f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) { 1487f5b9731SStan Tomov P = Q1d, Q = P1d; 1497f5b9731SStan Tomov } 1507f5b9731SStan Tomov 1517f5b9731SStan Tomov // Define element sizes for dofs/quad 1527f5b9731SStan Tomov CeedInt elquadsize = CeedIntPow(Q1d, dim); 1537f5b9731SStan Tomov CeedInt eldofssize = CeedIntPow(P1d, dim); 1547f5b9731SStan Tomov 1557f5b9731SStan Tomov // E-vector ordering -------------- Q-vector ordering 1567f5b9731SStan Tomov // dim 157868539c2SNatalie Beams // component component 158868539c2SNatalie Beams // elem elem 1597f5b9731SStan Tomov // node node 1607f5b9731SStan Tomov 1617f5b9731SStan Tomov // --- Define strides for NOTRANSPOSE mode: --- 1627f5b9731SStan Tomov // Input (u) is E-vector, output (v) is Q-vector 1637f5b9731SStan Tomov 1647f5b9731SStan Tomov // Element strides 165868539c2SNatalie Beams CeedInt u_elstride = eldofssize; 1667f5b9731SStan Tomov CeedInt v_elstride = elquadsize; 1677f5b9731SStan Tomov // Component strides 168868539c2SNatalie Beams CeedInt u_compstride = nelem * eldofssize; 1697f5b9731SStan Tomov CeedInt v_compstride = nelem * elquadsize; 1707f5b9731SStan Tomov // Dimension strides 1717f5b9731SStan Tomov CeedInt u_dimstride = 0; 1727f5b9731SStan Tomov CeedInt v_dimstride = nelem * elquadsize * ncomp; 1737f5b9731SStan Tomov 1747f5b9731SStan Tomov // --- Swap strides for TRANSPOSE mode: --- 1757f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) { 1767f5b9731SStan Tomov // Input (u) is Q-vector, output (v) is E-vector 1777f5b9731SStan Tomov // Element strides 178868539c2SNatalie Beams v_elstride = eldofssize; 1797f5b9731SStan Tomov u_elstride = elquadsize; 1807f5b9731SStan Tomov // Component strides 181868539c2SNatalie Beams v_compstride = nelem * eldofssize; 1827f5b9731SStan Tomov u_compstride = nelem * elquadsize; 1837f5b9731SStan Tomov // Dimension strides 1847f5b9731SStan Tomov v_dimstride = 0; 1857f5b9731SStan Tomov u_dimstride = nelem * elquadsize * ncomp; 1867f5b9731SStan Tomov } 1877f5b9731SStan Tomov 188f6af633fSnbeams CeedInt nthreads = 1; 189f6af633fSnbeams CeedInt ntcol = 1; 190f6af633fSnbeams CeedInt shmem = 0; 191f6af633fSnbeams CeedInt maxPQ = CeedIntMax(P, Q); 192f6af633fSnbeams 193f6af633fSnbeams switch (dim) { 194f6af633fSnbeams case 1: 195f6af633fSnbeams nthreads = maxPQ; 196f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D); 197f6af633fSnbeams shmem += sizeof(CeedScalar) * ntcol * (ncomp * (1 * P + 1 * Q)); 198f6af633fSnbeams shmem += sizeof(CeedScalar) * (P * Q); 199f6af633fSnbeams break; 200f6af633fSnbeams case 2: 201f6af633fSnbeams nthreads = maxPQ; 202f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D); 203f6af633fSnbeams shmem += sizeof(CeedScalar) * 2 * P * Q; // for sTinterp and sTgrad 204*2b730f8bSJeremy L Thompson shmem += sizeof(CeedScalar) * ntcol * (P * maxPQ); // for reforming rU we need PxP, and for the intermediate output we need PxQ 205f6af633fSnbeams break; 206f6af633fSnbeams case 3: 207f6af633fSnbeams nthreads = maxPQ * maxPQ; 208f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D); 209f6af633fSnbeams shmem += sizeof(CeedScalar) * 2 * P * Q; // for sTinterp and sTgrad 210*2b730f8bSJeremy L Thompson shmem += sizeof(CeedScalar) * ntcol * 211*2b730f8bSJeremy L Thompson CeedIntMax(P * P * P, 212*2b730f8bSJeremy L Thompson (P * P * Q) + (P * Q * Q)); // rU needs P^2xP, the intermediate outputs need (P^2.Q + P.Q^2) 213f6af633fSnbeams } 214f6af633fSnbeams CeedInt grid = (nelem + ntcol - 1) / ntcol; 215*2b730f8bSJeremy L Thompson void *args[] = {&impl->dinterp1d, &impl->dgrad1d, &u, &u_elstride, &u_compstride, &u_dimstride, &v, 216*2b730f8bSJeremy L Thompson &v_elstride, &v_compstride, &v_dimstride, &nelem}; 217f6af633fSnbeams 218f6af633fSnbeams if (tmode == CEED_TRANSPOSE) { 219*2b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad_tr, grid, nthreads, ntcol, 1, shmem, args)); 220f6af633fSnbeams } else { 221*2b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad, grid, nthreads, ntcol, 1, shmem, args)); 222f6af633fSnbeams } 223*2b730f8bSJeremy L Thompson } break; 2243513a710Sjeremylt case CEED_EVAL_WEIGHT: { 2257f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) 2267f5b9731SStan Tomov // LCOV_EXCL_START 227*2b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 2287f5b9731SStan Tomov // LCOV_EXCL_STOP 2297f5b9731SStan Tomov CeedInt Q = Q1d; 230f6af633fSnbeams CeedInt eldofssize = CeedIntPow(Q, dim); 231f6af633fSnbeams CeedInt nthreads = 1; 232f6af633fSnbeams CeedInt ntcol = 1; 233f6af633fSnbeams CeedInt shmem = 0; 234f6af633fSnbeams 235f6af633fSnbeams switch (dim) { 236f6af633fSnbeams case 1: 237f6af633fSnbeams nthreads = Q; 238f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D); 239f6af633fSnbeams shmem += sizeof(CeedScalar) * Q; // for dqweight1d 240f6af633fSnbeams shmem += sizeof(CeedScalar) * ntcol * Q; // for output 241f6af633fSnbeams break; 242f6af633fSnbeams case 2: 243f6af633fSnbeams nthreads = Q; 244f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D); 245f6af633fSnbeams shmem += sizeof(CeedScalar) * Q; // for dqweight1d 246f6af633fSnbeams break; 247f6af633fSnbeams case 3: 248f6af633fSnbeams nthreads = Q * Q; 249f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D); 250f6af633fSnbeams shmem += sizeof(CeedScalar) * Q; // for dqweight1d 251f6af633fSnbeams } 252f6af633fSnbeams CeedInt grid = (nelem + ntcol - 1) / ntcol; 253f6af633fSnbeams void *args[] = {&impl->dqweight1d, &v, &eldofssize, &nelem}; 254f6af633fSnbeams 255*2b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_weight, grid, nthreads, ntcol, 1, shmem, args)); 256*2b730f8bSJeremy L Thompson } break; 2573513a710Sjeremylt // LCOV_EXCL_START 2583513a710Sjeremylt case CEED_EVAL_DIV: 259e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 2603513a710Sjeremylt case CEED_EVAL_CURL: 261e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 2623513a710Sjeremylt case CEED_EVAL_NONE: 263*2b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); 2643513a710Sjeremylt // LCOV_EXCL_STOP 2653513a710Sjeremylt } 2667f5b9731SStan Tomov 267e0582403Sabdelfattah83 // must sync to ensure completeness 268e0582403Sabdelfattah83 ceed_magma_queue_sync(data->queue); 269e0582403Sabdelfattah83 2707f5b9731SStan Tomov if (emode != CEED_EVAL_WEIGHT) { 271*2b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(U, &u)); 2727f5b9731SStan Tomov } 273*2b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(V, &v)); 274e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2757f5b9731SStan Tomov } 2767f5b9731SStan Tomov 2777f5b9731SStan Tomov #ifdef __cplusplus 2787f5b9731SStan Tomov CEED_INTERN "C" 2797f5b9731SStan Tomov #endif 280*2b730f8bSJeremy L Thompson int 281*2b730f8bSJeremy L Thompson CeedBasisApplyNonTensor_f64_Magma(CeedBasis basis, CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector U, CeedVector V) { 282868539c2SNatalie Beams Ceed ceed; 283*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 284e0582403Sabdelfattah83 285e0582403Sabdelfattah83 Ceed_Magma *data; 286*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &data)); 287e0582403Sabdelfattah83 288868539c2SNatalie Beams CeedInt dim, ncomp, ndof, nqpt; 289*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 290*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp)); 291*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &ndof)); 292*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &nqpt)); 293868539c2SNatalie Beams const CeedScalar *du; 294868539c2SNatalie Beams CeedScalar *dv; 295868539c2SNatalie Beams if (emode != CEED_EVAL_WEIGHT) { 296*2b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du)); 297868539c2SNatalie Beams } else if (emode != CEED_EVAL_WEIGHT) { 298868539c2SNatalie Beams // LCOV_EXCL_START 299*2b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 300868539c2SNatalie Beams // LCOV_EXCL_STOP 301868539c2SNatalie Beams } 302*2b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv)); 303868539c2SNatalie Beams 304868539c2SNatalie Beams CeedBasisNonTensor_Magma *impl; 305*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &impl)); 306868539c2SNatalie Beams 307*2b730f8bSJeremy L Thompson CeedDebug256(ceed, 4, "[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, ncomp * ndof, ncomp); 308868539c2SNatalie Beams 309868539c2SNatalie Beams if (tmode == CEED_TRANSPOSE) { 3101f9221feSJeremy L Thompson CeedSize length; 311*2b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetLength(V, &length)); 31280a9ef05SNatalie Beams if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { 313*2b730f8bSJeremy L Thompson magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue); 31480a9ef05SNatalie Beams } else { 315*2b730f8bSJeremy L Thompson magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)dv, length, data->queue); 31680a9ef05SNatalie Beams } 317e0582403Sabdelfattah83 ceed_magma_queue_sync(data->queue); 318868539c2SNatalie Beams } 31980a9ef05SNatalie Beams 320868539c2SNatalie Beams switch (emode) { 321868539c2SNatalie Beams case CEED_EVAL_INTERP: { 322868539c2SNatalie Beams CeedInt P = ndof, Q = nqpt; 323868539c2SNatalie Beams if (tmode == CEED_TRANSPOSE) 324*2b730f8bSJeremy L Thompson magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, (double *)impl->dinterp, P, (double *)du, Q, 0.0, (double *)dv, P, 325*2b730f8bSJeremy L Thompson data->queue); 326868539c2SNatalie Beams else 327*2b730f8bSJeremy L Thompson magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, (double *)impl->dinterp, P, (double *)du, P, 0.0, (double *)dv, Q, 328*2b730f8bSJeremy L Thompson data->queue); 329*2b730f8bSJeremy L Thompson } break; 330868539c2SNatalie Beams 331868539c2SNatalie Beams case CEED_EVAL_GRAD: { 332868539c2SNatalie Beams CeedInt P = ndof, Q = nqpt; 333868539c2SNatalie Beams if (tmode == CEED_TRANSPOSE) { 33480a9ef05SNatalie Beams CeedScalar beta = 0.0; 335868539c2SNatalie Beams for (int d = 0; d < dim; d++) { 336*2b730f8bSJeremy L Thompson if (d > 0) beta = 1.0; 337*2b730f8bSJeremy L Thompson magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, (double *)(impl->dgrad + d * P * Q), P, 338*2b730f8bSJeremy L Thompson (double *)(du + d * nelem * ncomp * Q), Q, beta, (double *)dv, P, data->queue); 339868539c2SNatalie Beams } 340868539c2SNatalie Beams } else { 341868539c2SNatalie Beams for (int d = 0; d < dim; d++) 342*2b730f8bSJeremy L Thompson magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, (double *)(impl->dgrad + d * P * Q), P, (double *)du, P, 0.0, 343*2b730f8bSJeremy L Thompson (double *)(dv + d * nelem * ncomp * Q), Q, data->queue); 34480a9ef05SNatalie Beams } 345*2b730f8bSJeremy L Thompson } break; 34680a9ef05SNatalie Beams 34780a9ef05SNatalie Beams case CEED_EVAL_WEIGHT: { 34880a9ef05SNatalie Beams if (tmode == CEED_TRANSPOSE) 34980a9ef05SNatalie Beams // LCOV_EXCL_START 350*2b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 35180a9ef05SNatalie Beams // LCOV_EXCL_STOP 35280a9ef05SNatalie Beams 35380a9ef05SNatalie Beams int elemsPerBlock = 1; // basis->Q1d < 7 ? optElems[basis->Q1d] : 1; 354*2b730f8bSJeremy L Thompson int grid = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0); 355*2b730f8bSJeremy L Thompson magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv, data->queue); 356*2b730f8bSJeremy L Thompson } break; 35780a9ef05SNatalie Beams 35880a9ef05SNatalie Beams // LCOV_EXCL_START 35980a9ef05SNatalie Beams case CEED_EVAL_DIV: 36080a9ef05SNatalie Beams return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 36180a9ef05SNatalie Beams case CEED_EVAL_CURL: 36280a9ef05SNatalie Beams return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 36380a9ef05SNatalie Beams case CEED_EVAL_NONE: 364*2b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); 36580a9ef05SNatalie Beams // LCOV_EXCL_STOP 36680a9ef05SNatalie Beams } 36780a9ef05SNatalie Beams 36880a9ef05SNatalie Beams // must sync to ensure completeness 36980a9ef05SNatalie Beams ceed_magma_queue_sync(data->queue); 37080a9ef05SNatalie Beams 37180a9ef05SNatalie Beams if (emode != CEED_EVAL_WEIGHT) { 372*2b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(U, &du)); 37380a9ef05SNatalie Beams } 374*2b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(V, &dv)); 37580a9ef05SNatalie Beams return CEED_ERROR_SUCCESS; 37680a9ef05SNatalie Beams } 37780a9ef05SNatalie Beams 378*2b730f8bSJeremy L Thompson int CeedBasisApplyNonTensor_f32_Magma(CeedBasis basis, CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector U, CeedVector V) { 37980a9ef05SNatalie Beams Ceed ceed; 380*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 38180a9ef05SNatalie Beams 38280a9ef05SNatalie Beams Ceed_Magma *data; 383*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &data)); 38480a9ef05SNatalie Beams 38580a9ef05SNatalie Beams CeedInt dim, ncomp, ndof, nqpt; 386*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 387*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp)); 388*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &ndof)); 389*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &nqpt)); 39080a9ef05SNatalie Beams const CeedScalar *du; 39180a9ef05SNatalie Beams CeedScalar *dv; 39280a9ef05SNatalie Beams if (emode != CEED_EVAL_WEIGHT) { 393*2b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du)); 39480a9ef05SNatalie Beams } else if (emode != CEED_EVAL_WEIGHT) { 39580a9ef05SNatalie Beams // LCOV_EXCL_START 396*2b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 39780a9ef05SNatalie Beams // LCOV_EXCL_STOP 39880a9ef05SNatalie Beams } 399*2b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv)); 40080a9ef05SNatalie Beams 40180a9ef05SNatalie Beams CeedBasisNonTensor_Magma *impl; 402*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &impl)); 40380a9ef05SNatalie Beams 404*2b730f8bSJeremy L Thompson CeedDebug256(ceed, 4, "[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, ncomp * ndof, ncomp); 40580a9ef05SNatalie Beams 40680a9ef05SNatalie Beams if (tmode == CEED_TRANSPOSE) { 4071f9221feSJeremy L Thompson CeedSize length; 408*2b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetLength(V, &length)); 40980a9ef05SNatalie Beams if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { 410*2b730f8bSJeremy L Thompson magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue); 41180a9ef05SNatalie Beams } else { 412*2b730f8bSJeremy L Thompson magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)dv, length, data->queue); 41380a9ef05SNatalie Beams } 41480a9ef05SNatalie Beams ceed_magma_queue_sync(data->queue); 41580a9ef05SNatalie Beams } 41680a9ef05SNatalie Beams 41780a9ef05SNatalie Beams switch (emode) { 41880a9ef05SNatalie Beams case CEED_EVAL_INTERP: { 41980a9ef05SNatalie Beams CeedInt P = ndof, Q = nqpt; 42080a9ef05SNatalie Beams if (tmode == CEED_TRANSPOSE) 421*2b730f8bSJeremy L Thompson magma_sgemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, (float *)impl->dinterp, P, (float *)du, Q, 0.0, (float *)dv, P, 422*2b730f8bSJeremy L Thompson data->queue); 42380a9ef05SNatalie Beams else 424*2b730f8bSJeremy L Thompson magma_sgemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, (float *)impl->dinterp, P, (float *)du, P, 0.0, (float *)dv, Q, 425*2b730f8bSJeremy L Thompson data->queue); 426*2b730f8bSJeremy L Thompson } break; 42780a9ef05SNatalie Beams 42880a9ef05SNatalie Beams case CEED_EVAL_GRAD: { 42980a9ef05SNatalie Beams CeedInt P = ndof, Q = nqpt; 43080a9ef05SNatalie Beams if (tmode == CEED_TRANSPOSE) { 43180a9ef05SNatalie Beams CeedScalar beta = 0.0; 43280a9ef05SNatalie Beams for (int d = 0; d < dim; d++) { 433*2b730f8bSJeremy L Thompson if (d > 0) beta = 1.0; 434*2b730f8bSJeremy L Thompson magma_sgemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, (float *)(impl->dgrad + d * P * Q), P, 435*2b730f8bSJeremy L Thompson (float *)(du + d * nelem * ncomp * Q), Q, beta, (float *)dv, P, data->queue); 43680a9ef05SNatalie Beams } 43780a9ef05SNatalie Beams } else { 43880a9ef05SNatalie Beams for (int d = 0; d < dim; d++) 439*2b730f8bSJeremy L Thompson magma_sgemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, (float *)(impl->dgrad + d * P * Q), P, (float *)du, P, 0.0, 440*2b730f8bSJeremy L Thompson (float *)(dv + d * nelem * ncomp * Q), Q, data->queue); 441868539c2SNatalie Beams } 442*2b730f8bSJeremy L Thompson } break; 443868539c2SNatalie Beams 444868539c2SNatalie Beams case CEED_EVAL_WEIGHT: { 445868539c2SNatalie Beams if (tmode == CEED_TRANSPOSE) 446868539c2SNatalie Beams // LCOV_EXCL_START 447*2b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 448868539c2SNatalie Beams // LCOV_EXCL_STOP 449868539c2SNatalie Beams 450868539c2SNatalie Beams int elemsPerBlock = 1; // basis->Q1d < 7 ? optElems[basis->Q1d] : 1; 451*2b730f8bSJeremy L Thompson int grid = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0); 452*2b730f8bSJeremy L Thompson magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv, data->queue); 453*2b730f8bSJeremy L Thompson } break; 454868539c2SNatalie Beams 455868539c2SNatalie Beams // LCOV_EXCL_START 456868539c2SNatalie Beams case CEED_EVAL_DIV: 457e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 458868539c2SNatalie Beams case CEED_EVAL_CURL: 459e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 460868539c2SNatalie Beams case CEED_EVAL_NONE: 461*2b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); 462868539c2SNatalie Beams // LCOV_EXCL_STOP 463868539c2SNatalie Beams } 464868539c2SNatalie Beams 465e0582403Sabdelfattah83 // must sync to ensure completeness 466e0582403Sabdelfattah83 ceed_magma_queue_sync(data->queue); 467e0582403Sabdelfattah83 468868539c2SNatalie Beams if (emode != CEED_EVAL_WEIGHT) { 469*2b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(U, &du)); 470868539c2SNatalie Beams } 471*2b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(V, &dv)); 472e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 473868539c2SNatalie Beams } 474868539c2SNatalie Beams 475868539c2SNatalie Beams #ifdef __cplusplus 476868539c2SNatalie Beams CEED_INTERN "C" 477868539c2SNatalie Beams #endif 478*2b730f8bSJeremy L Thompson int 479*2b730f8bSJeremy L Thompson CeedBasisDestroy_Magma(CeedBasis basis) { 4807f5b9731SStan Tomov CeedBasis_Magma *impl; 481*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &impl)); 4827f5b9731SStan Tomov 483*2b730f8bSJeremy L Thompson CeedCallBackend(magma_free(impl->dqref1d)); 484*2b730f8bSJeremy L Thompson CeedCallBackend(magma_free(impl->dinterp1d)); 485*2b730f8bSJeremy L Thompson CeedCallBackend(magma_free(impl->dgrad1d)); 486*2b730f8bSJeremy L Thompson CeedCallBackend(magma_free(impl->dqweight1d)); 487f6af633fSnbeams Ceed ceed; 488*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 489e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP 490*2b730f8bSJeremy L Thompson CeedCallHip(ceed, hipModuleUnload(impl->module)); 491f6af633fSnbeams #else 492*2b730f8bSJeremy L Thompson CeedCallCuda(ceed, cuModuleUnload(impl->module)); 493f6af633fSnbeams #endif 4947f5b9731SStan Tomov 495*2b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl)); 4967f5b9731SStan Tomov 497e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 4987f5b9731SStan Tomov } 4997f5b9731SStan Tomov 5007f5b9731SStan Tomov #ifdef __cplusplus 5017f5b9731SStan Tomov CEED_INTERN "C" 5027f5b9731SStan Tomov #endif 503*2b730f8bSJeremy L Thompson int 504*2b730f8bSJeremy L Thompson CeedBasisDestroyNonTensor_Magma(CeedBasis basis) { 505868539c2SNatalie Beams CeedBasisNonTensor_Magma *impl; 506*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &impl)); 507868539c2SNatalie Beams 508*2b730f8bSJeremy L Thompson CeedCallBackend(magma_free(impl->dqref)); 509*2b730f8bSJeremy L Thompson CeedCallBackend(magma_free(impl->dinterp)); 510*2b730f8bSJeremy L Thompson CeedCallBackend(magma_free(impl->dgrad)); 511*2b730f8bSJeremy L Thompson CeedCallBackend(magma_free(impl->dqweight)); 512868539c2SNatalie Beams 513*2b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl)); 514868539c2SNatalie Beams 515e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 516868539c2SNatalie Beams } 517868539c2SNatalie Beams 518868539c2SNatalie Beams #ifdef __cplusplus 519868539c2SNatalie Beams CEED_INTERN "C" 520868539c2SNatalie Beams #endif 521*2b730f8bSJeremy L Thompson int 522*2b730f8bSJeremy L Thompson CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d, 523*2b730f8bSJeremy L Thompson const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis) { 5247f5b9731SStan Tomov CeedBasis_Magma *impl; 525*2b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &impl)); 5267f5b9731SStan Tomov Ceed ceed; 527*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 5287f5b9731SStan Tomov 529c9f8acf2SJeremy L Thompson // Check for supported parameters 530c9f8acf2SJeremy L Thompson CeedInt ncomp = 0; 531*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp)); 532e0582403Sabdelfattah83 Ceed_Magma *data; 533*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &data)); 534e0582403Sabdelfattah83 535f6af633fSnbeams // Compile kernels 536f6af633fSnbeams char *magma_common_path; 537f6af633fSnbeams char *interp_path, *grad_path, *weight_path; 538f6af633fSnbeams char *basis_kernel_source; 539*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_device.h", &magma_common_path)); 540f6af633fSnbeams CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 541*2b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &basis_kernel_source)); 542f6af633fSnbeams char *interp_name_base = "ceed/jit-source/magma/interp"; 543f6af633fSnbeams CeedInt interp_name_len = strlen(interp_name_base) + 6; 544f6af633fSnbeams char interp_name[interp_name_len]; 545*2b730f8bSJeremy L Thompson snprintf(interp_name, interp_name_len, "%s-%" CeedInt_FMT "d.h", interp_name_base, dim); 546*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, interp_name, &interp_path)); 547*2b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, interp_path, &basis_kernel_source)); 548f6af633fSnbeams char *grad_name_base = "ceed/jit-source/magma/grad"; 549f6af633fSnbeams CeedInt grad_name_len = strlen(grad_name_base) + 6; 550f6af633fSnbeams char grad_name[grad_name_len]; 551*2b730f8bSJeremy L Thompson snprintf(grad_name, grad_name_len, "%s-%" CeedInt_FMT "d.h", grad_name_base, dim); 552*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, grad_name, &grad_path)); 553*2b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_path, &basis_kernel_source)); 554f6af633fSnbeams char *weight_name_base = "ceed/jit-source/magma/weight"; 555f6af633fSnbeams CeedInt weight_name_len = strlen(weight_name_base) + 6; 556f6af633fSnbeams char weight_name[weight_name_len]; 557*2b730f8bSJeremy L Thompson snprintf(weight_name, weight_name_len, "%s-%" CeedInt_FMT "d.h", weight_name_base, dim); 558*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, weight_name, &weight_path)); 559*2b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_path, &basis_kernel_source)); 560*2b730f8bSJeremy L Thompson CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n"); 561f6af633fSnbeams // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip 562f6af633fSnbeams // data 563f6af633fSnbeams Ceed delegate; 564*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetDelegate(ceed, &delegate)); 565*2b730f8bSJeremy L Thompson CeedCallBackend(CeedCompileMagma(delegate, basis_kernel_source, &impl->module, 5, "DIM", dim, "NCOMP", ncomp, "P", P1d, "Q", Q1d, "MAXPQ", 566*2b730f8bSJeremy L Thompson CeedIntMax(P1d, Q1d))); 567f6af633fSnbeams 568f6af633fSnbeams // Kernel setup 569f6af633fSnbeams switch (dim) { 570f6af633fSnbeams case 1: 571*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->magma_interp)); 572*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->magma_interp_tr)); 573*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->magma_grad)); 574*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->magma_grad_tr)); 575*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->magma_weight)); 576f6af633fSnbeams break; 577f6af633fSnbeams case 2: 578*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->magma_interp)); 579*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->magma_interp_tr)); 580*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->magma_grad)); 581*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->magma_grad_tr)); 582*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->magma_weight)); 583f6af633fSnbeams break; 584f6af633fSnbeams case 3: 585*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->magma_interp)); 586*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->magma_interp_tr)); 587*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->magma_grad)); 588*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->magma_grad_tr)); 589*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->magma_weight)); 590f6af633fSnbeams } 591f6af633fSnbeams 592*2b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma)); 593*2b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma)); 5947f5b9731SStan Tomov 5957f5b9731SStan Tomov // Copy qref1d to the GPU 596*2b730f8bSJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->dqref1d, Q1d * sizeof(qref1d[0]))); 597*2b730f8bSJeremy L Thompson magma_setvector(Q1d, sizeof(qref1d[0]), qref1d, 1, impl->dqref1d, 1, data->queue); 5987f5b9731SStan Tomov 5997f5b9731SStan Tomov // Copy interp1d to the GPU 600*2b730f8bSJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->dinterp1d, Q1d * P1d * sizeof(interp1d[0]))); 601*2b730f8bSJeremy L Thompson magma_setvector(Q1d * P1d, sizeof(interp1d[0]), interp1d, 1, impl->dinterp1d, 1, data->queue); 6027f5b9731SStan Tomov 6037f5b9731SStan Tomov // Copy grad1d to the GPU 604*2b730f8bSJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->dgrad1d, Q1d * P1d * sizeof(grad1d[0]))); 605*2b730f8bSJeremy L Thompson magma_setvector(Q1d * P1d, sizeof(grad1d[0]), grad1d, 1, impl->dgrad1d, 1, data->queue); 6067f5b9731SStan Tomov 6077f5b9731SStan Tomov // Copy qweight1d to the GPU 608*2b730f8bSJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->dqweight1d, Q1d * sizeof(qweight1d[0]))); 609*2b730f8bSJeremy L Thompson magma_setvector(Q1d, sizeof(qweight1d[0]), qweight1d, 1, impl->dqweight1d, 1, data->queue); 6107f5b9731SStan Tomov 611*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisSetData(basis, impl)); 612*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisSetData(basis, impl)); 613*2b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&magma_common_path)); 614*2b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&interp_path)); 615*2b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&grad_path)); 616*2b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&weight_path)); 617*2b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_source)); 618f6af633fSnbeams 619e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 6207f5b9731SStan Tomov } 6217f5b9731SStan Tomov 6227f5b9731SStan Tomov #ifdef __cplusplus 6237f5b9731SStan Tomov CEED_INTERN "C" 6247f5b9731SStan Tomov #endif 625*2b730f8bSJeremy L Thompson int 626*2b730f8bSJeremy L Thompson CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp, const CeedScalar *grad, 627*2b730f8bSJeremy L Thompson const CeedScalar *qref, const CeedScalar *qweight, CeedBasis basis) { 628868539c2SNatalie Beams CeedBasisNonTensor_Magma *impl; 6297f5b9731SStan Tomov Ceed ceed; 630*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 6317f5b9731SStan Tomov 632e0582403Sabdelfattah83 Ceed_Magma *data; 633*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &data)); 634e0582403Sabdelfattah83 63580a9ef05SNatalie Beams if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) { 636*2b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_f64_Magma)); 63780a9ef05SNatalie Beams } else { 638*2b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_f32_Magma)); 63980a9ef05SNatalie Beams } 640*2b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma)); 641868539c2SNatalie Beams 642*2b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &impl)); 643*2b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisSetData(basis, impl)); 644868539c2SNatalie Beams 645868539c2SNatalie Beams // Copy qref to the GPU 646*2b730f8bSJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->dqref, nqpts * sizeof(qref[0]))); 647e0582403Sabdelfattah83 magma_setvector(nqpts, sizeof(qref[0]), qref, 1, impl->dqref, 1, data->queue); 648868539c2SNatalie Beams 649868539c2SNatalie Beams // Copy interp to the GPU 650*2b730f8bSJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->dinterp, nqpts * ndof * sizeof(interp[0]))); 651*2b730f8bSJeremy L Thompson magma_setvector(nqpts * ndof, sizeof(interp[0]), interp, 1, impl->dinterp, 1, data->queue); 652868539c2SNatalie Beams 653868539c2SNatalie Beams // Copy grad to the GPU 654*2b730f8bSJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->dgrad, nqpts * ndof * dim * sizeof(grad[0]))); 655*2b730f8bSJeremy L Thompson magma_setvector(nqpts * ndof * dim, sizeof(grad[0]), grad, 1, impl->dgrad, 1, data->queue); 656868539c2SNatalie Beams 657868539c2SNatalie Beams // Copy qweight to the GPU 658*2b730f8bSJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->dqweight, nqpts * sizeof(qweight[0]))); 659*2b730f8bSJeremy L Thompson magma_setvector(nqpts, sizeof(qweight[0]), qweight, 1, impl->dqweight, 1, data->queue); 660868539c2SNatalie Beams 661e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 6627f5b9731SStan Tomov } 663