13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 37f5b9731SStan Tomov // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 57f5b9731SStan Tomov // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 77f5b9731SStan Tomov 8ec3da8bcSJed Brown #include <ceed/ceed.h> 9ec3da8bcSJed Brown #include <ceed/backend.h> 10f6af633fSnbeams #include <ceed/jit-tools.h> 11f6af633fSnbeams #include <string.h> 127f5b9731SStan Tomov #include "ceed-magma.h" 13*e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP 14f6af633fSnbeams #include "../hip/ceed-hip-common.h" 15f6af633fSnbeams #include "../hip/ceed-hip-compile.h" 16f6af633fSnbeams #else 17f6af633fSnbeams #include "../cuda/ceed-cuda-common.h" 18f6af633fSnbeams #include "../cuda/ceed-cuda-compile.h" 19f6af633fSnbeams #endif 207f5b9731SStan Tomov 217f5b9731SStan Tomov #ifdef __cplusplus 227f5b9731SStan Tomov CEED_INTERN "C" 237f5b9731SStan Tomov #endif 247f5b9731SStan Tomov int CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem, 257f5b9731SStan Tomov CeedTransposeMode tmode, CeedEvalMode emode, 263513a710Sjeremylt CeedVector U, CeedVector V) { 277f5b9731SStan Tomov int ierr; 287f5b9731SStan Tomov Ceed ceed; 29e15f9bd0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 30e0582403Sabdelfattah83 CeedInt dim, ncomp, ndof; 31e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 32e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 33e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr); 34e0582403Sabdelfattah83 35e0582403Sabdelfattah83 Ceed_Magma *data; 36e15f9bd0SJeremy L Thompson ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 37e0582403Sabdelfattah83 387f5b9731SStan Tomov const CeedScalar *u; 397f5b9731SStan Tomov CeedScalar *v; 40868539c2SNatalie Beams if (emode != CEED_EVAL_WEIGHT) { 41e15f9bd0SJeremy L Thompson ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &u); CeedChkBackend(ierr); 427f5b9731SStan Tomov } else if (emode != CEED_EVAL_WEIGHT) { 437f5b9731SStan Tomov // LCOV_EXCL_START 44e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 457f5b9731SStan Tomov "An input vector is required for this CeedEvalMode"); 467f5b9731SStan Tomov // LCOV_EXCL_STOP 477f5b9731SStan Tomov } 489c774eddSJeremy L Thompson ierr = CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &v); CeedChkBackend(ierr); 497f5b9731SStan Tomov 507f5b9731SStan Tomov CeedBasis_Magma *impl; 51e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); 527f5b9731SStan Tomov 537f5b9731SStan Tomov CeedInt P1d, Q1d; 54e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 55e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 567f5b9731SStan Tomov 573f21f6b1SJeremy L Thompson CeedDebug(ceed, "\033[01m[CeedBasisApply_Magma] vsize=%d, comp = %d", 587f5b9731SStan Tomov ncomp*CeedIntPow(P1d, dim), ncomp); 597f5b9731SStan Tomov 607f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) { 611f9221feSJeremy L Thompson CeedSize length; 62e15f9bd0SJeremy L Thompson ierr = CeedVectorGetLength(V, &length); CeedChkBackend(ierr); 6380a9ef05SNatalie Beams if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { 6480a9ef05SNatalie Beams magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *) v, length, 6580a9ef05SNatalie Beams data->queue); 6680a9ef05SNatalie Beams } else { 6780a9ef05SNatalie Beams magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *) v, length, 6880a9ef05SNatalie Beams data->queue); 6980a9ef05SNatalie Beams } 70e0582403Sabdelfattah83 ceed_magma_queue_sync( data->queue ); 717f5b9731SStan Tomov } 72f6af633fSnbeams 733513a710Sjeremylt switch (emode) { 743513a710Sjeremylt case CEED_EVAL_INTERP: { 757f5b9731SStan Tomov CeedInt P = P1d, Q = Q1d; 767f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) { 777f5b9731SStan Tomov P = Q1d; Q = P1d; 787f5b9731SStan Tomov } 797f5b9731SStan Tomov 807f5b9731SStan Tomov // Define element sizes for dofs/quad 817f5b9731SStan Tomov CeedInt elquadsize = CeedIntPow(Q1d, dim); 827f5b9731SStan Tomov CeedInt eldofssize = CeedIntPow(P1d, dim); 837f5b9731SStan Tomov 847f5b9731SStan Tomov // E-vector ordering -------------- Q-vector ordering 85868539c2SNatalie Beams // component component 86868539c2SNatalie Beams // elem elem 877f5b9731SStan Tomov // node node 887f5b9731SStan Tomov 897f5b9731SStan Tomov // --- Define strides for NOTRANSPOSE mode: --- 907f5b9731SStan Tomov // Input (u) is E-vector, output (v) is Q-vector 917f5b9731SStan Tomov 927f5b9731SStan Tomov // Element strides 93868539c2SNatalie Beams CeedInt u_elstride = eldofssize; 947f5b9731SStan Tomov CeedInt v_elstride = elquadsize; 957f5b9731SStan Tomov // Component strides 96868539c2SNatalie Beams CeedInt u_compstride = nelem * eldofssize; 977f5b9731SStan Tomov CeedInt v_compstride = nelem * elquadsize; 987f5b9731SStan Tomov 997f5b9731SStan Tomov // --- Swap strides for TRANSPOSE mode: --- 1007f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) { 1017f5b9731SStan Tomov // Input (u) is Q-vector, output (v) is E-vector 1027f5b9731SStan Tomov // Element strides 103868539c2SNatalie Beams v_elstride = eldofssize; 1047f5b9731SStan Tomov u_elstride = elquadsize; 1057f5b9731SStan Tomov // Component strides 106868539c2SNatalie Beams v_compstride = nelem * eldofssize; 1077f5b9731SStan Tomov u_compstride = nelem * elquadsize; 1087f5b9731SStan Tomov } 1097f5b9731SStan Tomov 110f6af633fSnbeams CeedInt nthreads = 1; 111f6af633fSnbeams CeedInt ntcol = 1; 112f6af633fSnbeams CeedInt shmem = 0; 113f6af633fSnbeams CeedInt maxPQ = CeedIntMax(P, Q); 114f6af633fSnbeams 115f6af633fSnbeams switch (dim) { 116f6af633fSnbeams case 1: 117f6af633fSnbeams nthreads = maxPQ; 118f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D); 119f6af633fSnbeams shmem += sizeof(CeedScalar) * ntcol * ( ncomp * (1*P + 1*Q) ); 120f6af633fSnbeams shmem += sizeof(CeedScalar) * (P*Q); 121f6af633fSnbeams break; 122f6af633fSnbeams case 2: 123f6af633fSnbeams nthreads = maxPQ; 124f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D); 125f6af633fSnbeams shmem += P*Q *sizeof(CeedScalar); // for sT 126f6af633fSnbeams shmem += ntcol * ( P*maxPQ*sizeof( 127f6af633fSnbeams CeedScalar) ); // for reforming rU we need PxP, and for the intermediate output we need PxQ 128f6af633fSnbeams break; 129f6af633fSnbeams case 3: 130f6af633fSnbeams nthreads = maxPQ*maxPQ; 131f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D); 132f6af633fSnbeams shmem += sizeof(CeedScalar)* (P*Q); // for sT 133f6af633fSnbeams shmem += sizeof(CeedScalar)* ntcol * (CeedIntMax(P*P*maxPQ, 134f6af633fSnbeams P*Q*Q)); // rU needs P^2xP, the intermediate output needs max(P^2xQ,PQ^2) 135f6af633fSnbeams } 136f6af633fSnbeams CeedInt grid = (nelem + ntcol-1) / ntcol; 137f6af633fSnbeams void *args[] = {&impl->dinterp1d, 138f6af633fSnbeams &u, &u_elstride, &u_compstride, 139f6af633fSnbeams &v, &v_elstride, &v_compstride, 140f6af633fSnbeams &nelem 141f6af633fSnbeams }; 142f6af633fSnbeams 143f6af633fSnbeams if (tmode == CEED_TRANSPOSE) { 144f6af633fSnbeams ierr = MAGMA_RTC_RUN_KERNEL_DIM_SH(ceed, impl->magma_interp_tr, grid, 145f6af633fSnbeams nthreads, ntcol, 1, shmem, 146f6af633fSnbeams args); CeedChkBackend(ierr); 147f6af633fSnbeams } else { 148f6af633fSnbeams ierr = MAGMA_RTC_RUN_KERNEL_DIM_SH(ceed, impl->magma_interp, grid, 149f6af633fSnbeams nthreads, ntcol, 1, shmem, 150f6af633fSnbeams args); CeedChkBackend(ierr); 151f6af633fSnbeams } 1527f5b9731SStan Tomov } 1533513a710Sjeremylt break; 1543513a710Sjeremylt case CEED_EVAL_GRAD: { 1557f5b9731SStan Tomov CeedInt P = P1d, Q = Q1d; 1567f5b9731SStan Tomov // In CEED_NOTRANSPOSE mode: 1577f5b9731SStan Tomov // u is (P^dim x nc), column-major layout (nc = ncomp) 1587f5b9731SStan Tomov // v is (Q^dim x nc x dim), column-major layout (nc = ncomp) 1597f5b9731SStan Tomov // In CEED_TRANSPOSE mode, the sizes of u and v are switched. 1607f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) { 1617f5b9731SStan Tomov P = Q1d, Q = P1d; 1627f5b9731SStan Tomov } 1637f5b9731SStan Tomov 1647f5b9731SStan Tomov // Define element sizes for dofs/quad 1657f5b9731SStan Tomov CeedInt elquadsize = CeedIntPow(Q1d, dim); 1667f5b9731SStan Tomov CeedInt eldofssize = CeedIntPow(P1d, dim); 1677f5b9731SStan Tomov 1687f5b9731SStan Tomov // E-vector ordering -------------- Q-vector ordering 1697f5b9731SStan Tomov // dim 170868539c2SNatalie Beams // component component 171868539c2SNatalie Beams // elem elem 1727f5b9731SStan Tomov // node node 1737f5b9731SStan Tomov 1747f5b9731SStan Tomov // --- Define strides for NOTRANSPOSE mode: --- 1757f5b9731SStan Tomov // Input (u) is E-vector, output (v) is Q-vector 1767f5b9731SStan Tomov 1777f5b9731SStan Tomov // Element strides 178868539c2SNatalie Beams CeedInt u_elstride = eldofssize; 1797f5b9731SStan Tomov CeedInt v_elstride = elquadsize; 1807f5b9731SStan Tomov // Component strides 181868539c2SNatalie Beams CeedInt u_compstride = nelem * eldofssize; 1827f5b9731SStan Tomov CeedInt v_compstride = nelem * elquadsize; 1837f5b9731SStan Tomov // Dimension strides 1847f5b9731SStan Tomov CeedInt u_dimstride = 0; 1857f5b9731SStan Tomov CeedInt v_dimstride = nelem * elquadsize * ncomp; 1867f5b9731SStan Tomov 1877f5b9731SStan Tomov // --- Swap strides for TRANSPOSE mode: --- 1887f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) { 1897f5b9731SStan Tomov // Input (u) is Q-vector, output (v) is E-vector 1907f5b9731SStan Tomov // Element strides 191868539c2SNatalie Beams v_elstride = eldofssize; 1927f5b9731SStan Tomov u_elstride = elquadsize; 1937f5b9731SStan Tomov // Component strides 194868539c2SNatalie Beams v_compstride = nelem * eldofssize; 1957f5b9731SStan Tomov u_compstride = nelem * elquadsize; 1967f5b9731SStan Tomov // Dimension strides 1977f5b9731SStan Tomov v_dimstride = 0; 1987f5b9731SStan Tomov u_dimstride = nelem * elquadsize * ncomp; 1997f5b9731SStan Tomov 2007f5b9731SStan Tomov } 2017f5b9731SStan Tomov 202f6af633fSnbeams CeedInt nthreads = 1; 203f6af633fSnbeams CeedInt ntcol = 1; 204f6af633fSnbeams CeedInt shmem = 0; 205f6af633fSnbeams CeedInt maxPQ = CeedIntMax(P, Q); 206f6af633fSnbeams 207f6af633fSnbeams switch (dim) { 208f6af633fSnbeams case 1: 209f6af633fSnbeams nthreads = maxPQ; 210f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D); 211f6af633fSnbeams shmem += sizeof(CeedScalar) * ntcol * (ncomp * (1*P + 1*Q)); 212f6af633fSnbeams shmem += sizeof(CeedScalar) * (P*Q); 213f6af633fSnbeams break; 214f6af633fSnbeams case 2: 215f6af633fSnbeams nthreads = maxPQ; 216f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D); 217f6af633fSnbeams shmem += sizeof(CeedScalar) * 2*P*Q; // for sTinterp and sTgrad 218f6af633fSnbeams shmem += sizeof(CeedScalar) * ntcol * 219f6af633fSnbeams (P*maxPQ); // for reforming rU we need PxP, and for the intermediate output we need PxQ 220f6af633fSnbeams break; 221f6af633fSnbeams case 3: 222f6af633fSnbeams nthreads = maxPQ * maxPQ; 223f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D); 224f6af633fSnbeams shmem += sizeof(CeedScalar) * 2*P*Q; // for sTinterp and sTgrad 225f6af633fSnbeams shmem += sizeof(CeedScalar) * ntcol * CeedIntMax(P*P*P, 226f6af633fSnbeams (P*P*Q) + 227f6af633fSnbeams (P*Q*Q)); // rU needs P^2xP, the intermediate outputs need (P^2.Q + P.Q^2) 228f6af633fSnbeams } 229f6af633fSnbeams CeedInt grid = (nelem + ntcol-1) / ntcol; 230f6af633fSnbeams void *args[] = {&impl->dinterp1d, &impl->dgrad1d, 231f6af633fSnbeams &u, &u_elstride, &u_compstride, &u_dimstride, 232f6af633fSnbeams &v, &v_elstride, &v_compstride, &v_dimstride, 233f6af633fSnbeams &nelem 234f6af633fSnbeams }; 235f6af633fSnbeams 236f6af633fSnbeams if (tmode == CEED_TRANSPOSE) { 237f6af633fSnbeams ierr = MAGMA_RTC_RUN_KERNEL_DIM_SH(ceed, impl->magma_grad_tr, grid, 238f6af633fSnbeams nthreads, ntcol, 1, shmem, 239f6af633fSnbeams args); CeedChkBackend(ierr); 240f6af633fSnbeams } else { 241f6af633fSnbeams ierr = MAGMA_RTC_RUN_KERNEL_DIM_SH(ceed, impl->magma_grad, grid, 242f6af633fSnbeams nthreads, ntcol, 1, shmem, 243f6af633fSnbeams args); CeedChkBackend(ierr); 244f6af633fSnbeams } 2457f5b9731SStan Tomov } 2463513a710Sjeremylt break; 2473513a710Sjeremylt case CEED_EVAL_WEIGHT: { 2487f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) 2497f5b9731SStan Tomov // LCOV_EXCL_START 250e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 2517f5b9731SStan Tomov "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 2527f5b9731SStan Tomov // LCOV_EXCL_STOP 2537f5b9731SStan Tomov CeedInt Q = Q1d; 254f6af633fSnbeams CeedInt eldofssize = CeedIntPow(Q, dim); 255f6af633fSnbeams CeedInt nthreads = 1; 256f6af633fSnbeams CeedInt ntcol = 1; 257f6af633fSnbeams CeedInt shmem = 0; 258f6af633fSnbeams 259f6af633fSnbeams switch (dim) { 260f6af633fSnbeams case 1: 261f6af633fSnbeams nthreads = Q; 262f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D); 263f6af633fSnbeams shmem += sizeof(CeedScalar) * Q; // for dqweight1d 264f6af633fSnbeams shmem += sizeof(CeedScalar) * ntcol * Q; // for output 265f6af633fSnbeams break; 266f6af633fSnbeams case 2: 267f6af633fSnbeams nthreads = Q; 268f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D); 269f6af633fSnbeams shmem += sizeof(CeedScalar) * Q; // for dqweight1d 270f6af633fSnbeams break; 271f6af633fSnbeams case 3: 272f6af633fSnbeams nthreads = Q * Q; 273f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D); 274f6af633fSnbeams shmem += sizeof(CeedScalar) * Q; // for dqweight1d 275f6af633fSnbeams } 276f6af633fSnbeams CeedInt grid = (nelem + ntcol-1) / ntcol; 277f6af633fSnbeams void *args[] = {&impl->dqweight1d, &v, &eldofssize, &nelem}; 278f6af633fSnbeams 279f6af633fSnbeams ierr = MAGMA_RTC_RUN_KERNEL_DIM_SH(ceed, impl->magma_weight, grid, 280f6af633fSnbeams nthreads, ntcol, 1, shmem, 281f6af633fSnbeams args); CeedChkBackend(ierr); 2827f5b9731SStan Tomov } 2833513a710Sjeremylt break; 2843513a710Sjeremylt // LCOV_EXCL_START 2853513a710Sjeremylt case CEED_EVAL_DIV: 286e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 2873513a710Sjeremylt case CEED_EVAL_CURL: 288e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 2893513a710Sjeremylt case CEED_EVAL_NONE: 290e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 2913513a710Sjeremylt "CEED_EVAL_NONE does not make sense in this context"); 2923513a710Sjeremylt // LCOV_EXCL_STOP 2933513a710Sjeremylt } 2947f5b9731SStan Tomov 295e0582403Sabdelfattah83 // must sync to ensure completeness 296e0582403Sabdelfattah83 ceed_magma_queue_sync( data->queue ); 297e0582403Sabdelfattah83 2987f5b9731SStan Tomov if (emode!=CEED_EVAL_WEIGHT) { 299e15f9bd0SJeremy L Thompson ierr = CeedVectorRestoreArrayRead(U, &u); CeedChkBackend(ierr); 3007f5b9731SStan Tomov } 301e15f9bd0SJeremy L Thompson ierr = CeedVectorRestoreArray(V, &v); CeedChkBackend(ierr); 302e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3037f5b9731SStan Tomov } 3047f5b9731SStan Tomov 3057f5b9731SStan Tomov #ifdef __cplusplus 3067f5b9731SStan Tomov CEED_INTERN "C" 3077f5b9731SStan Tomov #endif 30880a9ef05SNatalie Beams int CeedBasisApplyNonTensor_f64_Magma(CeedBasis basis, CeedInt nelem, 309868539c2SNatalie Beams CeedTransposeMode tmode, CeedEvalMode emode, 310868539c2SNatalie Beams CeedVector U, CeedVector V) { 311868539c2SNatalie Beams int ierr; 312868539c2SNatalie Beams Ceed ceed; 313e15f9bd0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 314e0582403Sabdelfattah83 315e0582403Sabdelfattah83 Ceed_Magma *data; 316e15f9bd0SJeremy L Thompson ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 317e0582403Sabdelfattah83 318868539c2SNatalie Beams CeedInt dim, ncomp, ndof, nqpt; 319e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 320e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 321e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr); 322e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr); 323868539c2SNatalie Beams const CeedScalar *du; 324868539c2SNatalie Beams CeedScalar *dv; 325868539c2SNatalie Beams if (emode != CEED_EVAL_WEIGHT) { 326e15f9bd0SJeremy L Thompson ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du); CeedChkBackend(ierr); 327868539c2SNatalie Beams } else if (emode != CEED_EVAL_WEIGHT) { 328868539c2SNatalie Beams // LCOV_EXCL_START 329e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 330868539c2SNatalie Beams "An input vector is required for this CeedEvalMode"); 331868539c2SNatalie Beams // LCOV_EXCL_STOP 332868539c2SNatalie Beams } 3339c774eddSJeremy L Thompson ierr = CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv); CeedChkBackend(ierr); 334868539c2SNatalie Beams 335868539c2SNatalie Beams CeedBasisNonTensor_Magma *impl; 336e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); 337868539c2SNatalie Beams 3383f21f6b1SJeremy L Thompson CeedDebug(ceed, "\033[01m[CeedBasisApplyNonTensor_Magma] vsize=%d, comp = %d", 339868539c2SNatalie Beams ncomp*ndof, ncomp); 340868539c2SNatalie Beams 341868539c2SNatalie Beams if (tmode == CEED_TRANSPOSE) { 3421f9221feSJeremy L Thompson CeedSize length; 343868539c2SNatalie Beams ierr = CeedVectorGetLength(V, &length); 34480a9ef05SNatalie Beams if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { 34580a9ef05SNatalie Beams magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *) dv, length, 34680a9ef05SNatalie Beams data->queue); 34780a9ef05SNatalie Beams } else { 34880a9ef05SNatalie Beams magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *) dv, length, 34980a9ef05SNatalie Beams data->queue); 35080a9ef05SNatalie Beams } 351e0582403Sabdelfattah83 ceed_magma_queue_sync( data->queue ); 352868539c2SNatalie Beams } 35380a9ef05SNatalie Beams 354868539c2SNatalie Beams switch (emode) { 355868539c2SNatalie Beams case CEED_EVAL_INTERP: { 356868539c2SNatalie Beams CeedInt P = ndof, Q = nqpt; 357868539c2SNatalie Beams if (tmode == CEED_TRANSPOSE) 358e0582403Sabdelfattah83 magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans, 359868539c2SNatalie Beams P, nelem*ncomp, Q, 36080a9ef05SNatalie Beams 1.0, (double *)impl->dinterp, P, 36180a9ef05SNatalie Beams (double *)du, Q, 36280a9ef05SNatalie Beams 0.0, (double *)dv, P, data->queue); 363868539c2SNatalie Beams else 364e0582403Sabdelfattah83 magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans, 365868539c2SNatalie Beams Q, nelem*ncomp, P, 36680a9ef05SNatalie Beams 1.0, (double *)impl->dinterp, P, 36780a9ef05SNatalie Beams (double *)du, P, 36880a9ef05SNatalie Beams 0.0, (double *)dv, Q, data->queue); 369868539c2SNatalie Beams } 370868539c2SNatalie Beams break; 371868539c2SNatalie Beams 372868539c2SNatalie Beams case CEED_EVAL_GRAD: { 373868539c2SNatalie Beams CeedInt P = ndof, Q = nqpt; 374868539c2SNatalie Beams if (tmode == CEED_TRANSPOSE) { 37580a9ef05SNatalie Beams CeedScalar beta = 0.0; 376868539c2SNatalie Beams for(int d=0; d<dim; d++) { 377868539c2SNatalie Beams if (d>0) 378868539c2SNatalie Beams beta = 1.0; 379e0582403Sabdelfattah83 magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans, 380868539c2SNatalie Beams P, nelem*ncomp, Q, 38180a9ef05SNatalie Beams 1.0, (double *)(impl->dgrad + d*P*Q), P, 38280a9ef05SNatalie Beams (double *)(du + d*nelem*ncomp*Q), Q, 38380a9ef05SNatalie Beams beta, (double *)dv, P, data->queue); 384868539c2SNatalie Beams } 385868539c2SNatalie Beams } else { 386868539c2SNatalie Beams for(int d=0; d< dim; d++) 387e0582403Sabdelfattah83 magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans, 388868539c2SNatalie Beams Q, nelem*ncomp, P, 38980a9ef05SNatalie Beams 1.0, (double *)(impl->dgrad + d*P*Q), P, 39080a9ef05SNatalie Beams (double *)du, P, 39180a9ef05SNatalie Beams 0.0, (double *)(dv + d*nelem*ncomp*Q), Q, data->queue); 39280a9ef05SNatalie Beams } 39380a9ef05SNatalie Beams } 39480a9ef05SNatalie Beams break; 39580a9ef05SNatalie Beams 39680a9ef05SNatalie Beams case CEED_EVAL_WEIGHT: { 39780a9ef05SNatalie Beams if (tmode == CEED_TRANSPOSE) 39880a9ef05SNatalie Beams // LCOV_EXCL_START 39980a9ef05SNatalie Beams return CeedError(ceed, CEED_ERROR_BACKEND, 40080a9ef05SNatalie Beams "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 40180a9ef05SNatalie Beams // LCOV_EXCL_STOP 40280a9ef05SNatalie Beams 40380a9ef05SNatalie Beams int elemsPerBlock = 1;//basis->Q1d < 7 ? optElems[basis->Q1d] : 1; 40480a9ef05SNatalie Beams int grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)? 40580a9ef05SNatalie Beams 1 : 0 ); 40680a9ef05SNatalie Beams magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv, 40780a9ef05SNatalie Beams data->queue); 40880a9ef05SNatalie Beams CeedChkBackend(ierr); 40980a9ef05SNatalie Beams } 41080a9ef05SNatalie Beams break; 41180a9ef05SNatalie Beams 41280a9ef05SNatalie Beams // LCOV_EXCL_START 41380a9ef05SNatalie Beams case CEED_EVAL_DIV: 41480a9ef05SNatalie Beams return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 41580a9ef05SNatalie Beams case CEED_EVAL_CURL: 41680a9ef05SNatalie Beams return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 41780a9ef05SNatalie Beams case CEED_EVAL_NONE: 41880a9ef05SNatalie Beams return CeedError(ceed, CEED_ERROR_BACKEND, 41980a9ef05SNatalie Beams "CEED_EVAL_NONE does not make sense in this context"); 42080a9ef05SNatalie Beams // LCOV_EXCL_STOP 42180a9ef05SNatalie Beams } 42280a9ef05SNatalie Beams 42380a9ef05SNatalie Beams // must sync to ensure completeness 42480a9ef05SNatalie Beams ceed_magma_queue_sync( data->queue ); 42580a9ef05SNatalie Beams 42680a9ef05SNatalie Beams if (emode!=CEED_EVAL_WEIGHT) { 42780a9ef05SNatalie Beams ierr = CeedVectorRestoreArrayRead(U, &du); CeedChkBackend(ierr); 42880a9ef05SNatalie Beams } 42980a9ef05SNatalie Beams ierr = CeedVectorRestoreArray(V, &dv); CeedChkBackend(ierr); 43080a9ef05SNatalie Beams return CEED_ERROR_SUCCESS; 43180a9ef05SNatalie Beams } 43280a9ef05SNatalie Beams 43380a9ef05SNatalie Beams int CeedBasisApplyNonTensor_f32_Magma(CeedBasis basis, CeedInt nelem, 43480a9ef05SNatalie Beams CeedTransposeMode tmode, CeedEvalMode emode, 43580a9ef05SNatalie Beams CeedVector U, CeedVector V) { 43680a9ef05SNatalie Beams int ierr; 43780a9ef05SNatalie Beams Ceed ceed; 43880a9ef05SNatalie Beams ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 43980a9ef05SNatalie Beams 44080a9ef05SNatalie Beams Ceed_Magma *data; 44180a9ef05SNatalie Beams ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 44280a9ef05SNatalie Beams 44380a9ef05SNatalie Beams CeedInt dim, ncomp, ndof, nqpt; 44480a9ef05SNatalie Beams ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 44580a9ef05SNatalie Beams ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 44680a9ef05SNatalie Beams ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr); 44780a9ef05SNatalie Beams ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr); 44880a9ef05SNatalie Beams const CeedScalar *du; 44980a9ef05SNatalie Beams CeedScalar *dv; 45080a9ef05SNatalie Beams if (emode != CEED_EVAL_WEIGHT) { 45180a9ef05SNatalie Beams ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du); CeedChkBackend(ierr); 45280a9ef05SNatalie Beams } else if (emode != CEED_EVAL_WEIGHT) { 45380a9ef05SNatalie Beams // LCOV_EXCL_START 45480a9ef05SNatalie Beams return CeedError(ceed, CEED_ERROR_BACKEND, 45580a9ef05SNatalie Beams "An input vector is required for this CeedEvalMode"); 45680a9ef05SNatalie Beams // LCOV_EXCL_STOP 45780a9ef05SNatalie Beams } 4589c774eddSJeremy L Thompson ierr = CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv); CeedChkBackend(ierr); 45980a9ef05SNatalie Beams 46080a9ef05SNatalie Beams CeedBasisNonTensor_Magma *impl; 46180a9ef05SNatalie Beams ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); 46280a9ef05SNatalie Beams 4633f21f6b1SJeremy L Thompson CeedDebug(ceed, "\033[01m[CeedBasisApplyNonTensor_Magma] vsize=%d, comp = %d", 46480a9ef05SNatalie Beams ncomp*ndof, ncomp); 46580a9ef05SNatalie Beams 46680a9ef05SNatalie Beams if (tmode == CEED_TRANSPOSE) { 4671f9221feSJeremy L Thompson CeedSize length; 46880a9ef05SNatalie Beams ierr = CeedVectorGetLength(V, &length); 46980a9ef05SNatalie Beams if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { 47080a9ef05SNatalie Beams magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *) dv, length, 47180a9ef05SNatalie Beams data->queue); 47280a9ef05SNatalie Beams } else { 47380a9ef05SNatalie Beams magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *) dv, length, 47480a9ef05SNatalie Beams data->queue); 47580a9ef05SNatalie Beams } 47680a9ef05SNatalie Beams ceed_magma_queue_sync( data->queue ); 47780a9ef05SNatalie Beams } 47880a9ef05SNatalie Beams 47980a9ef05SNatalie Beams switch (emode) { 48080a9ef05SNatalie Beams case CEED_EVAL_INTERP: { 48180a9ef05SNatalie Beams CeedInt P = ndof, Q = nqpt; 48280a9ef05SNatalie Beams if (tmode == CEED_TRANSPOSE) 48380a9ef05SNatalie Beams magma_sgemm_nontensor(MagmaNoTrans, MagmaNoTrans, 48480a9ef05SNatalie Beams P, nelem*ncomp, Q, 48580a9ef05SNatalie Beams 1.0, (float *)impl->dinterp, P, 48680a9ef05SNatalie Beams (float *)du, Q, 48780a9ef05SNatalie Beams 0.0, (float *)dv, P, data->queue); 48880a9ef05SNatalie Beams else 48980a9ef05SNatalie Beams magma_sgemm_nontensor(MagmaTrans, MagmaNoTrans, 49080a9ef05SNatalie Beams Q, nelem*ncomp, P, 49180a9ef05SNatalie Beams 1.0, (float *)impl->dinterp, P, 49280a9ef05SNatalie Beams (float *)du, P, 49380a9ef05SNatalie Beams 0.0, (float *)dv, Q, data->queue); 49480a9ef05SNatalie Beams } 49580a9ef05SNatalie Beams break; 49680a9ef05SNatalie Beams 49780a9ef05SNatalie Beams case CEED_EVAL_GRAD: { 49880a9ef05SNatalie Beams CeedInt P = ndof, Q = nqpt; 49980a9ef05SNatalie Beams if (tmode == CEED_TRANSPOSE) { 50080a9ef05SNatalie Beams CeedScalar beta = 0.0; 50180a9ef05SNatalie Beams for(int d=0; d<dim; d++) { 50280a9ef05SNatalie Beams if (d>0) 50380a9ef05SNatalie Beams beta = 1.0; 50480a9ef05SNatalie Beams magma_sgemm_nontensor(MagmaNoTrans, MagmaNoTrans, 50580a9ef05SNatalie Beams P, nelem*ncomp, Q, 50680a9ef05SNatalie Beams 1.0, (float *)(impl->dgrad + d*P*Q), P, 50780a9ef05SNatalie Beams (float *)(du + d*nelem*ncomp*Q), Q, 50880a9ef05SNatalie Beams beta, (float *)dv, P, data->queue); 50980a9ef05SNatalie Beams } 51080a9ef05SNatalie Beams } else { 51180a9ef05SNatalie Beams for(int d=0; d< dim; d++) 51280a9ef05SNatalie Beams magma_sgemm_nontensor(MagmaTrans, MagmaNoTrans, 51380a9ef05SNatalie Beams Q, nelem*ncomp, P, 51480a9ef05SNatalie Beams 1.0, (float *)(impl->dgrad + d*P*Q), P, 51580a9ef05SNatalie Beams (float *)du, P, 51680a9ef05SNatalie Beams 0.0, (float *)(dv + d*nelem*ncomp*Q), Q, data->queue); 517868539c2SNatalie Beams } 518868539c2SNatalie Beams } 519868539c2SNatalie Beams break; 520868539c2SNatalie Beams 521868539c2SNatalie Beams case CEED_EVAL_WEIGHT: { 522868539c2SNatalie Beams if (tmode == CEED_TRANSPOSE) 523868539c2SNatalie Beams // LCOV_EXCL_START 524e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 525868539c2SNatalie Beams "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 526868539c2SNatalie Beams // LCOV_EXCL_STOP 527868539c2SNatalie Beams 528868539c2SNatalie Beams int elemsPerBlock = 1;//basis->Q1d < 7 ? optElems[basis->Q1d] : 1; 529868539c2SNatalie Beams int grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)? 530868539c2SNatalie Beams 1 : 0 ); 531e0582403Sabdelfattah83 magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv, 532e0582403Sabdelfattah83 data->queue); 533e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 534868539c2SNatalie Beams } 535868539c2SNatalie Beams break; 536868539c2SNatalie Beams 537868539c2SNatalie Beams // LCOV_EXCL_START 538868539c2SNatalie Beams case CEED_EVAL_DIV: 539e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 540868539c2SNatalie Beams case CEED_EVAL_CURL: 541e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 542868539c2SNatalie Beams case CEED_EVAL_NONE: 543e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 544868539c2SNatalie Beams "CEED_EVAL_NONE does not make sense in this context"); 545868539c2SNatalie Beams // LCOV_EXCL_STOP 546868539c2SNatalie Beams } 547868539c2SNatalie Beams 548e0582403Sabdelfattah83 // must sync to ensure completeness 549e0582403Sabdelfattah83 ceed_magma_queue_sync( data->queue ); 550e0582403Sabdelfattah83 551868539c2SNatalie Beams if (emode!=CEED_EVAL_WEIGHT) { 552e15f9bd0SJeremy L Thompson ierr = CeedVectorRestoreArrayRead(U, &du); CeedChkBackend(ierr); 553868539c2SNatalie Beams } 554e15f9bd0SJeremy L Thompson ierr = CeedVectorRestoreArray(V, &dv); CeedChkBackend(ierr); 555e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 556868539c2SNatalie Beams } 557868539c2SNatalie Beams 558868539c2SNatalie Beams #ifdef __cplusplus 559868539c2SNatalie Beams CEED_INTERN "C" 560868539c2SNatalie Beams #endif 5613513a710Sjeremylt int CeedBasisDestroy_Magma(CeedBasis basis) { 5627f5b9731SStan Tomov int ierr; 5637f5b9731SStan Tomov CeedBasis_Magma *impl; 564e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); 5657f5b9731SStan Tomov 566e15f9bd0SJeremy L Thompson ierr = magma_free(impl->dqref1d); CeedChkBackend(ierr); 567e15f9bd0SJeremy L Thompson ierr = magma_free(impl->dinterp1d); CeedChkBackend(ierr); 568e15f9bd0SJeremy L Thompson ierr = magma_free(impl->dgrad1d); CeedChkBackend(ierr); 569e15f9bd0SJeremy L Thompson ierr = magma_free(impl->dqweight1d); CeedChkBackend(ierr); 570f6af633fSnbeams Ceed ceed; 571f6af633fSnbeams ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 572*e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP 573f6af633fSnbeams ierr = hipModuleUnload(impl->module); CeedChk_Hip(ceed, ierr); 574f6af633fSnbeams #else 575f6af633fSnbeams ierr = cuModuleUnload(impl->module); CeedChk_Cu(ceed, ierr); 576f6af633fSnbeams #endif 5777f5b9731SStan Tomov 578e15f9bd0SJeremy L Thompson ierr = CeedFree(&impl); CeedChkBackend(ierr); 5797f5b9731SStan Tomov 580e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 5817f5b9731SStan Tomov } 5827f5b9731SStan Tomov 5837f5b9731SStan Tomov #ifdef __cplusplus 5847f5b9731SStan Tomov CEED_INTERN "C" 5857f5b9731SStan Tomov #endif 586868539c2SNatalie Beams int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) { 587868539c2SNatalie Beams int ierr; 588868539c2SNatalie Beams CeedBasisNonTensor_Magma *impl; 589e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); 590868539c2SNatalie Beams 591e15f9bd0SJeremy L Thompson ierr = magma_free(impl->dqref); CeedChkBackend(ierr); 592e15f9bd0SJeremy L Thompson ierr = magma_free(impl->dinterp); CeedChkBackend(ierr); 593e15f9bd0SJeremy L Thompson ierr = magma_free(impl->dgrad); CeedChkBackend(ierr); 594e15f9bd0SJeremy L Thompson ierr = magma_free(impl->dqweight); CeedChkBackend(ierr); 595868539c2SNatalie Beams 596e15f9bd0SJeremy L Thompson ierr = CeedFree(&impl); CeedChkBackend(ierr); 597868539c2SNatalie Beams 598e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 599868539c2SNatalie Beams } 600868539c2SNatalie Beams 601868539c2SNatalie Beams #ifdef __cplusplus 602868539c2SNatalie Beams CEED_INTERN "C" 603868539c2SNatalie Beams #endif 6043513a710Sjeremylt int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, 6053513a710Sjeremylt const CeedScalar *interp1d, 6067f5b9731SStan Tomov const CeedScalar *grad1d, 6077f5b9731SStan Tomov const CeedScalar *qref1d, 6083513a710Sjeremylt const CeedScalar *qweight1d, CeedBasis basis) { 6097f5b9731SStan Tomov int ierr; 6107f5b9731SStan Tomov CeedBasis_Magma *impl; 611f6af633fSnbeams ierr = CeedCalloc(1,&impl); CeedChkBackend(ierr); 6127f5b9731SStan Tomov Ceed ceed; 613e15f9bd0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 6147f5b9731SStan Tomov 615c9f8acf2SJeremy L Thompson // Check for supported parameters 616c9f8acf2SJeremy L Thompson CeedInt ncomp = 0; 617e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 618e0582403Sabdelfattah83 Ceed_Magma *data; 619e15f9bd0SJeremy L Thompson ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 620e0582403Sabdelfattah83 621f6af633fSnbeams // Compile kernels 622f6af633fSnbeams char *magma_common_path; 623f6af633fSnbeams char *interp_path, *grad_path, *weight_path; 624f6af633fSnbeams char *basis_kernel_source; 625f6af633fSnbeams ierr = CeedGetJitAbsolutePath(ceed, 626f6af633fSnbeams "ceed/jit-source/magma/magma_common_device.h", 627f6af633fSnbeams &magma_common_path); CeedChkBackend(ierr); 628f6af633fSnbeams CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 629f6af633fSnbeams ierr = CeedLoadSourceToBuffer(ceed, magma_common_path, 630f6af633fSnbeams &basis_kernel_source); 631f6af633fSnbeams CeedChkBackend(ierr); 632f6af633fSnbeams char *interp_name_base = "ceed/jit-source/magma/interp"; 633f6af633fSnbeams CeedInt interp_name_len = strlen(interp_name_base) + 6; 634f6af633fSnbeams char interp_name[interp_name_len]; 635f6af633fSnbeams snprintf(interp_name, interp_name_len, "%s-%dd.h", interp_name_base, dim); 636f6af633fSnbeams ierr = CeedGetJitAbsolutePath(ceed, interp_name, &interp_path); 637f6af633fSnbeams CeedChkBackend(ierr); 638f6af633fSnbeams ierr = CeedLoadSourceToInitializedBuffer(ceed, interp_path, 639f6af633fSnbeams &basis_kernel_source); 640f6af633fSnbeams CeedChkBackend(ierr); 641f6af633fSnbeams char *grad_name_base = "ceed/jit-source/magma/grad"; 642f6af633fSnbeams CeedInt grad_name_len = strlen(grad_name_base) + 6; 643f6af633fSnbeams char grad_name[grad_name_len]; 644f6af633fSnbeams snprintf(grad_name, grad_name_len, "%s-%dd.h", grad_name_base, dim); 645f6af633fSnbeams ierr = CeedGetJitAbsolutePath(ceed, grad_name, &grad_path); 646f6af633fSnbeams CeedChkBackend(ierr); 647f6af633fSnbeams ierr = CeedLoadSourceToInitializedBuffer(ceed, grad_path, 648f6af633fSnbeams &basis_kernel_source); 649f6af633fSnbeams CeedChkBackend(ierr); 650f6af633fSnbeams char *weight_name_base = "ceed/jit-source/magma/weight"; 651f6af633fSnbeams CeedInt weight_name_len = strlen(weight_name_base) + 6; 652f6af633fSnbeams char weight_name[weight_name_len]; 653f6af633fSnbeams snprintf(weight_name, weight_name_len, "%s-%dd.h", weight_name_base, dim); 654f6af633fSnbeams ierr = CeedGetJitAbsolutePath(ceed, weight_name, &weight_path); 655f6af633fSnbeams CeedChkBackend(ierr); 656f6af633fSnbeams ierr = CeedLoadSourceToInitializedBuffer(ceed, weight_path, 657f6af633fSnbeams &basis_kernel_source); 658f6af633fSnbeams CeedChkBackend(ierr); 659f6af633fSnbeams CeedDebug256(ceed, 2, 660f6af633fSnbeams "----- Loading Basis Kernel Source Complete! -----\n"); 661f6af633fSnbeams // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip 662f6af633fSnbeams // data 663f6af633fSnbeams Ceed delegate; 664f6af633fSnbeams ierr = CeedGetDelegate(ceed, &delegate); CeedChkBackend(ierr); 665f6af633fSnbeams ierr = MAGMA_RTC_COMPILE(delegate, basis_kernel_source, &impl->module, 5, 666f6af633fSnbeams "DIM", dim, 667f6af633fSnbeams "NCOMP", ncomp, 668f6af633fSnbeams "P", P1d, 669f6af633fSnbeams "Q", Q1d, 670f6af633fSnbeams "MAXPQ", CeedIntMax(P1d, Q1d)); 671f6af633fSnbeams CeedChkBackend(ierr); 672f6af633fSnbeams 673f6af633fSnbeams // Kernel setup 674f6af633fSnbeams switch (dim) { 675f6af633fSnbeams case 1: 676f6af633fSnbeams ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_interpn_1d_kernel", 677f6af633fSnbeams &impl->magma_interp); 678f6af633fSnbeams CeedChkBackend(ierr); 679f6af633fSnbeams ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_interpt_1d_kernel", 680f6af633fSnbeams &impl->magma_interp_tr); 681f6af633fSnbeams CeedChkBackend(ierr); 682f6af633fSnbeams ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_gradn_1d_kernel", 683f6af633fSnbeams &impl->magma_grad); 684f6af633fSnbeams CeedChkBackend(ierr); 685f6af633fSnbeams ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_gradt_1d_kernel", 686f6af633fSnbeams &impl->magma_grad_tr); 687f6af633fSnbeams CeedChkBackend(ierr); 688f6af633fSnbeams ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_weight_1d_kernel", 689f6af633fSnbeams &impl->magma_weight); 690f6af633fSnbeams CeedChkBackend(ierr); 691f6af633fSnbeams break; 692f6af633fSnbeams case 2: 693f6af633fSnbeams ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_interpn_2d_kernel", 694f6af633fSnbeams &impl->magma_interp); 695f6af633fSnbeams CeedChkBackend(ierr); 696f6af633fSnbeams ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_interpt_2d_kernel", 697f6af633fSnbeams &impl->magma_interp_tr); 698f6af633fSnbeams CeedChkBackend(ierr); 699f6af633fSnbeams ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_gradn_2d_kernel", 700f6af633fSnbeams &impl->magma_grad); 701f6af633fSnbeams CeedChkBackend(ierr); 702f6af633fSnbeams ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_gradt_2d_kernel", 703f6af633fSnbeams &impl->magma_grad_tr); 704f6af633fSnbeams CeedChkBackend(ierr); 705f6af633fSnbeams ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_weight_2d_kernel", 706f6af633fSnbeams &impl->magma_weight); 707f6af633fSnbeams CeedChkBackend(ierr); 708f6af633fSnbeams break; 709f6af633fSnbeams case 3: 710f6af633fSnbeams ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_interpn_3d_kernel", 711f6af633fSnbeams &impl->magma_interp); 712f6af633fSnbeams CeedChkBackend(ierr); 713f6af633fSnbeams ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_interpt_3d_kernel", 714f6af633fSnbeams &impl->magma_interp_tr); 715f6af633fSnbeams CeedChkBackend(ierr); 716f6af633fSnbeams ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_gradn_3d_kernel", 717f6af633fSnbeams &impl->magma_grad); 718f6af633fSnbeams CeedChkBackend(ierr); 719f6af633fSnbeams ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_gradt_3d_kernel", 720f6af633fSnbeams &impl->magma_grad_tr); 721f6af633fSnbeams CeedChkBackend(ierr); 722f6af633fSnbeams ierr = MAGMA_RTC_GET_KERNEL(ceed, impl->module, "magma_weight_3d_kernel", 723f6af633fSnbeams &impl->magma_weight); 724f6af633fSnbeams CeedChkBackend(ierr); 725f6af633fSnbeams } 726f6af633fSnbeams 7277f5b9731SStan Tomov ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 728e15f9bd0SJeremy L Thompson CeedBasisApply_Magma); CeedChkBackend(ierr); 7297f5b9731SStan Tomov ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 730e15f9bd0SJeremy L Thompson CeedBasisDestroy_Magma); CeedChkBackend(ierr); 7317f5b9731SStan Tomov 7327f5b9731SStan Tomov // Copy qref1d to the GPU 7337f5b9731SStan Tomov ierr = magma_malloc((void **)&impl->dqref1d, Q1d*sizeof(qref1d[0])); 734e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 735e0582403Sabdelfattah83 magma_setvector(Q1d, sizeof(qref1d[0]), qref1d, 1, impl->dqref1d, 1, 736e0582403Sabdelfattah83 data->queue); 7377f5b9731SStan Tomov 7387f5b9731SStan Tomov // Copy interp1d to the GPU 7397f5b9731SStan Tomov ierr = magma_malloc((void **)&impl->dinterp1d, Q1d*P1d*sizeof(interp1d[0])); 740e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 741e0582403Sabdelfattah83 magma_setvector(Q1d*P1d, sizeof(interp1d[0]), interp1d, 1, impl->dinterp1d, 1, 742e0582403Sabdelfattah83 data->queue); 7437f5b9731SStan Tomov 7447f5b9731SStan Tomov // Copy grad1d to the GPU 7457f5b9731SStan Tomov ierr = magma_malloc((void **)&impl->dgrad1d, Q1d*P1d*sizeof(grad1d[0])); 746e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 747e0582403Sabdelfattah83 magma_setvector(Q1d*P1d, sizeof(grad1d[0]), grad1d, 1, impl->dgrad1d, 1, 748e0582403Sabdelfattah83 data->queue); 7497f5b9731SStan Tomov 7507f5b9731SStan Tomov // Copy qweight1d to the GPU 7517f5b9731SStan Tomov ierr = magma_malloc((void **)&impl->dqweight1d, Q1d*sizeof(qweight1d[0])); 752e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 753e0582403Sabdelfattah83 magma_setvector(Q1d, sizeof(qweight1d[0]), qweight1d, 1, impl->dqweight1d, 1, 754e0582403Sabdelfattah83 data->queue); 7557f5b9731SStan Tomov 756f6af633fSnbeams ierr = CeedBasisSetData(basis, impl); CeedChkBackend(ierr); 757f6af633fSnbeams 758e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 7597f5b9731SStan Tomov } 7607f5b9731SStan Tomov 7617f5b9731SStan Tomov #ifdef __cplusplus 7627f5b9731SStan Tomov CEED_INTERN "C" 7637f5b9731SStan Tomov #endif 7643513a710Sjeremylt int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, 7653513a710Sjeremylt CeedInt nqpts, const CeedScalar *interp, 7663513a710Sjeremylt const CeedScalar *grad, const CeedScalar *qref, 7673513a710Sjeremylt const CeedScalar *qweight, CeedBasis basis) { 7687f5b9731SStan Tomov int ierr; 769868539c2SNatalie Beams CeedBasisNonTensor_Magma *impl; 7707f5b9731SStan Tomov Ceed ceed; 771e15f9bd0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 7727f5b9731SStan Tomov 773e0582403Sabdelfattah83 Ceed_Magma *data; 774e15f9bd0SJeremy L Thompson ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 775e0582403Sabdelfattah83 77680a9ef05SNatalie Beams if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) { 777868539c2SNatalie Beams ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 77880a9ef05SNatalie Beams CeedBasisApplyNonTensor_f64_Magma); 77980a9ef05SNatalie Beams CeedChkBackend(ierr); 78080a9ef05SNatalie Beams } else { 78180a9ef05SNatalie Beams ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 78280a9ef05SNatalie Beams CeedBasisApplyNonTensor_f32_Magma); 78380a9ef05SNatalie Beams CeedChkBackend(ierr); 78480a9ef05SNatalie Beams } 785868539c2SNatalie Beams ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 786e15f9bd0SJeremy L Thompson CeedBasisDestroyNonTensor_Magma); CeedChkBackend(ierr); 787868539c2SNatalie Beams 788e15f9bd0SJeremy L Thompson ierr = CeedCalloc(1,&impl); CeedChkBackend(ierr); 789e15f9bd0SJeremy L Thompson ierr = CeedBasisSetData(basis, impl); CeedChkBackend(ierr); 790868539c2SNatalie Beams 791868539c2SNatalie Beams // Copy qref to the GPU 792868539c2SNatalie Beams ierr = magma_malloc((void **)&impl->dqref, nqpts*sizeof(qref[0])); 793e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 794e0582403Sabdelfattah83 magma_setvector(nqpts, sizeof(qref[0]), qref, 1, impl->dqref, 1, data->queue); 795868539c2SNatalie Beams 796868539c2SNatalie Beams // Copy interp to the GPU 797868539c2SNatalie Beams ierr = magma_malloc((void **)&impl->dinterp, nqpts*ndof*sizeof(interp[0])); 798e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 799e0582403Sabdelfattah83 magma_setvector(nqpts*ndof, sizeof(interp[0]), interp, 1, impl->dinterp, 1, 800e0582403Sabdelfattah83 data->queue); 801868539c2SNatalie Beams 802868539c2SNatalie Beams // Copy grad to the GPU 803868539c2SNatalie Beams ierr = magma_malloc((void **)&impl->dgrad, nqpts*ndof*dim*sizeof(grad[0])); 804e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 805e0582403Sabdelfattah83 magma_setvector(nqpts*ndof*dim, sizeof(grad[0]), grad, 1, impl->dgrad, 1, 806e0582403Sabdelfattah83 data->queue); 807868539c2SNatalie Beams 808868539c2SNatalie Beams // Copy qweight to the GPU 809868539c2SNatalie Beams ierr = magma_malloc((void **)&impl->dqweight, nqpts*sizeof(qweight[0])); 810e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 811e0582403Sabdelfattah83 magma_setvector(nqpts, sizeof(qweight[0]), qweight, 1, impl->dqweight, 1, 812e0582403Sabdelfattah83 data->queue); 813868539c2SNatalie Beams 814e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 8157f5b9731SStan Tomov } 816