13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 37f5b9731SStan Tomov // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 57f5b9731SStan Tomov // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 77f5b9731SStan Tomov 849aac155SJeremy L Thompson #include <ceed.h> 9ec3da8bcSJed Brown #include <ceed/backend.h> 10f6af633fSnbeams #include <ceed/jit-tools.h> 11f6af633fSnbeams #include <string.h> 122b730f8bSJeremy L Thompson 13e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP 14f6af633fSnbeams #include "../hip/ceed-hip-common.h" 15f6af633fSnbeams #include "../hip/ceed-hip-compile.h" 16f6af633fSnbeams #else 17f6af633fSnbeams #include "../cuda/ceed-cuda-common.h" 18f6af633fSnbeams #include "../cuda/ceed-cuda-compile.h" 19f6af633fSnbeams #endif 2000fb7a04SSebastian Grimberg #include "ceed-magma-common.h" 2100fb7a04SSebastian Grimberg #include "ceed-magma.h" 227f5b9731SStan Tomov 237f5b9731SStan Tomov #ifdef __cplusplus 247f5b9731SStan Tomov CEED_INTERN "C" 257f5b9731SStan Tomov #endif 262b730f8bSJeremy L Thompson int 27*38293ee6SJeremy L Thompson CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector U, CeedVector V) { 287f5b9731SStan Tomov Ceed ceed; 29e0582403Sabdelfattah83 Ceed_Magma *data; 30*38293ee6SJeremy L Thompson CeedInt dim, num_comp, num_dof, P_1d, Q_1d; 316574a04fSJeremy L Thompson const CeedScalar *du; 326574a04fSJeremy L Thompson CeedScalar *dv; 33*38293ee6SJeremy L Thompson CeedBasis_Magma *impl; 34*38293ee6SJeremy L Thompson 35*38293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 36*38293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 37*38293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 38*38293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &num_dof)); 39*38293ee6SJeremy L Thompson 40*38293ee6SJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &data)); 41*38293ee6SJeremy L Thompson 426574a04fSJeremy L Thompson if (U != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du)); 43*38293ee6SJeremy L Thompson else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 446574a04fSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv)); 457f5b9731SStan Tomov 462b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &impl)); 477f5b9731SStan Tomov 48*38293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 49*38293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 507f5b9731SStan Tomov 51*38293ee6SJeremy L Thompson CeedDebug256(ceed, 4, "[CeedBasisApply_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, num_comp * CeedIntPow(P_1d, dim), num_comp); 527f5b9731SStan Tomov 53*38293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 541f9221feSJeremy L Thompson CeedSize length; 55*38293ee6SJeremy L Thompson 562b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetLength(V, &length)); 5780a9ef05SNatalie Beams if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { 586574a04fSJeremy L Thompson magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue); 5980a9ef05SNatalie Beams } else { 606574a04fSJeremy L Thompson magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)dv, length, data->queue); 6180a9ef05SNatalie Beams } 62e0582403Sabdelfattah83 ceed_magma_queue_sync(data->queue); 637f5b9731SStan Tomov } 64f6af633fSnbeams 65*38293ee6SJeremy L Thompson switch (e_mode) { 663513a710Sjeremylt case CEED_EVAL_INTERP: { 67*38293ee6SJeremy L Thompson CeedInt P = P_1d, Q = Q_1d; 68*38293ee6SJeremy L Thompson 69*38293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 70*38293ee6SJeremy L Thompson P = Q_1d; 71*38293ee6SJeremy L Thompson Q = P_1d; 727f5b9731SStan Tomov } 737f5b9731SStan Tomov 747f5b9731SStan Tomov // Define element sizes for dofs/quad 75*38293ee6SJeremy L Thompson CeedInt elem_qpts_size = CeedIntPow(Q_1d, dim); 76*38293ee6SJeremy L Thompson CeedInt elem_dofs_size = CeedIntPow(P_1d, dim); 777f5b9731SStan Tomov 787f5b9731SStan Tomov // E-vector ordering -------------- Q-vector ordering 79868539c2SNatalie Beams // component component 80868539c2SNatalie Beams // elem elem 817f5b9731SStan Tomov // node node 827f5b9731SStan Tomov 837f5b9731SStan Tomov // --- Define strides for NOTRANSPOSE mode: --- 846574a04fSJeremy L Thompson // Input (du) is E-vector, output (dv) is Q-vector 857f5b9731SStan Tomov 867f5b9731SStan Tomov // Element strides 87*38293ee6SJeremy L Thompson CeedInt u_elem_stride = elem_dofs_size; 88*38293ee6SJeremy L Thompson CeedInt v_elem_stride = elem_qpts_size; 897f5b9731SStan Tomov // Component strides 90*38293ee6SJeremy L Thompson CeedInt u_comp_stride = num_elem * elem_dofs_size; 91*38293ee6SJeremy L Thompson CeedInt v_comp_stride = num_elem * elem_qpts_size; 927f5b9731SStan Tomov 937f5b9731SStan Tomov // --- Swap strides for TRANSPOSE mode: --- 94*38293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 956574a04fSJeremy L Thompson // Input (du) is Q-vector, output (dv) is E-vector 967f5b9731SStan Tomov // Element strides 97*38293ee6SJeremy L Thompson v_elem_stride = elem_dofs_size; 98*38293ee6SJeremy L Thompson u_elem_stride = elem_qpts_size; 997f5b9731SStan Tomov // Component strides 100*38293ee6SJeremy L Thompson v_comp_stride = num_elem * elem_dofs_size; 101*38293ee6SJeremy L Thompson u_comp_stride = num_elem * elem_qpts_size; 1027f5b9731SStan Tomov } 103*38293ee6SJeremy L Thompson CeedInt num_threads = 1; 104*38293ee6SJeremy L Thompson CeedInt num_t_col = 1; 105*38293ee6SJeremy L Thompson CeedInt shared_mem = 0; 106*38293ee6SJeremy L Thompson CeedInt max_P_Q = CeedIntMax(P, Q); 107f6af633fSnbeams 108f6af633fSnbeams switch (dim) { 109f6af633fSnbeams case 1: 110*38293ee6SJeremy L Thompson num_threads = max_P_Q; 111*38293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_1D); 112*38293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * num_t_col * (num_comp * (1 * P + 1 * Q)); 113*38293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * (P * Q); 114f6af633fSnbeams break; 115f6af633fSnbeams case 2: 116*38293ee6SJeremy L Thompson num_threads = max_P_Q; 117*38293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D); 118*38293ee6SJeremy L Thompson shared_mem += P * Q * sizeof(CeedScalar); // for sT 119*38293ee6SJeremy L Thompson shared_mem += num_t_col * (P * max_P_Q * sizeof(CeedScalar)); // for reforming rU we need PxP, and for the intermediate output we need PxQ 120f6af633fSnbeams break; 121f6af633fSnbeams case 3: 122*38293ee6SJeremy L Thompson num_threads = max_P_Q * max_P_Q; 123*38293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D); 124*38293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * (P * Q); // for sT 125*38293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * num_t_col * 126*38293ee6SJeremy L Thompson (CeedIntMax(P * P * max_P_Q, 127f6af633fSnbeams P * Q * Q)); // rU needs P^2xP, the intermediate output needs max(P^2xQ,PQ^2) 128f6af633fSnbeams } 129*38293ee6SJeremy L Thompson CeedInt grid = (num_elem + num_t_col - 1) / num_t_col; 130*38293ee6SJeremy L Thompson void *args[] = {&impl->d_interp_1d, &du, &u_elem_stride, &u_comp_stride, &dv, &v_elem_stride, &v_comp_stride, &num_elem}; 131f6af633fSnbeams 132*38293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 133*38293ee6SJeremy L Thompson CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp_tr, grid, num_threads, num_t_col, 1, shared_mem, args)); 134f6af633fSnbeams } else { 135*38293ee6SJeremy L Thompson CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp, grid, num_threads, num_t_col, 1, shared_mem, args)); 136f6af633fSnbeams } 1372b730f8bSJeremy L Thompson } break; 1383513a710Sjeremylt case CEED_EVAL_GRAD: { 139*38293ee6SJeremy L Thompson CeedInt P = P_1d, Q = Q_1d; 140*38293ee6SJeremy L Thompson 1417f5b9731SStan Tomov // In CEED_NOTRANSPOSE mode: 142*38293ee6SJeremy L Thompson // du is (P^dim x nc), column-major layout (nc = num_comp) 143*38293ee6SJeremy L Thompson // dv is (Q^dim x nc x dim), column-major layout (nc = num_comp) 1446574a04fSJeremy L Thompson // In CEED_TRANSPOSE mode, the sizes of du and dv are switched. 145*38293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 146*38293ee6SJeremy L Thompson P = Q_1d; 147*38293ee6SJeremy L Thompson Q = P_1d; 1487f5b9731SStan Tomov } 1497f5b9731SStan Tomov 1507f5b9731SStan Tomov // Define element sizes for dofs/quad 151*38293ee6SJeremy L Thompson CeedInt elem_qpts_size = CeedIntPow(Q_1d, dim); 152*38293ee6SJeremy L Thompson CeedInt elem_dofs_size = CeedIntPow(P_1d, dim); 1537f5b9731SStan Tomov 1547f5b9731SStan Tomov // E-vector ordering -------------- Q-vector ordering 1557f5b9731SStan Tomov // dim 156868539c2SNatalie Beams // component component 157868539c2SNatalie Beams // elem elem 1587f5b9731SStan Tomov // node node 1597f5b9731SStan Tomov 1607f5b9731SStan Tomov // --- Define strides for NOTRANSPOSE mode: --- 1616574a04fSJeremy L Thompson // Input (du) is E-vector, output (dv) is Q-vector 1627f5b9731SStan Tomov 1637f5b9731SStan Tomov // Element strides 164*38293ee6SJeremy L Thompson CeedInt u_elem_stride = elem_dofs_size; 165*38293ee6SJeremy L Thompson CeedInt v_elem_stride = elem_qpts_size; 1667f5b9731SStan Tomov // Component strides 167*38293ee6SJeremy L Thompson CeedInt u_comp_stride = num_elem * elem_dofs_size; 168*38293ee6SJeremy L Thompson CeedInt v_comp_stride = num_elem * elem_qpts_size; 1697f5b9731SStan Tomov // Dimension strides 170*38293ee6SJeremy L Thompson CeedInt u_dim_stride = 0; 171*38293ee6SJeremy L Thompson CeedInt v_dim_stride = num_elem * elem_qpts_size * num_comp; 1727f5b9731SStan Tomov 1737f5b9731SStan Tomov // --- Swap strides for TRANSPOSE mode: --- 174*38293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 1756574a04fSJeremy L Thompson // Input (du) is Q-vector, output (dv) is E-vector 1767f5b9731SStan Tomov // Element strides 177*38293ee6SJeremy L Thompson v_elem_stride = elem_dofs_size; 178*38293ee6SJeremy L Thompson u_elem_stride = elem_qpts_size; 1797f5b9731SStan Tomov // Component strides 180*38293ee6SJeremy L Thompson v_comp_stride = num_elem * elem_dofs_size; 181*38293ee6SJeremy L Thompson u_comp_stride = num_elem * elem_qpts_size; 1827f5b9731SStan Tomov // Dimension strides 183*38293ee6SJeremy L Thompson v_dim_stride = 0; 184*38293ee6SJeremy L Thompson u_dim_stride = num_elem * elem_qpts_size * num_comp; 1857f5b9731SStan Tomov } 186*38293ee6SJeremy L Thompson CeedInt num_threads = 1; 187*38293ee6SJeremy L Thompson CeedInt num_t_col = 1; 188*38293ee6SJeremy L Thompson CeedInt shared_mem = 0; 189*38293ee6SJeremy L Thompson CeedInt max_P_Q = CeedIntMax(P, Q); 190f6af633fSnbeams 191f6af633fSnbeams switch (dim) { 192f6af633fSnbeams case 1: 193*38293ee6SJeremy L Thompson num_threads = max_P_Q; 194*38293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_1D); 195*38293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * num_t_col * (num_comp * (1 * P + 1 * Q)); 196*38293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * (P * Q); 197f6af633fSnbeams break; 198f6af633fSnbeams case 2: 199*38293ee6SJeremy L Thompson num_threads = max_P_Q; 200*38293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D); 201*38293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * 2 * P * Q; // for sTinterp and sTgrad 202*38293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * num_t_col * (P * max_P_Q); // for reforming rU we need PxP, and for the intermediate output we need PxQ 203f6af633fSnbeams break; 204f6af633fSnbeams case 3: 205*38293ee6SJeremy L Thompson num_threads = max_P_Q * max_P_Q; 206*38293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D); 207*38293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * 2 * P * Q; // for sTinterp and sTgrad 208*38293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * num_t_col * 2092b730f8bSJeremy L Thompson CeedIntMax(P * P * P, 2102b730f8bSJeremy L Thompson (P * P * Q) + (P * Q * Q)); // rU needs P^2xP, the intermediate outputs need (P^2.Q + P.Q^2) 211f6af633fSnbeams } 212*38293ee6SJeremy L Thompson CeedInt grid = (num_elem + num_t_col - 1) / num_t_col; 213*38293ee6SJeremy L Thompson void *args[] = {&impl->d_interp_1d, &impl->d_grad_1d, &du, &u_elem_stride, &u_comp_stride, &u_dim_stride, &dv, 214*38293ee6SJeremy L Thompson &v_elem_stride, &v_comp_stride, &v_dim_stride, &num_elem}; 215f6af633fSnbeams 216*38293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 217*38293ee6SJeremy L Thompson CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad_tr, grid, num_threads, num_t_col, 1, shared_mem, args)); 218f6af633fSnbeams } else { 219*38293ee6SJeremy L Thompson CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad, grid, num_threads, num_t_col, 1, shared_mem, args)); 220f6af633fSnbeams } 2212b730f8bSJeremy L Thompson } break; 2223513a710Sjeremylt case CEED_EVAL_WEIGHT: { 223*38293ee6SJeremy L Thompson CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT inum_compatible with CEED_TRANSPOSE"); 224*38293ee6SJeremy L Thompson CeedInt Q = Q_1d; 225*38293ee6SJeremy L Thompson CeedInt elem_dofs_size = CeedIntPow(Q, dim); 226*38293ee6SJeremy L Thompson CeedInt num_threads = 1; 227*38293ee6SJeremy L Thompson CeedInt num_t_col = 1; 228*38293ee6SJeremy L Thompson CeedInt shared_mem = 0; 229f6af633fSnbeams 230f6af633fSnbeams switch (dim) { 231f6af633fSnbeams case 1: 232*38293ee6SJeremy L Thompson num_threads = Q; 233*38293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_1D); 234*38293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * Q; // for d_q_weight_1d 235*38293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * num_t_col * Q; // for output 236f6af633fSnbeams break; 237f6af633fSnbeams case 2: 238*38293ee6SJeremy L Thompson num_threads = Q; 239*38293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D); 240*38293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * Q; // for d_q_weight_1d 241f6af633fSnbeams break; 242f6af633fSnbeams case 3: 243*38293ee6SJeremy L Thompson num_threads = Q * Q; 244*38293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D); 245*38293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * Q; // for d_q_weight_1d 246f6af633fSnbeams } 247*38293ee6SJeremy L Thompson CeedInt grid = (num_elem + num_t_col - 1) / num_t_col; 248*38293ee6SJeremy L Thompson void *args[] = {&impl->d_q_weight_1d, &dv, &elem_dofs_size, &num_elem}; 249f6af633fSnbeams 250*38293ee6SJeremy L Thompson CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_weight, grid, num_threads, num_t_col, 1, shared_mem, args)); 2512b730f8bSJeremy L Thompson } break; 2523513a710Sjeremylt // LCOV_EXCL_START 2533513a710Sjeremylt case CEED_EVAL_DIV: 254e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 2553513a710Sjeremylt case CEED_EVAL_CURL: 256e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 2573513a710Sjeremylt case CEED_EVAL_NONE: 2582b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); 2593513a710Sjeremylt // LCOV_EXCL_STOP 2603513a710Sjeremylt } 2617f5b9731SStan Tomov 262e0582403Sabdelfattah83 // must sync to ensure completeness 263e0582403Sabdelfattah83 ceed_magma_queue_sync(data->queue); 264e0582403Sabdelfattah83 265*38293ee6SJeremy L Thompson if (e_mode != CEED_EVAL_WEIGHT) { 2666574a04fSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(U, &du)); 2677f5b9731SStan Tomov } 2686574a04fSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(V, &dv)); 269e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2707f5b9731SStan Tomov } 2717f5b9731SStan Tomov 2727f5b9731SStan Tomov #ifdef __cplusplus 2737f5b9731SStan Tomov CEED_INTERN "C" 2747f5b9731SStan Tomov #endif 2752b730f8bSJeremy L Thompson int 276*38293ee6SJeremy L Thompson CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector U, CeedVector V) { 277868539c2SNatalie Beams Ceed ceed; 278e0582403Sabdelfattah83 Ceed_Magma *data; 279*38293ee6SJeremy L Thompson CeedInt dim, num_comp, num_dof, num_qpts, NB = 1; 280868539c2SNatalie Beams const CeedScalar *du; 281868539c2SNatalie Beams CeedScalar *dv; 282*38293ee6SJeremy L Thompson CeedBasisNonTensor_Magma *impl; 283*38293ee6SJeremy L Thompson CeedMagmaFunction *interp, *grad; 284*38293ee6SJeremy L Thompson 285*38293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 286*38293ee6SJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &data)); 287*38293ee6SJeremy L Thompson magma_int_t arch = magma_getdevice_arch(); 288*38293ee6SJeremy L Thompson 289*38293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 290*38293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 291*38293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &num_dof)); 292*38293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 293*38293ee6SJeremy L Thompson CeedInt P = num_dof, Q = num_qpts, N = num_elem * num_comp; 294*38293ee6SJeremy L Thompson 2956574a04fSJeremy L Thompson if (U != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du)); 296*38293ee6SJeremy L Thompson else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 2972b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv)); 298868539c2SNatalie Beams 2992b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &impl)); 300868539c2SNatalie Beams 301*38293ee6SJeremy L Thompson CeedDebug256(ceed, 4, "[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, num_comp * num_dof, num_comp); 302868539c2SNatalie Beams 303*38293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 3041f9221feSJeremy L Thompson CeedSize length; 305*38293ee6SJeremy L Thompson 3062b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetLength(V, &length)); 30780a9ef05SNatalie Beams if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { 3082b730f8bSJeremy L Thompson magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue); 30980a9ef05SNatalie Beams } else { 3102b730f8bSJeremy L Thompson magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)dv, length, data->queue); 31180a9ef05SNatalie Beams } 312e0582403Sabdelfattah83 ceed_magma_queue_sync(data->queue); 313868539c2SNatalie Beams } 31480a9ef05SNatalie Beams 315*38293ee6SJeremy L Thompson CeedInt n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_N_VALUES}; 316023b8a51Sabdelfattah83 CeedInt iN = 0; 317*38293ee6SJeremy L Thompson CeedInt diff = abs(n_array[iN] - N); 318*38293ee6SJeremy L Thompson 319023b8a51Sabdelfattah83 for (CeedInt in = iN + 1; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { 320*38293ee6SJeremy L Thompson CeedInt idiff = abs(n_array[in] - N); 321023b8a51Sabdelfattah83 if (idiff < diff) { 322023b8a51Sabdelfattah83 iN = in; 323023b8a51Sabdelfattah83 diff = idiff; 324868539c2SNatalie Beams } 32580a9ef05SNatalie Beams } 32680a9ef05SNatalie Beams 327*38293ee6SJeremy L Thompson NB = nontensor_rtc_get_nb(arch, 'd', e_mode, t_mode, P, n_array[iN], Q); 328*38293ee6SJeremy L Thompson interp = (t_mode == CEED_TRANSPOSE) ? &impl->magma_interp_tr_nontensor[iN] : &impl->magma_interp_nontensor[iN]; 329*38293ee6SJeremy L Thompson grad = (t_mode == CEED_TRANSPOSE) ? &impl->magma_grad_tr_nontensor[iN] : &impl->magma_grad_nontensor[iN]; 33080a9ef05SNatalie Beams 331*38293ee6SJeremy L Thompson switch (e_mode) { 33280a9ef05SNatalie Beams case CEED_EVAL_INTERP: { 333*38293ee6SJeremy L Thompson CeedInt P = num_dof, Q = num_qpts; 334023b8a51Sabdelfattah83 if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { 335*38293ee6SJeremy L Thompson CeedInt M = (t_mode == CEED_TRANSPOSE) ? P : Q; 336*38293ee6SJeremy L Thompson CeedInt K = (t_mode == CEED_TRANSPOSE) ? Q : P; 337*38293ee6SJeremy L Thompson CeedInt num_t_col = MAGMA_NONTENSOR_BASIS_NTCOL(M); 338*38293ee6SJeremy L Thompson CeedInt shared_mem = 0, shared_mem_A = 0, shared_mem_B = 0; 339*38293ee6SJeremy L Thompson shared_mem_B += num_t_col * K * NB * sizeof(CeedScalar); 340*38293ee6SJeremy L Thompson shared_mem_A += (t_mode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar); 341*38293ee6SJeremy L Thompson shared_mem = (t_mode == CEED_TRANSPOSE) ? (shared_mem_A + shared_mem_B) : CeedIntMax(shared_mem_A, shared_mem_B); 342023b8a51Sabdelfattah83 343*38293ee6SJeremy L Thompson CeedInt grid = MAGMA_CEILDIV(MAGMA_CEILDIV(N, NB), num_t_col); 344*38293ee6SJeremy L Thompson magma_trans_t trans_A = (t_mode == CEED_TRANSPOSE) ? MagmaNoTrans : MagmaTrans; 345*38293ee6SJeremy L Thompson magma_trans_t trans_B = MagmaNoTrans; 346023b8a51Sabdelfattah83 CeedScalar alpha = 1.0, beta = 0.0; 347023b8a51Sabdelfattah83 348*38293ee6SJeremy L Thompson void *args[] = {&trans_A, &trans_B, &N, &alpha, &impl->d_interp, &P, &du, &K, &beta, &dv, &M}; 349*38293ee6SJeremy L Thompson CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, *interp, grid, M, num_t_col, 1, shared_mem, args)); 350023b8a51Sabdelfattah83 } else { 351*38293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 352*38293ee6SJeremy L Thompson magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, num_elem * num_comp, Q, 1.0, impl->d_interp, P, du, Q, 0.0, dv, P, data->queue); 353*38293ee6SJeremy L Thompson } else { 354*38293ee6SJeremy L Thompson magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, num_elem * num_comp, P, 1.0, impl->d_interp, P, du, P, 0.0, dv, Q, data->queue); 355*38293ee6SJeremy L Thompson } 356023b8a51Sabdelfattah83 } 3572b730f8bSJeremy L Thompson } break; 35880a9ef05SNatalie Beams 35980a9ef05SNatalie Beams case CEED_EVAL_GRAD: { 360*38293ee6SJeremy L Thompson CeedInt P = num_dof, Q = num_qpts; 361023b8a51Sabdelfattah83 if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { 362*38293ee6SJeremy L Thompson CeedInt M = (t_mode == CEED_TRANSPOSE) ? P : Q; 363*38293ee6SJeremy L Thompson CeedInt K = (t_mode == CEED_TRANSPOSE) ? Q : P; 364*38293ee6SJeremy L Thompson CeedInt num_t_col = MAGMA_NONTENSOR_BASIS_NTCOL(M); 365*38293ee6SJeremy L Thompson CeedInt shared_mem = 0, shared_mem_A = 0, shared_mem_B = 0; 366*38293ee6SJeremy L Thompson shared_mem_B += num_t_col * K * NB * sizeof(CeedScalar); 367*38293ee6SJeremy L Thompson shared_mem_A += (t_mode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar); 368*38293ee6SJeremy L Thompson shared_mem = shared_mem_A + shared_mem_B; 369023b8a51Sabdelfattah83 370*38293ee6SJeremy L Thompson CeedInt grid = MAGMA_CEILDIV(MAGMA_CEILDIV(N, NB), num_t_col); 371*38293ee6SJeremy L Thompson magma_trans_t trans_A = (t_mode == CEED_TRANSPOSE) ? MagmaNoTrans : MagmaTrans; 372*38293ee6SJeremy L Thompson magma_trans_t trans_B = MagmaNoTrans; 373023b8a51Sabdelfattah83 374*38293ee6SJeremy L Thompson void *args[] = {&trans_A, &trans_B, &N, &impl->d_grad, &P, &du, &K, &dv, &M}; 375*38293ee6SJeremy L Thompson CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, *grad, grid, M, num_t_col, 1, shared_mem, args)); 376023b8a51Sabdelfattah83 } else { 377*38293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 37880a9ef05SNatalie Beams CeedScalar beta = 0.0; 37980a9ef05SNatalie Beams for (int d = 0; d < dim; d++) { 3802b730f8bSJeremy L Thompson if (d > 0) beta = 1.0; 381*38293ee6SJeremy L Thompson magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, num_elem * num_comp, Q, 1.0, impl->d_grad + d * P * Q, P, 382*38293ee6SJeremy L Thompson du + d * num_elem * num_comp * Q, Q, beta, dv, P, data->queue); 38380a9ef05SNatalie Beams } 38480a9ef05SNatalie Beams } else { 38580a9ef05SNatalie Beams for (int d = 0; d < dim; d++) 386*38293ee6SJeremy L Thompson magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, num_elem * num_comp, P, 1.0, impl->d_grad + d * P * Q, P, du, P, 0.0, 387*38293ee6SJeremy L Thompson dv + d * num_elem * num_comp * Q, Q, data->queue); 388023b8a51Sabdelfattah83 } 389868539c2SNatalie Beams } 3902b730f8bSJeremy L Thompson } break; 391868539c2SNatalie Beams 392868539c2SNatalie Beams case CEED_EVAL_WEIGHT: { 393*38293ee6SJeremy L Thompson CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT inum_compatible with CEED_TRANSPOSE"); 394868539c2SNatalie Beams 395*38293ee6SJeremy L Thompson int elemsPerBlock = 1; // basis->Q_1d < 7 ? optElems[basis->Q_1d] : 1; 396*38293ee6SJeremy L Thompson int grid = num_elem / elemsPerBlock + ((num_elem / elemsPerBlock * elemsPerBlock < num_elem) ? 1 : 0); 397*38293ee6SJeremy L Thompson 398*38293ee6SJeremy L Thompson magma_weight_nontensor(grid, num_qpts, num_elem, num_qpts, impl->d_q_weight, dv, data->queue); 3992b730f8bSJeremy L Thompson } break; 400868539c2SNatalie Beams 401868539c2SNatalie Beams // LCOV_EXCL_START 402868539c2SNatalie Beams case CEED_EVAL_DIV: 403e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 404868539c2SNatalie Beams case CEED_EVAL_CURL: 405e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 406868539c2SNatalie Beams case CEED_EVAL_NONE: 4072b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); 408868539c2SNatalie Beams // LCOV_EXCL_STOP 409868539c2SNatalie Beams } 410868539c2SNatalie Beams 411e0582403Sabdelfattah83 // must sync to ensure completeness 412e0582403Sabdelfattah83 ceed_magma_queue_sync(data->queue); 413e0582403Sabdelfattah83 414*38293ee6SJeremy L Thompson if (e_mode != CEED_EVAL_WEIGHT) { 4152b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(U, &du)); 416868539c2SNatalie Beams } 4172b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(V, &dv)); 418e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 419868539c2SNatalie Beams } 420868539c2SNatalie Beams 421868539c2SNatalie Beams #ifdef __cplusplus 422868539c2SNatalie Beams CEED_INTERN "C" 423868539c2SNatalie Beams #endif 4242b730f8bSJeremy L Thompson int 4252b730f8bSJeremy L Thompson CeedBasisDestroy_Magma(CeedBasis basis) { 426f6af633fSnbeams Ceed ceed; 427*38293ee6SJeremy L Thompson CeedBasis_Magma *impl; 428*38293ee6SJeremy L Thompson 429*38293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &impl)); 430*38293ee6SJeremy L Thompson CeedCallBackend(magma_free(impl->d_q_ref_1d)); 431*38293ee6SJeremy L Thompson CeedCallBackend(magma_free(impl->d_interp_1d)); 432*38293ee6SJeremy L Thompson CeedCallBackend(magma_free(impl->d_grad_1d)); 433*38293ee6SJeremy L Thompson CeedCallBackend(magma_free(impl->d_q_weight_1d)); 4342b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 435e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP 4362b730f8bSJeremy L Thompson CeedCallHip(ceed, hipModuleUnload(impl->module)); 437f6af633fSnbeams #else 4382b730f8bSJeremy L Thompson CeedCallCuda(ceed, cuModuleUnload(impl->module)); 439f6af633fSnbeams #endif 4402b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl)); 441e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 4427f5b9731SStan Tomov } 4437f5b9731SStan Tomov 4447f5b9731SStan Tomov #ifdef __cplusplus 4457f5b9731SStan Tomov CEED_INTERN "C" 4467f5b9731SStan Tomov #endif 4472b730f8bSJeremy L Thompson int 4482b730f8bSJeremy L Thompson CeedBasisDestroyNonTensor_Magma(CeedBasis basis) { 449023b8a51Sabdelfattah83 Ceed ceed; 450*38293ee6SJeremy L Thompson CeedBasisNonTensor_Magma *impl; 451*38293ee6SJeremy L Thompson 452*38293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &impl)); 453*38293ee6SJeremy L Thompson CeedCallBackend(magma_free(impl->d_q_ref)); 454*38293ee6SJeremy L Thompson CeedCallBackend(magma_free(impl->d_interp)); 455*38293ee6SJeremy L Thompson CeedCallBackend(magma_free(impl->d_grad)); 456*38293ee6SJeremy L Thompson CeedCallBackend(magma_free(impl->d_q_weight)); 457023b8a51Sabdelfattah83 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 458023b8a51Sabdelfattah83 #ifdef CEED_MAGMA_USE_HIP 459023b8a51Sabdelfattah83 for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { 460023b8a51Sabdelfattah83 CeedCallHip(ceed, hipModuleUnload(impl->module[in])); 461023b8a51Sabdelfattah83 } 462023b8a51Sabdelfattah83 #else 463023b8a51Sabdelfattah83 for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { 464023b8a51Sabdelfattah83 CeedCallCuda(ceed, cuModuleUnload(impl->module[in])); 465023b8a51Sabdelfattah83 } 466023b8a51Sabdelfattah83 #endif 4672b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl)); 468e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 469868539c2SNatalie Beams } 470868539c2SNatalie Beams 471868539c2SNatalie Beams #ifdef __cplusplus 472868539c2SNatalie Beams CEED_INTERN "C" 473868539c2SNatalie Beams #endif 4742b730f8bSJeremy L Thompson int 475*38293ee6SJeremy L Thompson CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 476*38293ee6SJeremy L Thompson const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 477*38293ee6SJeremy L Thompson Ceed ceed, ceed_delegate; 478*38293ee6SJeremy L Thompson Ceed_Magma *data; 479*38293ee6SJeremy L Thompson char *magma_common_path, *interp_path, *grad_path, *weight_path, *basis_kernel_source; 480*38293ee6SJeremy L Thompson CeedInt num_comp = 0; 4817f5b9731SStan Tomov CeedBasis_Magma *impl; 482*38293ee6SJeremy L Thompson 4832b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &impl)); 4842b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 4857f5b9731SStan Tomov 486c9f8acf2SJeremy L Thompson // Check for supported parameters 487*38293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 4882b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &data)); 489e0582403Sabdelfattah83 490f6af633fSnbeams // Compile kernels 491023b8a51Sabdelfattah83 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_defs.h", &magma_common_path)); 49223d4529eSJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 4932b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &basis_kernel_source)); 494023b8a51Sabdelfattah83 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_tensor.h", &magma_common_path)); 495023b8a51Sabdelfattah83 CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, magma_common_path, &basis_kernel_source)); 496f6af633fSnbeams char *interp_name_base = "ceed/jit-source/magma/interp"; 497f6af633fSnbeams CeedInt interp_name_len = strlen(interp_name_base) + 6; 498f6af633fSnbeams char interp_name[interp_name_len]; 499*38293ee6SJeremy L Thompson 5002b730f8bSJeremy L Thompson snprintf(interp_name, interp_name_len, "%s-%" CeedInt_FMT "d.h", interp_name_base, dim); 5012b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, interp_name, &interp_path)); 5022b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, interp_path, &basis_kernel_source)); 503f6af633fSnbeams char *grad_name_base = "ceed/jit-source/magma/grad"; 504f6af633fSnbeams CeedInt grad_name_len = strlen(grad_name_base) + 6; 505f6af633fSnbeams char grad_name[grad_name_len]; 506*38293ee6SJeremy L Thompson 5072b730f8bSJeremy L Thompson snprintf(grad_name, grad_name_len, "%s-%" CeedInt_FMT "d.h", grad_name_base, dim); 5082b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, grad_name, &grad_path)); 5092b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_path, &basis_kernel_source)); 510f6af633fSnbeams char *weight_name_base = "ceed/jit-source/magma/weight"; 511f6af633fSnbeams CeedInt weight_name_len = strlen(weight_name_base) + 6; 512f6af633fSnbeams char weight_name[weight_name_len]; 513*38293ee6SJeremy L Thompson 5142b730f8bSJeremy L Thompson snprintf(weight_name, weight_name_len, "%s-%" CeedInt_FMT "d.h", weight_name_base, dim); 5152b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, weight_name, &weight_path)); 5162b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_path, &basis_kernel_source)); 51723d4529eSJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 518f6af633fSnbeams // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip 519f6af633fSnbeams // data 520*38293ee6SJeremy L Thompson CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); 521*38293ee6SJeremy L Thompson CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module, 5, "DIM", dim, "NCOMP", num_comp, "P", P_1d, "Q", Q_1d, "MAXPQ", 522*38293ee6SJeremy L Thompson CeedIntMax(P_1d, Q_1d))); 523f6af633fSnbeams 524f6af633fSnbeams // Kernel setup 525f6af633fSnbeams switch (dim) { 526f6af633fSnbeams case 1: 5272b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->magma_interp)); 5282b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->magma_interp_tr)); 5292b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->magma_grad)); 5302b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->magma_grad_tr)); 5312b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->magma_weight)); 532f6af633fSnbeams break; 533f6af633fSnbeams case 2: 5342b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->magma_interp)); 5352b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->magma_interp_tr)); 5362b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->magma_grad)); 5372b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->magma_grad_tr)); 5382b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->magma_weight)); 539f6af633fSnbeams break; 540f6af633fSnbeams case 3: 5412b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->magma_interp)); 5422b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->magma_interp_tr)); 5432b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->magma_grad)); 5442b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->magma_grad_tr)); 5452b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->magma_weight)); 546f6af633fSnbeams } 547f6af633fSnbeams 5482b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma)); 5492b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma)); 5507f5b9731SStan Tomov 551*38293ee6SJeremy L Thompson // Copy q_ref_1d to the GPU 552*38293ee6SJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->d_q_ref_1d, Q_1d * sizeof(q_ref_1d[0]))); 553*38293ee6SJeremy L Thompson magma_setvector(Q_1d, sizeof(q_ref_1d[0]), q_ref_1d, 1, impl->d_q_ref_1d, 1, data->queue); 5547f5b9731SStan Tomov 555*38293ee6SJeremy L Thompson // Copy interp_1d to the GPU 556*38293ee6SJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->d_interp_1d, Q_1d * P_1d * sizeof(interp_1d[0]))); 557*38293ee6SJeremy L Thompson magma_setvector(Q_1d * P_1d, sizeof(interp_1d[0]), interp_1d, 1, impl->d_interp_1d, 1, data->queue); 5587f5b9731SStan Tomov 559*38293ee6SJeremy L Thompson // Copy grad_1d to the GPU 560*38293ee6SJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->d_grad_1d, Q_1d * P_1d * sizeof(grad_1d[0]))); 561*38293ee6SJeremy L Thompson magma_setvector(Q_1d * P_1d, sizeof(grad_1d[0]), grad_1d, 1, impl->d_grad_1d, 1, data->queue); 5627f5b9731SStan Tomov 563*38293ee6SJeremy L Thompson // Copy q_weight_1d to the GPU 564*38293ee6SJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->d_q_weight_1d, Q_1d * sizeof(q_weight_1d[0]))); 565*38293ee6SJeremy L Thompson magma_setvector(Q_1d, sizeof(q_weight_1d[0]), q_weight_1d, 1, impl->d_q_weight_1d, 1, data->queue); 5667f5b9731SStan Tomov 5672b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisSetData(basis, impl)); 5682b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&magma_common_path)); 5692b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&interp_path)); 5702b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&grad_path)); 5712b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&weight_path)); 5722b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_source)); 573e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 5747f5b9731SStan Tomov } 5757f5b9731SStan Tomov 5767f5b9731SStan Tomov #ifdef __cplusplus 5777f5b9731SStan Tomov CEED_INTERN "C" 5787f5b9731SStan Tomov #endif 5792b730f8bSJeremy L Thompson int 580*38293ee6SJeremy L Thompson CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_dof, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 581*38293ee6SJeremy L Thompson const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 582*38293ee6SJeremy L Thompson Ceed ceed, ceed_delegate; 583e0582403Sabdelfattah83 Ceed_Magma *data; 584*38293ee6SJeremy L Thompson char *magma_common_path, *interp_path, *grad_path, *basis_kernel_source; 585*38293ee6SJeremy L Thompson CeedBasisNonTensor_Magma *impl; 586*38293ee6SJeremy L Thompson 587*38293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 5882b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &data)); 589023b8a51Sabdelfattah83 magma_int_t arch = magma_getdevice_arch(); 590*38293ee6SJeremy L Thompson 5912b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &impl)); 592023b8a51Sabdelfattah83 // Compile kernels 593023b8a51Sabdelfattah83 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_defs.h", &magma_common_path)); 59423d4529eSJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 595023b8a51Sabdelfattah83 CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &basis_kernel_source)); 596023b8a51Sabdelfattah83 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_nontensor.h", &magma_common_path)); 597023b8a51Sabdelfattah83 CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, magma_common_path, &basis_kernel_source)); 598023b8a51Sabdelfattah83 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/interp-nontensor.h", &interp_path)); 599023b8a51Sabdelfattah83 CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, interp_path, &basis_kernel_source)); 600023b8a51Sabdelfattah83 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/grad-nontensor.h", &grad_path)); 601023b8a51Sabdelfattah83 CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_path, &basis_kernel_source)); 602023b8a51Sabdelfattah83 603023b8a51Sabdelfattah83 // tuning parameters for nb 604023b8a51Sabdelfattah83 CeedInt nb_interp_n[MAGMA_NONTENSOR_KERNEL_INSTANCES]; 605023b8a51Sabdelfattah83 CeedInt nb_interp_t[MAGMA_NONTENSOR_KERNEL_INSTANCES]; 606023b8a51Sabdelfattah83 CeedInt nb_grad_n[MAGMA_NONTENSOR_KERNEL_INSTANCES]; 607023b8a51Sabdelfattah83 CeedInt nb_grad_t[MAGMA_NONTENSOR_KERNEL_INSTANCES]; 608*38293ee6SJeremy L Thompson CeedInt P = num_dof, Q = num_qpts; 609*38293ee6SJeremy L Thompson CeedInt n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_N_VALUES}; 610*38293ee6SJeremy L Thompson 611023b8a51Sabdelfattah83 for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { 612*38293ee6SJeremy L Thompson nb_interp_n[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_INTERP, CEED_NOTRANSPOSE, P, n_array[in], Q); 613*38293ee6SJeremy L Thompson nb_interp_t[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_INTERP, CEED_TRANSPOSE, P, n_array[in], Q); 614*38293ee6SJeremy L Thompson nb_grad_n[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_GRAD, CEED_NOTRANSPOSE, P, n_array[in], Q); 615*38293ee6SJeremy L Thompson nb_grad_t[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_GRAD, CEED_TRANSPOSE, P, n_array[in], Q); 616023b8a51Sabdelfattah83 } 617023b8a51Sabdelfattah83 618023b8a51Sabdelfattah83 // compile 619*38293ee6SJeremy L Thompson CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); 620023b8a51Sabdelfattah83 for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { 621*38293ee6SJeremy L Thompson CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[in], 7, "DIM", dim, "P", P, "Q", Q, "NB_INTERP_N", 622*38293ee6SJeremy L Thompson nb_interp_n[in], "NB_INTERP_T", nb_interp_t[in], "NB_GRAD_N", nb_grad_n[in], "NB_GRAD_T", nb_grad_t[in])); 623023b8a51Sabdelfattah83 } 624023b8a51Sabdelfattah83 625023b8a51Sabdelfattah83 // get kernels 626023b8a51Sabdelfattah83 for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { 627023b8a51Sabdelfattah83 CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_interp_nontensor_n", &impl->magma_interp_nontensor[in])); 628023b8a51Sabdelfattah83 CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_interp_nontensor_t", &impl->magma_interp_tr_nontensor[in])); 629023b8a51Sabdelfattah83 CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_grad_nontensor_n", &impl->magma_grad_nontensor[in])); 630023b8a51Sabdelfattah83 CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_grad_nontensor_t", &impl->magma_grad_tr_nontensor[in])); 631023b8a51Sabdelfattah83 } 632023b8a51Sabdelfattah83 633023b8a51Sabdelfattah83 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma)); 634023b8a51Sabdelfattah83 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma)); 635868539c2SNatalie Beams 636*38293ee6SJeremy L Thompson // Copy q_ref to the GPU 637*38293ee6SJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->d_q_ref, num_qpts * sizeof(q_ref[0]))); 638*38293ee6SJeremy L Thompson magma_setvector(num_qpts, sizeof(q_ref[0]), q_ref, 1, impl->d_q_ref, 1, data->queue); 639868539c2SNatalie Beams 640868539c2SNatalie Beams // Copy interp to the GPU 641*38293ee6SJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_dof * sizeof(interp[0]))); 642*38293ee6SJeremy L Thompson magma_setvector(num_qpts * num_dof, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue); 643868539c2SNatalie Beams 644868539c2SNatalie Beams // Copy grad to the GPU 645*38293ee6SJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->d_grad, num_qpts * num_dof * dim * sizeof(grad[0]))); 646*38293ee6SJeremy L Thompson magma_setvector(num_qpts * num_dof * dim, sizeof(grad[0]), grad, 1, impl->d_grad, 1, data->queue); 647868539c2SNatalie Beams 648*38293ee6SJeremy L Thompson // Copy q_weight to the GPU 649*38293ee6SJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0]))); 650*38293ee6SJeremy L Thompson magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue); 651868539c2SNatalie Beams 652023b8a51Sabdelfattah83 CeedCallBackend(CeedBasisSetData(basis, impl)); 653023b8a51Sabdelfattah83 CeedCallBackend(CeedFree(&magma_common_path)); 654023b8a51Sabdelfattah83 CeedCallBackend(CeedFree(&interp_path)); 655023b8a51Sabdelfattah83 CeedCallBackend(CeedFree(&grad_path)); 656023b8a51Sabdelfattah83 CeedCallBackend(CeedFree(&basis_kernel_source)); 657e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 6587f5b9731SStan Tomov } 659