13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 37f5b9731SStan Tomov // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 57f5b9731SStan Tomov // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 77f5b9731SStan Tomov 849aac155SJeremy L Thompson #include <ceed.h> 9ec3da8bcSJed Brown #include <ceed/backend.h> 10f6af633fSnbeams #include <ceed/jit-tools.h> 11f6af633fSnbeams #include <string.h> 122b730f8bSJeremy L Thompson 13e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP 14f6af633fSnbeams #include "../hip/ceed-hip-common.h" 15f6af633fSnbeams #include "../hip/ceed-hip-compile.h" 16f6af633fSnbeams #else 17f6af633fSnbeams #include "../cuda/ceed-cuda-common.h" 18f6af633fSnbeams #include "../cuda/ceed-cuda-compile.h" 19f6af633fSnbeams #endif 2000fb7a04SSebastian Grimberg #include "ceed-magma-common.h" 2100fb7a04SSebastian Grimberg #include "ceed-magma.h" 227f5b9731SStan Tomov 23940a72f1SSebastian Grimberg #include "ceed-magma-gemm-nontensor.h" 24940a72f1SSebastian Grimberg #include "ceed-magma-gemm-selector.h" 25940a72f1SSebastian Grimberg 26940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 27940a72f1SSebastian Grimberg // Basis apply - tensor 28940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 29940a72f1SSebastian Grimberg static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) { 307f5b9731SStan Tomov Ceed ceed; 31e0582403Sabdelfattah83 Ceed_Magma *data; 32940a72f1SSebastian Grimberg CeedInt dim, num_comp, num_nodes, P_1d, Q_1d, P, Q; 33940a72f1SSebastian Grimberg const CeedScalar *d_u; 34940a72f1SSebastian Grimberg CeedScalar *d_v; 3538293ee6SJeremy L Thompson CeedBasis_Magma *impl; 3638293ee6SJeremy L Thompson 3738293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 38940a72f1SSebastian Grimberg CeedCallBackend(CeedGetData(ceed, &data)); 39940a72f1SSebastian Grimberg CeedCallBackend(CeedBasisGetData(basis, &impl)); 4038293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 4138293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 42940a72f1SSebastian Grimberg CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 4338293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 4438293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 45940a72f1SSebastian Grimberg P = P_1d; 46940a72f1SSebastian Grimberg Q = Q_1d; 4738293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 4838293ee6SJeremy L Thompson P = Q_1d; 4938293ee6SJeremy L Thompson Q = P_1d; 507f5b9731SStan Tomov } 517f5b9731SStan Tomov 52940a72f1SSebastian Grimberg // Read vectors 53940a72f1SSebastian Grimberg if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 54940a72f1SSebastian Grimberg else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 55940a72f1SSebastian Grimberg CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 56940a72f1SSebastian Grimberg 57940a72f1SSebastian Grimberg // Apply basis operation 58940a72f1SSebastian Grimberg switch (e_mode) { 59940a72f1SSebastian Grimberg case CEED_EVAL_INTERP: { 607f5b9731SStan Tomov // Define element sizes for dofs/quad 6138293ee6SJeremy L Thompson CeedInt elem_qpts_size = CeedIntPow(Q_1d, dim); 6238293ee6SJeremy L Thompson CeedInt elem_dofs_size = CeedIntPow(P_1d, dim); 637f5b9731SStan Tomov 647f5b9731SStan Tomov // E-vector ordering -------------- Q-vector ordering 65868539c2SNatalie Beams // component component 66868539c2SNatalie Beams // elem elem 677f5b9731SStan Tomov // node node 687f5b9731SStan Tomov 697f5b9731SStan Tomov // --- Define strides for NOTRANSPOSE mode: --- 70940a72f1SSebastian Grimberg // Input (d_u) is E-vector, output (d_v) is Q-vector 717f5b9731SStan Tomov 727f5b9731SStan Tomov // Element strides 7338293ee6SJeremy L Thompson CeedInt u_elem_stride = elem_dofs_size; 7438293ee6SJeremy L Thompson CeedInt v_elem_stride = elem_qpts_size; 757f5b9731SStan Tomov // Component strides 7638293ee6SJeremy L Thompson CeedInt u_comp_stride = num_elem * elem_dofs_size; 7738293ee6SJeremy L Thompson CeedInt v_comp_stride = num_elem * elem_qpts_size; 7838293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 79940a72f1SSebastian Grimberg // Input (d_u) is Q-vector, output (d_v) is E-vector 807f5b9731SStan Tomov // Element strides 8138293ee6SJeremy L Thompson v_elem_stride = elem_dofs_size; 8238293ee6SJeremy L Thompson u_elem_stride = elem_qpts_size; 837f5b9731SStan Tomov // Component strides 8438293ee6SJeremy L Thompson v_comp_stride = num_elem * elem_dofs_size; 8538293ee6SJeremy L Thompson u_comp_stride = num_elem * elem_qpts_size; 867f5b9731SStan Tomov } 8738293ee6SJeremy L Thompson CeedInt num_threads = 1; 8838293ee6SJeremy L Thompson CeedInt num_t_col = 1; 8938293ee6SJeremy L Thompson CeedInt shared_mem = 0; 9038293ee6SJeremy L Thompson CeedInt max_P_Q = CeedIntMax(P, Q); 91f6af633fSnbeams 92f6af633fSnbeams switch (dim) { 93f6af633fSnbeams case 1: 9438293ee6SJeremy L Thompson num_threads = max_P_Q; 9538293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_1D); 9638293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * num_t_col * (num_comp * (1 * P + 1 * Q)); 9738293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * (P * Q); 98f6af633fSnbeams break; 99f6af633fSnbeams case 2: 10038293ee6SJeremy L Thompson num_threads = max_P_Q; 10138293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D); 10238293ee6SJeremy L Thompson shared_mem += P * Q * sizeof(CeedScalar); // for sT 103940a72f1SSebastian Grimberg // for reforming rU we need P x P, and for the intermediate output we need P x Q 104940a72f1SSebastian Grimberg shared_mem += num_t_col * (P * max_P_Q * sizeof(CeedScalar)); 105f6af633fSnbeams break; 106f6af633fSnbeams case 3: 10738293ee6SJeremy L Thompson num_threads = max_P_Q * max_P_Q; 10838293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D); 10938293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * (P * Q); // for sT 110940a72f1SSebastian Grimberg // rU needs P^2 x P, the intermediate output needs max(P^2 x Q, P x Q^2) 111940a72f1SSebastian Grimberg shared_mem += sizeof(CeedScalar) * num_t_col * (CeedIntMax(P * P * max_P_Q, P * Q * Q)); 112940a72f1SSebastian Grimberg break; 113f6af633fSnbeams } 114940a72f1SSebastian Grimberg CeedInt grid = CeedDivUpInt(num_elem, num_t_col); 115940a72f1SSebastian Grimberg void *args[] = {&impl->d_interp_1d, &d_u, &u_elem_stride, &u_comp_stride, &d_v, &v_elem_stride, &v_comp_stride, &num_elem}; 116f6af633fSnbeams 11738293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 118940a72f1SSebastian Grimberg CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->InterpTranspose, grid, num_threads, num_t_col, 1, shared_mem, args)); 119f6af633fSnbeams } else { 120940a72f1SSebastian Grimberg CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Interp, grid, num_threads, num_t_col, 1, shared_mem, args)); 121f6af633fSnbeams } 1222b730f8bSJeremy L Thompson } break; 1233513a710Sjeremylt case CEED_EVAL_GRAD: { 1247f5b9731SStan Tomov // Define element sizes for dofs/quad 12538293ee6SJeremy L Thompson CeedInt elem_qpts_size = CeedIntPow(Q_1d, dim); 12638293ee6SJeremy L Thompson CeedInt elem_dofs_size = CeedIntPow(P_1d, dim); 1277f5b9731SStan Tomov 128940a72f1SSebastian Grimberg // In CEED_NOTRANSPOSE mode: 129940a72f1SSebastian Grimberg // d_u is (P^dim x nc), column-major layout (nc = num_comp) 130940a72f1SSebastian Grimberg // d_v is (Q^dim x nc x dim), column-major layout (nc = num_comp) 131940a72f1SSebastian Grimberg // In CEED_TRANSPOSE mode, the sizes of d_u and d_v are switched. 132940a72f1SSebastian Grimberg 1337f5b9731SStan Tomov // E-vector ordering -------------- Q-vector ordering 1347f5b9731SStan Tomov // dim 135868539c2SNatalie Beams // component component 136868539c2SNatalie Beams // elem elem 1377f5b9731SStan Tomov // node node 1387f5b9731SStan Tomov 1397f5b9731SStan Tomov // --- Define strides for NOTRANSPOSE mode: --- 140940a72f1SSebastian Grimberg // Input (d_u) is E-vector, output (d_v) is Q-vector 1417f5b9731SStan Tomov 1427f5b9731SStan Tomov // Element strides 14338293ee6SJeremy L Thompson CeedInt u_elem_stride = elem_dofs_size; 14438293ee6SJeremy L Thompson CeedInt v_elem_stride = elem_qpts_size; 1457f5b9731SStan Tomov // Component strides 14638293ee6SJeremy L Thompson CeedInt u_comp_stride = num_elem * elem_dofs_size; 14738293ee6SJeremy L Thompson CeedInt v_comp_stride = num_elem * elem_qpts_size; 1487f5b9731SStan Tomov // Dimension strides 14938293ee6SJeremy L Thompson CeedInt u_dim_stride = 0; 15038293ee6SJeremy L Thompson CeedInt v_dim_stride = num_elem * elem_qpts_size * num_comp; 15138293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 152940a72f1SSebastian Grimberg // Input (d_u) is Q-vector, output (d_v) is E-vector 1537f5b9731SStan Tomov // Element strides 15438293ee6SJeremy L Thompson v_elem_stride = elem_dofs_size; 15538293ee6SJeremy L Thompson u_elem_stride = elem_qpts_size; 1567f5b9731SStan Tomov // Component strides 15738293ee6SJeremy L Thompson v_comp_stride = num_elem * elem_dofs_size; 15838293ee6SJeremy L Thompson u_comp_stride = num_elem * elem_qpts_size; 1597f5b9731SStan Tomov // Dimension strides 16038293ee6SJeremy L Thompson v_dim_stride = 0; 16138293ee6SJeremy L Thompson u_dim_stride = num_elem * elem_qpts_size * num_comp; 1627f5b9731SStan Tomov } 16338293ee6SJeremy L Thompson CeedInt num_threads = 1; 16438293ee6SJeremy L Thompson CeedInt num_t_col = 1; 16538293ee6SJeremy L Thompson CeedInt shared_mem = 0; 16638293ee6SJeremy L Thompson CeedInt max_P_Q = CeedIntMax(P, Q); 167f6af633fSnbeams 168f6af633fSnbeams switch (dim) { 169f6af633fSnbeams case 1: 17038293ee6SJeremy L Thompson num_threads = max_P_Q; 17138293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_1D); 17238293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * num_t_col * (num_comp * (1 * P + 1 * Q)); 17338293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * (P * Q); 174f6af633fSnbeams break; 175f6af633fSnbeams case 2: 17638293ee6SJeremy L Thompson num_threads = max_P_Q; 17738293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D); 17838293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * 2 * P * Q; // for sTinterp and sTgrad 179940a72f1SSebastian Grimberg // for reforming rU we need P x P, and for the intermediate output we need P x Q 180940a72f1SSebastian Grimberg shared_mem += sizeof(CeedScalar) * num_t_col * (P * max_P_Q); 181f6af633fSnbeams break; 182f6af633fSnbeams case 3: 18338293ee6SJeremy L Thompson num_threads = max_P_Q * max_P_Q; 18438293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D); 18538293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * 2 * P * Q; // for sTinterp and sTgrad 186940a72f1SSebastian Grimberg // rU needs P^2 x P, the intermediate outputs need (P^2 x Q + P x Q^2) 187940a72f1SSebastian Grimberg shared_mem += sizeof(CeedScalar) * num_t_col * CeedIntMax(P * P * P, (P * P * Q) + (P * Q * Q)); 188940a72f1SSebastian Grimberg break; 189f6af633fSnbeams } 190940a72f1SSebastian Grimberg CeedInt grid = CeedDivUpInt(num_elem, num_t_col); 191940a72f1SSebastian Grimberg void *args[] = {&impl->d_interp_1d, &impl->d_grad_1d, &d_u, &u_elem_stride, &u_comp_stride, &u_dim_stride, &d_v, 19238293ee6SJeremy L Thompson &v_elem_stride, &v_comp_stride, &v_dim_stride, &num_elem}; 193f6af633fSnbeams 19438293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 195940a72f1SSebastian Grimberg CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->GradTranspose, grid, num_threads, num_t_col, 1, shared_mem, args)); 196f6af633fSnbeams } else { 197940a72f1SSebastian Grimberg CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Grad, grid, num_threads, num_t_col, 1, shared_mem, args)); 198f6af633fSnbeams } 1992b730f8bSJeremy L Thompson } break; 2003513a710Sjeremylt case CEED_EVAL_WEIGHT: { 201940a72f1SSebastian Grimberg CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 20238293ee6SJeremy L Thompson CeedInt elem_dofs_size = CeedIntPow(Q, dim); 20338293ee6SJeremy L Thompson CeedInt num_threads = 1; 20438293ee6SJeremy L Thompson CeedInt num_t_col = 1; 20538293ee6SJeremy L Thompson CeedInt shared_mem = 0; 206f6af633fSnbeams 207f6af633fSnbeams switch (dim) { 208f6af633fSnbeams case 1: 20938293ee6SJeremy L Thompson num_threads = Q; 21038293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_1D); 21138293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * Q; // for d_q_weight_1d 21238293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * num_t_col * Q; // for output 213f6af633fSnbeams break; 214f6af633fSnbeams case 2: 21538293ee6SJeremy L Thompson num_threads = Q; 21638293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D); 21738293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * Q; // for d_q_weight_1d 218f6af633fSnbeams break; 219f6af633fSnbeams case 3: 22038293ee6SJeremy L Thompson num_threads = Q * Q; 22138293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D); 22238293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * Q; // for d_q_weight_1d 223940a72f1SSebastian Grimberg break; 224f6af633fSnbeams } 225940a72f1SSebastian Grimberg CeedInt grid = CeedDivUpInt(num_elem, num_t_col); 226940a72f1SSebastian Grimberg void *args[] = {&impl->d_q_weight_1d, &d_v, &elem_dofs_size, &num_elem}; 227f6af633fSnbeams 228940a72f1SSebastian Grimberg CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, grid, num_threads, num_t_col, 1, shared_mem, args)); 2292b730f8bSJeremy L Thompson } break; 2303513a710Sjeremylt // LCOV_EXCL_START 2313513a710Sjeremylt case CEED_EVAL_DIV: 232e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 2333513a710Sjeremylt case CEED_EVAL_CURL: 234e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 2353513a710Sjeremylt case CEED_EVAL_NONE: 2362b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); 2373513a710Sjeremylt // LCOV_EXCL_STOP 2383513a710Sjeremylt } 2397f5b9731SStan Tomov 240940a72f1SSebastian Grimberg // Must sync to ensure completeness 241e0582403Sabdelfattah83 ceed_magma_queue_sync(data->queue); 242e0582403Sabdelfattah83 243940a72f1SSebastian Grimberg // Restore vectors 24438293ee6SJeremy L Thompson if (e_mode != CEED_EVAL_WEIGHT) { 245940a72f1SSebastian Grimberg CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 2467f5b9731SStan Tomov } 247940a72f1SSebastian Grimberg CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 248e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2497f5b9731SStan Tomov } 2507f5b9731SStan Tomov 251940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 252940a72f1SSebastian Grimberg // Basis apply - non-tensor 253940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 254940a72f1SSebastian Grimberg static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, 255940a72f1SSebastian Grimberg CeedVector v) { 256868539c2SNatalie Beams Ceed ceed; 257e0582403Sabdelfattah83 Ceed_Magma *data; 258*7251047cSSebastian Grimberg CeedInt num_comp, num_nodes, num_qpts, P, Q, N; 259*7251047cSSebastian Grimberg const CeedScalar *d_u; 260940a72f1SSebastian Grimberg CeedScalar *d_v; 26138293ee6SJeremy L Thompson CeedBasisNonTensor_Magma *impl; 26238293ee6SJeremy L Thompson 26338293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 26438293ee6SJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &data)); 265940a72f1SSebastian Grimberg CeedCallBackend(CeedBasisGetData(basis, &impl)); 26638293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 267940a72f1SSebastian Grimberg CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 26838293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 269940a72f1SSebastian Grimberg P = num_nodes; 270940a72f1SSebastian Grimberg Q = num_qpts; 271940a72f1SSebastian Grimberg N = num_elem * num_comp; 27238293ee6SJeremy L Thompson 273940a72f1SSebastian Grimberg // Read vectors 274940a72f1SSebastian Grimberg if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 27538293ee6SJeremy L Thompson else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 276940a72f1SSebastian Grimberg CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 277868539c2SNatalie Beams 278*7251047cSSebastian Grimberg // Compile kernels for N as needed 279*7251047cSSebastian Grimberg CeedInt iN = 0; 280*7251047cSSebastian Grimberg if (P <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q && (e_mode != CEED_EVAL_WEIGHT || !impl->Weight)) { 281940a72f1SSebastian Grimberg CeedInt n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_KERNEL_N_VALUES}; 282*7251047cSSebastian Grimberg CeedInt diff = abs(n_array[iN] - N), idiff; 28338293ee6SJeremy L Thompson 284023b8a51Sabdelfattah83 for (CeedInt in = iN + 1; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { 285940a72f1SSebastian Grimberg idiff = abs(n_array[in] - N); 286023b8a51Sabdelfattah83 if (idiff < diff) { 287023b8a51Sabdelfattah83 iN = in; 288023b8a51Sabdelfattah83 diff = idiff; 289868539c2SNatalie Beams } 29080a9ef05SNatalie Beams } 29180a9ef05SNatalie Beams 292940a72f1SSebastian Grimberg if (!impl->NB_interp[iN]) { 2939d15e85bSSebastian Grimberg CeedFESpace fe_space; 2949d15e85bSSebastian Grimberg CeedInt q_comp_interp, q_comp_deriv; 295940a72f1SSebastian Grimberg Ceed ceed_delegate; 296*7251047cSSebastian Grimberg char *basis_kernel_path, *weight_kernel_path, *basis_kernel_source; 297940a72f1SSebastian Grimberg magma_int_t arch = magma_getdevice_arch(); 29880a9ef05SNatalie Beams 299940a72f1SSebastian Grimberg // Tuning parameters for NB 3009d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetFESpace(basis, &fe_space)); 3019d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 3029d15e85bSSebastian Grimberg switch (fe_space) { 3039d15e85bSSebastian Grimberg case CEED_FE_SPACE_H1: 3049d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_deriv)); 3059d15e85bSSebastian Grimberg break; 3069d15e85bSSebastian Grimberg case CEED_FE_SPACE_HDIV: 3079d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_deriv)); 3089d15e85bSSebastian Grimberg break; 3099d15e85bSSebastian Grimberg case CEED_FE_SPACE_HCURL: 3109d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_deriv)); 3119d15e85bSSebastian Grimberg break; 3129d15e85bSSebastian Grimberg } 3139d15e85bSSebastian Grimberg impl->NB_interp[iN] = nontensor_rtc_get_nb(arch, 'n', q_comp_interp, P, Q, n_array[iN]); 3149d15e85bSSebastian Grimberg impl->NB_interp_t[iN] = nontensor_rtc_get_nb(arch, 't', q_comp_interp, P, Q, n_array[iN]); 3159d15e85bSSebastian Grimberg impl->NB_deriv[iN] = nontensor_rtc_get_nb(arch, 'n', q_comp_deriv, P, Q, n_array[iN]); 3169d15e85bSSebastian Grimberg impl->NB_deriv_t[iN] = nontensor_rtc_get_nb(arch, 't', q_comp_deriv, P, Q, n_array[iN]); 317023b8a51Sabdelfattah83 318940a72f1SSebastian Grimberg // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data 319940a72f1SSebastian Grimberg CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); 320023b8a51Sabdelfattah83 321940a72f1SSebastian Grimberg // Compile kernels 3229d15e85bSSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h", &basis_kernel_path)); 323940a72f1SSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 3249d15e85bSSebastian Grimberg CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 325*7251047cSSebastian Grimberg if (!impl->Weight) { 326*7251047cSSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path)); 327*7251047cSSebastian Grimberg CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_kernel_path, &basis_kernel_source)); 328*7251047cSSebastian Grimberg } 329940a72f1SSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 330*7251047cSSebastian Grimberg CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[iN], 8, "BASIS_Q_COMP_INTERP", q_comp_interp, 3319d15e85bSSebastian Grimberg "BASIS_Q_COMP_DERIV", q_comp_deriv, "BASIS_P", P, "BASIS_Q", Q, "BASIS_NB_INTERP_N", impl->NB_interp[iN], 3329d15e85bSSebastian Grimberg "BASIS_NB_INTERP_T", impl->NB_interp_t[iN], "BASIS_NB_DERIV_N", impl->NB_deriv[iN], "BASIS_NB_DERIV_T", 3339d15e85bSSebastian Grimberg impl->NB_deriv_t[iN])); 334*7251047cSSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_n", &impl->Interp[iN])); 335*7251047cSSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_t", &impl->InterpTranspose[iN])); 336*7251047cSSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_n", &impl->Deriv[iN])); 337*7251047cSSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_t", &impl->DerivTranspose[iN])); 338*7251047cSSebastian Grimberg if (!impl->Weight) { 339*7251047cSSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_weight_nontensor", &impl->Weight)); 340*7251047cSSebastian Grimberg CeedCallBackend(CeedFree(&weight_kernel_path)); 341*7251047cSSebastian Grimberg } 3429d15e85bSSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_path)); 343940a72f1SSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_source)); 344940a72f1SSebastian Grimberg } 345*7251047cSSebastian Grimberg } 346*7251047cSSebastian Grimberg 347*7251047cSSebastian Grimberg // Apply basis operation 348*7251047cSSebastian Grimberg if (e_mode != CEED_EVAL_WEIGHT) { 349*7251047cSSebastian Grimberg const CeedScalar *d_b = NULL; 350*7251047cSSebastian Grimberg CeedInt q_comp, NB, M, K; 351*7251047cSSebastian Grimberg CeedMagmaFunction Kernel; 352*7251047cSSebastian Grimberg 353*7251047cSSebastian Grimberg switch (e_mode) { 354*7251047cSSebastian Grimberg case CEED_EVAL_INTERP: 355*7251047cSSebastian Grimberg d_b = impl->d_interp; 356*7251047cSSebastian Grimberg break; 357*7251047cSSebastian Grimberg case CEED_EVAL_GRAD: 358*7251047cSSebastian Grimberg d_b = impl->d_grad; 359*7251047cSSebastian Grimberg break; 360*7251047cSSebastian Grimberg case CEED_EVAL_DIV: 361*7251047cSSebastian Grimberg d_b = impl->d_div; 362*7251047cSSebastian Grimberg break; 363*7251047cSSebastian Grimberg case CEED_EVAL_CURL: 364*7251047cSSebastian Grimberg d_b = impl->d_curl; 365*7251047cSSebastian Grimberg break; 366*7251047cSSebastian Grimberg // LCOV_EXCL_START 367*7251047cSSebastian Grimberg case CEED_EVAL_WEIGHT: 368*7251047cSSebastian Grimberg return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT does not make sense in this context"); 369*7251047cSSebastian Grimberg case CEED_EVAL_NONE: 370*7251047cSSebastian Grimberg return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); 371*7251047cSSebastian Grimberg // LCOV_EXCL_STOP 372*7251047cSSebastian Grimberg } 373*7251047cSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, e_mode, &q_comp)); 374*7251047cSSebastian Grimberg M = (t_mode == CEED_TRANSPOSE) ? P : Q, K = (t_mode == CEED_TRANSPOSE) ? Q : P; 375*7251047cSSebastian Grimberg 376*7251047cSSebastian Grimberg if (P <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { 3779d15e85bSSebastian Grimberg if (e_mode == CEED_EVAL_INTERP) { 3789d15e85bSSebastian Grimberg if (t_mode == CEED_TRANSPOSE) { 3799d15e85bSSebastian Grimberg Kernel = impl->InterpTranspose[iN]; 3809d15e85bSSebastian Grimberg NB = impl->NB_interp_t[iN]; 3819d15e85bSSebastian Grimberg } else { 3829d15e85bSSebastian Grimberg Kernel = impl->Interp[iN]; 3839d15e85bSSebastian Grimberg NB = impl->NB_interp[iN]; 3849d15e85bSSebastian Grimberg } 3859d15e85bSSebastian Grimberg } else { 3869d15e85bSSebastian Grimberg if (t_mode == CEED_TRANSPOSE) { 3879d15e85bSSebastian Grimberg Kernel = impl->DerivTranspose[iN]; 3889d15e85bSSebastian Grimberg NB = impl->NB_deriv_t[iN]; 3899d15e85bSSebastian Grimberg } else { 3909d15e85bSSebastian Grimberg Kernel = impl->Deriv[iN]; 3919d15e85bSSebastian Grimberg NB = impl->NB_deriv[iN]; 3929d15e85bSSebastian Grimberg } 3939d15e85bSSebastian Grimberg } 394940a72f1SSebastian Grimberg CeedInt num_t_col = MAGMA_BASIS_NTCOL(M, MAGMA_MAXTHREADS_1D); 3959d15e85bSSebastian Grimberg CeedInt grid = CeedDivUpInt(N, num_t_col * NB); 396833aa127SSebastian Grimberg CeedInt shared_mem_A = P * Q * sizeof(CeedScalar); 397940a72f1SSebastian Grimberg CeedInt shared_mem_B = num_t_col * K * NB * sizeof(CeedScalar); 398833aa127SSebastian Grimberg CeedInt shared_mem = (t_mode != CEED_TRANSPOSE && q_comp > 1) ? (shared_mem_A + shared_mem_B) : CeedIntMax(shared_mem_A, shared_mem_B); 3999d15e85bSSebastian Grimberg void *args[] = {&N, &d_b, &d_u, &d_v}; 400940a72f1SSebastian Grimberg 4019d15e85bSSebastian Grimberg CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, Kernel, grid, M, num_t_col, 1, shared_mem, args)); 4029d15e85bSSebastian Grimberg } else { 4039d15e85bSSebastian Grimberg for (CeedInt d = 0; d < q_comp; d++) { 40438293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 405940a72f1SSebastian Grimberg const CeedScalar beta = (d > 0) ? 1.0 : 0.0; 4069d15e85bSSebastian Grimberg magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, N, Q, 1.0, d_b + d * P * Q, P, d_u + d * N * Q, Q, beta, d_v, P, data->queue); 407940a72f1SSebastian Grimberg } else { 4089d15e85bSSebastian Grimberg magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, N, P, 1.0, d_b + d * P * Q, P, d_u, P, 0.0, d_v + d * N * Q, Q, data->queue); 409940a72f1SSebastian Grimberg } 410940a72f1SSebastian Grimberg } 411940a72f1SSebastian Grimberg } 412940a72f1SSebastian Grimberg } else { 413940a72f1SSebastian Grimberg CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 414940a72f1SSebastian Grimberg CeedInt num_t_col = MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D); 415940a72f1SSebastian Grimberg CeedInt grid = CeedDivUpInt(num_elem, num_t_col); 416940a72f1SSebastian Grimberg CeedInt shared_mem = Q * sizeof(CeedScalar) + num_t_col * Q * sizeof(CeedScalar); 4179d15e85bSSebastian Grimberg void *args[] = {&num_elem, &impl->d_q_weight, &d_v}; 418868539c2SNatalie Beams 419940a72f1SSebastian Grimberg CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, grid, Q, num_t_col, 1, shared_mem, args)); 420940a72f1SSebastian Grimberg } 421940a72f1SSebastian Grimberg 422940a72f1SSebastian Grimberg // Must sync to ensure completeness 423e0582403Sabdelfattah83 ceed_magma_queue_sync(data->queue); 424e0582403Sabdelfattah83 425940a72f1SSebastian Grimberg // Restore vectors 42638293ee6SJeremy L Thompson if (e_mode != CEED_EVAL_WEIGHT) { 427940a72f1SSebastian Grimberg CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 428868539c2SNatalie Beams } 429940a72f1SSebastian Grimberg CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 430e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 431868539c2SNatalie Beams } 432868539c2SNatalie Beams 433940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 434940a72f1SSebastian Grimberg // Destroy tensor basis 435940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 436940a72f1SSebastian Grimberg static int CeedBasisDestroy_Magma(CeedBasis basis) { 437f6af633fSnbeams Ceed ceed; 43838293ee6SJeremy L Thompson CeedBasis_Magma *impl; 43938293ee6SJeremy L Thompson 4402b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 441940a72f1SSebastian Grimberg CeedCallBackend(CeedBasisGetData(basis, &impl)); 442e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP 4432b730f8bSJeremy L Thompson CeedCallHip(ceed, hipModuleUnload(impl->module)); 444f6af633fSnbeams #else 4452b730f8bSJeremy L Thompson CeedCallCuda(ceed, cuModuleUnload(impl->module)); 446f6af633fSnbeams #endif 447940a72f1SSebastian Grimberg CeedCallBackend(magma_free(impl->d_interp_1d)); 448940a72f1SSebastian Grimberg CeedCallBackend(magma_free(impl->d_grad_1d)); 449940a72f1SSebastian Grimberg CeedCallBackend(magma_free(impl->d_q_weight_1d)); 4502b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl)); 451e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 4527f5b9731SStan Tomov } 4537f5b9731SStan Tomov 454940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 455940a72f1SSebastian Grimberg // Destroy non-tensor basis 456940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 457940a72f1SSebastian Grimberg static int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) { 458023b8a51Sabdelfattah83 Ceed ceed; 45938293ee6SJeremy L Thompson CeedBasisNonTensor_Magma *impl; 46038293ee6SJeremy L Thompson 461940a72f1SSebastian Grimberg CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 46238293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &impl)); 463940a72f1SSebastian Grimberg for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { 464*7251047cSSebastian Grimberg if (impl->module[in]) { 465940a72f1SSebastian Grimberg #ifdef CEED_MAGMA_USE_HIP 466*7251047cSSebastian Grimberg CeedCallHip(ceed, hipModuleUnload(impl->module[in])); 467940a72f1SSebastian Grimberg #else 468*7251047cSSebastian Grimberg CeedCallCuda(ceed, cuModuleUnload(impl->module[in])); 469940a72f1SSebastian Grimberg #endif 470940a72f1SSebastian Grimberg } 471940a72f1SSebastian Grimberg } 47238293ee6SJeremy L Thompson CeedCallBackend(magma_free(impl->d_interp)); 47338293ee6SJeremy L Thompson CeedCallBackend(magma_free(impl->d_grad)); 4749d15e85bSSebastian Grimberg CeedCallBackend(magma_free(impl->d_div)); 4759d15e85bSSebastian Grimberg CeedCallBackend(magma_free(impl->d_curl)); 47638293ee6SJeremy L Thompson CeedCallBackend(magma_free(impl->d_q_weight)); 4772b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl)); 478e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 479868539c2SNatalie Beams } 480868539c2SNatalie Beams 481940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 482940a72f1SSebastian Grimberg // Create tensor 483940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 484940a72f1SSebastian Grimberg int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 48538293ee6SJeremy L Thompson const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 48638293ee6SJeremy L Thompson Ceed ceed, ceed_delegate; 48738293ee6SJeremy L Thompson Ceed_Magma *data; 488940a72f1SSebastian Grimberg char *interp_kernel_path, *grad_kernel_path, *weight_kernel_path, *basis_kernel_source; 489940a72f1SSebastian Grimberg CeedInt num_comp; 4907f5b9731SStan Tomov CeedBasis_Magma *impl; 49138293ee6SJeremy L Thompson 4922b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 4932b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &data)); 494940a72f1SSebastian Grimberg CeedCallBackend(CeedCalloc(1, &impl)); 495e0582403Sabdelfattah83 496940a72f1SSebastian Grimberg // Copy basis data to GPU 497940a72f1SSebastian Grimberg CeedCallBackend(magma_malloc((void **)&impl->d_q_weight_1d, Q_1d * sizeof(q_weight_1d[0]))); 498940a72f1SSebastian Grimberg magma_setvector(Q_1d, sizeof(q_weight_1d[0]), q_weight_1d, 1, impl->d_q_weight_1d, 1, data->queue); 49938293ee6SJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->d_interp_1d, Q_1d * P_1d * sizeof(interp_1d[0]))); 50038293ee6SJeremy L Thompson magma_setvector(Q_1d * P_1d, sizeof(interp_1d[0]), interp_1d, 1, impl->d_interp_1d, 1, data->queue); 50138293ee6SJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->d_grad_1d, Q_1d * P_1d * sizeof(grad_1d[0]))); 50238293ee6SJeremy L Thompson magma_setvector(Q_1d * P_1d, sizeof(grad_1d[0]), grad_1d, 1, impl->d_grad_1d, 1, data->queue); 5037f5b9731SStan Tomov 504940a72f1SSebastian Grimberg // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data 505940a72f1SSebastian Grimberg CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); 506940a72f1SSebastian Grimberg 507940a72f1SSebastian Grimberg // Compile kernels 508940a72f1SSebastian Grimberg CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 509940a72f1SSebastian Grimberg { 510940a72f1SSebastian Grimberg char *interp_kernel_name_base = "ceed/jit-source/magma/magma-basis-interp"; 511940a72f1SSebastian Grimberg CeedInt interp_kernel_name_len = strlen(interp_kernel_name_base) + 6; 512940a72f1SSebastian Grimberg char interp_kernel_name[interp_kernel_name_len]; 513940a72f1SSebastian Grimberg 514940a72f1SSebastian Grimberg snprintf(interp_kernel_name, interp_kernel_name_len, "%s-%" CeedInt_FMT "d.h", interp_kernel_name_base, dim); 515940a72f1SSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, interp_kernel_name, &interp_kernel_path)); 516940a72f1SSebastian Grimberg } 517940a72f1SSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 518940a72f1SSebastian Grimberg CeedCallBackend(CeedLoadSourceToBuffer(ceed, interp_kernel_path, &basis_kernel_source)); 519940a72f1SSebastian Grimberg { 520940a72f1SSebastian Grimberg char *grad_kernel_name_base = "ceed/jit-source/magma/magma-basis-grad"; 521940a72f1SSebastian Grimberg CeedInt grad_kernel_name_len = strlen(grad_kernel_name_base) + 6; 522940a72f1SSebastian Grimberg char grad_kernel_name[grad_kernel_name_len]; 523940a72f1SSebastian Grimberg 524940a72f1SSebastian Grimberg snprintf(grad_kernel_name, grad_kernel_name_len, "%s-%" CeedInt_FMT "d.h", grad_kernel_name_base, dim); 525940a72f1SSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, grad_kernel_name, &grad_kernel_path)); 526940a72f1SSebastian Grimberg } 527940a72f1SSebastian Grimberg CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_kernel_path, &basis_kernel_source)); 528940a72f1SSebastian Grimberg { 529940a72f1SSebastian Grimberg char *weight_kernel_name_base = "ceed/jit-source/magma/magma-basis-weight"; 530940a72f1SSebastian Grimberg CeedInt weight_kernel_name_len = strlen(weight_kernel_name_base) + 6; 531940a72f1SSebastian Grimberg char weight_kernel_name[weight_kernel_name_len]; 532940a72f1SSebastian Grimberg 533940a72f1SSebastian Grimberg snprintf(weight_kernel_name, weight_kernel_name_len, "%s-%" CeedInt_FMT "d.h", weight_kernel_name_base, dim); 534940a72f1SSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, weight_kernel_name, &weight_kernel_path)); 535940a72f1SSebastian Grimberg } 536940a72f1SSebastian Grimberg CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_kernel_path, &basis_kernel_source)); 537940a72f1SSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 538940a72f1SSebastian Grimberg CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module, 5, "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_P", 539940a72f1SSebastian Grimberg P_1d, "BASIS_Q", Q_1d, "BASIS_MAX_P_Q", CeedIntMax(P_1d, Q_1d))); 540940a72f1SSebastian Grimberg switch (dim) { 541940a72f1SSebastian Grimberg case 1: 542940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->Interp)); 543940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->InterpTranspose)); 544940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->Grad)); 545940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->GradTranspose)); 546940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->Weight)); 547940a72f1SSebastian Grimberg break; 548940a72f1SSebastian Grimberg case 2: 549940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->Interp)); 550940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->InterpTranspose)); 551940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->Grad)); 552940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->GradTranspose)); 553940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->Weight)); 554940a72f1SSebastian Grimberg break; 555940a72f1SSebastian Grimberg case 3: 556940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->Interp)); 557940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->InterpTranspose)); 558940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->Grad)); 559940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->GradTranspose)); 560940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->Weight)); 561940a72f1SSebastian Grimberg break; 562940a72f1SSebastian Grimberg } 563940a72f1SSebastian Grimberg CeedCallBackend(CeedFree(&interp_kernel_path)); 564940a72f1SSebastian Grimberg CeedCallBackend(CeedFree(&grad_kernel_path)); 565940a72f1SSebastian Grimberg CeedCallBackend(CeedFree(&weight_kernel_path)); 566940a72f1SSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_source)); 5677f5b9731SStan Tomov 5682b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisSetData(basis, impl)); 569940a72f1SSebastian Grimberg 570940a72f1SSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma)); 571940a72f1SSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma)); 572e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 5737f5b9731SStan Tomov } 5747f5b9731SStan Tomov 575940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 576940a72f1SSebastian Grimberg // Create non-tensor H^1 577940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 578940a72f1SSebastian Grimberg int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 57938293ee6SJeremy L Thompson const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 580*7251047cSSebastian Grimberg Ceed ceed; 581e0582403Sabdelfattah83 Ceed_Magma *data; 58238293ee6SJeremy L Thompson CeedBasisNonTensor_Magma *impl; 58338293ee6SJeremy L Thompson 58438293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 5852b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &data)); 5862b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &impl)); 587023b8a51Sabdelfattah83 588940a72f1SSebastian Grimberg // Copy basis data to GPU 58938293ee6SJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0]))); 59038293ee6SJeremy L Thompson magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue); 5919d15e85bSSebastian Grimberg if (interp) { 5929d15e85bSSebastian Grimberg CeedInt q_comp_interp; 5939d15e85bSSebastian Grimberg 5949d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 5959d15e85bSSebastian Grimberg CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * q_comp_interp * sizeof(interp[0]))); 5969d15e85bSSebastian Grimberg magma_setvector(num_qpts * num_nodes * q_comp_interp, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue); 5979d15e85bSSebastian Grimberg } 5989d15e85bSSebastian Grimberg if (grad) { 5999d15e85bSSebastian Grimberg CeedInt q_comp_grad; 6009d15e85bSSebastian Grimberg 6019d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 6029d15e85bSSebastian Grimberg CeedCallBackend(magma_malloc((void **)&impl->d_grad, num_qpts * num_nodes * q_comp_grad * sizeof(grad[0]))); 6039d15e85bSSebastian Grimberg magma_setvector(num_qpts * num_nodes * q_comp_grad, sizeof(grad[0]), grad, 1, impl->d_grad, 1, data->queue); 6049d15e85bSSebastian Grimberg } 6059d15e85bSSebastian Grimberg 606*7251047cSSebastian Grimberg // Compile the weight kernel if it won't be compiled later on 607*7251047cSSebastian Grimberg if (num_nodes > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P || num_qpts > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { 608*7251047cSSebastian Grimberg Ceed ceed_delegate; 609*7251047cSSebastian Grimberg char *weight_kernel_path, *basis_kernel_source; 610*7251047cSSebastian Grimberg 6119d15e85bSSebastian Grimberg // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data 6129d15e85bSSebastian Grimberg CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); 6139d15e85bSSebastian Grimberg 6149d15e85bSSebastian Grimberg // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply) 6159d15e85bSSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path)); 6169d15e85bSSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 6179d15e85bSSebastian Grimberg CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source)); 6189d15e85bSSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 619*7251047cSSebastian Grimberg CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[0], 1, "BASIS_Q", num_qpts)); 620*7251047cSSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight)); 6219d15e85bSSebastian Grimberg CeedCallBackend(CeedFree(&weight_kernel_path)); 6229d15e85bSSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_source)); 623*7251047cSSebastian Grimberg } 6249d15e85bSSebastian Grimberg 6259d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisSetData(basis, impl)); 6269d15e85bSSebastian Grimberg 6279d15e85bSSebastian Grimberg // Register backend functions 6289d15e85bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma)); 6299d15e85bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma)); 6309d15e85bSSebastian Grimberg return CEED_ERROR_SUCCESS; 6319d15e85bSSebastian Grimberg } 6329d15e85bSSebastian Grimberg 6339d15e85bSSebastian Grimberg //------------------------------------------------------------------------------ 6349d15e85bSSebastian Grimberg // Create non-tensor H(div) 6359d15e85bSSebastian Grimberg //------------------------------------------------------------------------------ 6369d15e85bSSebastian Grimberg int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 6379d15e85bSSebastian Grimberg const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 638*7251047cSSebastian Grimberg Ceed ceed; 6399d15e85bSSebastian Grimberg Ceed_Magma *data; 6409d15e85bSSebastian Grimberg CeedBasisNonTensor_Magma *impl; 6419d15e85bSSebastian Grimberg 6429d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 6439d15e85bSSebastian Grimberg CeedCallBackend(CeedGetData(ceed, &data)); 6449d15e85bSSebastian Grimberg CeedCallBackend(CeedCalloc(1, &impl)); 6459d15e85bSSebastian Grimberg 6469d15e85bSSebastian Grimberg // Copy basis data to GPU 6479d15e85bSSebastian Grimberg CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0]))); 6489d15e85bSSebastian Grimberg magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue); 6499d15e85bSSebastian Grimberg if (interp) { 6509d15e85bSSebastian Grimberg CeedInt q_comp_interp; 6519d15e85bSSebastian Grimberg 6529d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 6539d15e85bSSebastian Grimberg CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * q_comp_interp * sizeof(interp[0]))); 6549d15e85bSSebastian Grimberg magma_setvector(num_qpts * num_nodes * q_comp_interp, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue); 6559d15e85bSSebastian Grimberg } 6569d15e85bSSebastian Grimberg if (div) { 6579d15e85bSSebastian Grimberg CeedInt q_comp_div; 6589d15e85bSSebastian Grimberg 6599d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); 6609d15e85bSSebastian Grimberg CeedCallBackend(magma_malloc((void **)&impl->d_div, num_qpts * num_nodes * q_comp_div * sizeof(div[0]))); 6619d15e85bSSebastian Grimberg magma_setvector(num_qpts * num_nodes * q_comp_div, sizeof(div[0]), div, 1, impl->d_div, 1, data->queue); 6629d15e85bSSebastian Grimberg } 6639d15e85bSSebastian Grimberg 664*7251047cSSebastian Grimberg // Compile the weight kernel if it won't be compiled later on 665*7251047cSSebastian Grimberg if (num_nodes > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P || num_qpts > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { 666*7251047cSSebastian Grimberg Ceed ceed_delegate; 667*7251047cSSebastian Grimberg char *weight_kernel_path, *basis_kernel_source; 668*7251047cSSebastian Grimberg 6699d15e85bSSebastian Grimberg // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data 6709d15e85bSSebastian Grimberg CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); 6719d15e85bSSebastian Grimberg 6729d15e85bSSebastian Grimberg // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply) 6739d15e85bSSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path)); 6749d15e85bSSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 6759d15e85bSSebastian Grimberg CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source)); 6769d15e85bSSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 677*7251047cSSebastian Grimberg CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[0], 1, "BASIS_Q", num_qpts)); 678*7251047cSSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight)); 6799d15e85bSSebastian Grimberg CeedCallBackend(CeedFree(&weight_kernel_path)); 6809d15e85bSSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_source)); 681*7251047cSSebastian Grimberg } 6829d15e85bSSebastian Grimberg 6839d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisSetData(basis, impl)); 6849d15e85bSSebastian Grimberg 6859d15e85bSSebastian Grimberg // Register backend functions 6869d15e85bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma)); 6879d15e85bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma)); 6889d15e85bSSebastian Grimberg return CEED_ERROR_SUCCESS; 6899d15e85bSSebastian Grimberg } 6909d15e85bSSebastian Grimberg 6919d15e85bSSebastian Grimberg //------------------------------------------------------------------------------ 6929d15e85bSSebastian Grimberg // Create non-tensor H(curl) 6939d15e85bSSebastian Grimberg //------------------------------------------------------------------------------ 6949d15e85bSSebastian Grimberg int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 6959d15e85bSSebastian Grimberg const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 696*7251047cSSebastian Grimberg Ceed ceed; 6979d15e85bSSebastian Grimberg Ceed_Magma *data; 6989d15e85bSSebastian Grimberg CeedBasisNonTensor_Magma *impl; 6999d15e85bSSebastian Grimberg 7009d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 7019d15e85bSSebastian Grimberg CeedCallBackend(CeedGetData(ceed, &data)); 7029d15e85bSSebastian Grimberg CeedCallBackend(CeedCalloc(1, &impl)); 7039d15e85bSSebastian Grimberg 7049d15e85bSSebastian Grimberg // Copy basis data to GPU 7059d15e85bSSebastian Grimberg CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0]))); 7069d15e85bSSebastian Grimberg magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue); 7079d15e85bSSebastian Grimberg if (interp) { 7089d15e85bSSebastian Grimberg CeedInt q_comp_interp; 7099d15e85bSSebastian Grimberg 7109d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 7119d15e85bSSebastian Grimberg CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * q_comp_interp * sizeof(interp[0]))); 7129d15e85bSSebastian Grimberg magma_setvector(num_qpts * num_nodes * q_comp_interp, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue); 7139d15e85bSSebastian Grimberg } 7149d15e85bSSebastian Grimberg if (curl) { 7159d15e85bSSebastian Grimberg CeedInt q_comp_curl; 7169d15e85bSSebastian Grimberg 7179d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); 7189d15e85bSSebastian Grimberg CeedCallBackend(magma_malloc((void **)&impl->d_curl, num_qpts * num_nodes * q_comp_curl * sizeof(curl[0]))); 7199d15e85bSSebastian Grimberg magma_setvector(num_qpts * num_nodes * q_comp_curl, sizeof(curl[0]), curl, 1, impl->d_curl, 1, data->queue); 7209d15e85bSSebastian Grimberg } 721940a72f1SSebastian Grimberg 722*7251047cSSebastian Grimberg // Compile the weight kernel if it won't be compiled later on 723*7251047cSSebastian Grimberg if (num_nodes > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P || num_qpts > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { 724*7251047cSSebastian Grimberg Ceed ceed_delegate; 725*7251047cSSebastian Grimberg char *weight_kernel_path, *basis_kernel_source; 726*7251047cSSebastian Grimberg 727940a72f1SSebastian Grimberg // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data 728940a72f1SSebastian Grimberg CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); 729940a72f1SSebastian Grimberg 730940a72f1SSebastian Grimberg // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply) 731940a72f1SSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path)); 732940a72f1SSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 733940a72f1SSebastian Grimberg CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source)); 734940a72f1SSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 735*7251047cSSebastian Grimberg CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[0], 1, "BASIS_Q", num_qpts)); 736*7251047cSSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight)); 737940a72f1SSebastian Grimberg CeedCallBackend(CeedFree(&weight_kernel_path)); 738940a72f1SSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_source)); 739*7251047cSSebastian Grimberg } 740868539c2SNatalie Beams 741023b8a51Sabdelfattah83 CeedCallBackend(CeedBasisSetData(basis, impl)); 742940a72f1SSebastian Grimberg 743940a72f1SSebastian Grimberg // Register backend functions 744940a72f1SSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma)); 745940a72f1SSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma)); 746e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 7477f5b9731SStan Tomov } 748940a72f1SSebastian Grimberg 749940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 750