13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 37f5b9731SStan Tomov // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 57f5b9731SStan Tomov // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 77f5b9731SStan Tomov 849aac155SJeremy L Thompson #include <ceed.h> 9ec3da8bcSJed Brown #include <ceed/backend.h> 10f6af633fSnbeams #include <ceed/jit-tools.h> 11f6af633fSnbeams #include <string.h> 122b730f8bSJeremy L Thompson 13e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP 14f6af633fSnbeams #include "../hip/ceed-hip-common.h" 15f6af633fSnbeams #include "../hip/ceed-hip-compile.h" 16f6af633fSnbeams #else 17f6af633fSnbeams #include "../cuda/ceed-cuda-common.h" 18f6af633fSnbeams #include "../cuda/ceed-cuda-compile.h" 19f6af633fSnbeams #endif 2000fb7a04SSebastian Grimberg #include "ceed-magma-common.h" 2100fb7a04SSebastian Grimberg #include "ceed-magma.h" 227f5b9731SStan Tomov 23940a72f1SSebastian Grimberg #include "ceed-magma-gemm-nontensor.h" 24940a72f1SSebastian Grimberg #include "ceed-magma-gemm-selector.h" 25940a72f1SSebastian Grimberg 26940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 27940a72f1SSebastian Grimberg // Basis apply - tensor 28940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 29940a72f1SSebastian Grimberg static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) { 307f5b9731SStan Tomov Ceed ceed; 31e0582403Sabdelfattah83 Ceed_Magma *data; 32940a72f1SSebastian Grimberg CeedInt dim, num_comp, num_nodes, P_1d, Q_1d, P, Q; 33940a72f1SSebastian Grimberg const CeedScalar *d_u; 34940a72f1SSebastian Grimberg CeedScalar *d_v; 3538293ee6SJeremy L Thompson CeedBasis_Magma *impl; 3638293ee6SJeremy L Thompson 3738293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 38940a72f1SSebastian Grimberg CeedCallBackend(CeedGetData(ceed, &data)); 39940a72f1SSebastian Grimberg CeedCallBackend(CeedBasisGetData(basis, &impl)); 4038293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 4138293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 42940a72f1SSebastian Grimberg CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 4338293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 4438293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 45940a72f1SSebastian Grimberg P = P_1d; 46940a72f1SSebastian Grimberg Q = Q_1d; 4738293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 4838293ee6SJeremy L Thompson P = Q_1d; 4938293ee6SJeremy L Thompson Q = P_1d; 507f5b9731SStan Tomov } 517f5b9731SStan Tomov 52940a72f1SSebastian Grimberg // Read vectors 53940a72f1SSebastian Grimberg if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 54940a72f1SSebastian Grimberg else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 55940a72f1SSebastian Grimberg CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 56940a72f1SSebastian Grimberg 57940a72f1SSebastian Grimberg // Clear v for transpose operation 58940a72f1SSebastian Grimberg if (t_mode == CEED_TRANSPOSE) { 59940a72f1SSebastian Grimberg CeedSize length; 60940a72f1SSebastian Grimberg 61940a72f1SSebastian Grimberg CeedCallBackend(CeedVectorGetLength(v, &length)); 62940a72f1SSebastian Grimberg if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { 63940a72f1SSebastian Grimberg magmablas_slaset(MagmaFull, length, 1, 0.0, 0.0, (float *)d_v, length, data->queue); 64940a72f1SSebastian Grimberg } else { 65940a72f1SSebastian Grimberg magmablas_dlaset(MagmaFull, length, 1, 0.0, 0.0, (double *)d_v, length, data->queue); 66940a72f1SSebastian Grimberg } 67940a72f1SSebastian Grimberg ceed_magma_queue_sync(data->queue); 68940a72f1SSebastian Grimberg } 69940a72f1SSebastian Grimberg 70940a72f1SSebastian Grimberg // Apply basis operation 71940a72f1SSebastian Grimberg switch (e_mode) { 72940a72f1SSebastian Grimberg case CEED_EVAL_INTERP: { 737f5b9731SStan Tomov // Define element sizes for dofs/quad 7438293ee6SJeremy L Thompson CeedInt elem_qpts_size = CeedIntPow(Q_1d, dim); 7538293ee6SJeremy L Thompson CeedInt elem_dofs_size = CeedIntPow(P_1d, dim); 767f5b9731SStan Tomov 777f5b9731SStan Tomov // E-vector ordering -------------- Q-vector ordering 78868539c2SNatalie Beams // component component 79868539c2SNatalie Beams // elem elem 807f5b9731SStan Tomov // node node 817f5b9731SStan Tomov 827f5b9731SStan Tomov // --- Define strides for NOTRANSPOSE mode: --- 83940a72f1SSebastian Grimberg // Input (d_u) is E-vector, output (d_v) is Q-vector 847f5b9731SStan Tomov 857f5b9731SStan Tomov // Element strides 8638293ee6SJeremy L Thompson CeedInt u_elem_stride = elem_dofs_size; 8738293ee6SJeremy L Thompson CeedInt v_elem_stride = elem_qpts_size; 887f5b9731SStan Tomov // Component strides 8938293ee6SJeremy L Thompson CeedInt u_comp_stride = num_elem * elem_dofs_size; 9038293ee6SJeremy L Thompson CeedInt v_comp_stride = num_elem * elem_qpts_size; 9138293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 92940a72f1SSebastian Grimberg // Input (d_u) is Q-vector, output (d_v) is E-vector 937f5b9731SStan Tomov // Element strides 9438293ee6SJeremy L Thompson v_elem_stride = elem_dofs_size; 9538293ee6SJeremy L Thompson u_elem_stride = elem_qpts_size; 967f5b9731SStan Tomov // Component strides 9738293ee6SJeremy L Thompson v_comp_stride = num_elem * elem_dofs_size; 9838293ee6SJeremy L Thompson u_comp_stride = num_elem * elem_qpts_size; 997f5b9731SStan Tomov } 10038293ee6SJeremy L Thompson CeedInt num_threads = 1; 10138293ee6SJeremy L Thompson CeedInt num_t_col = 1; 10238293ee6SJeremy L Thompson CeedInt shared_mem = 0; 10338293ee6SJeremy L Thompson CeedInt max_P_Q = CeedIntMax(P, Q); 104f6af633fSnbeams 105f6af633fSnbeams switch (dim) { 106f6af633fSnbeams case 1: 10738293ee6SJeremy L Thompson num_threads = max_P_Q; 10838293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_1D); 10938293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * num_t_col * (num_comp * (1 * P + 1 * Q)); 11038293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * (P * Q); 111f6af633fSnbeams break; 112f6af633fSnbeams case 2: 11338293ee6SJeremy L Thompson num_threads = max_P_Q; 11438293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D); 11538293ee6SJeremy L Thompson shared_mem += P * Q * sizeof(CeedScalar); // for sT 116940a72f1SSebastian Grimberg // for reforming rU we need P x P, and for the intermediate output we need P x Q 117940a72f1SSebastian Grimberg shared_mem += num_t_col * (P * max_P_Q * sizeof(CeedScalar)); 118f6af633fSnbeams break; 119f6af633fSnbeams case 3: 12038293ee6SJeremy L Thompson num_threads = max_P_Q * max_P_Q; 12138293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D); 12238293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * (P * Q); // for sT 123940a72f1SSebastian Grimberg // rU needs P^2 x P, the intermediate output needs max(P^2 x Q, P x Q^2) 124940a72f1SSebastian Grimberg shared_mem += sizeof(CeedScalar) * num_t_col * (CeedIntMax(P * P * max_P_Q, P * Q * Q)); 125940a72f1SSebastian Grimberg break; 126f6af633fSnbeams } 127940a72f1SSebastian Grimberg CeedInt grid = CeedDivUpInt(num_elem, num_t_col); 128940a72f1SSebastian Grimberg void *args[] = {&impl->d_interp_1d, &d_u, &u_elem_stride, &u_comp_stride, &d_v, &v_elem_stride, &v_comp_stride, &num_elem}; 129f6af633fSnbeams 13038293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 131940a72f1SSebastian Grimberg CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->InterpTranspose, grid, num_threads, num_t_col, 1, shared_mem, args)); 132f6af633fSnbeams } else { 133940a72f1SSebastian Grimberg CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Interp, grid, num_threads, num_t_col, 1, shared_mem, args)); 134f6af633fSnbeams } 1352b730f8bSJeremy L Thompson } break; 1363513a710Sjeremylt case CEED_EVAL_GRAD: { 1377f5b9731SStan Tomov // Define element sizes for dofs/quad 13838293ee6SJeremy L Thompson CeedInt elem_qpts_size = CeedIntPow(Q_1d, dim); 13938293ee6SJeremy L Thompson CeedInt elem_dofs_size = CeedIntPow(P_1d, dim); 1407f5b9731SStan Tomov 141940a72f1SSebastian Grimberg // In CEED_NOTRANSPOSE mode: 142940a72f1SSebastian Grimberg // d_u is (P^dim x nc), column-major layout (nc = num_comp) 143940a72f1SSebastian Grimberg // d_v is (Q^dim x nc x dim), column-major layout (nc = num_comp) 144940a72f1SSebastian Grimberg // In CEED_TRANSPOSE mode, the sizes of d_u and d_v are switched. 145940a72f1SSebastian Grimberg 1467f5b9731SStan Tomov // E-vector ordering -------------- Q-vector ordering 1477f5b9731SStan Tomov // dim 148868539c2SNatalie Beams // component component 149868539c2SNatalie Beams // elem elem 1507f5b9731SStan Tomov // node node 1517f5b9731SStan Tomov 1527f5b9731SStan Tomov // --- Define strides for NOTRANSPOSE mode: --- 153940a72f1SSebastian Grimberg // Input (d_u) is E-vector, output (d_v) is Q-vector 1547f5b9731SStan Tomov 1557f5b9731SStan Tomov // Element strides 15638293ee6SJeremy L Thompson CeedInt u_elem_stride = elem_dofs_size; 15738293ee6SJeremy L Thompson CeedInt v_elem_stride = elem_qpts_size; 1587f5b9731SStan Tomov // Component strides 15938293ee6SJeremy L Thompson CeedInt u_comp_stride = num_elem * elem_dofs_size; 16038293ee6SJeremy L Thompson CeedInt v_comp_stride = num_elem * elem_qpts_size; 1617f5b9731SStan Tomov // Dimension strides 16238293ee6SJeremy L Thompson CeedInt u_dim_stride = 0; 16338293ee6SJeremy L Thompson CeedInt v_dim_stride = num_elem * elem_qpts_size * num_comp; 16438293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 165940a72f1SSebastian Grimberg // Input (d_u) is Q-vector, output (d_v) is E-vector 1667f5b9731SStan Tomov // Element strides 16738293ee6SJeremy L Thompson v_elem_stride = elem_dofs_size; 16838293ee6SJeremy L Thompson u_elem_stride = elem_qpts_size; 1697f5b9731SStan Tomov // Component strides 17038293ee6SJeremy L Thompson v_comp_stride = num_elem * elem_dofs_size; 17138293ee6SJeremy L Thompson u_comp_stride = num_elem * elem_qpts_size; 1727f5b9731SStan Tomov // Dimension strides 17338293ee6SJeremy L Thompson v_dim_stride = 0; 17438293ee6SJeremy L Thompson u_dim_stride = num_elem * elem_qpts_size * num_comp; 1757f5b9731SStan Tomov } 17638293ee6SJeremy L Thompson CeedInt num_threads = 1; 17738293ee6SJeremy L Thompson CeedInt num_t_col = 1; 17838293ee6SJeremy L Thompson CeedInt shared_mem = 0; 17938293ee6SJeremy L Thompson CeedInt max_P_Q = CeedIntMax(P, Q); 180f6af633fSnbeams 181f6af633fSnbeams switch (dim) { 182f6af633fSnbeams case 1: 18338293ee6SJeremy L Thompson num_threads = max_P_Q; 18438293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_1D); 18538293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * num_t_col * (num_comp * (1 * P + 1 * Q)); 18638293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * (P * Q); 187f6af633fSnbeams break; 188f6af633fSnbeams case 2: 18938293ee6SJeremy L Thompson num_threads = max_P_Q; 19038293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D); 19138293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * 2 * P * Q; // for sTinterp and sTgrad 192940a72f1SSebastian Grimberg // for reforming rU we need P x P, and for the intermediate output we need P x Q 193940a72f1SSebastian Grimberg shared_mem += sizeof(CeedScalar) * num_t_col * (P * max_P_Q); 194f6af633fSnbeams break; 195f6af633fSnbeams case 3: 19638293ee6SJeremy L Thompson num_threads = max_P_Q * max_P_Q; 19738293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D); 19838293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * 2 * P * Q; // for sTinterp and sTgrad 199940a72f1SSebastian Grimberg // rU needs P^2 x P, the intermediate outputs need (P^2 x Q + P x Q^2) 200940a72f1SSebastian Grimberg shared_mem += sizeof(CeedScalar) * num_t_col * CeedIntMax(P * P * P, (P * P * Q) + (P * Q * Q)); 201940a72f1SSebastian Grimberg break; 202f6af633fSnbeams } 203940a72f1SSebastian Grimberg CeedInt grid = CeedDivUpInt(num_elem, num_t_col); 204940a72f1SSebastian Grimberg void *args[] = {&impl->d_interp_1d, &impl->d_grad_1d, &d_u, &u_elem_stride, &u_comp_stride, &u_dim_stride, &d_v, 20538293ee6SJeremy L Thompson &v_elem_stride, &v_comp_stride, &v_dim_stride, &num_elem}; 206f6af633fSnbeams 20738293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 208940a72f1SSebastian Grimberg CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->GradTranspose, grid, num_threads, num_t_col, 1, shared_mem, args)); 209f6af633fSnbeams } else { 210940a72f1SSebastian Grimberg CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Grad, grid, num_threads, num_t_col, 1, shared_mem, args)); 211f6af633fSnbeams } 2122b730f8bSJeremy L Thompson } break; 2133513a710Sjeremylt case CEED_EVAL_WEIGHT: { 214940a72f1SSebastian Grimberg CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 21538293ee6SJeremy L Thompson CeedInt elem_dofs_size = CeedIntPow(Q, dim); 21638293ee6SJeremy L Thompson CeedInt num_threads = 1; 21738293ee6SJeremy L Thompson CeedInt num_t_col = 1; 21838293ee6SJeremy L Thompson CeedInt shared_mem = 0; 219f6af633fSnbeams 220f6af633fSnbeams switch (dim) { 221f6af633fSnbeams case 1: 22238293ee6SJeremy L Thompson num_threads = Q; 22338293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_1D); 22438293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * Q; // for d_q_weight_1d 22538293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * num_t_col * Q; // for output 226f6af633fSnbeams break; 227f6af633fSnbeams case 2: 22838293ee6SJeremy L Thompson num_threads = Q; 22938293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D); 23038293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * Q; // for d_q_weight_1d 231f6af633fSnbeams break; 232f6af633fSnbeams case 3: 23338293ee6SJeremy L Thompson num_threads = Q * Q; 23438293ee6SJeremy L Thompson num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D); 23538293ee6SJeremy L Thompson shared_mem += sizeof(CeedScalar) * Q; // for d_q_weight_1d 236940a72f1SSebastian Grimberg break; 237f6af633fSnbeams } 238940a72f1SSebastian Grimberg CeedInt grid = CeedDivUpInt(num_elem, num_t_col); 239940a72f1SSebastian Grimberg void *args[] = {&impl->d_q_weight_1d, &d_v, &elem_dofs_size, &num_elem}; 240f6af633fSnbeams 241940a72f1SSebastian Grimberg CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, grid, num_threads, num_t_col, 1, shared_mem, args)); 2422b730f8bSJeremy L Thompson } break; 2433513a710Sjeremylt // LCOV_EXCL_START 2443513a710Sjeremylt case CEED_EVAL_DIV: 245e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 2463513a710Sjeremylt case CEED_EVAL_CURL: 247e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 2483513a710Sjeremylt case CEED_EVAL_NONE: 2492b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); 2503513a710Sjeremylt // LCOV_EXCL_STOP 2513513a710Sjeremylt } 2527f5b9731SStan Tomov 253940a72f1SSebastian Grimberg // Must sync to ensure completeness 254e0582403Sabdelfattah83 ceed_magma_queue_sync(data->queue); 255e0582403Sabdelfattah83 256940a72f1SSebastian Grimberg // Restore vectors 25738293ee6SJeremy L Thompson if (e_mode != CEED_EVAL_WEIGHT) { 258940a72f1SSebastian Grimberg CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 2597f5b9731SStan Tomov } 260940a72f1SSebastian Grimberg CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 261e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2627f5b9731SStan Tomov } 2637f5b9731SStan Tomov 264940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 265940a72f1SSebastian Grimberg // Basis apply - non-tensor 266940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 267940a72f1SSebastian Grimberg static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, 268940a72f1SSebastian Grimberg CeedVector v) { 269868539c2SNatalie Beams Ceed ceed; 270e0582403Sabdelfattah83 Ceed_Magma *data; 271*9d15e85bSSebastian Grimberg CeedInt num_comp, q_comp, num_nodes, num_qpts, P, Q, N; 272940a72f1SSebastian Grimberg const CeedScalar *d_u; 273940a72f1SSebastian Grimberg CeedScalar *d_v; 27438293ee6SJeremy L Thompson CeedBasisNonTensor_Magma *impl; 27538293ee6SJeremy L Thompson 27638293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 27738293ee6SJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &data)); 278940a72f1SSebastian Grimberg CeedCallBackend(CeedBasisGetData(basis, &impl)); 27938293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 280*9d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, e_mode, &q_comp)); 281940a72f1SSebastian Grimberg CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 28238293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 283940a72f1SSebastian Grimberg P = num_nodes; 284940a72f1SSebastian Grimberg Q = num_qpts; 285940a72f1SSebastian Grimberg N = num_elem * num_comp; 28638293ee6SJeremy L Thompson 287940a72f1SSebastian Grimberg // Read vectors 288940a72f1SSebastian Grimberg if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 28938293ee6SJeremy L Thompson else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 290940a72f1SSebastian Grimberg CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 291868539c2SNatalie Beams 292940a72f1SSebastian Grimberg // Clear v for transpose operation 29338293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 2941f9221feSJeremy L Thompson CeedSize length; 29538293ee6SJeremy L Thompson 296940a72f1SSebastian Grimberg CeedCallBackend(CeedVectorGetLength(v, &length)); 29780a9ef05SNatalie Beams if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { 298940a72f1SSebastian Grimberg magmablas_slaset(MagmaFull, length, 1, 0.0, 0.0, (float *)d_v, length, data->queue); 29980a9ef05SNatalie Beams } else { 300940a72f1SSebastian Grimberg magmablas_dlaset(MagmaFull, length, 1, 0.0, 0.0, (double *)d_v, length, data->queue); 30180a9ef05SNatalie Beams } 302e0582403Sabdelfattah83 ceed_magma_queue_sync(data->queue); 303868539c2SNatalie Beams } 30480a9ef05SNatalie Beams 305940a72f1SSebastian Grimberg // Apply basis operation 306940a72f1SSebastian Grimberg if (e_mode != CEED_EVAL_WEIGHT) { 307*9d15e85bSSebastian Grimberg const CeedScalar *d_b = NULL; 308*9d15e85bSSebastian Grimberg switch (e_mode) { 309*9d15e85bSSebastian Grimberg case CEED_EVAL_INTERP: 310*9d15e85bSSebastian Grimberg d_b = impl->d_interp; 311*9d15e85bSSebastian Grimberg break; 312*9d15e85bSSebastian Grimberg case CEED_EVAL_GRAD: 313*9d15e85bSSebastian Grimberg d_b = impl->d_grad; 314*9d15e85bSSebastian Grimberg break; 315*9d15e85bSSebastian Grimberg case CEED_EVAL_DIV: 316*9d15e85bSSebastian Grimberg d_b = impl->d_div; 317*9d15e85bSSebastian Grimberg break; 318*9d15e85bSSebastian Grimberg case CEED_EVAL_CURL: 319*9d15e85bSSebastian Grimberg d_b = impl->d_curl; 320*9d15e85bSSebastian Grimberg break; 321*9d15e85bSSebastian Grimberg // LCOV_EXCL_START 322*9d15e85bSSebastian Grimberg case CEED_EVAL_WEIGHT: 323*9d15e85bSSebastian Grimberg return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT does not make sense in this context"); 324*9d15e85bSSebastian Grimberg case CEED_EVAL_NONE: 325*9d15e85bSSebastian Grimberg return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); 326*9d15e85bSSebastian Grimberg // LCOV_EXCL_STOP 327*9d15e85bSSebastian Grimberg } 328*9d15e85bSSebastian Grimberg 329*9d15e85bSSebastian Grimberg // Apply basis operation 330*9d15e85bSSebastian Grimberg if (P <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { 331940a72f1SSebastian Grimberg CeedInt n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_KERNEL_N_VALUES}; 332940a72f1SSebastian Grimberg CeedInt iN = 0, diff = abs(n_array[iN] - N), idiff; 333940a72f1SSebastian Grimberg CeedInt M = (t_mode == CEED_TRANSPOSE) ? P : Q, K = (t_mode == CEED_TRANSPOSE) ? Q : P; 33438293ee6SJeremy L Thompson 335023b8a51Sabdelfattah83 for (CeedInt in = iN + 1; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { 336940a72f1SSebastian Grimberg idiff = abs(n_array[in] - N); 337023b8a51Sabdelfattah83 if (idiff < diff) { 338023b8a51Sabdelfattah83 iN = in; 339023b8a51Sabdelfattah83 diff = idiff; 340868539c2SNatalie Beams } 34180a9ef05SNatalie Beams } 34280a9ef05SNatalie Beams 343940a72f1SSebastian Grimberg // Compile kernels for N as needed 344940a72f1SSebastian Grimberg if (!impl->NB_interp[iN]) { 345*9d15e85bSSebastian Grimberg CeedFESpace fe_space; 346*9d15e85bSSebastian Grimberg CeedInt q_comp_interp, q_comp_deriv; 347940a72f1SSebastian Grimberg Ceed ceed_delegate; 348*9d15e85bSSebastian Grimberg char *basis_kernel_path, *basis_kernel_source; 349940a72f1SSebastian Grimberg magma_int_t arch = magma_getdevice_arch(); 35080a9ef05SNatalie Beams 351940a72f1SSebastian Grimberg // Tuning parameters for NB 352*9d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetFESpace(basis, &fe_space)); 353*9d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 354*9d15e85bSSebastian Grimberg switch (fe_space) { 355*9d15e85bSSebastian Grimberg case CEED_FE_SPACE_H1: 356*9d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_deriv)); 357*9d15e85bSSebastian Grimberg break; 358*9d15e85bSSebastian Grimberg case CEED_FE_SPACE_HDIV: 359*9d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_deriv)); 360*9d15e85bSSebastian Grimberg break; 361*9d15e85bSSebastian Grimberg case CEED_FE_SPACE_HCURL: 362*9d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_deriv)); 363*9d15e85bSSebastian Grimberg break; 364*9d15e85bSSebastian Grimberg } 365*9d15e85bSSebastian Grimberg impl->NB_interp[iN] = nontensor_rtc_get_nb(arch, 'n', q_comp_interp, P, Q, n_array[iN]); 366*9d15e85bSSebastian Grimberg impl->NB_interp_t[iN] = nontensor_rtc_get_nb(arch, 't', q_comp_interp, P, Q, n_array[iN]); 367*9d15e85bSSebastian Grimberg impl->NB_deriv[iN] = nontensor_rtc_get_nb(arch, 'n', q_comp_deriv, P, Q, n_array[iN]); 368*9d15e85bSSebastian Grimberg impl->NB_deriv_t[iN] = nontensor_rtc_get_nb(arch, 't', q_comp_deriv, P, Q, n_array[iN]); 369023b8a51Sabdelfattah83 370940a72f1SSebastian Grimberg // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data 371940a72f1SSebastian Grimberg CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); 372023b8a51Sabdelfattah83 373940a72f1SSebastian Grimberg // Compile kernels 374*9d15e85bSSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h", &basis_kernel_path)); 375940a72f1SSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 376*9d15e85bSSebastian Grimberg CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 377940a72f1SSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 378*9d15e85bSSebastian Grimberg CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_interp[iN], 8, "BASIS_Q_COMP_INTERP", q_comp_interp, 379*9d15e85bSSebastian Grimberg "BASIS_Q_COMP_DERIV", q_comp_deriv, "BASIS_P", P, "BASIS_Q", Q, "BASIS_NB_INTERP_N", impl->NB_interp[iN], 380*9d15e85bSSebastian Grimberg "BASIS_NB_INTERP_T", impl->NB_interp_t[iN], "BASIS_NB_DERIV_N", impl->NB_deriv[iN], "BASIS_NB_DERIV_T", 381*9d15e85bSSebastian Grimberg impl->NB_deriv_t[iN])); 382940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_interp_nontensor_n", &impl->Interp[iN])); 383940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_interp_nontensor_t", &impl->InterpTranspose[iN])); 384*9d15e85bSSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_deriv_nontensor_n", &impl->Deriv[iN])); 385*9d15e85bSSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_deriv_nontensor_t", &impl->DerivTranspose[iN])); 386*9d15e85bSSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_path)); 387940a72f1SSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_source)); 388940a72f1SSebastian Grimberg } 389*9d15e85bSSebastian Grimberg CeedMagmaFunction Kernel; 390*9d15e85bSSebastian Grimberg CeedInt NB; 391*9d15e85bSSebastian Grimberg if (e_mode == CEED_EVAL_INTERP) { 392*9d15e85bSSebastian Grimberg if (t_mode == CEED_TRANSPOSE) { 393*9d15e85bSSebastian Grimberg Kernel = impl->InterpTranspose[iN]; 394*9d15e85bSSebastian Grimberg NB = impl->NB_interp_t[iN]; 395*9d15e85bSSebastian Grimberg } else { 396*9d15e85bSSebastian Grimberg Kernel = impl->Interp[iN]; 397*9d15e85bSSebastian Grimberg NB = impl->NB_interp[iN]; 398*9d15e85bSSebastian Grimberg } 399*9d15e85bSSebastian Grimberg } else { 400*9d15e85bSSebastian Grimberg if (t_mode == CEED_TRANSPOSE) { 401*9d15e85bSSebastian Grimberg Kernel = impl->DerivTranspose[iN]; 402*9d15e85bSSebastian Grimberg NB = impl->NB_deriv_t[iN]; 403*9d15e85bSSebastian Grimberg } else { 404*9d15e85bSSebastian Grimberg Kernel = impl->Deriv[iN]; 405*9d15e85bSSebastian Grimberg NB = impl->NB_deriv[iN]; 406*9d15e85bSSebastian Grimberg } 407*9d15e85bSSebastian Grimberg } 408940a72f1SSebastian Grimberg CeedInt num_t_col = MAGMA_BASIS_NTCOL(M, MAGMA_MAXTHREADS_1D); 409*9d15e85bSSebastian Grimberg CeedInt grid = CeedDivUpInt(N, num_t_col * NB); 410*9d15e85bSSebastian Grimberg CeedInt shared_mem_A = (t_mode == CEED_TRANSPOSE) ? 0 : P * Q * sizeof(CeedScalar); 411940a72f1SSebastian Grimberg CeedInt shared_mem_B = num_t_col * K * NB * sizeof(CeedScalar); 412*9d15e85bSSebastian Grimberg CeedInt shared_mem = (t_mode == CEED_TRANSPOSE || q_comp > 1) ? (shared_mem_A + shared_mem_B) : CeedIntMax(shared_mem_A, shared_mem_B); 413*9d15e85bSSebastian Grimberg void *args[] = {&N, &d_b, &d_u, &d_v}; 414940a72f1SSebastian Grimberg 415*9d15e85bSSebastian Grimberg CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, Kernel, grid, M, num_t_col, 1, shared_mem, args)); 416*9d15e85bSSebastian Grimberg } else { 417*9d15e85bSSebastian Grimberg for (CeedInt d = 0; d < q_comp; d++) { 41838293ee6SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 419940a72f1SSebastian Grimberg const CeedScalar beta = (d > 0) ? 1.0 : 0.0; 420*9d15e85bSSebastian Grimberg magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, N, Q, 1.0, d_b + d * P * Q, P, d_u + d * N * Q, Q, beta, d_v, P, data->queue); 421940a72f1SSebastian Grimberg } else { 422*9d15e85bSSebastian Grimberg magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, N, P, 1.0, d_b + d * P * Q, P, d_u, P, 0.0, d_v + d * N * Q, Q, data->queue); 423940a72f1SSebastian Grimberg } 424940a72f1SSebastian Grimberg } 425940a72f1SSebastian Grimberg } 426940a72f1SSebastian Grimberg } else { 427940a72f1SSebastian Grimberg CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 428940a72f1SSebastian Grimberg CeedInt num_t_col = MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D); 429940a72f1SSebastian Grimberg CeedInt grid = CeedDivUpInt(num_elem, num_t_col); 430940a72f1SSebastian Grimberg CeedInt shared_mem = Q * sizeof(CeedScalar) + num_t_col * Q * sizeof(CeedScalar); 431*9d15e85bSSebastian Grimberg void *args[] = {&num_elem, &impl->d_q_weight, &d_v}; 432868539c2SNatalie Beams 433940a72f1SSebastian Grimberg CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, grid, Q, num_t_col, 1, shared_mem, args)); 434940a72f1SSebastian Grimberg } 435940a72f1SSebastian Grimberg 436940a72f1SSebastian Grimberg // Must sync to ensure completeness 437e0582403Sabdelfattah83 ceed_magma_queue_sync(data->queue); 438e0582403Sabdelfattah83 439940a72f1SSebastian Grimberg // Restore vectors 44038293ee6SJeremy L Thompson if (e_mode != CEED_EVAL_WEIGHT) { 441940a72f1SSebastian Grimberg CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 442868539c2SNatalie Beams } 443940a72f1SSebastian Grimberg CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 444e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 445868539c2SNatalie Beams } 446868539c2SNatalie Beams 447940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 448940a72f1SSebastian Grimberg // Destroy tensor basis 449940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 450940a72f1SSebastian Grimberg static int CeedBasisDestroy_Magma(CeedBasis basis) { 451f6af633fSnbeams Ceed ceed; 45238293ee6SJeremy L Thompson CeedBasis_Magma *impl; 45338293ee6SJeremy L Thompson 4542b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 455940a72f1SSebastian Grimberg CeedCallBackend(CeedBasisGetData(basis, &impl)); 456e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP 4572b730f8bSJeremy L Thompson CeedCallHip(ceed, hipModuleUnload(impl->module)); 458f6af633fSnbeams #else 4592b730f8bSJeremy L Thompson CeedCallCuda(ceed, cuModuleUnload(impl->module)); 460f6af633fSnbeams #endif 461940a72f1SSebastian Grimberg CeedCallBackend(magma_free(impl->d_interp_1d)); 462940a72f1SSebastian Grimberg CeedCallBackend(magma_free(impl->d_grad_1d)); 463940a72f1SSebastian Grimberg CeedCallBackend(magma_free(impl->d_q_weight_1d)); 4642b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl)); 465e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 4667f5b9731SStan Tomov } 4677f5b9731SStan Tomov 468940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 469940a72f1SSebastian Grimberg // Destroy non-tensor basis 470940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 471940a72f1SSebastian Grimberg static int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) { 472023b8a51Sabdelfattah83 Ceed ceed; 47338293ee6SJeremy L Thompson CeedBasisNonTensor_Magma *impl; 47438293ee6SJeremy L Thompson 475940a72f1SSebastian Grimberg CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 47638293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &impl)); 477940a72f1SSebastian Grimberg #ifdef CEED_MAGMA_USE_HIP 478940a72f1SSebastian Grimberg CeedCallHip(ceed, hipModuleUnload(impl->module_weight)); 479940a72f1SSebastian Grimberg #else 480940a72f1SSebastian Grimberg CeedCallCuda(ceed, cuModuleUnload(impl->module_weight)); 481940a72f1SSebastian Grimberg #endif 482940a72f1SSebastian Grimberg for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { 483940a72f1SSebastian Grimberg if (impl->module_interp[in]) { 484940a72f1SSebastian Grimberg #ifdef CEED_MAGMA_USE_HIP 485940a72f1SSebastian Grimberg CeedCallHip(ceed, hipModuleUnload(impl->module_interp[in])); 486940a72f1SSebastian Grimberg #else 487940a72f1SSebastian Grimberg CeedCallCuda(ceed, cuModuleUnload(impl->module_interp[in])); 488940a72f1SSebastian Grimberg #endif 489940a72f1SSebastian Grimberg } 490940a72f1SSebastian Grimberg } 49138293ee6SJeremy L Thompson CeedCallBackend(magma_free(impl->d_interp)); 49238293ee6SJeremy L Thompson CeedCallBackend(magma_free(impl->d_grad)); 493*9d15e85bSSebastian Grimberg CeedCallBackend(magma_free(impl->d_div)); 494*9d15e85bSSebastian Grimberg CeedCallBackend(magma_free(impl->d_curl)); 49538293ee6SJeremy L Thompson CeedCallBackend(magma_free(impl->d_q_weight)); 4962b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl)); 497e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 498868539c2SNatalie Beams } 499868539c2SNatalie Beams 500940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 501940a72f1SSebastian Grimberg // Create tensor 502940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 503940a72f1SSebastian Grimberg int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 50438293ee6SJeremy L Thompson const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 50538293ee6SJeremy L Thompson Ceed ceed, ceed_delegate; 50638293ee6SJeremy L Thompson Ceed_Magma *data; 507940a72f1SSebastian Grimberg char *interp_kernel_path, *grad_kernel_path, *weight_kernel_path, *basis_kernel_source; 508940a72f1SSebastian Grimberg CeedInt num_comp; 5097f5b9731SStan Tomov CeedBasis_Magma *impl; 51038293ee6SJeremy L Thompson 5112b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 5122b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &data)); 513940a72f1SSebastian Grimberg CeedCallBackend(CeedCalloc(1, &impl)); 514e0582403Sabdelfattah83 515940a72f1SSebastian Grimberg // Copy basis data to GPU 516940a72f1SSebastian Grimberg CeedCallBackend(magma_malloc((void **)&impl->d_q_weight_1d, Q_1d * sizeof(q_weight_1d[0]))); 517940a72f1SSebastian Grimberg magma_setvector(Q_1d, sizeof(q_weight_1d[0]), q_weight_1d, 1, impl->d_q_weight_1d, 1, data->queue); 51838293ee6SJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->d_interp_1d, Q_1d * P_1d * sizeof(interp_1d[0]))); 51938293ee6SJeremy L Thompson magma_setvector(Q_1d * P_1d, sizeof(interp_1d[0]), interp_1d, 1, impl->d_interp_1d, 1, data->queue); 52038293ee6SJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->d_grad_1d, Q_1d * P_1d * sizeof(grad_1d[0]))); 52138293ee6SJeremy L Thompson magma_setvector(Q_1d * P_1d, sizeof(grad_1d[0]), grad_1d, 1, impl->d_grad_1d, 1, data->queue); 5227f5b9731SStan Tomov 523940a72f1SSebastian Grimberg // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data 524940a72f1SSebastian Grimberg CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); 525940a72f1SSebastian Grimberg 526940a72f1SSebastian Grimberg // Compile kernels 527940a72f1SSebastian Grimberg CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 528940a72f1SSebastian Grimberg { 529940a72f1SSebastian Grimberg char *interp_kernel_name_base = "ceed/jit-source/magma/magma-basis-interp"; 530940a72f1SSebastian Grimberg CeedInt interp_kernel_name_len = strlen(interp_kernel_name_base) + 6; 531940a72f1SSebastian Grimberg char interp_kernel_name[interp_kernel_name_len]; 532940a72f1SSebastian Grimberg 533940a72f1SSebastian Grimberg snprintf(interp_kernel_name, interp_kernel_name_len, "%s-%" CeedInt_FMT "d.h", interp_kernel_name_base, dim); 534940a72f1SSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, interp_kernel_name, &interp_kernel_path)); 535940a72f1SSebastian Grimberg } 536940a72f1SSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 537940a72f1SSebastian Grimberg CeedCallBackend(CeedLoadSourceToBuffer(ceed, interp_kernel_path, &basis_kernel_source)); 538940a72f1SSebastian Grimberg { 539940a72f1SSebastian Grimberg char *grad_kernel_name_base = "ceed/jit-source/magma/magma-basis-grad"; 540940a72f1SSebastian Grimberg CeedInt grad_kernel_name_len = strlen(grad_kernel_name_base) + 6; 541940a72f1SSebastian Grimberg char grad_kernel_name[grad_kernel_name_len]; 542940a72f1SSebastian Grimberg 543940a72f1SSebastian Grimberg snprintf(grad_kernel_name, grad_kernel_name_len, "%s-%" CeedInt_FMT "d.h", grad_kernel_name_base, dim); 544940a72f1SSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, grad_kernel_name, &grad_kernel_path)); 545940a72f1SSebastian Grimberg } 546940a72f1SSebastian Grimberg CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_kernel_path, &basis_kernel_source)); 547940a72f1SSebastian Grimberg { 548940a72f1SSebastian Grimberg char *weight_kernel_name_base = "ceed/jit-source/magma/magma-basis-weight"; 549940a72f1SSebastian Grimberg CeedInt weight_kernel_name_len = strlen(weight_kernel_name_base) + 6; 550940a72f1SSebastian Grimberg char weight_kernel_name[weight_kernel_name_len]; 551940a72f1SSebastian Grimberg 552940a72f1SSebastian Grimberg snprintf(weight_kernel_name, weight_kernel_name_len, "%s-%" CeedInt_FMT "d.h", weight_kernel_name_base, dim); 553940a72f1SSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, weight_kernel_name, &weight_kernel_path)); 554940a72f1SSebastian Grimberg } 555940a72f1SSebastian Grimberg CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_kernel_path, &basis_kernel_source)); 556940a72f1SSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 557940a72f1SSebastian Grimberg CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module, 5, "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_P", 558940a72f1SSebastian Grimberg P_1d, "BASIS_Q", Q_1d, "BASIS_MAX_P_Q", CeedIntMax(P_1d, Q_1d))); 559940a72f1SSebastian Grimberg switch (dim) { 560940a72f1SSebastian Grimberg case 1: 561940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->Interp)); 562940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->InterpTranspose)); 563940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->Grad)); 564940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->GradTranspose)); 565940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->Weight)); 566940a72f1SSebastian Grimberg break; 567940a72f1SSebastian Grimberg case 2: 568940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->Interp)); 569940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->InterpTranspose)); 570940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->Grad)); 571940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->GradTranspose)); 572940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->Weight)); 573940a72f1SSebastian Grimberg break; 574940a72f1SSebastian Grimberg case 3: 575940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->Interp)); 576940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->InterpTranspose)); 577940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->Grad)); 578940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->GradTranspose)); 579940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->Weight)); 580940a72f1SSebastian Grimberg break; 581940a72f1SSebastian Grimberg } 582940a72f1SSebastian Grimberg CeedCallBackend(CeedFree(&interp_kernel_path)); 583940a72f1SSebastian Grimberg CeedCallBackend(CeedFree(&grad_kernel_path)); 584940a72f1SSebastian Grimberg CeedCallBackend(CeedFree(&weight_kernel_path)); 585940a72f1SSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_source)); 5867f5b9731SStan Tomov 5872b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisSetData(basis, impl)); 588940a72f1SSebastian Grimberg 589940a72f1SSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma)); 590940a72f1SSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma)); 591e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 5927f5b9731SStan Tomov } 5937f5b9731SStan Tomov 594940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 595940a72f1SSebastian Grimberg // Create non-tensor H^1 596940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 597940a72f1SSebastian Grimberg int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 59838293ee6SJeremy L Thompson const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 59938293ee6SJeremy L Thompson Ceed ceed, ceed_delegate; 600e0582403Sabdelfattah83 Ceed_Magma *data; 601940a72f1SSebastian Grimberg char *weight_kernel_path, *basis_kernel_source; 60238293ee6SJeremy L Thompson CeedBasisNonTensor_Magma *impl; 60338293ee6SJeremy L Thompson 60438293ee6SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 6052b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &data)); 6062b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &impl)); 607023b8a51Sabdelfattah83 608940a72f1SSebastian Grimberg // Copy basis data to GPU 60938293ee6SJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0]))); 61038293ee6SJeremy L Thompson magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue); 611*9d15e85bSSebastian Grimberg if (interp) { 612*9d15e85bSSebastian Grimberg CeedInt q_comp_interp; 613*9d15e85bSSebastian Grimberg 614*9d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 615*9d15e85bSSebastian Grimberg CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * q_comp_interp * sizeof(interp[0]))); 616*9d15e85bSSebastian Grimberg magma_setvector(num_qpts * num_nodes * q_comp_interp, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue); 617*9d15e85bSSebastian Grimberg } 618*9d15e85bSSebastian Grimberg if (grad) { 619*9d15e85bSSebastian Grimberg CeedInt q_comp_grad; 620*9d15e85bSSebastian Grimberg 621*9d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 622*9d15e85bSSebastian Grimberg CeedCallBackend(magma_malloc((void **)&impl->d_grad, num_qpts * num_nodes * q_comp_grad * sizeof(grad[0]))); 623*9d15e85bSSebastian Grimberg magma_setvector(num_qpts * num_nodes * q_comp_grad, sizeof(grad[0]), grad, 1, impl->d_grad, 1, data->queue); 624*9d15e85bSSebastian Grimberg } 625*9d15e85bSSebastian Grimberg 626*9d15e85bSSebastian Grimberg // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data 627*9d15e85bSSebastian Grimberg CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); 628*9d15e85bSSebastian Grimberg 629*9d15e85bSSebastian Grimberg // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply) 630*9d15e85bSSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path)); 631*9d15e85bSSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 632*9d15e85bSSebastian Grimberg CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source)); 633*9d15e85bSSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 634*9d15e85bSSebastian Grimberg CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_weight, 1, "BASIS_Q", num_qpts)); 635*9d15e85bSSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_weight, "magma_weight_nontensor", &impl->Weight)); 636*9d15e85bSSebastian Grimberg CeedCallBackend(CeedFree(&weight_kernel_path)); 637*9d15e85bSSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_source)); 638*9d15e85bSSebastian Grimberg 639*9d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisSetData(basis, impl)); 640*9d15e85bSSebastian Grimberg 641*9d15e85bSSebastian Grimberg // Register backend functions 642*9d15e85bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma)); 643*9d15e85bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma)); 644*9d15e85bSSebastian Grimberg return CEED_ERROR_SUCCESS; 645*9d15e85bSSebastian Grimberg } 646*9d15e85bSSebastian Grimberg 647*9d15e85bSSebastian Grimberg //------------------------------------------------------------------------------ 648*9d15e85bSSebastian Grimberg // Create non-tensor H(div) 649*9d15e85bSSebastian Grimberg //------------------------------------------------------------------------------ 650*9d15e85bSSebastian Grimberg int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 651*9d15e85bSSebastian Grimberg const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 652*9d15e85bSSebastian Grimberg Ceed ceed, ceed_delegate; 653*9d15e85bSSebastian Grimberg Ceed_Magma *data; 654*9d15e85bSSebastian Grimberg char *weight_kernel_path, *basis_kernel_source; 655*9d15e85bSSebastian Grimberg CeedBasisNonTensor_Magma *impl; 656*9d15e85bSSebastian Grimberg 657*9d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 658*9d15e85bSSebastian Grimberg CeedCallBackend(CeedGetData(ceed, &data)); 659*9d15e85bSSebastian Grimberg CeedCallBackend(CeedCalloc(1, &impl)); 660*9d15e85bSSebastian Grimberg 661*9d15e85bSSebastian Grimberg // Copy basis data to GPU 662*9d15e85bSSebastian Grimberg CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0]))); 663*9d15e85bSSebastian Grimberg magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue); 664*9d15e85bSSebastian Grimberg if (interp) { 665*9d15e85bSSebastian Grimberg CeedInt q_comp_interp; 666*9d15e85bSSebastian Grimberg 667*9d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 668*9d15e85bSSebastian Grimberg CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * q_comp_interp * sizeof(interp[0]))); 669*9d15e85bSSebastian Grimberg magma_setvector(num_qpts * num_nodes * q_comp_interp, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue); 670*9d15e85bSSebastian Grimberg } 671*9d15e85bSSebastian Grimberg if (div) { 672*9d15e85bSSebastian Grimberg CeedInt q_comp_div; 673*9d15e85bSSebastian Grimberg 674*9d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); 675*9d15e85bSSebastian Grimberg CeedCallBackend(magma_malloc((void **)&impl->d_div, num_qpts * num_nodes * q_comp_div * sizeof(div[0]))); 676*9d15e85bSSebastian Grimberg magma_setvector(num_qpts * num_nodes * q_comp_div, sizeof(div[0]), div, 1, impl->d_div, 1, data->queue); 677*9d15e85bSSebastian Grimberg } 678*9d15e85bSSebastian Grimberg 679*9d15e85bSSebastian Grimberg // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data 680*9d15e85bSSebastian Grimberg CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); 681*9d15e85bSSebastian Grimberg 682*9d15e85bSSebastian Grimberg // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply) 683*9d15e85bSSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path)); 684*9d15e85bSSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 685*9d15e85bSSebastian Grimberg CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source)); 686*9d15e85bSSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 687*9d15e85bSSebastian Grimberg CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_weight, 1, "BASIS_Q", num_qpts)); 688*9d15e85bSSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_weight, "magma_weight_nontensor", &impl->Weight)); 689*9d15e85bSSebastian Grimberg CeedCallBackend(CeedFree(&weight_kernel_path)); 690*9d15e85bSSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_source)); 691*9d15e85bSSebastian Grimberg 692*9d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisSetData(basis, impl)); 693*9d15e85bSSebastian Grimberg 694*9d15e85bSSebastian Grimberg // Register backend functions 695*9d15e85bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma)); 696*9d15e85bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma)); 697*9d15e85bSSebastian Grimberg return CEED_ERROR_SUCCESS; 698*9d15e85bSSebastian Grimberg } 699*9d15e85bSSebastian Grimberg 700*9d15e85bSSebastian Grimberg //------------------------------------------------------------------------------ 701*9d15e85bSSebastian Grimberg // Create non-tensor H(curl) 702*9d15e85bSSebastian Grimberg //------------------------------------------------------------------------------ 703*9d15e85bSSebastian Grimberg int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 704*9d15e85bSSebastian Grimberg const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 705*9d15e85bSSebastian Grimberg Ceed ceed, ceed_delegate; 706*9d15e85bSSebastian Grimberg Ceed_Magma *data; 707*9d15e85bSSebastian Grimberg char *weight_kernel_path, *basis_kernel_source; 708*9d15e85bSSebastian Grimberg CeedBasisNonTensor_Magma *impl; 709*9d15e85bSSebastian Grimberg 710*9d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 711*9d15e85bSSebastian Grimberg CeedCallBackend(CeedGetData(ceed, &data)); 712*9d15e85bSSebastian Grimberg CeedCallBackend(CeedCalloc(1, &impl)); 713*9d15e85bSSebastian Grimberg 714*9d15e85bSSebastian Grimberg // Copy basis data to GPU 715*9d15e85bSSebastian Grimberg CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0]))); 716*9d15e85bSSebastian Grimberg magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue); 717*9d15e85bSSebastian Grimberg if (interp) { 718*9d15e85bSSebastian Grimberg CeedInt q_comp_interp; 719*9d15e85bSSebastian Grimberg 720*9d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 721*9d15e85bSSebastian Grimberg CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * q_comp_interp * sizeof(interp[0]))); 722*9d15e85bSSebastian Grimberg magma_setvector(num_qpts * num_nodes * q_comp_interp, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue); 723*9d15e85bSSebastian Grimberg } 724*9d15e85bSSebastian Grimberg if (curl) { 725*9d15e85bSSebastian Grimberg CeedInt q_comp_curl; 726*9d15e85bSSebastian Grimberg 727*9d15e85bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); 728*9d15e85bSSebastian Grimberg CeedCallBackend(magma_malloc((void **)&impl->d_curl, num_qpts * num_nodes * q_comp_curl * sizeof(curl[0]))); 729*9d15e85bSSebastian Grimberg magma_setvector(num_qpts * num_nodes * q_comp_curl, sizeof(curl[0]), curl, 1, impl->d_curl, 1, data->queue); 730*9d15e85bSSebastian Grimberg } 731940a72f1SSebastian Grimberg 732940a72f1SSebastian Grimberg // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data 733940a72f1SSebastian Grimberg CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); 734940a72f1SSebastian Grimberg 735940a72f1SSebastian Grimberg // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply) 736940a72f1SSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path)); 737940a72f1SSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 738940a72f1SSebastian Grimberg CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source)); 739940a72f1SSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 740940a72f1SSebastian Grimberg CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_weight, 1, "BASIS_Q", num_qpts)); 741940a72f1SSebastian Grimberg CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_weight, "magma_weight_nontensor", &impl->Weight)); 742940a72f1SSebastian Grimberg CeedCallBackend(CeedFree(&weight_kernel_path)); 743940a72f1SSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_source)); 744868539c2SNatalie Beams 745023b8a51Sabdelfattah83 CeedCallBackend(CeedBasisSetData(basis, impl)); 746940a72f1SSebastian Grimberg 747940a72f1SSebastian Grimberg // Register backend functions 748940a72f1SSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma)); 749940a72f1SSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma)); 750e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 7517f5b9731SStan Tomov } 752940a72f1SSebastian Grimberg 753940a72f1SSebastian Grimberg //------------------------------------------------------------------------------ 754