13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 34444f328STzanio // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 54444f328STzanio // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 74444f328STzanio 890104f39SStan Tomov // magma functions specific to ceed 9972b3d9dSNatalie Beams #ifndef _ceed_magma_h 103d576824SJeremy L Thompson #define _ceed_magma_h 1190104f39SStan Tomov 12ec3da8bcSJed Brown #include <ceed/backend.h> 13*2b730f8bSJeremy L Thompson #include <ceed/ceed.h> 14e0582403Sabdelfattah83 #include <magma_v2.h> 15e0582403Sabdelfattah83 16f6af633fSnbeams #define MAGMA_MAXTHREADS_1D 128 17f6af633fSnbeams #define MAGMA_MAXTHREADS_2D 128 18f6af633fSnbeams #define MAGMA_MAXTHREADS_3D 64 19f6af633fSnbeams // Define macro for determining number of threads in y-direction 20f6af633fSnbeams // for basis kernels 21f6af633fSnbeams #define MAGMA_BASIS_NTCOL(x, maxt) (((maxt) < (x)) ? 1 : ((maxt) / (x))) 22f6af633fSnbeams // Define macro for computing the total threads in a block 23f6af633fSnbeams // for use with __launch_bounds__() 24f6af633fSnbeams #define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt)) 25f6af633fSnbeams 26e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP 27c42f38b1Snbeams typedef hipModule_t CeedMagmaModule; 28c42f38b1Snbeams typedef hipFunction_t CeedMagmaFunction; 29c42f38b1Snbeams #define CeedCompileMagma CeedCompileHip 30c42f38b1Snbeams #define CeedGetKernelMagma CeedGetKernelHip 31c42f38b1Snbeams #define CeedRunKernelMagma CeedRunKernelHip 32c42f38b1Snbeams #define CeedRunKernelDimMagma CeedRunKernelDimHip 33c42f38b1Snbeams #define CeedRunKernelDimSharedMagma CeedRunKernelDimSharedHip 34f6af633fSnbeams #else 35c42f38b1Snbeams typedef CUmodule CeedMagmaModule; 36c42f38b1Snbeams typedef CUfunction CeedMagmaFunction; 37c42f38b1Snbeams #define CeedCompileMagma CeedCompileCuda 38c42f38b1Snbeams #define CeedGetKernelMagma CeedGetKernelCuda 39c42f38b1Snbeams #define CeedRunKernelMagma CeedRunKernelCuda 40c42f38b1Snbeams #define CeedRunKernelDimMagma CeedRunKernelDimCuda 41c42f38b1Snbeams #define CeedRunKernelDimSharedMagma CeedRunKernelDimSharedCuda 42f6af633fSnbeams #endif 43f6af633fSnbeams 44*2b730f8bSJeremy L Thompson typedef enum { MAGMA_KERNEL_DIM_GENERIC = 101, MAGMA_KERNEL_DIM_SPECIFIC = 102 } magma_kernel_mode_t; 45e0582403Sabdelfattah83 46e0582403Sabdelfattah83 typedef struct { 47e0582403Sabdelfattah83 magma_kernel_mode_t basis_kernel_mode; 48e0582403Sabdelfattah83 magma_device_t device; 49e0582403Sabdelfattah83 magma_queue_t queue; 50e0582403Sabdelfattah83 } Ceed_Magma; 515a9ca9adSVeselin Dobrev 527f5b9731SStan Tomov typedef struct { 53c42f38b1Snbeams CeedMagmaModule module; 54c42f38b1Snbeams CeedMagmaFunction magma_interp; 55c42f38b1Snbeams CeedMagmaFunction magma_interp_tr; 56c42f38b1Snbeams CeedMagmaFunction magma_grad; 57c42f38b1Snbeams CeedMagmaFunction magma_grad_tr; 58c42f38b1Snbeams CeedMagmaFunction magma_weight; 597f5b9731SStan Tomov CeedScalar *dqref1d; 607f5b9731SStan Tomov CeedScalar *dinterp1d; 617f5b9731SStan Tomov CeedScalar *dgrad1d; 627f5b9731SStan Tomov CeedScalar *dqweight1d; 637f5b9731SStan Tomov } CeedBasis_Magma; 647f5b9731SStan Tomov 657f5b9731SStan Tomov typedef struct { 66868539c2SNatalie Beams CeedScalar *dqref; 67868539c2SNatalie Beams CeedScalar *dinterp; 68868539c2SNatalie Beams CeedScalar *dgrad; 69868539c2SNatalie Beams CeedScalar *dqweight; 70868539c2SNatalie Beams } CeedBasisNonTensor_Magma; 71868539c2SNatalie Beams 72c8b3a627SJed Brown typedef enum { 73c8b3a627SJed Brown OWNED_NONE = 0, 74c8b3a627SJed Brown OWNED_UNPINNED, 75c8b3a627SJed Brown OWNED_PINNED, 76c8b3a627SJed Brown } OwnershipMode; 77c8b3a627SJed Brown 78868539c2SNatalie Beams typedef struct { 79c42f38b1Snbeams CeedMagmaModule module; 80c42f38b1Snbeams CeedMagmaFunction StridedTranspose; 81c42f38b1Snbeams CeedMagmaFunction StridedNoTranspose; 82c42f38b1Snbeams CeedMagmaFunction OffsetTranspose; 83c42f38b1Snbeams CeedMagmaFunction OffsetNoTranspose; 84d655899aSNatalie Beams CeedInt *offsets; 85d655899aSNatalie Beams CeedInt *doffsets; 86c8b3a627SJed Brown OwnershipMode own_; 87868539c2SNatalie Beams int down_; // cover a case where we own Device memory 88868539c2SNatalie Beams } CeedElemRestriction_Magma; 89868539c2SNatalie Beams 90868539c2SNatalie Beams typedef struct { 917f5b9731SStan Tomov const CeedScalar **inputs; 927f5b9731SStan Tomov CeedScalar **outputs; 937f5b9731SStan Tomov bool setupdone; 947f5b9731SStan Tomov } CeedQFunction_Magma; 957f5b9731SStan Tomov 9690104f39SStan Tomov #define USE_MAGMA_BATCH 9797ee337cSStan Tomov #define USE_MAGMA_BATCH2 987f5b9731SStan Tomov #define USE_MAGMA_BATCH3 997f5b9731SStan Tomov #define USE_MAGMA_BATCH4 10090104f39SStan Tomov 101*2b730f8bSJeremy L Thompson CEED_INTERN void magma_weight_nontensor(magma_int_t grid, magma_int_t threads, magma_int_t nelem, magma_int_t Q, CeedScalar *dqweight, CeedScalar *dv, 102e0582403Sabdelfattah83 magma_queue_t queue); 103e0582403Sabdelfattah83 104*2b730f8bSJeremy L Thompson CEED_INTERN int magma_dgemm_nontensor(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, double alpha, 105*2b730f8bSJeremy L Thompson const double *dA, magma_int_t ldda, const double *dB, magma_int_t lddb, double beta, double *dC, 106*2b730f8bSJeremy L Thompson magma_int_t lddc, magma_queue_t queue); 107*2b730f8bSJeremy L Thompson 108*2b730f8bSJeremy L Thompson CEED_INTERN int magma_sgemm_nontensor(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, float alpha, 109*2b730f8bSJeremy L Thompson const float *dA, magma_int_t ldda, const float *dB, magma_int_t lddb, float beta, float *dC, magma_int_t lddc, 11080a9ef05SNatalie Beams magma_queue_t queue); 11180a9ef05SNatalie Beams 112*2b730f8bSJeremy L Thompson CEED_INTERN void gemm_selector(int gpu_arch, char precision, char transA, int m, int n, int k, int *nbatch, int *use_magma); 1132dc3fb5fSabdelfattah83 114*2b730f8bSJeremy L Thompson CEED_INTERN magma_int_t magma_isdevptr(const void *A); 1157f5b9731SStan Tomov 116*2b730f8bSJeremy L Thompson CEED_INTERN int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d, 117*2b730f8bSJeremy L Thompson const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis); 1187f5b9731SStan Tomov 119*2b730f8bSJeremy L Thompson CEED_INTERN int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp, 120*2b730f8bSJeremy L Thompson const CeedScalar *grad, const CeedScalar *qref, const CeedScalar *qweight, CeedBasis basis); 121868539c2SNatalie Beams 122*2b730f8bSJeremy L Thompson CEED_INTERN int CeedElemRestrictionCreate_Magma(CeedMemType mtype, CeedCopyMode cmode, const CeedInt *offsets, CeedElemRestriction r); 123868539c2SNatalie Beams 124*2b730f8bSJeremy L Thompson CEED_INTERN int CeedElemRestrictionCreateBlocked_Magma(const CeedMemType mtype, const CeedCopyMode cmode, const CeedInt *offsets, 125868539c2SNatalie Beams const CeedElemRestriction res); 126a8c028e3SNatalie Beams 1272dc3fb5fSabdelfattah83 CEED_INTERN int CeedOperatorCreate_Magma(CeedOperator op); 1287f5b9731SStan Tomov 1297f5b9731SStan Tomov // comment the line below to use the default magma_is_devptr function 1307f5b9731SStan Tomov #define magma_is_devptr magma_isdevptr 1317f5b9731SStan Tomov 132e0582403Sabdelfattah83 // if magma and cuda/ref are using the null stream, then ceed_magma_queue_sync 133e0582403Sabdelfattah83 // should do nothing 134e0582403Sabdelfattah83 #define ceed_magma_queue_sync(...) 135e0582403Sabdelfattah83 1367f5b9731SStan Tomov // batch stride, override using -DMAGMA_BATCH_STRIDE=<desired-value> 1377f5b9731SStan Tomov #ifndef MAGMA_BATCH_STRIDE 1387f5b9731SStan Tomov #define MAGMA_BATCH_STRIDE (1000) 1397f5b9731SStan Tomov #endif 140e0582403Sabdelfattah83 1413d576824SJeremy L Thompson #endif // _ceed_magma_h 142