13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 34444f328STzanio // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 54444f328STzanio // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 74444f328STzanio 890104f39SStan Tomov // magma functions specific to ceed 9972b3d9dSNatalie Beams #ifndef _ceed_magma_h 103d576824SJeremy L Thompson #define _ceed_magma_h 1190104f39SStan Tomov 1249aac155SJeremy L Thompson #include <ceed.h> 13ec3da8bcSJed Brown #include <ceed/backend.h> 14e0582403Sabdelfattah83 #include <magma_v2.h> 15e0582403Sabdelfattah83 16f6af633fSnbeams #define MAGMA_MAXTHREADS_1D 128 17f6af633fSnbeams #define MAGMA_MAXTHREADS_2D 128 18f6af633fSnbeams #define MAGMA_MAXTHREADS_3D 64 19023b8a51Sabdelfattah83 #define MAGMA_NONTENSOR_MAXTHREADS (128) 20023b8a51Sabdelfattah83 21f6af633fSnbeams // Define macro for determining number of threads in y-direction 22f6af633fSnbeams // for basis kernels 23f6af633fSnbeams #define MAGMA_BASIS_NTCOL(x, maxt) (((maxt) < (x)) ? 1 : ((maxt) / (x))) 24023b8a51Sabdelfattah83 #define MAGMA_NONTENSOR_BASIS_NTCOL(N) (CeedIntMax(1, (MAGMA_NONTENSOR_MAXTHREADS / (N)))) 25023b8a51Sabdelfattah83 #define MAGMA_CEILDIV(A, B) (((A) + (B)-1) / (B)) 26023b8a51Sabdelfattah83 27023b8a51Sabdelfattah83 #define MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P (40) 28023b8a51Sabdelfattah83 #define MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q (40) 29023b8a51Sabdelfattah83 30f6af633fSnbeams // Define macro for computing the total threads in a block 31f6af633fSnbeams // for use with __launch_bounds__() 32f6af633fSnbeams #define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt)) 33f6af633fSnbeams 34023b8a51Sabdelfattah83 // Define macro for non-tensor kernel instances 35023b8a51Sabdelfattah83 #define MAGMA_NONTENSOR_KERNEL_INSTANCES (5) 36023b8a51Sabdelfattah83 #define MAGMA_NONTENSOR_N_VALUES 10240, 51200, 102400, 512000, 1024000 37023b8a51Sabdelfattah83 38e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP 39c42f38b1Snbeams typedef hipModule_t CeedMagmaModule; 40c42f38b1Snbeams typedef hipFunction_t CeedMagmaFunction; 41*eb7e6cafSJeremy L Thompson #define CeedCompileMagma CeedCompile_Hip 42*eb7e6cafSJeremy L Thompson #define CeedGetKernelMagma CeedGetKernel_Hip 43*eb7e6cafSJeremy L Thompson #define CeedRunKernelMagma CeedRunKernel_Hip 44*eb7e6cafSJeremy L Thompson #define CeedRunKernelDimMagma CeedRunKernelDim_Hip 45*eb7e6cafSJeremy L Thompson #define CeedRunKernelDimSharedMagma CeedRunKernelDimShared_Hip 46f6af633fSnbeams #else 47c42f38b1Snbeams typedef CUmodule CeedMagmaModule; 48c42f38b1Snbeams typedef CUfunction CeedMagmaFunction; 49*eb7e6cafSJeremy L Thompson #define CeedCompileMagma CeedCompile_Cuda 50*eb7e6cafSJeremy L Thompson #define CeedGetKernelMagma CeedGetKernel_Cuda 51*eb7e6cafSJeremy L Thompson #define CeedRunKernelMagma CeedRunKernel_Cuda 52*eb7e6cafSJeremy L Thompson #define CeedRunKernelDimMagma CeedRunKernelDim_Cuda 53*eb7e6cafSJeremy L Thompson #define CeedRunKernelDimSharedMagma CeedRunKernelDimShared_Cuda 54f6af633fSnbeams #endif 55f6af633fSnbeams 562b730f8bSJeremy L Thompson typedef enum { MAGMA_KERNEL_DIM_GENERIC = 101, MAGMA_KERNEL_DIM_SPECIFIC = 102 } magma_kernel_mode_t; 57e0582403Sabdelfattah83 58e0582403Sabdelfattah83 typedef struct { 59e0582403Sabdelfattah83 magma_kernel_mode_t basis_kernel_mode; 60e0582403Sabdelfattah83 magma_device_t device; 61e0582403Sabdelfattah83 magma_queue_t queue; 62e0582403Sabdelfattah83 } Ceed_Magma; 635a9ca9adSVeselin Dobrev 647f5b9731SStan Tomov typedef struct { 65c42f38b1Snbeams CeedMagmaModule module; 66c42f38b1Snbeams CeedMagmaFunction magma_interp; 67c42f38b1Snbeams CeedMagmaFunction magma_interp_tr; 68c42f38b1Snbeams CeedMagmaFunction magma_grad; 69c42f38b1Snbeams CeedMagmaFunction magma_grad_tr; 70c42f38b1Snbeams CeedMagmaFunction magma_weight; 717f5b9731SStan Tomov CeedScalar *dqref1d; 727f5b9731SStan Tomov CeedScalar *dinterp1d; 737f5b9731SStan Tomov CeedScalar *dgrad1d; 747f5b9731SStan Tomov CeedScalar *dqweight1d; 757f5b9731SStan Tomov } CeedBasis_Magma; 767f5b9731SStan Tomov 777f5b9731SStan Tomov typedef struct { 78023b8a51Sabdelfattah83 CeedMagmaModule module[MAGMA_NONTENSOR_KERNEL_INSTANCES]; 79023b8a51Sabdelfattah83 CeedMagmaFunction magma_interp_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES]; 80023b8a51Sabdelfattah83 CeedMagmaFunction magma_interp_tr_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES]; 81023b8a51Sabdelfattah83 CeedMagmaFunction magma_grad_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES]; 82023b8a51Sabdelfattah83 CeedMagmaFunction magma_grad_tr_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES]; 83868539c2SNatalie Beams CeedScalar *dqref; 84868539c2SNatalie Beams CeedScalar *dinterp; 85868539c2SNatalie Beams CeedScalar *dgrad; 86868539c2SNatalie Beams CeedScalar *dqweight; 87868539c2SNatalie Beams } CeedBasisNonTensor_Magma; 88868539c2SNatalie Beams 89c8b3a627SJed Brown typedef enum { 90c8b3a627SJed Brown OWNED_NONE = 0, 91c8b3a627SJed Brown OWNED_UNPINNED, 92c8b3a627SJed Brown OWNED_PINNED, 93c8b3a627SJed Brown } OwnershipMode; 94c8b3a627SJed Brown 95868539c2SNatalie Beams typedef struct { 96c42f38b1Snbeams CeedMagmaModule module; 97c42f38b1Snbeams CeedMagmaFunction StridedTranspose; 98c42f38b1Snbeams CeedMagmaFunction StridedNoTranspose; 99c42f38b1Snbeams CeedMagmaFunction OffsetTranspose; 100c42f38b1Snbeams CeedMagmaFunction OffsetNoTranspose; 101d655899aSNatalie Beams CeedInt *offsets; 102d655899aSNatalie Beams CeedInt *doffsets; 103c8b3a627SJed Brown OwnershipMode own_; 104868539c2SNatalie Beams int down_; // cover a case where we own Device memory 105868539c2SNatalie Beams } CeedElemRestriction_Magma; 106868539c2SNatalie Beams 107868539c2SNatalie Beams typedef struct { 1087f5b9731SStan Tomov const CeedScalar **inputs; 1097f5b9731SStan Tomov CeedScalar **outputs; 1107f5b9731SStan Tomov bool setupdone; 1117f5b9731SStan Tomov } CeedQFunction_Magma; 1127f5b9731SStan Tomov 11390104f39SStan Tomov #define USE_MAGMA_BATCH 11497ee337cSStan Tomov #define USE_MAGMA_BATCH2 1157f5b9731SStan Tomov #define USE_MAGMA_BATCH3 1167f5b9731SStan Tomov #define USE_MAGMA_BATCH4 11790104f39SStan Tomov 1182b730f8bSJeremy L Thompson CEED_INTERN void magma_weight_nontensor(magma_int_t grid, magma_int_t threads, magma_int_t nelem, magma_int_t Q, CeedScalar *dqweight, CeedScalar *dv, 119e0582403Sabdelfattah83 magma_queue_t queue); 120e0582403Sabdelfattah83 121023b8a51Sabdelfattah83 CEED_INTERN int magma_gemm_nontensor(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha, 122023b8a51Sabdelfattah83 const CeedScalar *dA, magma_int_t ldda, const CeedScalar *dB, magma_int_t lddb, CeedScalar beta, CeedScalar *dC, 1232b730f8bSJeremy L Thompson magma_int_t lddc, magma_queue_t queue); 1242b730f8bSJeremy L Thompson 1252b730f8bSJeremy L Thompson CEED_INTERN void gemm_selector(int gpu_arch, char precision, char transA, int m, int n, int k, int *nbatch, int *use_magma); 1262dc3fb5fSabdelfattah83 127023b8a51Sabdelfattah83 CEED_INTERN CeedInt nontensor_rtc_get_nb(int gpu_arch, char precision, CeedEvalMode emode, CeedTransposeMode tmode, int P_, int N, int Q_); 128023b8a51Sabdelfattah83 1292b730f8bSJeremy L Thompson CEED_INTERN magma_int_t magma_isdevptr(const void *A); 1307f5b9731SStan Tomov 1312b730f8bSJeremy L Thompson CEED_INTERN int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d, 1322b730f8bSJeremy L Thompson const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis); 1337f5b9731SStan Tomov 1342b730f8bSJeremy L Thompson CEED_INTERN int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp, 1352b730f8bSJeremy L Thompson const CeedScalar *grad, const CeedScalar *qref, const CeedScalar *qweight, CeedBasis basis); 136868539c2SNatalie Beams 1372b730f8bSJeremy L Thompson CEED_INTERN int CeedElemRestrictionCreate_Magma(CeedMemType mtype, CeedCopyMode cmode, const CeedInt *offsets, CeedElemRestriction r); 138868539c2SNatalie Beams 1397f5b9731SStan Tomov // comment the line below to use the default magma_is_devptr function 1407f5b9731SStan Tomov #define magma_is_devptr magma_isdevptr 1417f5b9731SStan Tomov 142e0582403Sabdelfattah83 // if magma and cuda/ref are using the null stream, then ceed_magma_queue_sync 143e0582403Sabdelfattah83 // should do nothing 144e0582403Sabdelfattah83 #define ceed_magma_queue_sync(...) 145e0582403Sabdelfattah83 1467f5b9731SStan Tomov // batch stride, override using -DMAGMA_BATCH_STRIDE=<desired-value> 1477f5b9731SStan Tomov #ifndef MAGMA_BATCH_STRIDE 1487f5b9731SStan Tomov #define MAGMA_BATCH_STRIDE (1000) 1497f5b9731SStan Tomov #endif 150e0582403Sabdelfattah83 1513d576824SJeremy L Thompson #endif // _ceed_magma_h 152