1*7f5b9731SStan Tomov // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2*7f5b9731SStan Tomov // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3*7f5b9731SStan Tomov // All Rights reserved. See files LICENSE and NOTICE for details. 4*7f5b9731SStan Tomov // 5*7f5b9731SStan Tomov // This file is part of CEED, a collection of benchmarks, miniapps, software 6*7f5b9731SStan Tomov // libraries and APIs for efficient high-order finite element and spectral 7*7f5b9731SStan Tomov // element discretizations for exascale applications. For more information and 8*7f5b9731SStan Tomov // source code availability see http://github.com/ceed. 9*7f5b9731SStan Tomov // 10*7f5b9731SStan Tomov // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11*7f5b9731SStan Tomov // a collaborative effort of two U.S. Department of Energy organizations (Office 12*7f5b9731SStan Tomov // of Science and the National Nuclear Security Administration) responsible for 13*7f5b9731SStan Tomov // the planning and preparation of a capable exascale ecosystem, including 14*7f5b9731SStan Tomov // software, applications, hardware, advanced system engineering and early 15*7f5b9731SStan Tomov // testbed platforms, in support of the nation's exascale computing imperative. 16*7f5b9731SStan Tomov 17*7f5b9731SStan Tomov #include "ceed-magma.h" 18*7f5b9731SStan Tomov 19*7f5b9731SStan Tomov #ifdef __cplusplus 20*7f5b9731SStan Tomov CEED_INTERN "C" 21*7f5b9731SStan Tomov #endif 22*7f5b9731SStan Tomov int CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem, 23*7f5b9731SStan Tomov CeedTransposeMode tmode, CeedEvalMode emode, 24*7f5b9731SStan Tomov CeedVector U, CeedVector V) 25*7f5b9731SStan Tomov { 26*7f5b9731SStan Tomov int ierr; 27*7f5b9731SStan Tomov Ceed ceed; 28*7f5b9731SStan Tomov ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr); 29*7f5b9731SStan Tomov CeedInt dim, ncomp, ndof, nqpt; 30*7f5b9731SStan Tomov ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 31*7f5b9731SStan Tomov ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChk(ierr); 32*7f5b9731SStan Tomov ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChk(ierr); 33*7f5b9731SStan Tomov ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChk(ierr); 34*7f5b9731SStan Tomov const CeedScalar *u; 35*7f5b9731SStan Tomov CeedScalar *v; 36*7f5b9731SStan Tomov if (U) { 37*7f5b9731SStan Tomov ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &u); CeedChk(ierr); 38*7f5b9731SStan Tomov } else if (emode != CEED_EVAL_WEIGHT) { 39*7f5b9731SStan Tomov // LCOV_EXCL_START 40*7f5b9731SStan Tomov return CeedError(ceed, 1, 41*7f5b9731SStan Tomov "An input vector is required for this CeedEvalMode"); 42*7f5b9731SStan Tomov // LCOV_EXCL_STOP 43*7f5b9731SStan Tomov } 44*7f5b9731SStan Tomov ierr = CeedVectorGetArray(V, CEED_MEM_DEVICE, &v); CeedChk(ierr); 45*7f5b9731SStan Tomov 46*7f5b9731SStan Tomov CeedBasis_Magma *impl; 47*7f5b9731SStan Tomov ierr = CeedBasisGetData(basis, (void*)&impl); CeedChk(ierr); 48*7f5b9731SStan Tomov 49*7f5b9731SStan Tomov CeedInt P1d, Q1d; 50*7f5b9731SStan Tomov ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 51*7f5b9731SStan Tomov ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 52*7f5b9731SStan Tomov 53*7f5b9731SStan Tomov CeedDebug("\033[01m[CeedBasisApply_Magma] vsize=%d, comp = %d", 54*7f5b9731SStan Tomov ncomp*CeedIntPow(P1d, dim), ncomp); 55*7f5b9731SStan Tomov 56*7f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) { 57*7f5b9731SStan Tomov CeedInt length; 58*7f5b9731SStan Tomov ierr = CeedVectorGetLength(V, &length); 59*7f5b9731SStan Tomov magmablas_dlaset(MagmaFull, length, 1, 0., 0., v, length ); 60*7f5b9731SStan Tomov } 61*7f5b9731SStan Tomov if (emode & CEED_EVAL_INTERP) { 62*7f5b9731SStan Tomov CeedInt P = P1d, Q = Q1d; 63*7f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) { 64*7f5b9731SStan Tomov P = Q1d; Q = P1d; 65*7f5b9731SStan Tomov } 66*7f5b9731SStan Tomov 67*7f5b9731SStan Tomov // Define element sizes for dofs/quad 68*7f5b9731SStan Tomov CeedInt elquadsize = CeedIntPow(Q1d, dim); 69*7f5b9731SStan Tomov CeedInt eldofssize = CeedIntPow(P1d, dim); 70*7f5b9731SStan Tomov 71*7f5b9731SStan Tomov // E-vector ordering -------------- Q-vector ordering 72*7f5b9731SStan Tomov // elem component 73*7f5b9731SStan Tomov // component elem 74*7f5b9731SStan Tomov // node node 75*7f5b9731SStan Tomov 76*7f5b9731SStan Tomov // --- Define strides for NOTRANSPOSE mode: --- 77*7f5b9731SStan Tomov // Input (u) is E-vector, output (v) is Q-vector 78*7f5b9731SStan Tomov 79*7f5b9731SStan Tomov // Element strides 80*7f5b9731SStan Tomov CeedInt u_elstride = ncomp * eldofssize; 81*7f5b9731SStan Tomov CeedInt v_elstride = elquadsize; 82*7f5b9731SStan Tomov // Component strides 83*7f5b9731SStan Tomov CeedInt u_compstride = eldofssize; 84*7f5b9731SStan Tomov CeedInt v_compstride = nelem * elquadsize; 85*7f5b9731SStan Tomov 86*7f5b9731SStan Tomov // --- Swap strides for TRANSPOSE mode: --- 87*7f5b9731SStan Tomov if(tmode == CEED_TRANSPOSE) { 88*7f5b9731SStan Tomov // Input (u) is Q-vector, output (v) is E-vector 89*7f5b9731SStan Tomov // Element strides 90*7f5b9731SStan Tomov v_elstride = ncomp * eldofssize; 91*7f5b9731SStan Tomov u_elstride = elquadsize; 92*7f5b9731SStan Tomov // Component strides 93*7f5b9731SStan Tomov v_compstride = eldofssize; 94*7f5b9731SStan Tomov u_compstride = nelem * elquadsize; 95*7f5b9731SStan Tomov } 96*7f5b9731SStan Tomov 97*7f5b9731SStan Tomov // Loop through components and apply batch over elements 98*7f5b9731SStan Tomov for (CeedInt comp_ctr = 0; comp_ctr < ncomp; comp_ctr++){ 99*7f5b9731SStan Tomov magmablas_dbasis_apply_batched_eval_interp(P, Q, dim, ncomp, 100*7f5b9731SStan Tomov impl->dinterp1d, tmode, 101*7f5b9731SStan Tomov u + u_compstride * comp_ctr, u_elstride, 102*7f5b9731SStan Tomov v + v_compstride * comp_ctr, v_elstride, 103*7f5b9731SStan Tomov nelem); 104*7f5b9731SStan Tomov } 105*7f5b9731SStan Tomov 106*7f5b9731SStan Tomov } 107*7f5b9731SStan Tomov if (emode & CEED_EVAL_GRAD) { 108*7f5b9731SStan Tomov CeedInt P = P1d, Q = Q1d; 109*7f5b9731SStan Tomov // In CEED_NOTRANSPOSE mode: 110*7f5b9731SStan Tomov // u is (P^dim x nc), column-major layout (nc = ncomp) 111*7f5b9731SStan Tomov // v is (Q^dim x nc x dim), column-major layout (nc = ncomp) 112*7f5b9731SStan Tomov // In CEED_TRANSPOSE mode, the sizes of u and v are switched. 113*7f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) { 114*7f5b9731SStan Tomov P = Q1d, Q = P1d; 115*7f5b9731SStan Tomov } 116*7f5b9731SStan Tomov 117*7f5b9731SStan Tomov // Define element sizes for dofs/quad 118*7f5b9731SStan Tomov CeedInt elquadsize = CeedIntPow(Q1d, dim); 119*7f5b9731SStan Tomov CeedInt eldofssize = CeedIntPow(P1d, dim); 120*7f5b9731SStan Tomov 121*7f5b9731SStan Tomov // E-vector ordering -------------- Q-vector ordering 122*7f5b9731SStan Tomov // dim 123*7f5b9731SStan Tomov // elem component 124*7f5b9731SStan Tomov // component elem 125*7f5b9731SStan Tomov // node node 126*7f5b9731SStan Tomov 127*7f5b9731SStan Tomov 128*7f5b9731SStan Tomov // --- Define strides for NOTRANSPOSE mode: --- 129*7f5b9731SStan Tomov // Input (u) is E-vector, output (v) is Q-vector 130*7f5b9731SStan Tomov 131*7f5b9731SStan Tomov // Element strides 132*7f5b9731SStan Tomov CeedInt u_elstride = ncomp * eldofssize; 133*7f5b9731SStan Tomov CeedInt v_elstride = elquadsize; 134*7f5b9731SStan Tomov // Component strides 135*7f5b9731SStan Tomov CeedInt u_compstride = eldofssize; 136*7f5b9731SStan Tomov CeedInt v_compstride = nelem * elquadsize; 137*7f5b9731SStan Tomov // Dimension strides 138*7f5b9731SStan Tomov CeedInt u_dimstride = 0; 139*7f5b9731SStan Tomov CeedInt v_dimstride = nelem * elquadsize * ncomp; 140*7f5b9731SStan Tomov 141*7f5b9731SStan Tomov // --- Swap strides for TRANSPOSE mode: --- 142*7f5b9731SStan Tomov if(tmode == CEED_TRANSPOSE) { 143*7f5b9731SStan Tomov // Input (u) is Q-vector, output (v) is E-vector 144*7f5b9731SStan Tomov // Element strides 145*7f5b9731SStan Tomov v_elstride = ncomp * eldofssize; 146*7f5b9731SStan Tomov u_elstride = elquadsize; 147*7f5b9731SStan Tomov // Component strides 148*7f5b9731SStan Tomov v_compstride = eldofssize; 149*7f5b9731SStan Tomov u_compstride = nelem * elquadsize; 150*7f5b9731SStan Tomov // Dimension strides 151*7f5b9731SStan Tomov v_dimstride = 0; 152*7f5b9731SStan Tomov u_dimstride = nelem * elquadsize * ncomp; 153*7f5b9731SStan Tomov 154*7f5b9731SStan Tomov } 155*7f5b9731SStan Tomov 156*7f5b9731SStan Tomov // Loop through grad dimensions and components, batch call over elements 157*7f5b9731SStan Tomov for(CeedInt dim_ctr = 0; dim_ctr < dim; dim_ctr++){ 158*7f5b9731SStan Tomov for (CeedInt comp_ctr = 0; comp_ctr < ncomp; comp_ctr++){ 159*7f5b9731SStan Tomov magmablas_dbasis_apply_batched_eval_grad(P, Q, dim, ncomp, nqpt, 160*7f5b9731SStan Tomov impl->dinterp1d, impl->dgrad1d, tmode, 161*7f5b9731SStan Tomov u + dim_ctr * u_dimstride + u_compstride * comp_ctr, 162*7f5b9731SStan Tomov u_elstride, 163*7f5b9731SStan Tomov v + dim_ctr * v_dimstride + v_compstride * comp_ctr, 164*7f5b9731SStan Tomov v_elstride, 165*7f5b9731SStan Tomov nelem, dim_ctr); 166*7f5b9731SStan Tomov } 167*7f5b9731SStan Tomov } 168*7f5b9731SStan Tomov } 169*7f5b9731SStan Tomov if (emode & CEED_EVAL_WEIGHT) { 170*7f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) 171*7f5b9731SStan Tomov // LCOV_EXCL_START 172*7f5b9731SStan Tomov return CeedError(ceed, 1, 173*7f5b9731SStan Tomov "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 174*7f5b9731SStan Tomov // LCOV_EXCL_STOP 175*7f5b9731SStan Tomov CeedInt Q = Q1d; 176*7f5b9731SStan Tomov int eldofssize = CeedIntPow(Q, dim); 177*7f5b9731SStan Tomov magmablas_dbasis_apply_batched_eval_weight(Q, dim, impl->dqweight1d, 178*7f5b9731SStan Tomov v, eldofssize, 179*7f5b9731SStan Tomov nelem); 180*7f5b9731SStan Tomov } 181*7f5b9731SStan Tomov 182*7f5b9731SStan Tomov if(emode!=CEED_EVAL_WEIGHT) { 183*7f5b9731SStan Tomov ierr = CeedVectorRestoreArrayRead(U, &u); CeedChk(ierr); 184*7f5b9731SStan Tomov } 185*7f5b9731SStan Tomov ierr = CeedVectorRestoreArray(V, &v); CeedChk(ierr); 186*7f5b9731SStan Tomov return 0; 187*7f5b9731SStan Tomov } 188*7f5b9731SStan Tomov 189*7f5b9731SStan Tomov #ifdef __cplusplus 190*7f5b9731SStan Tomov CEED_INTERN "C" 191*7f5b9731SStan Tomov #endif 192*7f5b9731SStan Tomov int CeedBasisDestroy_Magma(CeedBasis basis) 193*7f5b9731SStan Tomov { 194*7f5b9731SStan Tomov int ierr; 195*7f5b9731SStan Tomov CeedBasis_Magma *impl; 196*7f5b9731SStan Tomov ierr = CeedBasisGetData(basis, (void *)&impl); CeedChk(ierr); 197*7f5b9731SStan Tomov 198*7f5b9731SStan Tomov ierr = magma_free(impl->dqref1d); CeedChk(ierr); 199*7f5b9731SStan Tomov ierr = magma_free(impl->dinterp1d); CeedChk(ierr); 200*7f5b9731SStan Tomov ierr = magma_free(impl->dgrad1d); CeedChk(ierr); 201*7f5b9731SStan Tomov ierr = magma_free(impl->dqweight1d); CeedChk(ierr); 202*7f5b9731SStan Tomov 203*7f5b9731SStan Tomov ierr = CeedFree(&impl); CeedChk(ierr); 204*7f5b9731SStan Tomov 205*7f5b9731SStan Tomov return 0; 206*7f5b9731SStan Tomov } 207*7f5b9731SStan Tomov 208*7f5b9731SStan Tomov #ifdef __cplusplus 209*7f5b9731SStan Tomov CEED_INTERN "C" 210*7f5b9731SStan Tomov #endif 211*7f5b9731SStan Tomov int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, 212*7f5b9731SStan Tomov CeedInt Q1d, const CeedScalar *interp1d, 213*7f5b9731SStan Tomov const CeedScalar *grad1d, 214*7f5b9731SStan Tomov const CeedScalar *qref1d, 215*7f5b9731SStan Tomov const CeedScalar *qweight1d, 216*7f5b9731SStan Tomov CeedBasis basis) 217*7f5b9731SStan Tomov { 218*7f5b9731SStan Tomov int ierr; 219*7f5b9731SStan Tomov CeedBasis_Magma *impl; 220*7f5b9731SStan Tomov Ceed ceed; 221*7f5b9731SStan Tomov ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr); 222*7f5b9731SStan Tomov 223*7f5b9731SStan Tomov ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 224*7f5b9731SStan Tomov CeedBasisApply_Magma); CeedChk(ierr); 225*7f5b9731SStan Tomov ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 226*7f5b9731SStan Tomov CeedBasisDestroy_Magma); CeedChk(ierr); 227*7f5b9731SStan Tomov 228*7f5b9731SStan Tomov ierr = CeedCalloc(1,&impl); CeedChk(ierr); 229*7f5b9731SStan Tomov ierr = CeedBasisSetData(basis, (void *)&impl); CeedChk(ierr); 230*7f5b9731SStan Tomov 231*7f5b9731SStan Tomov // Copy qref1d to the GPU 232*7f5b9731SStan Tomov ierr = magma_malloc((void**)&impl->dqref1d, Q1d*sizeof(qref1d[0])); 233*7f5b9731SStan Tomov CeedChk(ierr); 234*7f5b9731SStan Tomov magma_setvector(Q1d, sizeof(qref1d[0]), qref1d, 1, impl->dqref1d, 1); 235*7f5b9731SStan Tomov 236*7f5b9731SStan Tomov // Copy interp1d to the GPU 237*7f5b9731SStan Tomov ierr = magma_malloc((void**)&impl->dinterp1d, Q1d*P1d*sizeof(interp1d[0])); 238*7f5b9731SStan Tomov CeedChk(ierr); 239*7f5b9731SStan Tomov magma_setvector(Q1d*P1d, sizeof(interp1d[0]), interp1d, 1, impl->dinterp1d, 1); 240*7f5b9731SStan Tomov 241*7f5b9731SStan Tomov // Copy grad1d to the GPU 242*7f5b9731SStan Tomov ierr = magma_malloc((void**)&impl->dgrad1d, Q1d*P1d*sizeof(grad1d[0])); 243*7f5b9731SStan Tomov CeedChk(ierr); 244*7f5b9731SStan Tomov magma_setvector(Q1d*P1d, sizeof(grad1d[0]), grad1d, 1, impl->dgrad1d, 1); 245*7f5b9731SStan Tomov 246*7f5b9731SStan Tomov // Copy qweight1d to the GPU 247*7f5b9731SStan Tomov ierr = magma_malloc((void**)&impl->dqweight1d, Q1d*sizeof(qweight1d[0])); 248*7f5b9731SStan Tomov CeedChk(ierr); 249*7f5b9731SStan Tomov magma_setvector(Q1d, sizeof(qweight1d[0]), qweight1d, 1, impl->dqweight1d, 1); 250*7f5b9731SStan Tomov 251*7f5b9731SStan Tomov return 0; 252*7f5b9731SStan Tomov } 253*7f5b9731SStan Tomov 254*7f5b9731SStan Tomov #ifdef __cplusplus 255*7f5b9731SStan Tomov CEED_INTERN "C" 256*7f5b9731SStan Tomov #endif 257*7f5b9731SStan Tomov int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, 258*7f5b9731SStan Tomov CeedInt ndof, CeedInt nqpts, 259*7f5b9731SStan Tomov const CeedScalar *interp, 260*7f5b9731SStan Tomov const CeedScalar *grad, 261*7f5b9731SStan Tomov const CeedScalar *qref, 262*7f5b9731SStan Tomov const CeedScalar *qweight, 263*7f5b9731SStan Tomov CeedBasis basis) 264*7f5b9731SStan Tomov { 265*7f5b9731SStan Tomov int ierr; 266*7f5b9731SStan Tomov Ceed ceed; 267*7f5b9731SStan Tomov ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr); 268*7f5b9731SStan Tomov 269*7f5b9731SStan Tomov return CeedError(ceed, 1, "Backend does not implement non-tensor bases"); 270*7f5b9731SStan Tomov } 271