// Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. // All Rights reserved. See files LICENSE and NOTICE for details. // // This file is part of CEED, a collection of benchmarks, miniapps, software // libraries and APIs for efficient high-order finite element and spectral // element discretizations for exascale applications. For more information and // source code availability see http://github.com/ceed. // // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, // a collaborative effort of two U.S. Department of Energy organizations (Office // of Science and the National Nuclear Security Administration) responsible for // the planning and preparation of a capable exascale ecosystem, including // software, applications, hardware, advanced system engineering and early // testbed platforms, in support of the nation's exascale computing imperative. #include #include "ceed-cuda-gen.h" #include #include #include "../cuda/ceed-cuda.h" #include "../cuda-reg/ceed-cuda-reg.h" #include "../cuda-shared/ceed-cuda-shared.h" static const char *atomicAdd = QUOTE( __device__ double atomicAdd(double *address, double val) { unsigned long long int *address_as_ull = (unsigned long long int *)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN // (since NaN != NaN) } while (assumed != old); return __longlong_as_double(old); } ); static const char *deviceFunctions = QUOTE( typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; typedef struct { CeedInt tidx; CeedInt tidy; CeedInt tidz; CeedInt tid; CeedScalar* slice; } BackendData; template inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { for (CeedInt i=data.tid; i inline __device__ void readDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { if (data.tidx inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { if (data.tidx inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedInt *strides, const CeedScalar* d_u, CeedScalar* r_u) { if (data.tidx inline __device__ void writeDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { if (data.tidx inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { if (data.tidx inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedInt *strides, const CeedScalar* r_v, CeedScalar* d_v) { if (data.tidx inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { data.slice[data.tidx] = *U; __syncthreads(); *V = 0.0; for (CeedInt i = 0; i < P1d; ++i) { *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction } __syncthreads(); } template inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { data.slice[data.tidx] = *U; __syncthreads(); *V = 0.0; for (CeedInt i = 0; i < Q1d; ++i) { *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction } __syncthreads(); } template inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { for (CeedInt comp=0; comp(data, r_U+comp, c_B, r_V+comp); } } template inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { for (CeedInt comp=0; comp(data, r_U+comp, c_B, r_V+comp); } } template inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { for (CeedInt comp=0; comp(data, r_U+comp, c_G, r_V+comp); } } template inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { for (CeedInt comp=0; comp(data, r_U+comp, c_G, r_V+comp); } } //**** // 2D template inline __device__ void readDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { if (data.tidx inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { if (data.tidx inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedInt *strides, const CeedScalar* d_u, CeedScalar* r_u) { if (data.tidx inline __device__ void writeDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { if (data.tidx inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { if (data.tidx inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedInt *strides, const CeedScalar* r_v, CeedScalar* d_v) { if (data.tidx inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { data.slice[data.tidx+data.tidy*Q1d] = *U; __syncthreads(); *V = 0.0; for (CeedInt i = 0; i < P1d; ++i) { *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction } __syncthreads(); } template inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { data.slice[data.tidx+data.tidy*Q1d] = *U; __syncthreads(); *V = 0.0; for (CeedInt i = 0; i < P1d; ++i) { *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction } __syncthreads(); } template inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { data.slice[data.tidx+data.tidy*Q1d] = *U; __syncthreads(); *V = 0.0; if (data.tidy inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { data.slice[data.tidx+data.tidy*Q1d] = *U; __syncthreads(); *V = 0.0; if (data.tidx inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { data.slice[data.tidx+data.tidy*Q1d] = *U; __syncthreads(); if (data.tidx inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp=0; comp(data, r_U+comp, c_B, r_t); ContractY2d(data, r_t, c_B, r_V+comp); } } template inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp=0; comp(data, r_U+comp, c_B, r_t); ContractXTranspose2d(data, r_t, c_B, r_V+comp); } } template inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp=0; comp(data, r_U+comp, c_G, r_t); ContractY2d(data, r_t, c_B, r_V+comp+0*NCOMP); ContractX2d(data, r_U+comp, c_B, r_t); ContractY2d(data, r_t, c_G, r_V+comp+1*NCOMP); } } template inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp=0; comp(data, r_U+comp+0*NCOMP, c_B, r_t); ContractXTranspose2d(data, r_t, c_G, r_V+comp); ContractYTranspose2d(data, r_U+comp+1*NCOMP, c_G, r_t); ContractXTransposeAdd2d(data, r_t, c_B, r_V+comp); } } //**** // 3D template inline __device__ void readDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { if (data.tidx inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { if (data.tidx inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedInt *strides, const CeedScalar* d_u, CeedScalar* r_u) { if (data.tidx inline __device__ void readSliceQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; for (CeedInt comp = 0; comp < NCOMP; ++comp) { r_u[comp] = d_u[ind + nquads * comp]; } } template inline __device__ void readSliceQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d]; for (CeedInt comp = 0; comp < NCOMP; ++comp) { r_u[comp] = d_u[ind * NCOMP + comp]; } } template inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedInt *strides, const CeedScalar* d_u, CeedScalar* r_u) { const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; const CeedInt ind = node * strides[0] + elem * strides[2]; for (CeedInt comp = 0; comp < NCOMP; ++comp) { r_u[comp] = d_u[ind + comp * strides[1]]; } } template inline __device__ void writeDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { if (data.tidx inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { if (data.tidx inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedInt *strides, const CeedScalar* r_v, CeedScalar* d_v) { if (data.tidx inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[P1d]; for (CeedInt i = 0; i < P1d; ++i) { r_B[i] = B[i + data.tidx*P1d]; } for (CeedInt k = 0; k < P1d; ++k) { data.slice[data.tidx+data.tidy*Q1d] = U[k]; __syncthreads(); V[k] = 0.0; for (CeedInt i = 0; i < P1d; ++i) { V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction } __syncthreads(); } } template inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[P1d]; for (CeedInt i = 0; i < P1d; ++i) { r_B[i] = B[i + data.tidy*P1d]; } for (CeedInt k = 0; k < P1d; ++k) { data.slice[data.tidx+data.tidy*Q1d] = U[k]; __syncthreads(); V[k] = 0.0; for (CeedInt i = 0; i < P1d; ++i) { V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction } __syncthreads(); } } template inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { for (CeedInt k = 0; k < Q1d; ++k) { V[k] = 0.0; for (CeedInt i = 0; i < P1d; ++i) { V[k] += B[i + k*P1d] * U[i];//contract z direction } } } template inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { for (CeedInt k = 0; k < Q1d; ++k) { V[k] = 0.0; if (k inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q1d]; for (CeedInt i = 0; i < Q1d; ++i) { r_B[i] = B[data.tidy + i*P1d]; } for (CeedInt k = 0; k < P1d; ++k) { data.slice[data.tidx+data.tidy*Q1d] = U[k]; __syncthreads(); V[k] = 0.0; if (data.tidy inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q1d]; for (CeedInt i = 0; i < Q1d; ++i) { r_B[i] = B[data.tidy + i*P1d]; } for (CeedInt k = 0; k < P1d; ++k) { data.slice[data.tidx+data.tidy*Q1d] = U[k]; __syncthreads(); if (data.tidy inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q1d]; for (CeedInt i = 0; i < Q1d; ++i) { r_B[i] = B[data.tidx + i*P1d]; } for (CeedInt k = 0; k < P1d; ++k) { data.slice[data.tidx+data.tidy*Q1d] = U[k]; __syncthreads(); V[k] = 0.0; if (data.tidx inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q1d]; for (CeedInt i = 0; i < Q1d; ++i) { r_B[i] = B[data.tidx + i*P1d]; } for (CeedInt k = 0; k < P1d; ++k) { data.slice[data.tidx+data.tidy*Q1d] = U[k]; __syncthreads(); if (data.tidx inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[Q1d]; CeedScalar r_t2[Q1d]; for (CeedInt comp=0; comp(data, r_U+comp*P1d, c_B, r_t1); ContractY3d(data, r_t1, c_B, r_t2); ContractZ3d(data, r_t2, c_B, r_V+comp*Q1d); } } template inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[Q1d]; CeedScalar r_t2[Q1d]; for (CeedInt comp=0; comp(data, r_U+comp*Q1d, c_B, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); ContractTransposeX3d(data, r_t2, c_B, r_V+comp*P1d); } } template inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[Q1d]; CeedScalar r_t2[Q1d]; for (CeedInt comp=0; comp(data, r_U+comp*P1d, c_G, r_t1); ContractY3d(data, r_t1, c_B, r_t2); ContractZ3d(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d); ContractX3d(data, r_U+comp*P1d, c_B, r_t1); ContractY3d(data, r_t1, c_G, r_t2); ContractZ3d(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d); ContractX3d(data, r_U+comp*P1d, c_B, r_t1); ContractY3d(data, r_t1, c_B, r_t2); ContractZ3d(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d); } } template inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[Q1d]; CeedScalar r_t2[Q1d]; for (CeedInt comp=0; comp(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); ContractTransposeX3d(data, r_t2, c_G, r_V+comp*P1d); ContractTransposeZ3d(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1); ContractTransposeY3d(data, r_t1, c_G, r_t2); ContractTransposeAddX3d(data, r_t2, c_B, r_V+comp*P1d); ContractTransposeZ3d(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); ContractTransposeAddX3d(data, r_t2, c_B, r_V+comp*P1d); } } template inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NCOMP; ++comp) { data.slice[data.tidx+data.tidy*Q1d] = r_U[q + comp*Q1d]; __syncthreads(); // X derivative r_V[comp+0*NCOMP] = 0.0; for (CeedInt i = 0; i < Q1d; ++i) { r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative) } // Y derivative r_V[comp+1*NCOMP] = 0.0; for (CeedInt i = 0; i < Q1d; ++i) { r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative) } // Z derivative r_V[comp+2*NCOMP] = 0.0; for (CeedInt i = 0; i < Q1d; ++i) { r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d];//contract z direction (Z derivative) } __syncthreads(); } } template inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NCOMP; ++comp) { // X derivative data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 0*NCOMP]; __syncthreads(); for (CeedInt i = 0; i < Q1d; ++i) { r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative) } __syncthreads(); // Y derivative data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 1*NCOMP]; __syncthreads(); for (CeedInt i = 0; i < Q1d; ++i) { r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative) } __syncthreads(); // Z derivative for (CeedInt i = 0; i < Q1d; ++i) { r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP];// PARTIAL contract z direction (Z derivative) } } } template inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { *w = qweight1d[data.tidx]; } template inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { *w = qweight1d[data.tidx]*qweight1d[data.tidy]; } template inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy]; for (CeedInt z = 0; z < Q1d; ++z) { w[z] = pw*qweight1d[z]; } } ); extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { using std::ostringstream; using std::string; int ierr; bool setupdone; ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr); if (setupdone) return 0; Ceed ceed; ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); CeedOperator_Cuda_gen *data; ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr); CeedQFunction qf; CeedQFunction_Cuda_gen *qf_data; ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, nnodes; ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); CeedChk(ierr); CeedOperatorField *opinputfields, *opoutputfields; ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); CeedChk(ierr); CeedQFunctionField *qfinputfields, *qfoutputfields; ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); CeedChk(ierr); CeedEvalMode emode; CeedInterlaceMode imode; CeedBasis basis; CeedBasis_Cuda_shared *basis_data; CeedElemRestriction Erestrict; CeedElemRestriction_Cuda_reg *restr_data; ostringstream code; string devFunctions(deviceFunctions); // Add atomicAdd function for old NVidia architectures struct cudaDeviceProp prop; Ceed delegate; CeedGetDelegate(ceed, &delegate); Ceed_Cuda *ceed_data; ierr = CeedGetData(delegate, (void **)&ceed_data); CeedChk(ierr); ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); if (prop.major<6){ code << atomicAdd; } code << devFunctions; string qFunction(qf_data->qFunctionSource); code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; code << "\n#define CeedPragmaSIMD\n"; code << qFunction; // Setup code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFieldsInt strides, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; // Input Evecs and Restriction for (CeedInt i = 0; i < numinputfields; i++) { ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); CeedChk(ierr); if (emode == CEED_EVAL_WEIGHT) { // Skip } else { code << "const CeedScalar* d_u" <dim = dim; data->Q1d = Q1d; for (CeedInt i = 0; i < numoutputfields; i++) { code << "CeedScalar* d_v"<1?"*Q1d":"")<<";\n"; code << "\n// Input field constants and basis data\n"; //Initialize constants, and matrices B and G for (CeedInt i = 0; i < numinputfields; i++) { code << "// -- Input field "<B.in[i] = basis_data->d_interp1d; code << "__shared__ double s_B_in_"<(data, B.in["<B.in[i] = basis_data->d_interp1d; code << "__shared__ double s_B_in_"<(data, B.in["<d_collograd1d) { data->G.in[i] = basis_data->d_collograd1d; code << "__shared__ double s_G_in_"<(data, G.in["<G.in[i] = basis_data->d_grad1d; code << "__shared__ double s_G_in_"<(data, G.in["<B.out[i] = basis_data->d_interp1d; code << " __shared__ double s_B_out_"<(data, B.out["<B.out[i] = basis_data->d_interp1d; code << "__shared__ double s_B_out_"<(data, B.out["<d_collograd1d) { data->G.out[i] = basis_data->d_collograd1d; code << "__shared__ double s_G_out_"<(data, G.out["<G.out[i] = basis_data->d_grad1d; code << "__shared__ double s_G_out_"<(data, G.out["<d_collograd1d)) { code << " CeedScalar r_u"<indices.in[i] = restr_data->d_ind; if (data->indices.in[i]) { ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); code << " // InterlaceMode: "<(data, nnodes_in_"<strides.in[i], 3 * sizeof(CeedInt)); CeedChk_Cu(ceed, (CUresult)ierr); ierr = cudaMemcpy(data->strides.in[i], strides, 3 * sizeof(CeedInt), cudaMemcpyHostToDevice); CeedChk_Cu(ceed, (CUresult)ierr); code << " // Strides: {"<(data, elem, strides.in["<d_collograd1d) { code << " CeedScalar* r_t"<(data, r_u"<d_collograd1d) { code << " CeedScalar r_t"<(data, r_u"<(data, r_u"<W = basis_data->d_qweight1d; code << " weight"<(data, W, r_t"<d_collograd1d) { //Accumulator for gradient slices code << " CeedScalar r_tt"<d_collograd1d) { code << "\n // Note: Collocated Gradient\n"; code << "#pragma unroll\n"; code << "for (CeedInt q=0; qindices.in[i] = restr_data->d_ind; if (data->indices.in[i]) { ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); code << " // InterlaceMode: "<(data, nquads_in_"<strides.in[i], 3 * sizeof(CeedInt)); CeedChk_Cu(ceed, (CUresult)ierr); ierr = cudaMemcpy(data->strides.in[i], strides, 3 * sizeof(CeedInt), cudaMemcpyHostToDevice); CeedChk_Cu(ceed, (CUresult)ierr); code << " // Strides: {"<(data, elem, q, strides.in["<(data, q, r_t"<qFunctionName); code << " "<d_collograd1d) { code << "1 "; }else{ code << "Q1d "; } code << ", in, out);\n"; if (basis_data->d_collograd1d) { code << "\n // Note: Collocated Gradient\n"; for (CeedInt i = 0; i < numoutputfields; i++) { code << " // -- Output field "<(data, q, r_qq"<(data, r_tt"<d_collograd1d) { code << " interpTranspose"<(data, r_tt"<(data, r_tt"<indices.out[i] = restr_data->d_ind; if (data->indices.out[i]) { ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); code << " // InterlaceMode: "<(data, nnodes_out_"<strides.out[i], 3 * sizeof(CeedInt)); CeedChk_Cu(ceed, (CUresult)ierr); ierr = cudaMemcpy(data->strides.out[i], strides, 3 * sizeof(CeedInt), cudaMemcpyHostToDevice); CeedChk_Cu(ceed, (CUresult)ierr); code << " // Strides: {"<(data, elem, strides.out["<module, 0); CeedChk(ierr); ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op); CeedChk(ierr); ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); return 0; }