1*20cf1dd8SToby Isaac #include <petsc/private/petscfeimpl.h> /*I "petscfe.h" I*/ 2*20cf1dd8SToby Isaac 3*20cf1dd8SToby Isaac #ifdef PETSC_HAVE_OPENCL 4*20cf1dd8SToby Isaac 5*20cf1dd8SToby Isaac PetscErrorCode PetscFEDestroy_OpenCL(PetscFE fem) 6*20cf1dd8SToby Isaac { 7*20cf1dd8SToby Isaac PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 8*20cf1dd8SToby Isaac PetscErrorCode ierr; 9*20cf1dd8SToby Isaac 10*20cf1dd8SToby Isaac PetscFunctionBegin; 11*20cf1dd8SToby Isaac ierr = clReleaseCommandQueue(ocl->queue_id);CHKERRQ(ierr); 12*20cf1dd8SToby Isaac ocl->queue_id = 0; 13*20cf1dd8SToby Isaac ierr = clReleaseContext(ocl->ctx_id);CHKERRQ(ierr); 14*20cf1dd8SToby Isaac ocl->ctx_id = 0; 15*20cf1dd8SToby Isaac ierr = PetscFree(ocl);CHKERRQ(ierr); 16*20cf1dd8SToby Isaac PetscFunctionReturn(0); 17*20cf1dd8SToby Isaac } 18*20cf1dd8SToby Isaac 19*20cf1dd8SToby Isaac #define STRING_ERROR_CHECK(MSG) do {CHKERRQ(ierr); string_tail += count; if (string_tail == end_of_buffer) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, MSG);} while(0) 20*20cf1dd8SToby Isaac enum {LAPLACIAN = 0, ELASTICITY = 1}; 21*20cf1dd8SToby Isaac 22*20cf1dd8SToby Isaac /* NOTE: This is now broken for vector problems. Must redo loops to respect vector basis elements */ 23*20cf1dd8SToby Isaac /* dim Number of spatial dimensions: 2 */ 24*20cf1dd8SToby Isaac /* N_b Number of basis functions: generated */ 25*20cf1dd8SToby Isaac /* N_{bt} Number of total basis functions: N_b * N_{comp} */ 26*20cf1dd8SToby Isaac /* N_q Number of quadrature points: generated */ 27*20cf1dd8SToby Isaac /* N_{bs} Number of block cells LCM(N_b, N_q) */ 28*20cf1dd8SToby Isaac /* N_{bst} Number of block cell components LCM(N_{bt}, N_q) */ 29*20cf1dd8SToby Isaac /* N_{bl} Number of concurrent blocks generated */ 30*20cf1dd8SToby Isaac /* N_t Number of threads: N_{bl} * N_{bs} */ 31*20cf1dd8SToby Isaac /* N_{cbc} Number of concurrent basis cells: N_{bl} * N_q */ 32*20cf1dd8SToby Isaac /* N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b */ 33*20cf1dd8SToby Isaac /* N_{sbc} Number of serial basis cells: N_{bs} / N_q */ 34*20cf1dd8SToby Isaac /* N_{sqc} Number of serial quadrature cells: N_{bs} / N_b */ 35*20cf1dd8SToby Isaac /* N_{cb} Number of serial cell batches: input */ 36*20cf1dd8SToby Isaac /* N_c Number of total cells: N_{cb}*N_{t}/N_{comp} */ 37*20cf1dd8SToby Isaac PetscErrorCode PetscFEOpenCLGenerateIntegrationCode(PetscFE fem, char **string_buffer, PetscInt buffer_length, PetscBool useAux, PetscInt N_bl) 38*20cf1dd8SToby Isaac { 39*20cf1dd8SToby Isaac PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 40*20cf1dd8SToby Isaac PetscQuadrature q; 41*20cf1dd8SToby Isaac char *string_tail = *string_buffer; 42*20cf1dd8SToby Isaac char *end_of_buffer = *string_buffer + buffer_length; 43*20cf1dd8SToby Isaac char float_str[] = "float", double_str[] = "double"; 44*20cf1dd8SToby Isaac char *numeric_str = &(float_str[0]); 45*20cf1dd8SToby Isaac PetscInt op = ocl->op; 46*20cf1dd8SToby Isaac PetscBool useField = PETSC_FALSE; 47*20cf1dd8SToby Isaac PetscBool useFieldDer = PETSC_TRUE; 48*20cf1dd8SToby Isaac PetscBool useFieldAux = useAux; 49*20cf1dd8SToby Isaac PetscBool useFieldDerAux= PETSC_FALSE; 50*20cf1dd8SToby Isaac PetscBool useF0 = PETSC_TRUE; 51*20cf1dd8SToby Isaac PetscBool useF1 = PETSC_TRUE; 52*20cf1dd8SToby Isaac const PetscReal *points, *weights; 53*20cf1dd8SToby Isaac PetscReal *basis, *basisDer; 54*20cf1dd8SToby Isaac PetscInt dim, qNc, N_b, N_c, N_q, N_t, p, d, b, c; 55*20cf1dd8SToby Isaac size_t count; 56*20cf1dd8SToby Isaac PetscErrorCode ierr; 57*20cf1dd8SToby Isaac 58*20cf1dd8SToby Isaac PetscFunctionBegin; 59*20cf1dd8SToby Isaac ierr = PetscFEGetSpatialDimension(fem, &dim);CHKERRQ(ierr); 60*20cf1dd8SToby Isaac ierr = PetscFEGetDimension(fem, &N_b);CHKERRQ(ierr); 61*20cf1dd8SToby Isaac ierr = PetscFEGetNumComponents(fem, &N_c);CHKERRQ(ierr); 62*20cf1dd8SToby Isaac ierr = PetscFEGetQuadrature(fem, &q);CHKERRQ(ierr); 63*20cf1dd8SToby Isaac ierr = PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);CHKERRQ(ierr); 64*20cf1dd8SToby Isaac if (qNc != 1) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %D components\n", qNc); 65*20cf1dd8SToby Isaac N_t = N_b * N_c * N_q * N_bl; 66*20cf1dd8SToby Isaac /* Enable device extension for double precision */ 67*20cf1dd8SToby Isaac if (ocl->realType == PETSC_DOUBLE) { 68*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 69*20cf1dd8SToby Isaac "#if defined(cl_khr_fp64)\n" 70*20cf1dd8SToby Isaac "# pragma OPENCL EXTENSION cl_khr_fp64: enable\n" 71*20cf1dd8SToby Isaac "#elif defined(cl_amd_fp64)\n" 72*20cf1dd8SToby Isaac "# pragma OPENCL EXTENSION cl_amd_fp64: enable\n" 73*20cf1dd8SToby Isaac "#endif\n", 74*20cf1dd8SToby Isaac &count);STRING_ERROR_CHECK("Message to short"); 75*20cf1dd8SToby Isaac numeric_str = &(double_str[0]); 76*20cf1dd8SToby Isaac } 77*20cf1dd8SToby Isaac /* Kernel API */ 78*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 79*20cf1dd8SToby Isaac "\n" 80*20cf1dd8SToby Isaac "__kernel void integrateElementQuadrature(int N_cb, __global %s *coefficients, __global %s *coefficientsAux, __global %s *jacobianInverses, __global %s *jacobianDeterminants, __global %s *elemVec)\n" 81*20cf1dd8SToby Isaac "{\n", 82*20cf1dd8SToby Isaac &count, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str);STRING_ERROR_CHECK("Message to short"); 83*20cf1dd8SToby Isaac /* Quadrature */ 84*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 85*20cf1dd8SToby Isaac " /* Quadrature points\n" 86*20cf1dd8SToby Isaac " - (x1,y1,x2,y2,...) */\n" 87*20cf1dd8SToby Isaac " const %s points[%d] = {\n", 88*20cf1dd8SToby Isaac &count, numeric_str, N_q*dim);STRING_ERROR_CHECK("Message to short"); 89*20cf1dd8SToby Isaac for (p = 0; p < N_q; ++p) { 90*20cf1dd8SToby Isaac for (d = 0; d < dim; ++d) { 91*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, points[p*dim+d]);STRING_ERROR_CHECK("Message to short"); 92*20cf1dd8SToby Isaac } 93*20cf1dd8SToby Isaac } 94*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);STRING_ERROR_CHECK("Message to short"); 95*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 96*20cf1dd8SToby Isaac " /* Quadrature weights\n" 97*20cf1dd8SToby Isaac " - (v1,v2,...) */\n" 98*20cf1dd8SToby Isaac " const %s weights[%d] = {\n", 99*20cf1dd8SToby Isaac &count, numeric_str, N_q);STRING_ERROR_CHECK("Message to short"); 100*20cf1dd8SToby Isaac for (p = 0; p < N_q; ++p) { 101*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, weights[p]);STRING_ERROR_CHECK("Message to short"); 102*20cf1dd8SToby Isaac } 103*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);STRING_ERROR_CHECK("Message to short"); 104*20cf1dd8SToby Isaac /* Basis Functions */ 105*20cf1dd8SToby Isaac ierr = PetscFEGetDefaultTabulation(fem, &basis, &basisDer, NULL);CHKERRQ(ierr); 106*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 107*20cf1dd8SToby Isaac " /* Nodal basis function evaluations\n" 108*20cf1dd8SToby Isaac " - basis component is fastest varying, the basis function, then point */\n" 109*20cf1dd8SToby Isaac " const %s Basis[%d] = {\n", 110*20cf1dd8SToby Isaac &count, numeric_str, N_q*N_b*N_c);STRING_ERROR_CHECK("Message to short"); 111*20cf1dd8SToby Isaac for (p = 0; p < N_q; ++p) { 112*20cf1dd8SToby Isaac for (b = 0; b < N_b; ++b) { 113*20cf1dd8SToby Isaac for (c = 0; c < N_c; ++c) { 114*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, basis[(p*N_b + b)*N_c + c]);STRING_ERROR_CHECK("Message to short"); 115*20cf1dd8SToby Isaac } 116*20cf1dd8SToby Isaac } 117*20cf1dd8SToby Isaac } 118*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);STRING_ERROR_CHECK("Message to short"); 119*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 120*20cf1dd8SToby Isaac "\n" 121*20cf1dd8SToby Isaac " /* Nodal basis function derivative evaluations,\n" 122*20cf1dd8SToby Isaac " - derivative direction is fastest varying, then basis component, then basis function, then point */\n" 123*20cf1dd8SToby Isaac " const %s%d BasisDerivatives[%d] = {\n", 124*20cf1dd8SToby Isaac &count, numeric_str, dim, N_q*N_b*N_c);STRING_ERROR_CHECK("Message to short"); 125*20cf1dd8SToby Isaac for (p = 0; p < N_q; ++p) { 126*20cf1dd8SToby Isaac for (b = 0; b < N_b; ++b) { 127*20cf1dd8SToby Isaac for (c = 0; c < N_c; ++c) { 128*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "(%s%d)(", &count, numeric_str, dim);STRING_ERROR_CHECK("Message to short"); 129*20cf1dd8SToby Isaac for (d = 0; d < dim; ++d) { 130*20cf1dd8SToby Isaac if (d > 0) { 131*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, ", %g", &count, basisDer[((p*N_b + b)*dim + d)*N_c + c]);STRING_ERROR_CHECK("Message to short"); 132*20cf1dd8SToby Isaac } else { 133*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g", &count, basisDer[((p*N_b + b)*dim + d)*N_c + c]);STRING_ERROR_CHECK("Message to short"); 134*20cf1dd8SToby Isaac } 135*20cf1dd8SToby Isaac } 136*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "),\n", &count);STRING_ERROR_CHECK("Message to short"); 137*20cf1dd8SToby Isaac } 138*20cf1dd8SToby Isaac } 139*20cf1dd8SToby Isaac } 140*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);STRING_ERROR_CHECK("Message to short"); 141*20cf1dd8SToby Isaac /* Sizes */ 142*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 143*20cf1dd8SToby Isaac " const int dim = %d; // The spatial dimension\n" 144*20cf1dd8SToby Isaac " const int N_bl = %d; // The number of concurrent blocks\n" 145*20cf1dd8SToby Isaac " const int N_b = %d; // The number of basis functions\n" 146*20cf1dd8SToby Isaac " const int N_comp = %d; // The number of basis function components\n" 147*20cf1dd8SToby Isaac " const int N_bt = N_b*N_comp; // The total number of scalar basis functions\n" 148*20cf1dd8SToby Isaac " const int N_q = %d; // The number of quadrature points\n" 149*20cf1dd8SToby Isaac " const int N_bst = N_bt*N_q; // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously\n" 150*20cf1dd8SToby Isaac " const int N_t = N_bst*N_bl; // The number of threads, N_bst * N_bl\n" 151*20cf1dd8SToby Isaac " const int N_bc = N_t/N_comp; // The number of cells per batch (N_b*N_q*N_bl)\n" 152*20cf1dd8SToby Isaac " const int N_sbc = N_bst / (N_q * N_comp);\n" 153*20cf1dd8SToby Isaac " const int N_sqc = N_bst / N_bt;\n" 154*20cf1dd8SToby Isaac " /*const int N_c = N_cb * N_bc;*/\n" 155*20cf1dd8SToby Isaac "\n" 156*20cf1dd8SToby Isaac " /* Calculated indices */\n" 157*20cf1dd8SToby Isaac " /*const int tidx = get_local_id(0) + get_local_size(0)*get_local_id(1);*/\n" 158*20cf1dd8SToby Isaac " const int tidx = get_local_id(0);\n" 159*20cf1dd8SToby Isaac " const int blidx = tidx / N_bst; // Block number for this thread\n" 160*20cf1dd8SToby Isaac " const int bidx = tidx %% N_bt; // Basis function mapped to this thread\n" 161*20cf1dd8SToby Isaac " const int cidx = tidx %% N_comp; // Basis component mapped to this thread\n" 162*20cf1dd8SToby Isaac " const int qidx = tidx %% N_q; // Quadrature point mapped to this thread\n" 163*20cf1dd8SToby Isaac " const int blbidx = tidx %% N_q + blidx*N_q; // Cell mapped to this thread in the basis phase\n" 164*20cf1dd8SToby Isaac " const int blqidx = tidx %% N_b + blidx*N_b; // Cell mapped to this thread in the quadrature phase\n" 165*20cf1dd8SToby Isaac " const int gidx = get_group_id(1)*get_num_groups(0) + get_group_id(0);\n" 166*20cf1dd8SToby Isaac " const int Goffset = gidx*N_cb*N_bc;\n", 167*20cf1dd8SToby Isaac &count, dim, N_bl, N_b, N_c, N_q);STRING_ERROR_CHECK("Message to short"); 168*20cf1dd8SToby Isaac /* Local memory */ 169*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 170*20cf1dd8SToby Isaac "\n" 171*20cf1dd8SToby Isaac " /* Quadrature data */\n" 172*20cf1dd8SToby Isaac " %s w; // $w_q$, Quadrature weight at $x_q$\n" 173*20cf1dd8SToby Isaac " __local %s phi_i[%d]; //[N_bt*N_q]; // $\\phi_i(x_q)$, Value of the basis function $i$ at $x_q$\n" 174*20cf1dd8SToby Isaac " __local %s%d phiDer_i[%d]; //[N_bt*N_q]; // $\\frac{\\partial\\phi_i(x_q)}{\\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$\n" 175*20cf1dd8SToby Isaac " /* Geometric data */\n" 176*20cf1dd8SToby Isaac " __local %s detJ[%d]; //[N_t]; // $|J(x_q)|$, Jacobian determinant at $x_q$\n" 177*20cf1dd8SToby Isaac " __local %s invJ[%d];//[N_t*dim*dim]; // $J^{-1}(x_q)$, Jacobian inverse at $x_q$\n", 178*20cf1dd8SToby Isaac &count, numeric_str, numeric_str, N_b*N_c*N_q, numeric_str, dim, N_b*N_c*N_q, numeric_str, N_t, 179*20cf1dd8SToby Isaac numeric_str, N_t*dim*dim, numeric_str, N_t*N_b*N_c);STRING_ERROR_CHECK("Message to short"); 180*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 181*20cf1dd8SToby Isaac " /* FEM data */\n" 182*20cf1dd8SToby Isaac " __local %s u_i[%d]; //[N_t*N_bt]; // Coefficients $u_i$ of the field $u|_{\\mathcal{T}} = \\sum_i u_i \\phi_i$\n", 183*20cf1dd8SToby Isaac &count, numeric_str, N_t*N_b*N_c);STRING_ERROR_CHECK("Message to short"); 184*20cf1dd8SToby Isaac if (useAux) { 185*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 186*20cf1dd8SToby Isaac " __local %s a_i[%d]; //[N_t]; // Coefficients $a_i$ of the auxiliary field $a|_{\\mathcal{T}} = \\sum_i a_i \\phi^R_i$\n", 187*20cf1dd8SToby Isaac &count, numeric_str, N_t);STRING_ERROR_CHECK("Message to short"); 188*20cf1dd8SToby Isaac } 189*20cf1dd8SToby Isaac if (useF0) { 190*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 191*20cf1dd8SToby Isaac " /* Intermediate calculations */\n" 192*20cf1dd8SToby Isaac " __local %s f_0[%d]; //[N_t*N_sqc]; // $f_0(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n", 193*20cf1dd8SToby Isaac &count, numeric_str, N_t*N_q);STRING_ERROR_CHECK("Message to short"); 194*20cf1dd8SToby Isaac } 195*20cf1dd8SToby Isaac if (useF1) { 196*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 197*20cf1dd8SToby Isaac " __local %s%d f_1[%d]; //[N_t*N_sqc]; // $f_1(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n", 198*20cf1dd8SToby Isaac &count, numeric_str, dim, N_t*N_q);STRING_ERROR_CHECK("Message to short"); 199*20cf1dd8SToby Isaac } 200*20cf1dd8SToby Isaac /* TODO: If using elasticity, put in mu/lambda coefficients */ 201*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 202*20cf1dd8SToby Isaac " /* Output data */\n" 203*20cf1dd8SToby Isaac " %s e_i; // Coefficient $e_i$ of the residual\n\n", 204*20cf1dd8SToby Isaac &count, numeric_str);STRING_ERROR_CHECK("Message to short"); 205*20cf1dd8SToby Isaac /* One-time loads */ 206*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 207*20cf1dd8SToby Isaac " /* These should be generated inline */\n" 208*20cf1dd8SToby Isaac " /* Load quadrature weights */\n" 209*20cf1dd8SToby Isaac " w = weights[qidx];\n" 210*20cf1dd8SToby Isaac " /* Load basis tabulation \\phi_i for this cell */\n" 211*20cf1dd8SToby Isaac " if (tidx < N_bt*N_q) {\n" 212*20cf1dd8SToby Isaac " phi_i[tidx] = Basis[tidx];\n" 213*20cf1dd8SToby Isaac " phiDer_i[tidx] = BasisDerivatives[tidx];\n" 214*20cf1dd8SToby Isaac " }\n\n", 215*20cf1dd8SToby Isaac &count);STRING_ERROR_CHECK("Message to short"); 216*20cf1dd8SToby Isaac /* Batch loads */ 217*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 218*20cf1dd8SToby Isaac " for (int batch = 0; batch < N_cb; ++batch) {\n" 219*20cf1dd8SToby Isaac " /* Load geometry */\n" 220*20cf1dd8SToby Isaac " detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];\n" 221*20cf1dd8SToby Isaac " for (int n = 0; n < dim*dim; ++n) {\n" 222*20cf1dd8SToby Isaac " const int offset = n*N_t;\n" 223*20cf1dd8SToby Isaac " invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];\n" 224*20cf1dd8SToby Isaac " }\n" 225*20cf1dd8SToby Isaac " /* Load coefficients u_i for this cell */\n" 226*20cf1dd8SToby Isaac " for (int n = 0; n < N_bt; ++n) {\n" 227*20cf1dd8SToby Isaac " const int offset = n*N_t;\n" 228*20cf1dd8SToby Isaac " u_i[offset+tidx] = coefficients[(Goffset*N_bt)+batch*N_t*N_b+offset+tidx];\n" 229*20cf1dd8SToby Isaac " }\n", 230*20cf1dd8SToby Isaac &count);STRING_ERROR_CHECK("Message to short"); 231*20cf1dd8SToby Isaac if (useAux) { 232*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 233*20cf1dd8SToby Isaac " /* Load coefficients a_i for this cell */\n" 234*20cf1dd8SToby Isaac " /* TODO: This should not be N_t here, it should be N_bc*N_comp_aux */\n" 235*20cf1dd8SToby Isaac " a_i[tidx] = coefficientsAux[Goffset+batch*N_t+tidx];\n", 236*20cf1dd8SToby Isaac &count);STRING_ERROR_CHECK("Message to short"); 237*20cf1dd8SToby Isaac } 238*20cf1dd8SToby Isaac /* Quadrature phase */ 239*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 240*20cf1dd8SToby Isaac " barrier(CLK_LOCAL_MEM_FENCE);\n" 241*20cf1dd8SToby Isaac "\n" 242*20cf1dd8SToby Isaac " /* Map coefficients to values at quadrature points */\n" 243*20cf1dd8SToby Isaac " for (int c = 0; c < N_sqc; ++c) {\n" 244*20cf1dd8SToby Isaac " const int cell = c*N_bl*N_b + blqidx;\n" 245*20cf1dd8SToby Isaac " const int fidx = (cell*N_q + qidx)*N_comp + cidx;\n", 246*20cf1dd8SToby Isaac &count);STRING_ERROR_CHECK("Message to short"); 247*20cf1dd8SToby Isaac if (useField) { 248*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 249*20cf1dd8SToby Isaac " %s u[%d]; //[N_comp]; // $u(x_q)$, Value of the field at $x_q$\n", 250*20cf1dd8SToby Isaac &count, numeric_str, N_c);STRING_ERROR_CHECK("Message to short"); 251*20cf1dd8SToby Isaac } 252*20cf1dd8SToby Isaac if (useFieldDer) { 253*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 254*20cf1dd8SToby Isaac " %s%d gradU[%d]; //[N_comp]; // $\\nabla u(x_q)$, Value of the field gradient at $x_q$\n", 255*20cf1dd8SToby Isaac &count, numeric_str, dim, N_c);STRING_ERROR_CHECK("Message to short"); 256*20cf1dd8SToby Isaac } 257*20cf1dd8SToby Isaac if (useFieldAux) { 258*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 259*20cf1dd8SToby Isaac " %s a[%d]; //[1]; // $a(x_q)$, Value of the auxiliary fields at $x_q$\n", 260*20cf1dd8SToby Isaac &count, numeric_str, 1);STRING_ERROR_CHECK("Message to short"); 261*20cf1dd8SToby Isaac } 262*20cf1dd8SToby Isaac if (useFieldDerAux) { 263*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 264*20cf1dd8SToby Isaac " %s%d gradA[%d]; //[1]; // $\\nabla a(x_q)$, Value of the auxiliary field gradient at $x_q$\n", 265*20cf1dd8SToby Isaac &count, numeric_str, dim, 1);STRING_ERROR_CHECK("Message to short"); 266*20cf1dd8SToby Isaac } 267*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 268*20cf1dd8SToby Isaac "\n" 269*20cf1dd8SToby Isaac " for (int comp = 0; comp < N_comp; ++comp) {\n", 270*20cf1dd8SToby Isaac &count);STRING_ERROR_CHECK("Message to short"); 271*20cf1dd8SToby Isaac if (useField) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " u[comp] = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");} 272*20cf1dd8SToby Isaac if (useFieldDer) { 273*20cf1dd8SToby Isaac switch (dim) { 274*20cf1dd8SToby Isaac case 1: 275*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break; 276*20cf1dd8SToby Isaac case 2: 277*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break; 278*20cf1dd8SToby Isaac case 3: 279*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break; 280*20cf1dd8SToby Isaac } 281*20cf1dd8SToby Isaac } 282*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 283*20cf1dd8SToby Isaac " }\n", 284*20cf1dd8SToby Isaac &count);STRING_ERROR_CHECK("Message to short"); 285*20cf1dd8SToby Isaac if (useFieldAux) { 286*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " a[0] = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short"); 287*20cf1dd8SToby Isaac } 288*20cf1dd8SToby Isaac if (useFieldDerAux) { 289*20cf1dd8SToby Isaac switch (dim) { 290*20cf1dd8SToby Isaac case 1: 291*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break; 292*20cf1dd8SToby Isaac case 2: 293*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break; 294*20cf1dd8SToby Isaac case 3: 295*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0; gradA[0].z = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break; 296*20cf1dd8SToby Isaac } 297*20cf1dd8SToby Isaac } 298*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 299*20cf1dd8SToby Isaac " /* Get field and derivatives at this quadrature point */\n" 300*20cf1dd8SToby Isaac " for (int i = 0; i < N_b; ++i) {\n" 301*20cf1dd8SToby Isaac " for (int comp = 0; comp < N_comp; ++comp) {\n" 302*20cf1dd8SToby Isaac " const int b = i*N_comp+comp;\n" 303*20cf1dd8SToby Isaac " const int pidx = qidx*N_bt + b;\n" 304*20cf1dd8SToby Isaac " const int uidx = cell*N_bt + b;\n" 305*20cf1dd8SToby Isaac " %s%d realSpaceDer;\n\n", 306*20cf1dd8SToby Isaac &count, numeric_str, dim);STRING_ERROR_CHECK("Message to short"); 307*20cf1dd8SToby Isaac if (useField) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," u[comp] += u_i[uidx]*phi_i[pidx];\n", &count);STRING_ERROR_CHECK("Message to short");} 308*20cf1dd8SToby Isaac if (useFieldDer) { 309*20cf1dd8SToby Isaac switch (dim) { 310*20cf1dd8SToby Isaac case 2: 311*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 312*20cf1dd8SToby Isaac " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n" 313*20cf1dd8SToby Isaac " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n" 314*20cf1dd8SToby Isaac " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n" 315*20cf1dd8SToby Isaac " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n", 316*20cf1dd8SToby Isaac &count);STRING_ERROR_CHECK("Message to short");break; 317*20cf1dd8SToby Isaac case 3: 318*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 319*20cf1dd8SToby Isaac " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n" 320*20cf1dd8SToby Isaac " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n" 321*20cf1dd8SToby Isaac " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n" 322*20cf1dd8SToby Isaac " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n" 323*20cf1dd8SToby Isaac " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n" 324*20cf1dd8SToby Isaac " gradU[comp].z += u_i[uidx]*realSpaceDer.z;\n", 325*20cf1dd8SToby Isaac &count);STRING_ERROR_CHECK("Message to short");break; 326*20cf1dd8SToby Isaac } 327*20cf1dd8SToby Isaac } 328*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 329*20cf1dd8SToby Isaac " }\n" 330*20cf1dd8SToby Isaac " }\n", 331*20cf1dd8SToby Isaac &count);STRING_ERROR_CHECK("Message to short"); 332*20cf1dd8SToby Isaac if (useFieldAux) { 333*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," a[0] += a_i[cell];\n", &count);STRING_ERROR_CHECK("Message to short"); 334*20cf1dd8SToby Isaac } 335*20cf1dd8SToby Isaac /* Calculate residual at quadrature points: Should be generated by an weak form egine */ 336*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 337*20cf1dd8SToby Isaac " /* Process values at quadrature points */\n", 338*20cf1dd8SToby Isaac &count);STRING_ERROR_CHECK("Message to short"); 339*20cf1dd8SToby Isaac switch (op) { 340*20cf1dd8SToby Isaac case LAPLACIAN: 341*20cf1dd8SToby Isaac if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count);STRING_ERROR_CHECK("Message to short");} 342*20cf1dd8SToby Isaac if (useF1) { 343*20cf1dd8SToby Isaac if (useAux) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = a[0]*gradU[cidx];\n", &count);STRING_ERROR_CHECK("Message to short");} 344*20cf1dd8SToby Isaac else {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = gradU[cidx];\n", &count);STRING_ERROR_CHECK("Message to short");} 345*20cf1dd8SToby Isaac } 346*20cf1dd8SToby Isaac break; 347*20cf1dd8SToby Isaac case ELASTICITY: 348*20cf1dd8SToby Isaac if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count);STRING_ERROR_CHECK("Message to short");} 349*20cf1dd8SToby Isaac if (useF1) { 350*20cf1dd8SToby Isaac switch (dim) { 351*20cf1dd8SToby Isaac case 2: 352*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 353*20cf1dd8SToby Isaac " switch (cidx) {\n" 354*20cf1dd8SToby Isaac " case 0:\n" 355*20cf1dd8SToby Isaac " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].x + gradU[0].x);\n" 356*20cf1dd8SToby Isaac " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].y + gradU[1].x);\n" 357*20cf1dd8SToby Isaac " break;\n" 358*20cf1dd8SToby Isaac " case 1:\n" 359*20cf1dd8SToby Isaac " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].x + gradU[0].y);\n" 360*20cf1dd8SToby Isaac " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].y + gradU[1].y);\n" 361*20cf1dd8SToby Isaac " }\n", 362*20cf1dd8SToby Isaac &count);STRING_ERROR_CHECK("Message to short");break; 363*20cf1dd8SToby Isaac case 3: 364*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 365*20cf1dd8SToby Isaac " switch (cidx) {\n" 366*20cf1dd8SToby Isaac " case 0:\n" 367*20cf1dd8SToby Isaac " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].x + gradU[0].x);\n" 368*20cf1dd8SToby Isaac " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].y + gradU[1].x);\n" 369*20cf1dd8SToby Isaac " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].z + gradU[2].x);\n" 370*20cf1dd8SToby Isaac " break;\n" 371*20cf1dd8SToby Isaac " case 1:\n" 372*20cf1dd8SToby Isaac " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].x + gradU[0].y);\n" 373*20cf1dd8SToby Isaac " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[1].y);\n" 374*20cf1dd8SToby Isaac " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[2].y);\n" 375*20cf1dd8SToby Isaac " break;\n" 376*20cf1dd8SToby Isaac " case 2:\n" 377*20cf1dd8SToby Isaac " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].x + gradU[0].z);\n" 378*20cf1dd8SToby Isaac " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[1].z);\n" 379*20cf1dd8SToby Isaac " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[2].z);\n" 380*20cf1dd8SToby Isaac " }\n", 381*20cf1dd8SToby Isaac &count);STRING_ERROR_CHECK("Message to short");break; 382*20cf1dd8SToby Isaac }} 383*20cf1dd8SToby Isaac break; 384*20cf1dd8SToby Isaac default: 385*20cf1dd8SToby Isaac SETERRQ1(PETSC_COMM_WORLD, PETSC_ERR_SUP, "PDE operator %d is not supported", op); 386*20cf1dd8SToby Isaac } 387*20cf1dd8SToby Isaac if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_0[fidx] *= detJ[cell]*w;\n", &count);STRING_ERROR_CHECK("Message to short");} 388*20cf1dd8SToby Isaac if (useF1) { 389*20cf1dd8SToby Isaac switch (dim) { 390*20cf1dd8SToby Isaac case 1: 391*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_1[fidx].x *= detJ[cell]*w;\n", &count);STRING_ERROR_CHECK("Message to short");break; 392*20cf1dd8SToby Isaac case 2: 393*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;\n", &count);STRING_ERROR_CHECK("Message to short");break; 394*20cf1dd8SToby Isaac case 3: 395*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;\n", &count);STRING_ERROR_CHECK("Message to short");break; 396*20cf1dd8SToby Isaac } 397*20cf1dd8SToby Isaac } 398*20cf1dd8SToby Isaac /* Thread transpose */ 399*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 400*20cf1dd8SToby Isaac " }\n\n" 401*20cf1dd8SToby Isaac " /* ==== TRANSPOSE THREADS ==== */\n" 402*20cf1dd8SToby Isaac " barrier(CLK_LOCAL_MEM_FENCE);\n\n", 403*20cf1dd8SToby Isaac &count);STRING_ERROR_CHECK("Message to short"); 404*20cf1dd8SToby Isaac /* Basis phase */ 405*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 406*20cf1dd8SToby Isaac " /* Map values at quadrature points to coefficients */\n" 407*20cf1dd8SToby Isaac " for (int c = 0; c < N_sbc; ++c) {\n" 408*20cf1dd8SToby Isaac " const int cell = c*N_bl*N_q + blbidx; /* Cell number in batch */\n" 409*20cf1dd8SToby Isaac "\n" 410*20cf1dd8SToby Isaac " e_i = 0.0;\n" 411*20cf1dd8SToby Isaac " for (int q = 0; q < N_q; ++q) {\n" 412*20cf1dd8SToby Isaac " const int pidx = q*N_bt + bidx;\n" 413*20cf1dd8SToby Isaac " const int fidx = (cell*N_q + q)*N_comp + cidx;\n" 414*20cf1dd8SToby Isaac " %s%d realSpaceDer;\n\n", 415*20cf1dd8SToby Isaac &count, numeric_str, dim);STRING_ERROR_CHECK("Message to short"); 416*20cf1dd8SToby Isaac 417*20cf1dd8SToby Isaac if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," e_i += phi_i[pidx]*f_0[fidx];\n", &count);STRING_ERROR_CHECK("Message to short");} 418*20cf1dd8SToby Isaac if (useF1) { 419*20cf1dd8SToby Isaac switch (dim) { 420*20cf1dd8SToby Isaac case 2: 421*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 422*20cf1dd8SToby Isaac " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n" 423*20cf1dd8SToby Isaac " e_i += realSpaceDer.x*f_1[fidx].x;\n" 424*20cf1dd8SToby Isaac " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n" 425*20cf1dd8SToby Isaac " e_i += realSpaceDer.y*f_1[fidx].y;\n", 426*20cf1dd8SToby Isaac &count);STRING_ERROR_CHECK("Message to short");break; 427*20cf1dd8SToby Isaac case 3: 428*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 429*20cf1dd8SToby Isaac " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n" 430*20cf1dd8SToby Isaac " e_i += realSpaceDer.x*f_1[fidx].x;\n" 431*20cf1dd8SToby Isaac " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n" 432*20cf1dd8SToby Isaac " e_i += realSpaceDer.y*f_1[fidx].y;\n" 433*20cf1dd8SToby Isaac " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n" 434*20cf1dd8SToby Isaac " e_i += realSpaceDer.z*f_1[fidx].z;\n", 435*20cf1dd8SToby Isaac &count);STRING_ERROR_CHECK("Message to short");break; 436*20cf1dd8SToby Isaac } 437*20cf1dd8SToby Isaac } 438*20cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 439*20cf1dd8SToby Isaac " }\n" 440*20cf1dd8SToby Isaac " /* Write element vector for N_{cbc} cells at a time */\n" 441*20cf1dd8SToby Isaac " elemVec[(Goffset + batch*N_bc + c*N_bl*N_q)*N_bt + tidx] = e_i;\n" 442*20cf1dd8SToby Isaac " }\n" 443*20cf1dd8SToby Isaac " /* ==== Could do one write per batch ==== */\n" 444*20cf1dd8SToby Isaac " }\n" 445*20cf1dd8SToby Isaac " return;\n" 446*20cf1dd8SToby Isaac "}\n", 447*20cf1dd8SToby Isaac &count);STRING_ERROR_CHECK("Message to short"); 448*20cf1dd8SToby Isaac PetscFunctionReturn(0); 449*20cf1dd8SToby Isaac } 450*20cf1dd8SToby Isaac 451*20cf1dd8SToby Isaac PetscErrorCode PetscFEOpenCLGetIntegrationKernel(PetscFE fem, PetscBool useAux, cl_program *ocl_prog, cl_kernel *ocl_kernel) 452*20cf1dd8SToby Isaac { 453*20cf1dd8SToby Isaac PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 454*20cf1dd8SToby Isaac PetscInt dim, N_bl; 455*20cf1dd8SToby Isaac PetscBool flg; 456*20cf1dd8SToby Isaac char *buffer; 457*20cf1dd8SToby Isaac size_t len; 458*20cf1dd8SToby Isaac char errMsg[8192]; 459*20cf1dd8SToby Isaac cl_int ierr2; 460*20cf1dd8SToby Isaac PetscErrorCode ierr; 461*20cf1dd8SToby Isaac 462*20cf1dd8SToby Isaac PetscFunctionBegin; 463*20cf1dd8SToby Isaac ierr = PetscFEGetSpatialDimension(fem, &dim);CHKERRQ(ierr); 464*20cf1dd8SToby Isaac ierr = PetscMalloc1(8192, &buffer);CHKERRQ(ierr); 465*20cf1dd8SToby Isaac ierr = PetscFEGetTileSizes(fem, NULL, &N_bl, NULL, NULL);CHKERRQ(ierr); 466*20cf1dd8SToby Isaac ierr = PetscFEOpenCLGenerateIntegrationCode(fem, &buffer, 8192, useAux, N_bl);CHKERRQ(ierr); 467*20cf1dd8SToby Isaac ierr = PetscOptionsHasName(((PetscObject)fem)->options,((PetscObject)fem)->prefix, "-petscfe_opencl_kernel_print", &flg);CHKERRQ(ierr); 468*20cf1dd8SToby Isaac if (flg) {ierr = PetscPrintf(PetscObjectComm((PetscObject) fem), "OpenCL FE Integration Kernel:\n%s\n", buffer);CHKERRQ(ierr);} 469*20cf1dd8SToby Isaac len = strlen(buffer); 470*20cf1dd8SToby Isaac *ocl_prog = clCreateProgramWithSource(ocl->ctx_id, 1, (const char **) &buffer, &len, &ierr2);CHKERRQ(ierr2); 471*20cf1dd8SToby Isaac ierr = clBuildProgram(*ocl_prog, 0, NULL, NULL, NULL, NULL); 472*20cf1dd8SToby Isaac if (ierr != CL_SUCCESS) { 473*20cf1dd8SToby Isaac ierr = clGetProgramBuildInfo(*ocl_prog, ocl->dev_id, CL_PROGRAM_BUILD_LOG, 8192*sizeof(char), &errMsg, NULL);CHKERRQ(ierr); 474*20cf1dd8SToby Isaac SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Build failed! Log:\n %s", errMsg); 475*20cf1dd8SToby Isaac } 476*20cf1dd8SToby Isaac ierr = PetscFree(buffer);CHKERRQ(ierr); 477*20cf1dd8SToby Isaac *ocl_kernel = clCreateKernel(*ocl_prog, "integrateElementQuadrature", &ierr);CHKERRQ(ierr); 478*20cf1dd8SToby Isaac PetscFunctionReturn(0); 479*20cf1dd8SToby Isaac } 480*20cf1dd8SToby Isaac 481*20cf1dd8SToby Isaac PetscErrorCode PetscFEOpenCLCalculateGrid(PetscFE fem, PetscInt N, PetscInt blockSize, size_t *x, size_t *y, size_t *z) 482*20cf1dd8SToby Isaac { 483*20cf1dd8SToby Isaac const PetscInt Nblocks = N/blockSize; 484*20cf1dd8SToby Isaac 485*20cf1dd8SToby Isaac PetscFunctionBegin; 486*20cf1dd8SToby Isaac if (N % blockSize) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N); 487*20cf1dd8SToby Isaac *z = 1; 488*20cf1dd8SToby Isaac for (*x = (size_t) (PetscSqrtReal(Nblocks) + 0.5); *x > 0; --*x) { 489*20cf1dd8SToby Isaac *y = Nblocks / *x; 490*20cf1dd8SToby Isaac if (*x * *y == Nblocks) break; 491*20cf1dd8SToby Isaac } 492*20cf1dd8SToby Isaac if (*x * *y != Nblocks) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %d with block size %d", N, blockSize); 493*20cf1dd8SToby Isaac PetscFunctionReturn(0); 494*20cf1dd8SToby Isaac } 495*20cf1dd8SToby Isaac 496*20cf1dd8SToby Isaac PetscErrorCode PetscFEOpenCLLogResidual(PetscFE fem, PetscLogDouble time, PetscLogDouble flops) 497*20cf1dd8SToby Isaac { 498*20cf1dd8SToby Isaac PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 499*20cf1dd8SToby Isaac PetscStageLog stageLog; 500*20cf1dd8SToby Isaac PetscEventPerfLog eventLog = NULL; 501*20cf1dd8SToby Isaac PetscInt stage; 502*20cf1dd8SToby Isaac PetscErrorCode ierr; 503*20cf1dd8SToby Isaac 504*20cf1dd8SToby Isaac PetscFunctionBegin; 505*20cf1dd8SToby Isaac ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); 506*20cf1dd8SToby Isaac ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); 507*20cf1dd8SToby Isaac ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr); 508*20cf1dd8SToby Isaac /* Log performance info */ 509*20cf1dd8SToby Isaac eventLog->eventInfo[ocl->residualEvent].count++; 510*20cf1dd8SToby Isaac eventLog->eventInfo[ocl->residualEvent].time += time; 511*20cf1dd8SToby Isaac eventLog->eventInfo[ocl->residualEvent].flops += flops; 512*20cf1dd8SToby Isaac PetscFunctionReturn(0); 513*20cf1dd8SToby Isaac } 514*20cf1dd8SToby Isaac 515*20cf1dd8SToby Isaac PetscErrorCode PetscFEIntegrateResidual_OpenCL(PetscFE fem, PetscDS prob, PetscInt field, PetscInt Ne, PetscFEGeom *cgeom, 516*20cf1dd8SToby Isaac const PetscScalar coefficients[], const PetscScalar coefficients_t[], PetscDS probAux, const PetscScalar coefficientsAux[], PetscReal t, PetscScalar elemVec[]) 517*20cf1dd8SToby Isaac { 518*20cf1dd8SToby Isaac /* Nbc = batchSize */ 519*20cf1dd8SToby Isaac PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 520*20cf1dd8SToby Isaac PetscPointFunc f0_func; 521*20cf1dd8SToby Isaac PetscPointFunc f1_func; 522*20cf1dd8SToby Isaac PetscQuadrature q; 523*20cf1dd8SToby Isaac PetscInt dim, qNc; 524*20cf1dd8SToby Isaac PetscInt N_b; /* The number of basis functions */ 525*20cf1dd8SToby Isaac PetscInt N_comp; /* The number of basis function components */ 526*20cf1dd8SToby Isaac PetscInt N_bt; /* The total number of scalar basis functions */ 527*20cf1dd8SToby Isaac PetscInt N_q; /* The number of quadrature points */ 528*20cf1dd8SToby Isaac PetscInt N_bst; /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */ 529*20cf1dd8SToby Isaac PetscInt N_t; /* The number of threads, N_bst * N_bl */ 530*20cf1dd8SToby Isaac PetscInt N_bl; /* The number of blocks */ 531*20cf1dd8SToby Isaac PetscInt N_bc; /* The batch size, N_bl*N_q*N_b */ 532*20cf1dd8SToby Isaac PetscInt N_cb; /* The number of batches */ 533*20cf1dd8SToby Isaac PetscInt numFlops, f0Flops = 0, f1Flops = 0; 534*20cf1dd8SToby Isaac PetscBool useAux = probAux ? PETSC_TRUE : PETSC_FALSE; 535*20cf1dd8SToby Isaac PetscBool useField = PETSC_FALSE; 536*20cf1dd8SToby Isaac PetscBool useFieldDer = PETSC_TRUE; 537*20cf1dd8SToby Isaac PetscBool useF0 = PETSC_TRUE; 538*20cf1dd8SToby Isaac PetscBool useF1 = PETSC_TRUE; 539*20cf1dd8SToby Isaac /* OpenCL variables */ 540*20cf1dd8SToby Isaac cl_program ocl_prog; 541*20cf1dd8SToby Isaac cl_kernel ocl_kernel; 542*20cf1dd8SToby Isaac cl_event ocl_ev; /* The event for tracking kernel execution */ 543*20cf1dd8SToby Isaac cl_ulong ns_start; /* Nanoseconds counter on GPU at kernel start */ 544*20cf1dd8SToby Isaac cl_ulong ns_end; /* Nanoseconds counter on GPU at kernel stop */ 545*20cf1dd8SToby Isaac cl_mem o_jacobianInverses, o_jacobianDeterminants; 546*20cf1dd8SToby Isaac cl_mem o_coefficients, o_coefficientsAux, o_elemVec; 547*20cf1dd8SToby Isaac float *f_coeff = NULL, *f_coeffAux = NULL, *f_invJ = NULL, *f_detJ = NULL; 548*20cf1dd8SToby Isaac double *d_coeff = NULL, *d_coeffAux = NULL, *d_invJ = NULL, *d_detJ = NULL; 549*20cf1dd8SToby Isaac PetscReal *r_invJ = NULL, *r_detJ = NULL; 550*20cf1dd8SToby Isaac void *oclCoeff, *oclCoeffAux, *oclInvJ, *oclDetJ; 551*20cf1dd8SToby Isaac size_t local_work_size[3], global_work_size[3]; 552*20cf1dd8SToby Isaac size_t realSize, x, y, z; 553*20cf1dd8SToby Isaac const PetscReal *points, *weights; 554*20cf1dd8SToby Isaac PetscErrorCode ierr; 555*20cf1dd8SToby Isaac 556*20cf1dd8SToby Isaac PetscFunctionBegin; 557*20cf1dd8SToby Isaac if (!Ne) {ierr = PetscFEOpenCLLogResidual(fem, 0.0, 0.0);CHKERRQ(ierr); PetscFunctionReturn(0);} 558*20cf1dd8SToby Isaac ierr = PetscFEGetSpatialDimension(fem, &dim);CHKERRQ(ierr); 559*20cf1dd8SToby Isaac ierr = PetscFEGetQuadrature(fem, &q);CHKERRQ(ierr); 560*20cf1dd8SToby Isaac ierr = PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);CHKERRQ(ierr); 561*20cf1dd8SToby Isaac if (qNc != 1) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %D components\n", qNc); 562*20cf1dd8SToby Isaac ierr = PetscFEGetDimension(fem, &N_b);CHKERRQ(ierr); 563*20cf1dd8SToby Isaac ierr = PetscFEGetNumComponents(fem, &N_comp);CHKERRQ(ierr); 564*20cf1dd8SToby Isaac ierr = PetscDSGetResidual(prob, field, &f0_func, &f1_func);CHKERRQ(ierr); 565*20cf1dd8SToby Isaac ierr = PetscFEGetTileSizes(fem, NULL, &N_bl, &N_bc, &N_cb);CHKERRQ(ierr); 566*20cf1dd8SToby Isaac N_bt = N_b*N_comp; 567*20cf1dd8SToby Isaac N_bst = N_bt*N_q; 568*20cf1dd8SToby Isaac N_t = N_bst*N_bl; 569*20cf1dd8SToby Isaac if (N_bc*N_comp != N_t) SETERRQ3(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, N_bc, N_comp); 570*20cf1dd8SToby Isaac /* Calculate layout */ 571*20cf1dd8SToby Isaac if (Ne % (N_cb*N_bc)) { /* Remainder cells */ 572*20cf1dd8SToby Isaac ierr = PetscFEIntegrateResidual_Basic(fem, prob, field, Ne, cgeom, coefficients, coefficients_t, probAux, coefficientsAux, t, elemVec);CHKERRQ(ierr); 573*20cf1dd8SToby Isaac PetscFunctionReturn(0); 574*20cf1dd8SToby Isaac } 575*20cf1dd8SToby Isaac ierr = PetscFEOpenCLCalculateGrid(fem, Ne, N_cb*N_bc, &x, &y, &z);CHKERRQ(ierr); 576*20cf1dd8SToby Isaac local_work_size[0] = N_bc*N_comp; 577*20cf1dd8SToby Isaac local_work_size[1] = 1; 578*20cf1dd8SToby Isaac local_work_size[2] = 1; 579*20cf1dd8SToby Isaac global_work_size[0] = x * local_work_size[0]; 580*20cf1dd8SToby Isaac global_work_size[1] = y * local_work_size[1]; 581*20cf1dd8SToby Isaac global_work_size[2] = z * local_work_size[2]; 582*20cf1dd8SToby Isaac ierr = PetscInfo7(fem, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n", x, y, z, local_work_size[0], local_work_size[1], local_work_size[2], N_cb);CHKERRQ(ierr); 583*20cf1dd8SToby Isaac ierr = PetscInfo2(fem, " N_t: %d, N_cb: %d\n", N_t, N_cb); 584*20cf1dd8SToby Isaac /* Generate code */ 585*20cf1dd8SToby Isaac if (probAux) { 586*20cf1dd8SToby Isaac PetscSpace P; 587*20cf1dd8SToby Isaac PetscInt NfAux, order, f; 588*20cf1dd8SToby Isaac 589*20cf1dd8SToby Isaac ierr = PetscDSGetNumFields(probAux, &NfAux);CHKERRQ(ierr); 590*20cf1dd8SToby Isaac for (f = 0; f < NfAux; ++f) { 591*20cf1dd8SToby Isaac PetscFE feAux; 592*20cf1dd8SToby Isaac 593*20cf1dd8SToby Isaac ierr = PetscDSGetDiscretization(probAux, f, (PetscObject *) &feAux);CHKERRQ(ierr); 594*20cf1dd8SToby Isaac ierr = PetscFEGetBasisSpace(feAux, &P);CHKERRQ(ierr); 595*20cf1dd8SToby Isaac ierr = PetscSpaceGetDegree(P, &order, NULL);CHKERRQ(ierr); 596*20cf1dd8SToby Isaac if (order > 0) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Can only handle P0 coefficient fields"); 597*20cf1dd8SToby Isaac } 598*20cf1dd8SToby Isaac } 599*20cf1dd8SToby Isaac ierr = PetscFEOpenCLGetIntegrationKernel(fem, useAux, &ocl_prog, &ocl_kernel);CHKERRQ(ierr); 600*20cf1dd8SToby Isaac /* Create buffers on the device and send data over */ 601*20cf1dd8SToby Isaac ierr = PetscDataTypeGetSize(ocl->realType, &realSize);CHKERRQ(ierr); 602*20cf1dd8SToby Isaac if (sizeof(PetscReal) != realSize) { 603*20cf1dd8SToby Isaac switch (ocl->realType) { 604*20cf1dd8SToby Isaac case PETSC_FLOAT: 605*20cf1dd8SToby Isaac { 606*20cf1dd8SToby Isaac PetscInt c, b, d; 607*20cf1dd8SToby Isaac 608*20cf1dd8SToby Isaac ierr = PetscMalloc4(Ne*N_bt,&f_coeff,Ne,&f_coeffAux,Ne*dim*dim,&f_invJ,Ne,&f_detJ);CHKERRQ(ierr); 609*20cf1dd8SToby Isaac for (c = 0; c < Ne; ++c) { 610*20cf1dd8SToby Isaac f_detJ[c] = (float) cgeom[c].detJ; 611*20cf1dd8SToby Isaac for (d = 0; d < dim*dim; ++d) { 612*20cf1dd8SToby Isaac f_invJ[c*dim*dim+d] = (float) cgeom[c].invJ[d]; 613*20cf1dd8SToby Isaac } 614*20cf1dd8SToby Isaac for (b = 0; b < N_bt; ++b) { 615*20cf1dd8SToby Isaac f_coeff[c*N_bt+b] = (float) coefficients[c*N_bt+b]; 616*20cf1dd8SToby Isaac } 617*20cf1dd8SToby Isaac } 618*20cf1dd8SToby Isaac if (coefficientsAux) { /* Assume P0 */ 619*20cf1dd8SToby Isaac for (c = 0; c < Ne; ++c) { 620*20cf1dd8SToby Isaac f_coeffAux[c] = (float) coefficientsAux[c]; 621*20cf1dd8SToby Isaac } 622*20cf1dd8SToby Isaac } 623*20cf1dd8SToby Isaac oclCoeff = (void *) f_coeff; 624*20cf1dd8SToby Isaac if (coefficientsAux) { 625*20cf1dd8SToby Isaac oclCoeffAux = (void *) f_coeffAux; 626*20cf1dd8SToby Isaac } else { 627*20cf1dd8SToby Isaac oclCoeffAux = NULL; 628*20cf1dd8SToby Isaac } 629*20cf1dd8SToby Isaac oclInvJ = (void *) f_invJ; 630*20cf1dd8SToby Isaac oclDetJ = (void *) f_detJ; 631*20cf1dd8SToby Isaac } 632*20cf1dd8SToby Isaac break; 633*20cf1dd8SToby Isaac case PETSC_DOUBLE: 634*20cf1dd8SToby Isaac { 635*20cf1dd8SToby Isaac PetscInt c, b, d; 636*20cf1dd8SToby Isaac 637*20cf1dd8SToby Isaac ierr = PetscMalloc4(Ne*N_bt,&d_coeff,Ne,&d_coeffAux,Ne*dim*dim,&d_invJ,Ne,&d_detJ);CHKERRQ(ierr); 638*20cf1dd8SToby Isaac for (c = 0; c < Ne; ++c) { 639*20cf1dd8SToby Isaac d_detJ[c] = (double) cgeom[c].detJ; 640*20cf1dd8SToby Isaac for (d = 0; d < dim*dim; ++d) { 641*20cf1dd8SToby Isaac d_invJ[c*dim*dim+d] = (double) cgeom[c].invJ[d]; 642*20cf1dd8SToby Isaac } 643*20cf1dd8SToby Isaac for (b = 0; b < N_bt; ++b) { 644*20cf1dd8SToby Isaac d_coeff[c*N_bt+b] = (double) coefficients[c*N_bt+b]; 645*20cf1dd8SToby Isaac } 646*20cf1dd8SToby Isaac } 647*20cf1dd8SToby Isaac if (coefficientsAux) { /* Assume P0 */ 648*20cf1dd8SToby Isaac for (c = 0; c < Ne; ++c) { 649*20cf1dd8SToby Isaac d_coeffAux[c] = (double) coefficientsAux[c]; 650*20cf1dd8SToby Isaac } 651*20cf1dd8SToby Isaac } 652*20cf1dd8SToby Isaac oclCoeff = (void *) d_coeff; 653*20cf1dd8SToby Isaac if (coefficientsAux) { 654*20cf1dd8SToby Isaac oclCoeffAux = (void *) d_coeffAux; 655*20cf1dd8SToby Isaac } else { 656*20cf1dd8SToby Isaac oclCoeffAux = NULL; 657*20cf1dd8SToby Isaac } 658*20cf1dd8SToby Isaac oclInvJ = (void *) d_invJ; 659*20cf1dd8SToby Isaac oclDetJ = (void *) d_detJ; 660*20cf1dd8SToby Isaac } 661*20cf1dd8SToby Isaac break; 662*20cf1dd8SToby Isaac default: 663*20cf1dd8SToby Isaac SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType); 664*20cf1dd8SToby Isaac } 665*20cf1dd8SToby Isaac } else { 666*20cf1dd8SToby Isaac PetscInt c, d; 667*20cf1dd8SToby Isaac 668*20cf1dd8SToby Isaac ierr = PetscMalloc2(Ne*dim*dim,&r_invJ,Ne,&r_detJ);CHKERRQ(ierr); 669*20cf1dd8SToby Isaac for (c = 0; c < Ne; ++c) { 670*20cf1dd8SToby Isaac r_detJ[c] = cgeom[c].detJ; 671*20cf1dd8SToby Isaac for (d = 0; d < dim*dim; ++d) { 672*20cf1dd8SToby Isaac r_invJ[c*dim*dim+d] = cgeom[c].invJ[d]; 673*20cf1dd8SToby Isaac } 674*20cf1dd8SToby Isaac } 675*20cf1dd8SToby Isaac oclCoeff = (void *) coefficients; 676*20cf1dd8SToby Isaac oclCoeffAux = (void *) coefficientsAux; 677*20cf1dd8SToby Isaac oclInvJ = (void *) r_invJ; 678*20cf1dd8SToby Isaac oclDetJ = (void *) r_detJ; 679*20cf1dd8SToby Isaac } 680*20cf1dd8SToby Isaac o_coefficients = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*N_bt * realSize, oclCoeff, &ierr);CHKERRQ(ierr); 681*20cf1dd8SToby Isaac if (coefficientsAux) { 682*20cf1dd8SToby Isaac o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclCoeffAux, &ierr);CHKERRQ(ierr); 683*20cf1dd8SToby Isaac } else { 684*20cf1dd8SToby Isaac o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY, Ne * realSize, oclCoeffAux, &ierr);CHKERRQ(ierr); 685*20cf1dd8SToby Isaac } 686*20cf1dd8SToby Isaac o_jacobianInverses = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*dim*dim * realSize, oclInvJ, &ierr);CHKERRQ(ierr); 687*20cf1dd8SToby Isaac o_jacobianDeterminants = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclDetJ, &ierr);CHKERRQ(ierr); 688*20cf1dd8SToby Isaac o_elemVec = clCreateBuffer(ocl->ctx_id, CL_MEM_WRITE_ONLY, Ne*N_bt * realSize, NULL, &ierr);CHKERRQ(ierr); 689*20cf1dd8SToby Isaac /* Kernel launch */ 690*20cf1dd8SToby Isaac ierr = clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void*) &N_cb);CHKERRQ(ierr); 691*20cf1dd8SToby Isaac ierr = clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void*) &o_coefficients);CHKERRQ(ierr); 692*20cf1dd8SToby Isaac ierr = clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void*) &o_coefficientsAux);CHKERRQ(ierr); 693*20cf1dd8SToby Isaac ierr = clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void*) &o_jacobianInverses);CHKERRQ(ierr); 694*20cf1dd8SToby Isaac ierr = clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void*) &o_jacobianDeterminants);CHKERRQ(ierr); 695*20cf1dd8SToby Isaac ierr = clSetKernelArg(ocl_kernel, 5, sizeof(cl_mem), (void*) &o_elemVec);CHKERRQ(ierr); 696*20cf1dd8SToby Isaac ierr = clEnqueueNDRangeKernel(ocl->queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev);CHKERRQ(ierr); 697*20cf1dd8SToby Isaac /* Read data back from device */ 698*20cf1dd8SToby Isaac if (sizeof(PetscReal) != realSize) { 699*20cf1dd8SToby Isaac switch (ocl->realType) { 700*20cf1dd8SToby Isaac case PETSC_FLOAT: 701*20cf1dd8SToby Isaac { 702*20cf1dd8SToby Isaac float *elem; 703*20cf1dd8SToby Isaac PetscInt c, b; 704*20cf1dd8SToby Isaac 705*20cf1dd8SToby Isaac ierr = PetscFree4(f_coeff,f_coeffAux,f_invJ,f_detJ);CHKERRQ(ierr); 706*20cf1dd8SToby Isaac ierr = PetscMalloc1(Ne*N_bt, &elem);CHKERRQ(ierr); 707*20cf1dd8SToby Isaac ierr = clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL);CHKERRQ(ierr); 708*20cf1dd8SToby Isaac for (c = 0; c < Ne; ++c) { 709*20cf1dd8SToby Isaac for (b = 0; b < N_bt; ++b) { 710*20cf1dd8SToby Isaac elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b]; 711*20cf1dd8SToby Isaac } 712*20cf1dd8SToby Isaac } 713*20cf1dd8SToby Isaac ierr = PetscFree(elem);CHKERRQ(ierr); 714*20cf1dd8SToby Isaac } 715*20cf1dd8SToby Isaac break; 716*20cf1dd8SToby Isaac case PETSC_DOUBLE: 717*20cf1dd8SToby Isaac { 718*20cf1dd8SToby Isaac double *elem; 719*20cf1dd8SToby Isaac PetscInt c, b; 720*20cf1dd8SToby Isaac 721*20cf1dd8SToby Isaac ierr = PetscFree4(d_coeff,d_coeffAux,d_invJ,d_detJ);CHKERRQ(ierr); 722*20cf1dd8SToby Isaac ierr = PetscMalloc1(Ne*N_bt, &elem);CHKERRQ(ierr); 723*20cf1dd8SToby Isaac ierr = clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL);CHKERRQ(ierr); 724*20cf1dd8SToby Isaac for (c = 0; c < Ne; ++c) { 725*20cf1dd8SToby Isaac for (b = 0; b < N_bt; ++b) { 726*20cf1dd8SToby Isaac elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b]; 727*20cf1dd8SToby Isaac } 728*20cf1dd8SToby Isaac } 729*20cf1dd8SToby Isaac ierr = PetscFree(elem);CHKERRQ(ierr); 730*20cf1dd8SToby Isaac } 731*20cf1dd8SToby Isaac break; 732*20cf1dd8SToby Isaac default: 733*20cf1dd8SToby Isaac SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType); 734*20cf1dd8SToby Isaac } 735*20cf1dd8SToby Isaac } else { 736*20cf1dd8SToby Isaac ierr = PetscFree2(r_invJ,r_detJ);CHKERRQ(ierr); 737*20cf1dd8SToby Isaac ierr = clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elemVec, 0, NULL, NULL);CHKERRQ(ierr); 738*20cf1dd8SToby Isaac } 739*20cf1dd8SToby Isaac /* Log performance */ 740*20cf1dd8SToby Isaac ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL);CHKERRQ(ierr); 741*20cf1dd8SToby Isaac ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ns_end, NULL);CHKERRQ(ierr); 742*20cf1dd8SToby Isaac f0Flops = 0; 743*20cf1dd8SToby Isaac switch (ocl->op) { 744*20cf1dd8SToby Isaac case LAPLACIAN: 745*20cf1dd8SToby Isaac f1Flops = useAux ? dim : 0;break; 746*20cf1dd8SToby Isaac case ELASTICITY: 747*20cf1dd8SToby Isaac f1Flops = 2*dim*dim;break; 748*20cf1dd8SToby Isaac } 749*20cf1dd8SToby Isaac numFlops = Ne*( 750*20cf1dd8SToby Isaac N_q*( 751*20cf1dd8SToby Isaac N_b*N_comp*((useField ? 2 : 0) + (useFieldDer ? 2*dim*(dim + 1) : 0)) 752*20cf1dd8SToby Isaac /*+ 753*20cf1dd8SToby Isaac N_ba*N_compa*((useFieldAux ? 2 : 0) + (useFieldDerAux ? 2*dim*(dim + 1) : 0))*/ 754*20cf1dd8SToby Isaac + 755*20cf1dd8SToby Isaac N_comp*((useF0 ? f0Flops + 2 : 0) + (useF1 ? f1Flops + 2*dim : 0))) 756*20cf1dd8SToby Isaac + 757*20cf1dd8SToby Isaac N_b*((useF0 ? 2 : 0) + (useF1 ? 2*dim*(dim + 1) : 0))); 758*20cf1dd8SToby Isaac ierr = PetscFEOpenCLLogResidual(fem, (ns_end - ns_start)*1.0e-9, numFlops);CHKERRQ(ierr); 759*20cf1dd8SToby Isaac /* Cleanup */ 760*20cf1dd8SToby Isaac ierr = clReleaseMemObject(o_coefficients);CHKERRQ(ierr); 761*20cf1dd8SToby Isaac ierr = clReleaseMemObject(o_coefficientsAux);CHKERRQ(ierr); 762*20cf1dd8SToby Isaac ierr = clReleaseMemObject(o_jacobianInverses);CHKERRQ(ierr); 763*20cf1dd8SToby Isaac ierr = clReleaseMemObject(o_jacobianDeterminants);CHKERRQ(ierr); 764*20cf1dd8SToby Isaac ierr = clReleaseMemObject(o_elemVec);CHKERRQ(ierr); 765*20cf1dd8SToby Isaac ierr = clReleaseKernel(ocl_kernel);CHKERRQ(ierr); 766*20cf1dd8SToby Isaac ierr = clReleaseProgram(ocl_prog);CHKERRQ(ierr); 767*20cf1dd8SToby Isaac PetscFunctionReturn(0); 768*20cf1dd8SToby Isaac } 769*20cf1dd8SToby Isaac 770*20cf1dd8SToby Isaac PetscErrorCode PetscFEInitialize_OpenCL(PetscFE fem) 771*20cf1dd8SToby Isaac { 772*20cf1dd8SToby Isaac PetscFunctionBegin; 773*20cf1dd8SToby Isaac fem->ops->setfromoptions = NULL; 774*20cf1dd8SToby Isaac fem->ops->setup = PetscFESetUp_Basic; 775*20cf1dd8SToby Isaac fem->ops->view = NULL; 776*20cf1dd8SToby Isaac fem->ops->destroy = PetscFEDestroy_OpenCL; 777*20cf1dd8SToby Isaac fem->ops->getdimension = PetscFEGetDimension_Basic; 778*20cf1dd8SToby Isaac fem->ops->gettabulation = PetscFEGetTabulation_Basic; 779*20cf1dd8SToby Isaac fem->ops->integrateresidual = PetscFEIntegrateResidual_OpenCL; 780*20cf1dd8SToby Isaac fem->ops->integratebdresidual = NULL/* PetscFEIntegrateBdResidual_OpenCL */; 781*20cf1dd8SToby Isaac fem->ops->integratejacobianaction = NULL/* PetscFEIntegrateJacobianAction_OpenCL */; 782*20cf1dd8SToby Isaac fem->ops->integratejacobian = PetscFEIntegrateJacobian_Basic; 783*20cf1dd8SToby Isaac PetscFunctionReturn(0); 784*20cf1dd8SToby Isaac } 785*20cf1dd8SToby Isaac 786*20cf1dd8SToby Isaac /*MC 787*20cf1dd8SToby Isaac PETSCFEOPENCL = "opencl" - A PetscFE object that integrates using a vectorized OpenCL implementation 788*20cf1dd8SToby Isaac 789*20cf1dd8SToby Isaac Level: intermediate 790*20cf1dd8SToby Isaac 791*20cf1dd8SToby Isaac .seealso: PetscFEType, PetscFECreate(), PetscFESetType() 792*20cf1dd8SToby Isaac M*/ 793*20cf1dd8SToby Isaac 794*20cf1dd8SToby Isaac PETSC_EXTERN PetscErrorCode PetscFECreate_OpenCL(PetscFE fem) 795*20cf1dd8SToby Isaac { 796*20cf1dd8SToby Isaac PetscFE_OpenCL *ocl; 797*20cf1dd8SToby Isaac cl_uint num_platforms; 798*20cf1dd8SToby Isaac cl_platform_id platform_ids[42]; 799*20cf1dd8SToby Isaac cl_uint num_devices; 800*20cf1dd8SToby Isaac cl_device_id device_ids[42]; 801*20cf1dd8SToby Isaac cl_int ierr2; 802*20cf1dd8SToby Isaac PetscErrorCode ierr; 803*20cf1dd8SToby Isaac 804*20cf1dd8SToby Isaac PetscFunctionBegin; 805*20cf1dd8SToby Isaac PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1); 806*20cf1dd8SToby Isaac ierr = PetscNewLog(fem,&ocl);CHKERRQ(ierr); 807*20cf1dd8SToby Isaac fem->data = ocl; 808*20cf1dd8SToby Isaac 809*20cf1dd8SToby Isaac /* Init Platform */ 810*20cf1dd8SToby Isaac ierr = clGetPlatformIDs(42, platform_ids, &num_platforms);CHKERRQ(ierr); 811*20cf1dd8SToby Isaac if (!num_platforms) SETERRQ(PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL platform found."); 812*20cf1dd8SToby Isaac ocl->pf_id = platform_ids[0]; 813*20cf1dd8SToby Isaac /* Init Device */ 814*20cf1dd8SToby Isaac ierr = clGetDeviceIDs(ocl->pf_id, CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices);CHKERRQ(ierr); 815*20cf1dd8SToby Isaac if (!num_devices) SETERRQ(PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL device found."); 816*20cf1dd8SToby Isaac ocl->dev_id = device_ids[0]; 817*20cf1dd8SToby Isaac /* Create context with one command queue */ 818*20cf1dd8SToby Isaac ocl->ctx_id = clCreateContext(0, 1, &(ocl->dev_id), NULL, NULL, &ierr2);CHKERRQ(ierr2); 819*20cf1dd8SToby Isaac ocl->queue_id = clCreateCommandQueue(ocl->ctx_id, ocl->dev_id, CL_QUEUE_PROFILING_ENABLE, &ierr2);CHKERRQ(ierr2); 820*20cf1dd8SToby Isaac /* Types */ 821*20cf1dd8SToby Isaac ocl->realType = PETSC_FLOAT; 822*20cf1dd8SToby Isaac /* Register events */ 823*20cf1dd8SToby Isaac ierr = PetscLogEventRegister("OpenCL FEResidual", PETSCFE_CLASSID, &ocl->residualEvent);CHKERRQ(ierr); 824*20cf1dd8SToby Isaac /* Equation handling */ 825*20cf1dd8SToby Isaac ocl->op = LAPLACIAN; 826*20cf1dd8SToby Isaac 827*20cf1dd8SToby Isaac ierr = PetscFEInitialize_OpenCL(fem);CHKERRQ(ierr); 828*20cf1dd8SToby Isaac PetscFunctionReturn(0); 829*20cf1dd8SToby Isaac } 830*20cf1dd8SToby Isaac 831*20cf1dd8SToby Isaac PetscErrorCode PetscFEOpenCLSetRealType(PetscFE fem, PetscDataType realType) 832*20cf1dd8SToby Isaac { 833*20cf1dd8SToby Isaac PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 834*20cf1dd8SToby Isaac 835*20cf1dd8SToby Isaac PetscFunctionBegin; 836*20cf1dd8SToby Isaac PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1); 837*20cf1dd8SToby Isaac ocl->realType = realType; 838*20cf1dd8SToby Isaac PetscFunctionReturn(0); 839*20cf1dd8SToby Isaac } 840*20cf1dd8SToby Isaac 841*20cf1dd8SToby Isaac PetscErrorCode PetscFEOpenCLGetRealType(PetscFE fem, PetscDataType *realType) 842*20cf1dd8SToby Isaac { 843*20cf1dd8SToby Isaac PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 844*20cf1dd8SToby Isaac 845*20cf1dd8SToby Isaac PetscFunctionBegin; 846*20cf1dd8SToby Isaac PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1); 847*20cf1dd8SToby Isaac PetscValidPointer(realType, 2); 848*20cf1dd8SToby Isaac *realType = ocl->realType; 849*20cf1dd8SToby Isaac PetscFunctionReturn(0); 850*20cf1dd8SToby Isaac } 851*20cf1dd8SToby Isaac 852*20cf1dd8SToby Isaac #endif /* PETSC_HAVE_OPENCL */ 853*20cf1dd8SToby Isaac 854