120cf1dd8SToby Isaac #include <petsc/private/petscfeimpl.h> /*I "petscfe.h" I*/ 220cf1dd8SToby Isaac 37be5e748SToby Isaac #if defined(PETSC_HAVE_OPENCL) 420cf1dd8SToby Isaac 52b99622eSMatthew G. Knepley static PetscErrorCode PetscFEDestroy_OpenCL(PetscFE fem) 620cf1dd8SToby Isaac { 720cf1dd8SToby Isaac PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 820cf1dd8SToby Isaac PetscErrorCode ierr; 920cf1dd8SToby Isaac 1020cf1dd8SToby Isaac PetscFunctionBegin; 1120cf1dd8SToby Isaac ierr = clReleaseCommandQueue(ocl->queue_id);CHKERRQ(ierr); 1220cf1dd8SToby Isaac ocl->queue_id = 0; 1320cf1dd8SToby Isaac ierr = clReleaseContext(ocl->ctx_id);CHKERRQ(ierr); 1420cf1dd8SToby Isaac ocl->ctx_id = 0; 1520cf1dd8SToby Isaac ierr = PetscFree(ocl);CHKERRQ(ierr); 1620cf1dd8SToby Isaac PetscFunctionReturn(0); 1720cf1dd8SToby Isaac } 1820cf1dd8SToby Isaac 192e58f0efSBarry Smith #define CHKERRSTR(err) do {CHKERRQ(err); string_tail += count; if (string_tail == end_of_buffer) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB,"Buffer overflow");} while (0) 2020cf1dd8SToby Isaac enum {LAPLACIAN = 0, ELASTICITY = 1}; 2120cf1dd8SToby Isaac 2220cf1dd8SToby Isaac /* NOTE: This is now broken for vector problems. Must redo loops to respect vector basis elements */ 2320cf1dd8SToby Isaac /* dim Number of spatial dimensions: 2 */ 2420cf1dd8SToby Isaac /* N_b Number of basis functions: generated */ 2520cf1dd8SToby Isaac /* N_{bt} Number of total basis functions: N_b * N_{comp} */ 2620cf1dd8SToby Isaac /* N_q Number of quadrature points: generated */ 2720cf1dd8SToby Isaac /* N_{bs} Number of block cells LCM(N_b, N_q) */ 2820cf1dd8SToby Isaac /* N_{bst} Number of block cell components LCM(N_{bt}, N_q) */ 2920cf1dd8SToby Isaac /* N_{bl} Number of concurrent blocks generated */ 3020cf1dd8SToby Isaac /* N_t Number of threads: N_{bl} * N_{bs} */ 3120cf1dd8SToby Isaac /* N_{cbc} Number of concurrent basis cells: N_{bl} * N_q */ 3220cf1dd8SToby Isaac /* N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b */ 3320cf1dd8SToby Isaac /* N_{sbc} Number of serial basis cells: N_{bs} / N_q */ 3420cf1dd8SToby Isaac /* N_{sqc} Number of serial quadrature cells: N_{bs} / N_b */ 3520cf1dd8SToby Isaac /* N_{cb} Number of serial cell batches: input */ 3620cf1dd8SToby Isaac /* N_c Number of total cells: N_{cb}*N_{t}/N_{comp} */ 372b99622eSMatthew G. Knepley static PetscErrorCode PetscFEOpenCLGenerateIntegrationCode(PetscFE fem, char **string_buffer, PetscInt buffer_length, PetscBool useAux, PetscInt N_bl) 3820cf1dd8SToby Isaac { 3920cf1dd8SToby Isaac PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 4020cf1dd8SToby Isaac PetscQuadrature q; 4120cf1dd8SToby Isaac char *string_tail = *string_buffer; 4220cf1dd8SToby Isaac char *end_of_buffer = *string_buffer + buffer_length; 4320cf1dd8SToby Isaac char float_str[] = "float", double_str[] = "double"; 4420cf1dd8SToby Isaac char *numeric_str = &(float_str[0]); 4520cf1dd8SToby Isaac PetscInt op = ocl->op; 4620cf1dd8SToby Isaac PetscBool useField = PETSC_FALSE; 4720cf1dd8SToby Isaac PetscBool useFieldDer = PETSC_TRUE; 4820cf1dd8SToby Isaac PetscBool useFieldAux = useAux; 4920cf1dd8SToby Isaac PetscBool useFieldDerAux= PETSC_FALSE; 5020cf1dd8SToby Isaac PetscBool useF0 = PETSC_TRUE; 5120cf1dd8SToby Isaac PetscBool useF1 = PETSC_TRUE; 5220cf1dd8SToby Isaac const PetscReal *points, *weights; 53ef0bb6c7SMatthew G. Knepley PetscTabulation T; 5420cf1dd8SToby Isaac PetscInt dim, qNc, N_b, N_c, N_q, N_t, p, d, b, c; 5520cf1dd8SToby Isaac size_t count; 5620cf1dd8SToby Isaac PetscErrorCode ierr; 5720cf1dd8SToby Isaac 5820cf1dd8SToby Isaac PetscFunctionBegin; 5920cf1dd8SToby Isaac ierr = PetscFEGetSpatialDimension(fem, &dim);CHKERRQ(ierr); 6020cf1dd8SToby Isaac ierr = PetscFEGetDimension(fem, &N_b);CHKERRQ(ierr); 6120cf1dd8SToby Isaac ierr = PetscFEGetNumComponents(fem, &N_c);CHKERRQ(ierr); 6220cf1dd8SToby Isaac ierr = PetscFEGetQuadrature(fem, &q);CHKERRQ(ierr); 6320cf1dd8SToby Isaac ierr = PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);CHKERRQ(ierr); 6420cf1dd8SToby Isaac if (qNc != 1) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %D components\n", qNc); 6520cf1dd8SToby Isaac N_t = N_b * N_c * N_q * N_bl; 6620cf1dd8SToby Isaac /* Enable device extension for double precision */ 6720cf1dd8SToby Isaac if (ocl->realType == PETSC_DOUBLE) { 6820cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 6920cf1dd8SToby Isaac "#if defined(cl_khr_fp64)\n" 7020cf1dd8SToby Isaac "# pragma OPENCL EXTENSION cl_khr_fp64: enable\n" 7120cf1dd8SToby Isaac "#elif defined(cl_amd_fp64)\n" 7220cf1dd8SToby Isaac "# pragma OPENCL EXTENSION cl_amd_fp64: enable\n" 7320cf1dd8SToby Isaac "#endif\n", 742e58f0efSBarry Smith &count);CHKERRSTR(ierr); 7520cf1dd8SToby Isaac numeric_str = &(double_str[0]); 7620cf1dd8SToby Isaac } 7720cf1dd8SToby Isaac /* Kernel API */ 7820cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 7920cf1dd8SToby Isaac "\n" 8020cf1dd8SToby Isaac "__kernel void integrateElementQuadrature(int N_cb, __global %s *coefficients, __global %s *coefficientsAux, __global %s *jacobianInverses, __global %s *jacobianDeterminants, __global %s *elemVec)\n" 8120cf1dd8SToby Isaac "{\n", 822e58f0efSBarry Smith &count, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str);CHKERRSTR(ierr); 8320cf1dd8SToby Isaac /* Quadrature */ 8420cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 8520cf1dd8SToby Isaac " /* Quadrature points\n" 8620cf1dd8SToby Isaac " - (x1,y1,x2,y2,...) */\n" 8720cf1dd8SToby Isaac " const %s points[%d] = {\n", 882e58f0efSBarry Smith &count, numeric_str, N_q*dim);CHKERRSTR(ierr); 8920cf1dd8SToby Isaac for (p = 0; p < N_q; ++p) { 9020cf1dd8SToby Isaac for (d = 0; d < dim; ++d) { 912e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, points[p*dim+d]);CHKERRSTR(ierr); 9220cf1dd8SToby Isaac } 9320cf1dd8SToby Isaac } 942e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);CHKERRSTR(ierr); 9520cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 9620cf1dd8SToby Isaac " /* Quadrature weights\n" 9720cf1dd8SToby Isaac " - (v1,v2,...) */\n" 9820cf1dd8SToby Isaac " const %s weights[%d] = {\n", 992e58f0efSBarry Smith &count, numeric_str, N_q);CHKERRSTR(ierr); 10020cf1dd8SToby Isaac for (p = 0; p < N_q; ++p) { 1012e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, weights[p]);CHKERRSTR(ierr); 10220cf1dd8SToby Isaac } 1032e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);CHKERRSTR(ierr); 10420cf1dd8SToby Isaac /* Basis Functions */ 105f9244615SMatthew G. Knepley ierr = PetscFEGetCellTabulation(fem, 1, &T);CHKERRQ(ierr); 10620cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 10720cf1dd8SToby Isaac " /* Nodal basis function evaluations\n" 10820cf1dd8SToby Isaac " - basis component is fastest varying, the basis function, then point */\n" 10920cf1dd8SToby Isaac " const %s Basis[%d] = {\n", 1102e58f0efSBarry Smith &count, numeric_str, N_q*N_b*N_c);CHKERRSTR(ierr); 11120cf1dd8SToby Isaac for (p = 0; p < N_q; ++p) { 11220cf1dd8SToby Isaac for (b = 0; b < N_b; ++b) { 11320cf1dd8SToby Isaac for (c = 0; c < N_c; ++c) { 1142e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, T->T[0][(p*N_b + b)*N_c + c]);CHKERRSTR(ierr); 11520cf1dd8SToby Isaac } 11620cf1dd8SToby Isaac } 11720cf1dd8SToby Isaac } 1182e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);CHKERRSTR(ierr); 11920cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 12020cf1dd8SToby Isaac "\n" 12120cf1dd8SToby Isaac " /* Nodal basis function derivative evaluations,\n" 12220cf1dd8SToby Isaac " - derivative direction is fastest varying, then basis component, then basis function, then point */\n" 12320cf1dd8SToby Isaac " const %s%d BasisDerivatives[%d] = {\n", 1242e58f0efSBarry Smith &count, numeric_str, dim, N_q*N_b*N_c);CHKERRSTR(ierr); 12520cf1dd8SToby Isaac for (p = 0; p < N_q; ++p) { 12620cf1dd8SToby Isaac for (b = 0; b < N_b; ++b) { 12720cf1dd8SToby Isaac for (c = 0; c < N_c; ++c) { 1282e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "(%s%d)(", &count, numeric_str, dim);CHKERRSTR(ierr); 12920cf1dd8SToby Isaac for (d = 0; d < dim; ++d) { 13020cf1dd8SToby Isaac if (d > 0) { 1312e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, ", %g", &count, T->T[1][((p*N_b + b)*dim + d)*N_c + c]);CHKERRSTR(ierr); 13220cf1dd8SToby Isaac } else { 1332e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g", &count, T->T[1][((p*N_b + b)*dim + d)*N_c + c]);CHKERRSTR(ierr); 13420cf1dd8SToby Isaac } 13520cf1dd8SToby Isaac } 1362e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "),\n", &count);CHKERRSTR(ierr); 13720cf1dd8SToby Isaac } 13820cf1dd8SToby Isaac } 13920cf1dd8SToby Isaac } 1402e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);CHKERRSTR(ierr); 14120cf1dd8SToby Isaac /* Sizes */ 14220cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 14320cf1dd8SToby Isaac " const int dim = %d; // The spatial dimension\n" 14420cf1dd8SToby Isaac " const int N_bl = %d; // The number of concurrent blocks\n" 14520cf1dd8SToby Isaac " const int N_b = %d; // The number of basis functions\n" 14620cf1dd8SToby Isaac " const int N_comp = %d; // The number of basis function components\n" 14720cf1dd8SToby Isaac " const int N_bt = N_b*N_comp; // The total number of scalar basis functions\n" 14820cf1dd8SToby Isaac " const int N_q = %d; // The number of quadrature points\n" 14920cf1dd8SToby Isaac " const int N_bst = N_bt*N_q; // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously\n" 15020cf1dd8SToby Isaac " const int N_t = N_bst*N_bl; // The number of threads, N_bst * N_bl\n" 15120cf1dd8SToby Isaac " const int N_bc = N_t/N_comp; // The number of cells per batch (N_b*N_q*N_bl)\n" 15220cf1dd8SToby Isaac " const int N_sbc = N_bst / (N_q * N_comp);\n" 15320cf1dd8SToby Isaac " const int N_sqc = N_bst / N_bt;\n" 15420cf1dd8SToby Isaac " /*const int N_c = N_cb * N_bc;*/\n" 15520cf1dd8SToby Isaac "\n" 15620cf1dd8SToby Isaac " /* Calculated indices */\n" 15720cf1dd8SToby Isaac " /*const int tidx = get_local_id(0) + get_local_size(0)*get_local_id(1);*/\n" 15820cf1dd8SToby Isaac " const int tidx = get_local_id(0);\n" 15920cf1dd8SToby Isaac " const int blidx = tidx / N_bst; // Block number for this thread\n" 16020cf1dd8SToby Isaac " const int bidx = tidx %% N_bt; // Basis function mapped to this thread\n" 16120cf1dd8SToby Isaac " const int cidx = tidx %% N_comp; // Basis component mapped to this thread\n" 16220cf1dd8SToby Isaac " const int qidx = tidx %% N_q; // Quadrature point mapped to this thread\n" 16320cf1dd8SToby Isaac " const int blbidx = tidx %% N_q + blidx*N_q; // Cell mapped to this thread in the basis phase\n" 16420cf1dd8SToby Isaac " const int blqidx = tidx %% N_b + blidx*N_b; // Cell mapped to this thread in the quadrature phase\n" 16520cf1dd8SToby Isaac " const int gidx = get_group_id(1)*get_num_groups(0) + get_group_id(0);\n" 16620cf1dd8SToby Isaac " const int Goffset = gidx*N_cb*N_bc;\n", 1672e58f0efSBarry Smith &count, dim, N_bl, N_b, N_c, N_q);CHKERRSTR(ierr); 16820cf1dd8SToby Isaac /* Local memory */ 16920cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 17020cf1dd8SToby Isaac "\n" 17120cf1dd8SToby Isaac " /* Quadrature data */\n" 17220cf1dd8SToby Isaac " %s w; // $w_q$, Quadrature weight at $x_q$\n" 17320cf1dd8SToby Isaac " __local %s phi_i[%d]; //[N_bt*N_q]; // $\\phi_i(x_q)$, Value of the basis function $i$ at $x_q$\n" 17420cf1dd8SToby Isaac " __local %s%d phiDer_i[%d]; //[N_bt*N_q]; // $\\frac{\\partial\\phi_i(x_q)}{\\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$\n" 17520cf1dd8SToby Isaac " /* Geometric data */\n" 17620cf1dd8SToby Isaac " __local %s detJ[%d]; //[N_t]; // $|J(x_q)|$, Jacobian determinant at $x_q$\n" 17720cf1dd8SToby Isaac " __local %s invJ[%d];//[N_t*dim*dim]; // $J^{-1}(x_q)$, Jacobian inverse at $x_q$\n", 17820cf1dd8SToby Isaac &count, numeric_str, numeric_str, N_b*N_c*N_q, numeric_str, dim, N_b*N_c*N_q, numeric_str, N_t, 1792e58f0efSBarry Smith numeric_str, N_t*dim*dim, numeric_str, N_t*N_b*N_c);CHKERRSTR(ierr); 18020cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 18120cf1dd8SToby Isaac " /* FEM data */\n" 18220cf1dd8SToby Isaac " __local %s u_i[%d]; //[N_t*N_bt]; // Coefficients $u_i$ of the field $u|_{\\mathcal{T}} = \\sum_i u_i \\phi_i$\n", 1832e58f0efSBarry Smith &count, numeric_str, N_t*N_b*N_c);CHKERRSTR(ierr); 18420cf1dd8SToby Isaac if (useAux) { 18520cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 18620cf1dd8SToby Isaac " __local %s a_i[%d]; //[N_t]; // Coefficients $a_i$ of the auxiliary field $a|_{\\mathcal{T}} = \\sum_i a_i \\phi^R_i$\n", 1872e58f0efSBarry Smith &count, numeric_str, N_t);CHKERRSTR(ierr); 18820cf1dd8SToby Isaac } 18920cf1dd8SToby Isaac if (useF0) { 19020cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 19120cf1dd8SToby Isaac " /* Intermediate calculations */\n" 19220cf1dd8SToby Isaac " __local %s f_0[%d]; //[N_t*N_sqc]; // $f_0(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n", 1932e58f0efSBarry Smith &count, numeric_str, N_t*N_q);CHKERRSTR(ierr); 19420cf1dd8SToby Isaac } 19520cf1dd8SToby Isaac if (useF1) { 19620cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 19720cf1dd8SToby Isaac " __local %s%d f_1[%d]; //[N_t*N_sqc]; // $f_1(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n", 1982e58f0efSBarry Smith &count, numeric_str, dim, N_t*N_q);CHKERRSTR(ierr); 19920cf1dd8SToby Isaac } 20020cf1dd8SToby Isaac /* TODO: If using elasticity, put in mu/lambda coefficients */ 20120cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 20220cf1dd8SToby Isaac " /* Output data */\n" 20320cf1dd8SToby Isaac " %s e_i; // Coefficient $e_i$ of the residual\n\n", 2042e58f0efSBarry Smith &count, numeric_str);CHKERRSTR(ierr); 20520cf1dd8SToby Isaac /* One-time loads */ 20620cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 20720cf1dd8SToby Isaac " /* These should be generated inline */\n" 20820cf1dd8SToby Isaac " /* Load quadrature weights */\n" 20920cf1dd8SToby Isaac " w = weights[qidx];\n" 21020cf1dd8SToby Isaac " /* Load basis tabulation \\phi_i for this cell */\n" 21120cf1dd8SToby Isaac " if (tidx < N_bt*N_q) {\n" 21220cf1dd8SToby Isaac " phi_i[tidx] = Basis[tidx];\n" 21320cf1dd8SToby Isaac " phiDer_i[tidx] = BasisDerivatives[tidx];\n" 21420cf1dd8SToby Isaac " }\n\n", 2152e58f0efSBarry Smith &count);CHKERRSTR(ierr); 21620cf1dd8SToby Isaac /* Batch loads */ 21720cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 21820cf1dd8SToby Isaac " for (int batch = 0; batch < N_cb; ++batch) {\n" 21920cf1dd8SToby Isaac " /* Load geometry */\n" 22020cf1dd8SToby Isaac " detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];\n" 22120cf1dd8SToby Isaac " for (int n = 0; n < dim*dim; ++n) {\n" 22220cf1dd8SToby Isaac " const int offset = n*N_t;\n" 22320cf1dd8SToby Isaac " invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];\n" 22420cf1dd8SToby Isaac " }\n" 22520cf1dd8SToby Isaac " /* Load coefficients u_i for this cell */\n" 22620cf1dd8SToby Isaac " for (int n = 0; n < N_bt; ++n) {\n" 22720cf1dd8SToby Isaac " const int offset = n*N_t;\n" 22820cf1dd8SToby Isaac " u_i[offset+tidx] = coefficients[(Goffset*N_bt)+batch*N_t*N_b+offset+tidx];\n" 22920cf1dd8SToby Isaac " }\n", 2302e58f0efSBarry Smith &count);CHKERRSTR(ierr); 23120cf1dd8SToby Isaac if (useAux) { 23220cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 23320cf1dd8SToby Isaac " /* Load coefficients a_i for this cell */\n" 23420cf1dd8SToby Isaac " /* TODO: This should not be N_t here, it should be N_bc*N_comp_aux */\n" 23520cf1dd8SToby Isaac " a_i[tidx] = coefficientsAux[Goffset+batch*N_t+tidx];\n", 2362e58f0efSBarry Smith &count);CHKERRSTR(ierr); 23720cf1dd8SToby Isaac } 23820cf1dd8SToby Isaac /* Quadrature phase */ 23920cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 24020cf1dd8SToby Isaac " barrier(CLK_LOCAL_MEM_FENCE);\n" 24120cf1dd8SToby Isaac "\n" 24220cf1dd8SToby Isaac " /* Map coefficients to values at quadrature points */\n" 24320cf1dd8SToby Isaac " for (int c = 0; c < N_sqc; ++c) {\n" 24420cf1dd8SToby Isaac " const int cell = c*N_bl*N_b + blqidx;\n" 24520cf1dd8SToby Isaac " const int fidx = (cell*N_q + qidx)*N_comp + cidx;\n", 2462e58f0efSBarry Smith &count);CHKERRSTR(ierr); 24720cf1dd8SToby Isaac if (useField) { 24820cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 24920cf1dd8SToby Isaac " %s u[%d]; //[N_comp]; // $u(x_q)$, Value of the field at $x_q$\n", 2502e58f0efSBarry Smith &count, numeric_str, N_c);CHKERRSTR(ierr); 25120cf1dd8SToby Isaac } 25220cf1dd8SToby Isaac if (useFieldDer) { 25320cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 25420cf1dd8SToby Isaac " %s%d gradU[%d]; //[N_comp]; // $\\nabla u(x_q)$, Value of the field gradient at $x_q$\n", 2552e58f0efSBarry Smith &count, numeric_str, dim, N_c);CHKERRSTR(ierr); 25620cf1dd8SToby Isaac } 25720cf1dd8SToby Isaac if (useFieldAux) { 25820cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 25920cf1dd8SToby Isaac " %s a[%d]; //[1]; // $a(x_q)$, Value of the auxiliary fields at $x_q$\n", 2602e58f0efSBarry Smith &count, numeric_str, 1);CHKERRSTR(ierr); 26120cf1dd8SToby Isaac } 26220cf1dd8SToby Isaac if (useFieldDerAux) { 26320cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 26420cf1dd8SToby Isaac " %s%d gradA[%d]; //[1]; // $\\nabla a(x_q)$, Value of the auxiliary field gradient at $x_q$\n", 2652e58f0efSBarry Smith &count, numeric_str, dim, 1);CHKERRSTR(ierr); 26620cf1dd8SToby Isaac } 26720cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 26820cf1dd8SToby Isaac "\n" 26920cf1dd8SToby Isaac " for (int comp = 0; comp < N_comp; ++comp) {\n", 2702e58f0efSBarry Smith &count);CHKERRSTR(ierr); 2712e58f0efSBarry Smith if (useField) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " u[comp] = 0.0;\n", &count);CHKERRSTR(ierr);} 27220cf1dd8SToby Isaac if (useFieldDer) { 27320cf1dd8SToby Isaac switch (dim) { 27420cf1dd8SToby Isaac case 1: 2752e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0;\n", &count);CHKERRSTR(ierr);break; 27620cf1dd8SToby Isaac case 2: 2772e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0;\n", &count);CHKERRSTR(ierr);break; 27820cf1dd8SToby Isaac case 3: 2792e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;\n", &count);CHKERRSTR(ierr);break; 28020cf1dd8SToby Isaac } 28120cf1dd8SToby Isaac } 28220cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 28320cf1dd8SToby Isaac " }\n", 2842e58f0efSBarry Smith &count);CHKERRSTR(ierr); 28520cf1dd8SToby Isaac if (useFieldAux) { 2862e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " a[0] = 0.0;\n", &count);CHKERRSTR(ierr); 28720cf1dd8SToby Isaac } 28820cf1dd8SToby Isaac if (useFieldDerAux) { 28920cf1dd8SToby Isaac switch (dim) { 29020cf1dd8SToby Isaac case 1: 2912e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0;\n", &count);CHKERRSTR(ierr);break; 29220cf1dd8SToby Isaac case 2: 2932e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0;\n", &count);CHKERRSTR(ierr);break; 29420cf1dd8SToby Isaac case 3: 2952e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0; gradA[0].z = 0.0;\n", &count);CHKERRSTR(ierr);break; 29620cf1dd8SToby Isaac } 29720cf1dd8SToby Isaac } 29820cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 29920cf1dd8SToby Isaac " /* Get field and derivatives at this quadrature point */\n" 30020cf1dd8SToby Isaac " for (int i = 0; i < N_b; ++i) {\n" 30120cf1dd8SToby Isaac " for (int comp = 0; comp < N_comp; ++comp) {\n" 30220cf1dd8SToby Isaac " const int b = i*N_comp+comp;\n" 30320cf1dd8SToby Isaac " const int pidx = qidx*N_bt + b;\n" 30420cf1dd8SToby Isaac " const int uidx = cell*N_bt + b;\n" 30520cf1dd8SToby Isaac " %s%d realSpaceDer;\n\n", 3062e58f0efSBarry Smith &count, numeric_str, dim);CHKERRSTR(ierr); 3072e58f0efSBarry Smith if (useField) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," u[comp] += u_i[uidx]*phi_i[pidx];\n", &count);CHKERRSTR(ierr);} 30820cf1dd8SToby Isaac if (useFieldDer) { 30920cf1dd8SToby Isaac switch (dim) { 31020cf1dd8SToby Isaac case 2: 31120cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 31220cf1dd8SToby Isaac " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n" 31320cf1dd8SToby Isaac " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n" 31420cf1dd8SToby Isaac " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n" 31520cf1dd8SToby Isaac " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n", 3162e58f0efSBarry Smith &count);CHKERRSTR(ierr);break; 31720cf1dd8SToby Isaac case 3: 31820cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 31920cf1dd8SToby Isaac " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n" 32020cf1dd8SToby Isaac " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n" 32120cf1dd8SToby Isaac " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n" 32220cf1dd8SToby Isaac " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n" 32320cf1dd8SToby Isaac " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n" 32420cf1dd8SToby Isaac " gradU[comp].z += u_i[uidx]*realSpaceDer.z;\n", 3252e58f0efSBarry Smith &count);CHKERRSTR(ierr);break; 32620cf1dd8SToby Isaac } 32720cf1dd8SToby Isaac } 32820cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 32920cf1dd8SToby Isaac " }\n" 33020cf1dd8SToby Isaac " }\n", 3312e58f0efSBarry Smith &count);CHKERRSTR(ierr); 33220cf1dd8SToby Isaac if (useFieldAux) { 3332e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," a[0] += a_i[cell];\n", &count);CHKERRSTR(ierr); 33420cf1dd8SToby Isaac } 33520cf1dd8SToby Isaac /* Calculate residual at quadrature points: Should be generated by an weak form egine */ 33620cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 33720cf1dd8SToby Isaac " /* Process values at quadrature points */\n", 3382e58f0efSBarry Smith &count);CHKERRSTR(ierr); 33920cf1dd8SToby Isaac switch (op) { 34020cf1dd8SToby Isaac case LAPLACIAN: 3412e58f0efSBarry Smith if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count);CHKERRSTR(ierr);} 34220cf1dd8SToby Isaac if (useF1) { 3432e58f0efSBarry Smith if (useAux) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = a[0]*gradU[cidx];\n", &count);CHKERRSTR(ierr);} 3442e58f0efSBarry Smith else {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = gradU[cidx];\n", &count);CHKERRSTR(ierr);} 34520cf1dd8SToby Isaac } 34620cf1dd8SToby Isaac break; 34720cf1dd8SToby Isaac case ELASTICITY: 3482e58f0efSBarry Smith if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count);CHKERRSTR(ierr);} 34920cf1dd8SToby Isaac if (useF1) { 35020cf1dd8SToby Isaac switch (dim) { 35120cf1dd8SToby Isaac case 2: 35220cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 35320cf1dd8SToby Isaac " switch (cidx) {\n" 35420cf1dd8SToby Isaac " case 0:\n" 35520cf1dd8SToby Isaac " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].x + gradU[0].x);\n" 35620cf1dd8SToby Isaac " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].y + gradU[1].x);\n" 35720cf1dd8SToby Isaac " break;\n" 35820cf1dd8SToby Isaac " case 1:\n" 35920cf1dd8SToby Isaac " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].x + gradU[0].y);\n" 36020cf1dd8SToby Isaac " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].y + gradU[1].y);\n" 36120cf1dd8SToby Isaac " }\n", 3622e58f0efSBarry Smith &count);CHKERRSTR(ierr);break; 36320cf1dd8SToby Isaac case 3: 36420cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 36520cf1dd8SToby Isaac " switch (cidx) {\n" 36620cf1dd8SToby Isaac " case 0:\n" 36720cf1dd8SToby Isaac " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].x + gradU[0].x);\n" 36820cf1dd8SToby Isaac " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].y + gradU[1].x);\n" 36920cf1dd8SToby Isaac " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].z + gradU[2].x);\n" 37020cf1dd8SToby Isaac " break;\n" 37120cf1dd8SToby Isaac " case 1:\n" 37220cf1dd8SToby Isaac " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].x + gradU[0].y);\n" 37320cf1dd8SToby Isaac " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[1].y);\n" 37420cf1dd8SToby Isaac " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[2].y);\n" 37520cf1dd8SToby Isaac " break;\n" 37620cf1dd8SToby Isaac " case 2:\n" 37720cf1dd8SToby Isaac " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].x + gradU[0].z);\n" 37820cf1dd8SToby Isaac " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[1].z);\n" 37920cf1dd8SToby Isaac " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[2].z);\n" 38020cf1dd8SToby Isaac " }\n", 3812e58f0efSBarry Smith &count);CHKERRSTR(ierr);break; 38220cf1dd8SToby Isaac }} 38320cf1dd8SToby Isaac break; 38420cf1dd8SToby Isaac default: 385ea13f565SStefano Zampini SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "PDE operator %d is not supported", op); 38620cf1dd8SToby Isaac } 3872e58f0efSBarry Smith if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_0[fidx] *= detJ[cell]*w;\n", &count);CHKERRSTR(ierr);} 38820cf1dd8SToby Isaac if (useF1) { 38920cf1dd8SToby Isaac switch (dim) { 39020cf1dd8SToby Isaac case 1: 3912e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_1[fidx].x *= detJ[cell]*w;\n", &count);CHKERRSTR(ierr);break; 39220cf1dd8SToby Isaac case 2: 3932e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;\n", &count);CHKERRSTR(ierr);break; 39420cf1dd8SToby Isaac case 3: 3952e58f0efSBarry Smith ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;\n", &count);CHKERRSTR(ierr);break; 39620cf1dd8SToby Isaac } 39720cf1dd8SToby Isaac } 39820cf1dd8SToby Isaac /* Thread transpose */ 39920cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 40020cf1dd8SToby Isaac " }\n\n" 40120cf1dd8SToby Isaac " /* ==== TRANSPOSE THREADS ==== */\n" 40220cf1dd8SToby Isaac " barrier(CLK_LOCAL_MEM_FENCE);\n\n", 4032e58f0efSBarry Smith &count);CHKERRSTR(ierr); 40420cf1dd8SToby Isaac /* Basis phase */ 40520cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 40620cf1dd8SToby Isaac " /* Map values at quadrature points to coefficients */\n" 40720cf1dd8SToby Isaac " for (int c = 0; c < N_sbc; ++c) {\n" 40820cf1dd8SToby Isaac " const int cell = c*N_bl*N_q + blbidx; /* Cell number in batch */\n" 40920cf1dd8SToby Isaac "\n" 41020cf1dd8SToby Isaac " e_i = 0.0;\n" 41120cf1dd8SToby Isaac " for (int q = 0; q < N_q; ++q) {\n" 41220cf1dd8SToby Isaac " const int pidx = q*N_bt + bidx;\n" 41320cf1dd8SToby Isaac " const int fidx = (cell*N_q + q)*N_comp + cidx;\n" 41420cf1dd8SToby Isaac " %s%d realSpaceDer;\n\n", 4152e58f0efSBarry Smith &count, numeric_str, dim);CHKERRSTR(ierr); 41620cf1dd8SToby Isaac 4172e58f0efSBarry Smith if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," e_i += phi_i[pidx]*f_0[fidx];\n", &count);CHKERRSTR(ierr);} 41820cf1dd8SToby Isaac if (useF1) { 41920cf1dd8SToby Isaac switch (dim) { 42020cf1dd8SToby Isaac case 2: 42120cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 42220cf1dd8SToby Isaac " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n" 42320cf1dd8SToby Isaac " e_i += realSpaceDer.x*f_1[fidx].x;\n" 42420cf1dd8SToby Isaac " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n" 42520cf1dd8SToby Isaac " e_i += realSpaceDer.y*f_1[fidx].y;\n", 4262e58f0efSBarry Smith &count);CHKERRSTR(ierr);break; 42720cf1dd8SToby Isaac case 3: 42820cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 42920cf1dd8SToby Isaac " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n" 43020cf1dd8SToby Isaac " e_i += realSpaceDer.x*f_1[fidx].x;\n" 43120cf1dd8SToby Isaac " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n" 43220cf1dd8SToby Isaac " e_i += realSpaceDer.y*f_1[fidx].y;\n" 43320cf1dd8SToby Isaac " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n" 43420cf1dd8SToby Isaac " e_i += realSpaceDer.z*f_1[fidx].z;\n", 4352e58f0efSBarry Smith &count);CHKERRSTR(ierr);break; 43620cf1dd8SToby Isaac } 43720cf1dd8SToby Isaac } 43820cf1dd8SToby Isaac ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 43920cf1dd8SToby Isaac " }\n" 44020cf1dd8SToby Isaac " /* Write element vector for N_{cbc} cells at a time */\n" 44120cf1dd8SToby Isaac " elemVec[(Goffset + batch*N_bc + c*N_bl*N_q)*N_bt + tidx] = e_i;\n" 44220cf1dd8SToby Isaac " }\n" 44320cf1dd8SToby Isaac " /* ==== Could do one write per batch ==== */\n" 44420cf1dd8SToby Isaac " }\n" 44520cf1dd8SToby Isaac " return;\n" 44620cf1dd8SToby Isaac "}\n", 4472e58f0efSBarry Smith &count);CHKERRSTR(ierr); 44820cf1dd8SToby Isaac PetscFunctionReturn(0); 44920cf1dd8SToby Isaac } 45020cf1dd8SToby Isaac 4512b99622eSMatthew G. Knepley static PetscErrorCode PetscFEOpenCLGetIntegrationKernel(PetscFE fem, PetscBool useAux, cl_program *ocl_prog, cl_kernel *ocl_kernel) 45220cf1dd8SToby Isaac { 45320cf1dd8SToby Isaac PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 45420cf1dd8SToby Isaac PetscInt dim, N_bl; 45520cf1dd8SToby Isaac PetscBool flg; 45620cf1dd8SToby Isaac char *buffer; 45720cf1dd8SToby Isaac size_t len; 45820cf1dd8SToby Isaac char errMsg[8192]; 4592da392ccSBarry Smith cl_int err; 46020cf1dd8SToby Isaac PetscErrorCode ierr; 46120cf1dd8SToby Isaac 46220cf1dd8SToby Isaac PetscFunctionBegin; 46320cf1dd8SToby Isaac ierr = PetscFEGetSpatialDimension(fem, &dim);CHKERRQ(ierr); 46420cf1dd8SToby Isaac ierr = PetscMalloc1(8192, &buffer);CHKERRQ(ierr); 46520cf1dd8SToby Isaac ierr = PetscFEGetTileSizes(fem, NULL, &N_bl, NULL, NULL);CHKERRQ(ierr); 46620cf1dd8SToby Isaac ierr = PetscFEOpenCLGenerateIntegrationCode(fem, &buffer, 8192, useAux, N_bl);CHKERRQ(ierr); 46720cf1dd8SToby Isaac ierr = PetscOptionsHasName(((PetscObject)fem)->options,((PetscObject)fem)->prefix, "-petscfe_opencl_kernel_print", &flg);CHKERRQ(ierr); 46820cf1dd8SToby Isaac if (flg) {ierr = PetscPrintf(PetscObjectComm((PetscObject) fem), "OpenCL FE Integration Kernel:\n%s\n", buffer);CHKERRQ(ierr);} 4692da392ccSBarry Smith ierr = PetscStrlen(buffer,&len);CHKERRQ(ierr); 4702da392ccSBarry Smith *ocl_prog = clCreateProgramWithSource(ocl->ctx_id, 1, (const char **) &buffer, &len, &err);CHKERRQ(err); 4712da392ccSBarry Smith err = clBuildProgram(*ocl_prog, 0, NULL, NULL, NULL, NULL); 4722da392ccSBarry Smith if (err != CL_SUCCESS) { 473*2f613bf5SBarry Smith err = clGetProgramBuildInfo(*ocl_prog, ocl->dev_id, CL_PROGRAM_BUILD_LOG, 8192*sizeof(char), &errMsg, NULL); 47420cf1dd8SToby Isaac SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Build failed! Log:\n %s", errMsg); 47520cf1dd8SToby Isaac } 47620cf1dd8SToby Isaac ierr = PetscFree(buffer);CHKERRQ(ierr); 477*2f613bf5SBarry Smith *ocl_kernel = clCreateKernel(*ocl_prog, "integrateElementQuadrature", &ierr); 47820cf1dd8SToby Isaac PetscFunctionReturn(0); 47920cf1dd8SToby Isaac } 48020cf1dd8SToby Isaac 4812b99622eSMatthew G. Knepley static PetscErrorCode PetscFEOpenCLCalculateGrid(PetscFE fem, PetscInt N, PetscInt blockSize, size_t *x, size_t *y, size_t *z) 48220cf1dd8SToby Isaac { 48320cf1dd8SToby Isaac const PetscInt Nblocks = N/blockSize; 48420cf1dd8SToby Isaac 48520cf1dd8SToby Isaac PetscFunctionBegin; 48620cf1dd8SToby Isaac if (N % blockSize) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N); 48720cf1dd8SToby Isaac *z = 1; 488623c9f23SSatish Balay *y = 1; 48920cf1dd8SToby Isaac for (*x = (size_t) (PetscSqrtReal(Nblocks) + 0.5); *x > 0; --*x) { 49020cf1dd8SToby Isaac *y = Nblocks / *x; 491526996e7SStefano Zampini if (*x * *y == (size_t)Nblocks) break; 49220cf1dd8SToby Isaac } 493adf5291fSStefano Zampini if (*x * *y != (size_t)Nblocks) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %D with block size %D", N, blockSize); 49420cf1dd8SToby Isaac PetscFunctionReturn(0); 49520cf1dd8SToby Isaac } 49620cf1dd8SToby Isaac 4972b99622eSMatthew G. Knepley static PetscErrorCode PetscFEOpenCLLogResidual(PetscFE fem, PetscLogDouble time, PetscLogDouble flops) 49820cf1dd8SToby Isaac { 49920cf1dd8SToby Isaac PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 50020cf1dd8SToby Isaac PetscStageLog stageLog; 50120cf1dd8SToby Isaac PetscEventPerfLog eventLog = NULL; 502afbfd3a7SStefano Zampini int stage; 50320cf1dd8SToby Isaac PetscErrorCode ierr; 50420cf1dd8SToby Isaac 50520cf1dd8SToby Isaac PetscFunctionBegin; 50620cf1dd8SToby Isaac ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); 50720cf1dd8SToby Isaac ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); 50820cf1dd8SToby Isaac ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr); 50920cf1dd8SToby Isaac /* Log performance info */ 51020cf1dd8SToby Isaac eventLog->eventInfo[ocl->residualEvent].count++; 51120cf1dd8SToby Isaac eventLog->eventInfo[ocl->residualEvent].time += time; 51220cf1dd8SToby Isaac eventLog->eventInfo[ocl->residualEvent].flops += flops; 51320cf1dd8SToby Isaac PetscFunctionReturn(0); 51420cf1dd8SToby Isaac } 51520cf1dd8SToby Isaac 51606ad1575SMatthew G. Knepley static PetscErrorCode PetscFEIntegrateResidual_OpenCL(PetscDS prob, PetscFormKey key, PetscInt Ne, PetscFEGeom *cgeom, 51720cf1dd8SToby Isaac const PetscScalar coefficients[], const PetscScalar coefficients_t[], PetscDS probAux, const PetscScalar coefficientsAux[], PetscReal t, PetscScalar elemVec[]) 51820cf1dd8SToby Isaac { 51920cf1dd8SToby Isaac /* Nbc = batchSize */ 520849189efSMatthew G. Knepley PetscFE fem; 521849189efSMatthew G. Knepley PetscFE_OpenCL *ocl; 52220cf1dd8SToby Isaac PetscPointFunc f0_func; 52320cf1dd8SToby Isaac PetscPointFunc f1_func; 52420cf1dd8SToby Isaac PetscQuadrature q; 52520cf1dd8SToby Isaac PetscInt dim, qNc; 52620cf1dd8SToby Isaac PetscInt N_b; /* The number of basis functions */ 52720cf1dd8SToby Isaac PetscInt N_comp; /* The number of basis function components */ 52820cf1dd8SToby Isaac PetscInt N_bt; /* The total number of scalar basis functions */ 52920cf1dd8SToby Isaac PetscInt N_q; /* The number of quadrature points */ 53020cf1dd8SToby Isaac PetscInt N_bst; /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */ 53120cf1dd8SToby Isaac PetscInt N_t; /* The number of threads, N_bst * N_bl */ 53220cf1dd8SToby Isaac PetscInt N_bl; /* The number of blocks */ 53320cf1dd8SToby Isaac PetscInt N_bc; /* The batch size, N_bl*N_q*N_b */ 53420cf1dd8SToby Isaac PetscInt N_cb; /* The number of batches */ 5356528b96dSMatthew G. Knepley const PetscInt field = key.field; 53620cf1dd8SToby Isaac PetscInt numFlops, f0Flops = 0, f1Flops = 0; 53720cf1dd8SToby Isaac PetscBool useAux = probAux ? PETSC_TRUE : PETSC_FALSE; 53820cf1dd8SToby Isaac PetscBool useField = PETSC_FALSE; 53920cf1dd8SToby Isaac PetscBool useFieldDer = PETSC_TRUE; 54020cf1dd8SToby Isaac PetscBool useF0 = PETSC_TRUE; 54120cf1dd8SToby Isaac PetscBool useF1 = PETSC_TRUE; 54220cf1dd8SToby Isaac /* OpenCL variables */ 54320cf1dd8SToby Isaac cl_program ocl_prog; 54420cf1dd8SToby Isaac cl_kernel ocl_kernel; 54520cf1dd8SToby Isaac cl_event ocl_ev; /* The event for tracking kernel execution */ 54620cf1dd8SToby Isaac cl_ulong ns_start; /* Nanoseconds counter on GPU at kernel start */ 54720cf1dd8SToby Isaac cl_ulong ns_end; /* Nanoseconds counter on GPU at kernel stop */ 54820cf1dd8SToby Isaac cl_mem o_jacobianInverses, o_jacobianDeterminants; 54920cf1dd8SToby Isaac cl_mem o_coefficients, o_coefficientsAux, o_elemVec; 55020cf1dd8SToby Isaac float *f_coeff = NULL, *f_coeffAux = NULL, *f_invJ = NULL, *f_detJ = NULL; 55120cf1dd8SToby Isaac double *d_coeff = NULL, *d_coeffAux = NULL, *d_invJ = NULL, *d_detJ = NULL; 55220cf1dd8SToby Isaac PetscReal *r_invJ = NULL, *r_detJ = NULL; 55320cf1dd8SToby Isaac void *oclCoeff, *oclCoeffAux, *oclInvJ, *oclDetJ; 55420cf1dd8SToby Isaac size_t local_work_size[3], global_work_size[3]; 55520cf1dd8SToby Isaac size_t realSize, x, y, z; 55620cf1dd8SToby Isaac const PetscReal *points, *weights; 55720cf1dd8SToby Isaac PetscErrorCode ierr; 55820cf1dd8SToby Isaac 55920cf1dd8SToby Isaac PetscFunctionBegin; 560849189efSMatthew G. Knepley ierr = PetscDSGetDiscretization(prob, field, (PetscObject *) &fem);CHKERRQ(ierr); 561849189efSMatthew G. Knepley ocl = (PetscFE_OpenCL *) fem->data; 56220cf1dd8SToby Isaac if (!Ne) {ierr = PetscFEOpenCLLogResidual(fem, 0.0, 0.0);CHKERRQ(ierr); PetscFunctionReturn(0);} 56320cf1dd8SToby Isaac ierr = PetscFEGetSpatialDimension(fem, &dim);CHKERRQ(ierr); 56420cf1dd8SToby Isaac ierr = PetscFEGetQuadrature(fem, &q);CHKERRQ(ierr); 56520cf1dd8SToby Isaac ierr = PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);CHKERRQ(ierr); 56620cf1dd8SToby Isaac if (qNc != 1) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %D components\n", qNc); 56720cf1dd8SToby Isaac ierr = PetscFEGetDimension(fem, &N_b);CHKERRQ(ierr); 56820cf1dd8SToby Isaac ierr = PetscFEGetNumComponents(fem, &N_comp);CHKERRQ(ierr); 56920cf1dd8SToby Isaac ierr = PetscDSGetResidual(prob, field, &f0_func, &f1_func);CHKERRQ(ierr); 57020cf1dd8SToby Isaac ierr = PetscFEGetTileSizes(fem, NULL, &N_bl, &N_bc, &N_cb);CHKERRQ(ierr); 57120cf1dd8SToby Isaac N_bt = N_b*N_comp; 57220cf1dd8SToby Isaac N_bst = N_bt*N_q; 57320cf1dd8SToby Isaac N_t = N_bst*N_bl; 57420cf1dd8SToby Isaac if (N_bc*N_comp != N_t) SETERRQ3(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, N_bc, N_comp); 57520cf1dd8SToby Isaac /* Calculate layout */ 57620cf1dd8SToby Isaac if (Ne % (N_cb*N_bc)) { /* Remainder cells */ 5776528b96dSMatthew G. Knepley ierr = PetscFEIntegrateResidual_Basic(prob, key, Ne, cgeom, coefficients, coefficients_t, probAux, coefficientsAux, t, elemVec);CHKERRQ(ierr); 57820cf1dd8SToby Isaac PetscFunctionReturn(0); 57920cf1dd8SToby Isaac } 58020cf1dd8SToby Isaac ierr = PetscFEOpenCLCalculateGrid(fem, Ne, N_cb*N_bc, &x, &y, &z);CHKERRQ(ierr); 58120cf1dd8SToby Isaac local_work_size[0] = N_bc*N_comp; 58220cf1dd8SToby Isaac local_work_size[1] = 1; 58320cf1dd8SToby Isaac local_work_size[2] = 1; 58420cf1dd8SToby Isaac global_work_size[0] = x * local_work_size[0]; 58520cf1dd8SToby Isaac global_work_size[1] = y * local_work_size[1]; 58620cf1dd8SToby Isaac global_work_size[2] = z * local_work_size[2]; 58720cf1dd8SToby Isaac ierr = PetscInfo7(fem, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n", x, y, z, local_work_size[0], local_work_size[1], local_work_size[2], N_cb);CHKERRQ(ierr); 5881e1ea65dSPierre Jolivet ierr = PetscInfo2(fem, " N_t: %d, N_cb: %d\n", N_t, N_cb);CHKERRQ(ierr); 58920cf1dd8SToby Isaac /* Generate code */ 59020cf1dd8SToby Isaac if (probAux) { 59120cf1dd8SToby Isaac PetscSpace P; 59220cf1dd8SToby Isaac PetscInt NfAux, order, f; 59320cf1dd8SToby Isaac 59420cf1dd8SToby Isaac ierr = PetscDSGetNumFields(probAux, &NfAux);CHKERRQ(ierr); 59520cf1dd8SToby Isaac for (f = 0; f < NfAux; ++f) { 59620cf1dd8SToby Isaac PetscFE feAux; 59720cf1dd8SToby Isaac 59820cf1dd8SToby Isaac ierr = PetscDSGetDiscretization(probAux, f, (PetscObject *) &feAux);CHKERRQ(ierr); 59920cf1dd8SToby Isaac ierr = PetscFEGetBasisSpace(feAux, &P);CHKERRQ(ierr); 60020cf1dd8SToby Isaac ierr = PetscSpaceGetDegree(P, &order, NULL);CHKERRQ(ierr); 60120cf1dd8SToby Isaac if (order > 0) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Can only handle P0 coefficient fields"); 60220cf1dd8SToby Isaac } 60320cf1dd8SToby Isaac } 60420cf1dd8SToby Isaac ierr = PetscFEOpenCLGetIntegrationKernel(fem, useAux, &ocl_prog, &ocl_kernel);CHKERRQ(ierr); 60520cf1dd8SToby Isaac /* Create buffers on the device and send data over */ 60620cf1dd8SToby Isaac ierr = PetscDataTypeGetSize(ocl->realType, &realSize);CHKERRQ(ierr); 6077be5e748SToby Isaac if (cgeom->numPoints > 1) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only support affine geometry for OpenCL integration right now"); 60820cf1dd8SToby Isaac if (sizeof(PetscReal) != realSize) { 60920cf1dd8SToby Isaac switch (ocl->realType) { 61020cf1dd8SToby Isaac case PETSC_FLOAT: 61120cf1dd8SToby Isaac { 61220cf1dd8SToby Isaac PetscInt c, b, d; 61320cf1dd8SToby Isaac 61420cf1dd8SToby Isaac ierr = PetscMalloc4(Ne*N_bt,&f_coeff,Ne,&f_coeffAux,Ne*dim*dim,&f_invJ,Ne,&f_detJ);CHKERRQ(ierr); 61520cf1dd8SToby Isaac for (c = 0; c < Ne; ++c) { 6167be5e748SToby Isaac f_detJ[c] = (float) cgeom->detJ[c]; 61720cf1dd8SToby Isaac for (d = 0; d < dim*dim; ++d) { 6187be5e748SToby Isaac f_invJ[c*dim*dim+d] = (float) cgeom->invJ[c * dim * dim + d]; 61920cf1dd8SToby Isaac } 62020cf1dd8SToby Isaac for (b = 0; b < N_bt; ++b) { 62120cf1dd8SToby Isaac f_coeff[c*N_bt+b] = (float) coefficients[c*N_bt+b]; 62220cf1dd8SToby Isaac } 62320cf1dd8SToby Isaac } 62420cf1dd8SToby Isaac if (coefficientsAux) { /* Assume P0 */ 62520cf1dd8SToby Isaac for (c = 0; c < Ne; ++c) { 62620cf1dd8SToby Isaac f_coeffAux[c] = (float) coefficientsAux[c]; 62720cf1dd8SToby Isaac } 62820cf1dd8SToby Isaac } 62920cf1dd8SToby Isaac oclCoeff = (void *) f_coeff; 63020cf1dd8SToby Isaac if (coefficientsAux) { 63120cf1dd8SToby Isaac oclCoeffAux = (void *) f_coeffAux; 63220cf1dd8SToby Isaac } else { 63320cf1dd8SToby Isaac oclCoeffAux = NULL; 63420cf1dd8SToby Isaac } 63520cf1dd8SToby Isaac oclInvJ = (void *) f_invJ; 63620cf1dd8SToby Isaac oclDetJ = (void *) f_detJ; 63720cf1dd8SToby Isaac } 63820cf1dd8SToby Isaac break; 63920cf1dd8SToby Isaac case PETSC_DOUBLE: 64020cf1dd8SToby Isaac { 64120cf1dd8SToby Isaac PetscInt c, b, d; 64220cf1dd8SToby Isaac 64320cf1dd8SToby Isaac ierr = PetscMalloc4(Ne*N_bt,&d_coeff,Ne,&d_coeffAux,Ne*dim*dim,&d_invJ,Ne,&d_detJ);CHKERRQ(ierr); 64420cf1dd8SToby Isaac for (c = 0; c < Ne; ++c) { 6457be5e748SToby Isaac d_detJ[c] = (double) cgeom->detJ[c]; 64620cf1dd8SToby Isaac for (d = 0; d < dim*dim; ++d) { 6477be5e748SToby Isaac d_invJ[c*dim*dim+d] = (double) cgeom->invJ[c * dim * dim + d]; 64820cf1dd8SToby Isaac } 64920cf1dd8SToby Isaac for (b = 0; b < N_bt; ++b) { 65020cf1dd8SToby Isaac d_coeff[c*N_bt+b] = (double) coefficients[c*N_bt+b]; 65120cf1dd8SToby Isaac } 65220cf1dd8SToby Isaac } 65320cf1dd8SToby Isaac if (coefficientsAux) { /* Assume P0 */ 65420cf1dd8SToby Isaac for (c = 0; c < Ne; ++c) { 65520cf1dd8SToby Isaac d_coeffAux[c] = (double) coefficientsAux[c]; 65620cf1dd8SToby Isaac } 65720cf1dd8SToby Isaac } 65820cf1dd8SToby Isaac oclCoeff = (void *) d_coeff; 65920cf1dd8SToby Isaac if (coefficientsAux) { 66020cf1dd8SToby Isaac oclCoeffAux = (void *) d_coeffAux; 66120cf1dd8SToby Isaac } else { 66220cf1dd8SToby Isaac oclCoeffAux = NULL; 66320cf1dd8SToby Isaac } 66420cf1dd8SToby Isaac oclInvJ = (void *) d_invJ; 66520cf1dd8SToby Isaac oclDetJ = (void *) d_detJ; 66620cf1dd8SToby Isaac } 66720cf1dd8SToby Isaac break; 66820cf1dd8SToby Isaac default: 66920cf1dd8SToby Isaac SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType); 67020cf1dd8SToby Isaac } 67120cf1dd8SToby Isaac } else { 67220cf1dd8SToby Isaac PetscInt c, d; 67320cf1dd8SToby Isaac 67420cf1dd8SToby Isaac ierr = PetscMalloc2(Ne*dim*dim,&r_invJ,Ne,&r_detJ);CHKERRQ(ierr); 67520cf1dd8SToby Isaac for (c = 0; c < Ne; ++c) { 6767be5e748SToby Isaac r_detJ[c] = cgeom->detJ[c]; 67720cf1dd8SToby Isaac for (d = 0; d < dim*dim; ++d) { 6787be5e748SToby Isaac r_invJ[c*dim*dim+d] = cgeom->invJ[c * dim * dim + d]; 67920cf1dd8SToby Isaac } 68020cf1dd8SToby Isaac } 68120cf1dd8SToby Isaac oclCoeff = (void *) coefficients; 68220cf1dd8SToby Isaac oclCoeffAux = (void *) coefficientsAux; 68320cf1dd8SToby Isaac oclInvJ = (void *) r_invJ; 68420cf1dd8SToby Isaac oclDetJ = (void *) r_detJ; 68520cf1dd8SToby Isaac } 686*2f613bf5SBarry Smith o_coefficients = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*N_bt * realSize, oclCoeff, &ierr); 68720cf1dd8SToby Isaac if (coefficientsAux) { 688*2f613bf5SBarry Smith o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclCoeffAux, &ierr); 68920cf1dd8SToby Isaac } else { 690*2f613bf5SBarry Smith o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY, Ne * realSize, oclCoeffAux, &ierr); 69120cf1dd8SToby Isaac } 692*2f613bf5SBarry Smith o_jacobianInverses = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*dim*dim * realSize, oclInvJ, &ierr); 693*2f613bf5SBarry Smith o_jacobianDeterminants = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclDetJ, &ierr); 694*2f613bf5SBarry Smith o_elemVec = clCreateBuffer(ocl->ctx_id, CL_MEM_WRITE_ONLY, Ne*N_bt * realSize, NULL, &ierr); 69520cf1dd8SToby Isaac /* Kernel launch */ 69620cf1dd8SToby Isaac ierr = clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void*) &N_cb);CHKERRQ(ierr); 69720cf1dd8SToby Isaac ierr = clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void*) &o_coefficients);CHKERRQ(ierr); 69820cf1dd8SToby Isaac ierr = clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void*) &o_coefficientsAux);CHKERRQ(ierr); 69920cf1dd8SToby Isaac ierr = clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void*) &o_jacobianInverses);CHKERRQ(ierr); 70020cf1dd8SToby Isaac ierr = clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void*) &o_jacobianDeterminants);CHKERRQ(ierr); 70120cf1dd8SToby Isaac ierr = clSetKernelArg(ocl_kernel, 5, sizeof(cl_mem), (void*) &o_elemVec);CHKERRQ(ierr); 70220cf1dd8SToby Isaac ierr = clEnqueueNDRangeKernel(ocl->queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev);CHKERRQ(ierr); 70320cf1dd8SToby Isaac /* Read data back from device */ 70420cf1dd8SToby Isaac if (sizeof(PetscReal) != realSize) { 70520cf1dd8SToby Isaac switch (ocl->realType) { 70620cf1dd8SToby Isaac case PETSC_FLOAT: 70720cf1dd8SToby Isaac { 70820cf1dd8SToby Isaac float *elem; 70920cf1dd8SToby Isaac PetscInt c, b; 71020cf1dd8SToby Isaac 71120cf1dd8SToby Isaac ierr = PetscFree4(f_coeff,f_coeffAux,f_invJ,f_detJ);CHKERRQ(ierr); 71220cf1dd8SToby Isaac ierr = PetscMalloc1(Ne*N_bt, &elem);CHKERRQ(ierr); 71320cf1dd8SToby Isaac ierr = clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL);CHKERRQ(ierr); 71420cf1dd8SToby Isaac for (c = 0; c < Ne; ++c) { 71520cf1dd8SToby Isaac for (b = 0; b < N_bt; ++b) { 71620cf1dd8SToby Isaac elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b]; 71720cf1dd8SToby Isaac } 71820cf1dd8SToby Isaac } 71920cf1dd8SToby Isaac ierr = PetscFree(elem);CHKERRQ(ierr); 72020cf1dd8SToby Isaac } 72120cf1dd8SToby Isaac break; 72220cf1dd8SToby Isaac case PETSC_DOUBLE: 72320cf1dd8SToby Isaac { 72420cf1dd8SToby Isaac double *elem; 72520cf1dd8SToby Isaac PetscInt c, b; 72620cf1dd8SToby Isaac 72720cf1dd8SToby Isaac ierr = PetscFree4(d_coeff,d_coeffAux,d_invJ,d_detJ);CHKERRQ(ierr); 72820cf1dd8SToby Isaac ierr = PetscMalloc1(Ne*N_bt, &elem);CHKERRQ(ierr); 72920cf1dd8SToby Isaac ierr = clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL);CHKERRQ(ierr); 73020cf1dd8SToby Isaac for (c = 0; c < Ne; ++c) { 73120cf1dd8SToby Isaac for (b = 0; b < N_bt; ++b) { 73220cf1dd8SToby Isaac elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b]; 73320cf1dd8SToby Isaac } 73420cf1dd8SToby Isaac } 73520cf1dd8SToby Isaac ierr = PetscFree(elem);CHKERRQ(ierr); 73620cf1dd8SToby Isaac } 73720cf1dd8SToby Isaac break; 73820cf1dd8SToby Isaac default: 73920cf1dd8SToby Isaac SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType); 74020cf1dd8SToby Isaac } 74120cf1dd8SToby Isaac } else { 74220cf1dd8SToby Isaac ierr = PetscFree2(r_invJ,r_detJ);CHKERRQ(ierr); 74320cf1dd8SToby Isaac ierr = clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elemVec, 0, NULL, NULL);CHKERRQ(ierr); 74420cf1dd8SToby Isaac } 74520cf1dd8SToby Isaac /* Log performance */ 74620cf1dd8SToby Isaac ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL);CHKERRQ(ierr); 74720cf1dd8SToby Isaac ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ns_end, NULL);CHKERRQ(ierr); 74820cf1dd8SToby Isaac f0Flops = 0; 74920cf1dd8SToby Isaac switch (ocl->op) { 75020cf1dd8SToby Isaac case LAPLACIAN: 75120cf1dd8SToby Isaac f1Flops = useAux ? dim : 0;break; 75220cf1dd8SToby Isaac case ELASTICITY: 75320cf1dd8SToby Isaac f1Flops = 2*dim*dim;break; 75420cf1dd8SToby Isaac } 75520cf1dd8SToby Isaac numFlops = Ne*( 75620cf1dd8SToby Isaac N_q*( 75720cf1dd8SToby Isaac N_b*N_comp*((useField ? 2 : 0) + (useFieldDer ? 2*dim*(dim + 1) : 0)) 75820cf1dd8SToby Isaac /*+ 75920cf1dd8SToby Isaac N_ba*N_compa*((useFieldAux ? 2 : 0) + (useFieldDerAux ? 2*dim*(dim + 1) : 0))*/ 76020cf1dd8SToby Isaac + 76120cf1dd8SToby Isaac N_comp*((useF0 ? f0Flops + 2 : 0) + (useF1 ? f1Flops + 2*dim : 0))) 76220cf1dd8SToby Isaac + 76320cf1dd8SToby Isaac N_b*((useF0 ? 2 : 0) + (useF1 ? 2*dim*(dim + 1) : 0))); 76420cf1dd8SToby Isaac ierr = PetscFEOpenCLLogResidual(fem, (ns_end - ns_start)*1.0e-9, numFlops);CHKERRQ(ierr); 76520cf1dd8SToby Isaac /* Cleanup */ 76620cf1dd8SToby Isaac ierr = clReleaseMemObject(o_coefficients);CHKERRQ(ierr); 76720cf1dd8SToby Isaac ierr = clReleaseMemObject(o_coefficientsAux);CHKERRQ(ierr); 76820cf1dd8SToby Isaac ierr = clReleaseMemObject(o_jacobianInverses);CHKERRQ(ierr); 76920cf1dd8SToby Isaac ierr = clReleaseMemObject(o_jacobianDeterminants);CHKERRQ(ierr); 77020cf1dd8SToby Isaac ierr = clReleaseMemObject(o_elemVec);CHKERRQ(ierr); 77120cf1dd8SToby Isaac ierr = clReleaseKernel(ocl_kernel);CHKERRQ(ierr); 77220cf1dd8SToby Isaac ierr = clReleaseProgram(ocl_prog);CHKERRQ(ierr); 77320cf1dd8SToby Isaac PetscFunctionReturn(0); 77420cf1dd8SToby Isaac } 77520cf1dd8SToby Isaac 776526996e7SStefano Zampini PETSC_INTERN PetscErrorCode PetscFESetUp_Basic(PetscFE); 777526996e7SStefano Zampini PETSC_INTERN PetscErrorCode PetscFECreateTabulation_Basic(PetscFE, PetscInt, const PetscReal [], PetscInt, PetscTabulation); 7782e47ffbbSToby Isaac 7792b99622eSMatthew G. Knepley static PetscErrorCode PetscFEInitialize_OpenCL(PetscFE fem) 78020cf1dd8SToby Isaac { 78120cf1dd8SToby Isaac PetscFunctionBegin; 78220cf1dd8SToby Isaac fem->ops->setfromoptions = NULL; 78320cf1dd8SToby Isaac fem->ops->setup = PetscFESetUp_Basic; 78420cf1dd8SToby Isaac fem->ops->view = NULL; 78520cf1dd8SToby Isaac fem->ops->destroy = PetscFEDestroy_OpenCL; 78620cf1dd8SToby Isaac fem->ops->getdimension = PetscFEGetDimension_Basic; 787ef0bb6c7SMatthew G. Knepley fem->ops->createtabulation = PetscFECreateTabulation_Basic; 78820cf1dd8SToby Isaac fem->ops->integrateresidual = PetscFEIntegrateResidual_OpenCL; 78920cf1dd8SToby Isaac fem->ops->integratebdresidual = NULL/* PetscFEIntegrateBdResidual_OpenCL */; 79020cf1dd8SToby Isaac fem->ops->integratejacobianaction = NULL/* PetscFEIntegrateJacobianAction_OpenCL */; 79120cf1dd8SToby Isaac fem->ops->integratejacobian = PetscFEIntegrateJacobian_Basic; 79220cf1dd8SToby Isaac PetscFunctionReturn(0); 79320cf1dd8SToby Isaac } 79420cf1dd8SToby Isaac 79520cf1dd8SToby Isaac /*MC 79620cf1dd8SToby Isaac PETSCFEOPENCL = "opencl" - A PetscFE object that integrates using a vectorized OpenCL implementation 79720cf1dd8SToby Isaac 79820cf1dd8SToby Isaac Level: intermediate 79920cf1dd8SToby Isaac 80020cf1dd8SToby Isaac .seealso: PetscFEType, PetscFECreate(), PetscFESetType() 80120cf1dd8SToby Isaac M*/ 80220cf1dd8SToby Isaac 80320cf1dd8SToby Isaac PETSC_EXTERN PetscErrorCode PetscFECreate_OpenCL(PetscFE fem) 80420cf1dd8SToby Isaac { 80520cf1dd8SToby Isaac PetscFE_OpenCL *ocl; 80620cf1dd8SToby Isaac cl_uint num_platforms; 80720cf1dd8SToby Isaac cl_platform_id platform_ids[42]; 80820cf1dd8SToby Isaac cl_uint num_devices; 80920cf1dd8SToby Isaac cl_device_id device_ids[42]; 8102da392ccSBarry Smith cl_int err; 81120cf1dd8SToby Isaac PetscErrorCode ierr; 81220cf1dd8SToby Isaac 81320cf1dd8SToby Isaac PetscFunctionBegin; 81420cf1dd8SToby Isaac PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1); 81520cf1dd8SToby Isaac ierr = PetscNewLog(fem,&ocl);CHKERRQ(ierr); 81620cf1dd8SToby Isaac fem->data = ocl; 81720cf1dd8SToby Isaac 81820cf1dd8SToby Isaac /* Init Platform */ 81920cf1dd8SToby Isaac ierr = clGetPlatformIDs(42, platform_ids, &num_platforms);CHKERRQ(ierr); 82020cf1dd8SToby Isaac if (!num_platforms) SETERRQ(PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL platform found."); 82120cf1dd8SToby Isaac ocl->pf_id = platform_ids[0]; 82220cf1dd8SToby Isaac /* Init Device */ 82320cf1dd8SToby Isaac ierr = clGetDeviceIDs(ocl->pf_id, CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices);CHKERRQ(ierr); 82420cf1dd8SToby Isaac if (!num_devices) SETERRQ(PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL device found."); 82520cf1dd8SToby Isaac ocl->dev_id = device_ids[0]; 82620cf1dd8SToby Isaac /* Create context with one command queue */ 8272da392ccSBarry Smith ocl->ctx_id = clCreateContext(0, 1, &(ocl->dev_id), NULL, NULL, &err);CHKERRQ(err); 8282da392ccSBarry Smith ocl->queue_id = clCreateCommandQueue(ocl->ctx_id, ocl->dev_id, CL_QUEUE_PROFILING_ENABLE, &err);CHKERRQ(err); 82920cf1dd8SToby Isaac /* Types */ 83020cf1dd8SToby Isaac ocl->realType = PETSC_FLOAT; 83120cf1dd8SToby Isaac /* Register events */ 83220cf1dd8SToby Isaac ierr = PetscLogEventRegister("OpenCL FEResidual", PETSCFE_CLASSID, &ocl->residualEvent);CHKERRQ(ierr); 83320cf1dd8SToby Isaac /* Equation handling */ 83420cf1dd8SToby Isaac ocl->op = LAPLACIAN; 83520cf1dd8SToby Isaac 83620cf1dd8SToby Isaac ierr = PetscFEInitialize_OpenCL(fem);CHKERRQ(ierr); 83720cf1dd8SToby Isaac PetscFunctionReturn(0); 83820cf1dd8SToby Isaac } 83920cf1dd8SToby Isaac 8402b99622eSMatthew G. Knepley /*@ 8412b99622eSMatthew G. Knepley PetscFEOpenCLSetRealType - Set the scalar type for running on the accelerator 8422b99622eSMatthew G. Knepley 8432b99622eSMatthew G. Knepley Input Parameters: 8442b99622eSMatthew G. Knepley + fem - The PetscFE 8452b99622eSMatthew G. Knepley - realType - The scalar type 8462b99622eSMatthew G. Knepley 8472b99622eSMatthew G. Knepley Level: developer 8482b99622eSMatthew G. Knepley 8492b99622eSMatthew G. Knepley .seealso: PetscFEOpenCLGetRealType() 8502b99622eSMatthew G. Knepley @*/ 85120cf1dd8SToby Isaac PetscErrorCode PetscFEOpenCLSetRealType(PetscFE fem, PetscDataType realType) 85220cf1dd8SToby Isaac { 85320cf1dd8SToby Isaac PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 85420cf1dd8SToby Isaac 85520cf1dd8SToby Isaac PetscFunctionBegin; 85620cf1dd8SToby Isaac PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1); 85720cf1dd8SToby Isaac ocl->realType = realType; 85820cf1dd8SToby Isaac PetscFunctionReturn(0); 85920cf1dd8SToby Isaac } 86020cf1dd8SToby Isaac 8612b99622eSMatthew G. Knepley /*@ 8622b99622eSMatthew G. Knepley PetscFEOpenCLGetRealType - Get the scalar type for running on the accelerator 8632b99622eSMatthew G. Knepley 8642b99622eSMatthew G. Knepley Input Parameter: 8652b99622eSMatthew G. Knepley . fem - The PetscFE 8662b99622eSMatthew G. Knepley 8672b99622eSMatthew G. Knepley Output Parameter: 8682b99622eSMatthew G. Knepley . realType - The scalar type 8692b99622eSMatthew G. Knepley 8702b99622eSMatthew G. Knepley Level: developer 8712b99622eSMatthew G. Knepley 8722b99622eSMatthew G. Knepley .seealso: PetscFEOpenCLSetRealType() 8732b99622eSMatthew G. Knepley @*/ 87420cf1dd8SToby Isaac PetscErrorCode PetscFEOpenCLGetRealType(PetscFE fem, PetscDataType *realType) 87520cf1dd8SToby Isaac { 87620cf1dd8SToby Isaac PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 87720cf1dd8SToby Isaac 87820cf1dd8SToby Isaac PetscFunctionBegin; 87920cf1dd8SToby Isaac PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1); 88020cf1dd8SToby Isaac PetscValidPointer(realType, 2); 88120cf1dd8SToby Isaac *realType = ocl->realType; 88220cf1dd8SToby Isaac PetscFunctionReturn(0); 88320cf1dd8SToby Isaac } 88420cf1dd8SToby Isaac 88520cf1dd8SToby Isaac #endif /* PETSC_HAVE_OPENCL */ 886