xref: /petsc/src/dm/dt/fe/impls/opencl/feopencl.c (revision 2f613bf53f46f9356e00a2ca2bd69453be72fc31)
120cf1dd8SToby Isaac #include <petsc/private/petscfeimpl.h> /*I "petscfe.h" I*/
220cf1dd8SToby Isaac 
37be5e748SToby Isaac #if defined(PETSC_HAVE_OPENCL)
420cf1dd8SToby Isaac 
52b99622eSMatthew G. Knepley static PetscErrorCode PetscFEDestroy_OpenCL(PetscFE fem)
620cf1dd8SToby Isaac {
720cf1dd8SToby Isaac   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
820cf1dd8SToby Isaac   PetscErrorCode  ierr;
920cf1dd8SToby Isaac 
1020cf1dd8SToby Isaac   PetscFunctionBegin;
1120cf1dd8SToby Isaac   ierr = clReleaseCommandQueue(ocl->queue_id);CHKERRQ(ierr);
1220cf1dd8SToby Isaac   ocl->queue_id = 0;
1320cf1dd8SToby Isaac   ierr = clReleaseContext(ocl->ctx_id);CHKERRQ(ierr);
1420cf1dd8SToby Isaac   ocl->ctx_id = 0;
1520cf1dd8SToby Isaac   ierr = PetscFree(ocl);CHKERRQ(ierr);
1620cf1dd8SToby Isaac   PetscFunctionReturn(0);
1720cf1dd8SToby Isaac }
1820cf1dd8SToby Isaac 
192e58f0efSBarry Smith #define CHKERRSTR(err) do {CHKERRQ(err); string_tail += count; if (string_tail == end_of_buffer) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB,"Buffer overflow");} while (0)
2020cf1dd8SToby Isaac enum {LAPLACIAN = 0, ELASTICITY = 1};
2120cf1dd8SToby Isaac 
2220cf1dd8SToby Isaac /* NOTE: This is now broken for vector problems. Must redo loops to respect vector basis elements */
2320cf1dd8SToby Isaac /* dim     Number of spatial dimensions:          2                   */
2420cf1dd8SToby Isaac /* N_b     Number of basis functions:             generated           */
2520cf1dd8SToby Isaac /* N_{bt}  Number of total basis functions:       N_b * N_{comp}      */
2620cf1dd8SToby Isaac /* N_q     Number of quadrature points:           generated           */
2720cf1dd8SToby Isaac /* N_{bs}  Number of block cells                  LCM(N_b, N_q)       */
2820cf1dd8SToby Isaac /* N_{bst} Number of block cell components        LCM(N_{bt}, N_q)    */
2920cf1dd8SToby Isaac /* N_{bl}  Number of concurrent blocks            generated           */
3020cf1dd8SToby Isaac /* N_t     Number of threads:                     N_{bl} * N_{bs}     */
3120cf1dd8SToby Isaac /* N_{cbc} Number of concurrent basis      cells: N_{bl} * N_q        */
3220cf1dd8SToby Isaac /* N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b        */
3320cf1dd8SToby Isaac /* N_{sbc} Number of serial     basis      cells: N_{bs} / N_q        */
3420cf1dd8SToby Isaac /* N_{sqc} Number of serial     quadrature cells: N_{bs} / N_b        */
3520cf1dd8SToby Isaac /* N_{cb}  Number of serial cell batches:         input               */
3620cf1dd8SToby Isaac /* N_c     Number of total cells:                 N_{cb}*N_{t}/N_{comp} */
372b99622eSMatthew G. Knepley static PetscErrorCode PetscFEOpenCLGenerateIntegrationCode(PetscFE fem, char **string_buffer, PetscInt buffer_length, PetscBool useAux, PetscInt N_bl)
3820cf1dd8SToby Isaac {
3920cf1dd8SToby Isaac   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
4020cf1dd8SToby Isaac   PetscQuadrature q;
4120cf1dd8SToby Isaac   char           *string_tail   = *string_buffer;
4220cf1dd8SToby Isaac   char           *end_of_buffer = *string_buffer + buffer_length;
4320cf1dd8SToby Isaac   char            float_str[]   = "float", double_str[]  = "double";
4420cf1dd8SToby Isaac   char           *numeric_str   = &(float_str[0]);
4520cf1dd8SToby Isaac   PetscInt        op            = ocl->op;
4620cf1dd8SToby Isaac   PetscBool       useField      = PETSC_FALSE;
4720cf1dd8SToby Isaac   PetscBool       useFieldDer   = PETSC_TRUE;
4820cf1dd8SToby Isaac   PetscBool       useFieldAux   = useAux;
4920cf1dd8SToby Isaac   PetscBool       useFieldDerAux= PETSC_FALSE;
5020cf1dd8SToby Isaac   PetscBool       useF0         = PETSC_TRUE;
5120cf1dd8SToby Isaac   PetscBool       useF1         = PETSC_TRUE;
5220cf1dd8SToby Isaac   const PetscReal *points, *weights;
53ef0bb6c7SMatthew G. Knepley   PetscTabulation T;
5420cf1dd8SToby Isaac   PetscInt        dim, qNc, N_b, N_c, N_q, N_t, p, d, b, c;
5520cf1dd8SToby Isaac   size_t          count;
5620cf1dd8SToby Isaac   PetscErrorCode  ierr;
5720cf1dd8SToby Isaac 
5820cf1dd8SToby Isaac   PetscFunctionBegin;
5920cf1dd8SToby Isaac   ierr = PetscFEGetSpatialDimension(fem, &dim);CHKERRQ(ierr);
6020cf1dd8SToby Isaac   ierr = PetscFEGetDimension(fem, &N_b);CHKERRQ(ierr);
6120cf1dd8SToby Isaac   ierr = PetscFEGetNumComponents(fem, &N_c);CHKERRQ(ierr);
6220cf1dd8SToby Isaac   ierr = PetscFEGetQuadrature(fem, &q);CHKERRQ(ierr);
6320cf1dd8SToby Isaac   ierr = PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);CHKERRQ(ierr);
6420cf1dd8SToby Isaac   if (qNc != 1) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %D components\n", qNc);
6520cf1dd8SToby Isaac   N_t  = N_b * N_c * N_q * N_bl;
6620cf1dd8SToby Isaac   /* Enable device extension for double precision */
6720cf1dd8SToby Isaac   if (ocl->realType == PETSC_DOUBLE) {
6820cf1dd8SToby Isaac     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
6920cf1dd8SToby Isaac "#if defined(cl_khr_fp64)\n"
7020cf1dd8SToby Isaac "#  pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
7120cf1dd8SToby Isaac "#elif defined(cl_amd_fp64)\n"
7220cf1dd8SToby Isaac "#  pragma OPENCL EXTENSION cl_amd_fp64: enable\n"
7320cf1dd8SToby Isaac "#endif\n",
742e58f0efSBarry Smith                               &count);CHKERRSTR(ierr);
7520cf1dd8SToby Isaac     numeric_str  = &(double_str[0]);
7620cf1dd8SToby Isaac   }
7720cf1dd8SToby Isaac   /* Kernel API */
7820cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
7920cf1dd8SToby Isaac "\n"
8020cf1dd8SToby Isaac "__kernel void integrateElementQuadrature(int N_cb, __global %s *coefficients, __global %s *coefficientsAux, __global %s *jacobianInverses, __global %s *jacobianDeterminants, __global %s *elemVec)\n"
8120cf1dd8SToby Isaac "{\n",
822e58f0efSBarry Smith                        &count, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str);CHKERRSTR(ierr);
8320cf1dd8SToby Isaac   /* Quadrature */
8420cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
8520cf1dd8SToby Isaac "  /* Quadrature points\n"
8620cf1dd8SToby Isaac "   - (x1,y1,x2,y2,...) */\n"
8720cf1dd8SToby Isaac "  const %s points[%d] = {\n",
882e58f0efSBarry Smith                        &count, numeric_str, N_q*dim);CHKERRSTR(ierr);
8920cf1dd8SToby Isaac   for (p = 0; p < N_q; ++p) {
9020cf1dd8SToby Isaac     for (d = 0; d < dim; ++d) {
912e58f0efSBarry Smith       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, points[p*dim+d]);CHKERRSTR(ierr);
9220cf1dd8SToby Isaac     }
9320cf1dd8SToby Isaac   }
942e58f0efSBarry Smith   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);CHKERRSTR(ierr);
9520cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
9620cf1dd8SToby Isaac "  /* Quadrature weights\n"
9720cf1dd8SToby Isaac "   - (v1,v2,...) */\n"
9820cf1dd8SToby Isaac "  const %s weights[%d] = {\n",
992e58f0efSBarry Smith                        &count, numeric_str, N_q);CHKERRSTR(ierr);
10020cf1dd8SToby Isaac   for (p = 0; p < N_q; ++p) {
1012e58f0efSBarry Smith     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, weights[p]);CHKERRSTR(ierr);
10220cf1dd8SToby Isaac   }
1032e58f0efSBarry Smith   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);CHKERRSTR(ierr);
10420cf1dd8SToby Isaac   /* Basis Functions */
105f9244615SMatthew G. Knepley   ierr = PetscFEGetCellTabulation(fem, 1, &T);CHKERRQ(ierr);
10620cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
10720cf1dd8SToby Isaac "  /* Nodal basis function evaluations\n"
10820cf1dd8SToby Isaac "    - basis component is fastest varying, the basis function, then point */\n"
10920cf1dd8SToby Isaac "  const %s Basis[%d] = {\n",
1102e58f0efSBarry Smith                        &count, numeric_str, N_q*N_b*N_c);CHKERRSTR(ierr);
11120cf1dd8SToby Isaac   for (p = 0; p < N_q; ++p) {
11220cf1dd8SToby Isaac     for (b = 0; b < N_b; ++b) {
11320cf1dd8SToby Isaac       for (c = 0; c < N_c; ++c) {
1142e58f0efSBarry Smith         ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, T->T[0][(p*N_b + b)*N_c + c]);CHKERRSTR(ierr);
11520cf1dd8SToby Isaac       }
11620cf1dd8SToby Isaac     }
11720cf1dd8SToby Isaac   }
1182e58f0efSBarry Smith   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);CHKERRSTR(ierr);
11920cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
12020cf1dd8SToby Isaac "\n"
12120cf1dd8SToby Isaac "  /* Nodal basis function derivative evaluations,\n"
12220cf1dd8SToby Isaac "      - derivative direction is fastest varying, then basis component, then basis function, then point */\n"
12320cf1dd8SToby Isaac "  const %s%d BasisDerivatives[%d] = {\n",
1242e58f0efSBarry Smith                        &count, numeric_str, dim, N_q*N_b*N_c);CHKERRSTR(ierr);
12520cf1dd8SToby Isaac   for (p = 0; p < N_q; ++p) {
12620cf1dd8SToby Isaac     for (b = 0; b < N_b; ++b) {
12720cf1dd8SToby Isaac       for (c = 0; c < N_c; ++c) {
1282e58f0efSBarry Smith         ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "(%s%d)(", &count, numeric_str, dim);CHKERRSTR(ierr);
12920cf1dd8SToby Isaac         for (d = 0; d < dim; ++d) {
13020cf1dd8SToby Isaac           if (d > 0) {
1312e58f0efSBarry Smith             ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, ", %g", &count, T->T[1][((p*N_b + b)*dim + d)*N_c + c]);CHKERRSTR(ierr);
13220cf1dd8SToby Isaac           } else {
1332e58f0efSBarry Smith             ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g", &count, T->T[1][((p*N_b + b)*dim + d)*N_c + c]);CHKERRSTR(ierr);
13420cf1dd8SToby Isaac           }
13520cf1dd8SToby Isaac         }
1362e58f0efSBarry Smith         ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "),\n", &count);CHKERRSTR(ierr);
13720cf1dd8SToby Isaac       }
13820cf1dd8SToby Isaac     }
13920cf1dd8SToby Isaac   }
1402e58f0efSBarry Smith   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);CHKERRSTR(ierr);
14120cf1dd8SToby Isaac   /* Sizes */
14220cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
14320cf1dd8SToby Isaac "  const int dim    = %d;                           // The spatial dimension\n"
14420cf1dd8SToby Isaac "  const int N_bl   = %d;                           // The number of concurrent blocks\n"
14520cf1dd8SToby Isaac "  const int N_b    = %d;                           // The number of basis functions\n"
14620cf1dd8SToby Isaac "  const int N_comp = %d;                           // The number of basis function components\n"
14720cf1dd8SToby Isaac "  const int N_bt   = N_b*N_comp;                    // The total number of scalar basis functions\n"
14820cf1dd8SToby Isaac "  const int N_q    = %d;                           // The number of quadrature points\n"
14920cf1dd8SToby Isaac "  const int N_bst  = N_bt*N_q;                      // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously\n"
15020cf1dd8SToby Isaac "  const int N_t    = N_bst*N_bl;                    // The number of threads, N_bst * N_bl\n"
15120cf1dd8SToby Isaac "  const int N_bc   = N_t/N_comp;                    // The number of cells per batch (N_b*N_q*N_bl)\n"
15220cf1dd8SToby Isaac "  const int N_sbc  = N_bst / (N_q * N_comp);\n"
15320cf1dd8SToby Isaac "  const int N_sqc  = N_bst / N_bt;\n"
15420cf1dd8SToby Isaac "  /*const int N_c    = N_cb * N_bc;*/\n"
15520cf1dd8SToby Isaac "\n"
15620cf1dd8SToby Isaac "  /* Calculated indices */\n"
15720cf1dd8SToby Isaac "  /*const int tidx    = get_local_id(0) + get_local_size(0)*get_local_id(1);*/\n"
15820cf1dd8SToby Isaac "  const int tidx    = get_local_id(0);\n"
15920cf1dd8SToby Isaac "  const int blidx   = tidx / N_bst;                  // Block number for this thread\n"
16020cf1dd8SToby Isaac "  const int bidx    = tidx %% N_bt;                   // Basis function mapped to this thread\n"
16120cf1dd8SToby Isaac "  const int cidx    = tidx %% N_comp;                 // Basis component mapped to this thread\n"
16220cf1dd8SToby Isaac "  const int qidx    = tidx %% N_q;                    // Quadrature point mapped to this thread\n"
16320cf1dd8SToby Isaac "  const int blbidx  = tidx %% N_q + blidx*N_q;        // Cell mapped to this thread in the basis phase\n"
16420cf1dd8SToby Isaac "  const int blqidx  = tidx %% N_b + blidx*N_b;        // Cell mapped to this thread in the quadrature phase\n"
16520cf1dd8SToby Isaac "  const int gidx    = get_group_id(1)*get_num_groups(0) + get_group_id(0);\n"
16620cf1dd8SToby Isaac "  const int Goffset = gidx*N_cb*N_bc;\n",
1672e58f0efSBarry Smith                             &count, dim, N_bl, N_b, N_c, N_q);CHKERRSTR(ierr);
16820cf1dd8SToby Isaac   /* Local memory */
16920cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
17020cf1dd8SToby Isaac "\n"
17120cf1dd8SToby Isaac "  /* Quadrature data */\n"
17220cf1dd8SToby Isaac "  %s                w;                   // $w_q$, Quadrature weight at $x_q$\n"
17320cf1dd8SToby Isaac "  __local %s         phi_i[%d];    //[N_bt*N_q];  // $\\phi_i(x_q)$, Value of the basis function $i$ at $x_q$\n"
17420cf1dd8SToby Isaac "  __local %s%d       phiDer_i[%d]; //[N_bt*N_q];  // $\\frac{\\partial\\phi_i(x_q)}{\\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$\n"
17520cf1dd8SToby Isaac "  /* Geometric data */\n"
17620cf1dd8SToby Isaac "  __local %s        detJ[%d]; //[N_t];           // $|J(x_q)|$, Jacobian determinant at $x_q$\n"
17720cf1dd8SToby Isaac "  __local %s        invJ[%d];//[N_t*dim*dim];   // $J^{-1}(x_q)$, Jacobian inverse at $x_q$\n",
17820cf1dd8SToby Isaac                             &count, numeric_str, numeric_str, N_b*N_c*N_q, numeric_str, dim, N_b*N_c*N_q, numeric_str, N_t,
1792e58f0efSBarry Smith                             numeric_str, N_t*dim*dim, numeric_str, N_t*N_b*N_c);CHKERRSTR(ierr);
18020cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
18120cf1dd8SToby Isaac "  /* FEM data */\n"
18220cf1dd8SToby Isaac "  __local %s        u_i[%d]; //[N_t*N_bt];       // Coefficients $u_i$ of the field $u|_{\\mathcal{T}} = \\sum_i u_i \\phi_i$\n",
1832e58f0efSBarry Smith                             &count, numeric_str, N_t*N_b*N_c);CHKERRSTR(ierr);
18420cf1dd8SToby Isaac   if (useAux) {
18520cf1dd8SToby Isaac     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
18620cf1dd8SToby Isaac "  __local %s        a_i[%d]; //[N_t];            // Coefficients $a_i$ of the auxiliary field $a|_{\\mathcal{T}} = \\sum_i a_i \\phi^R_i$\n",
1872e58f0efSBarry Smith                             &count, numeric_str, N_t);CHKERRSTR(ierr);
18820cf1dd8SToby Isaac   }
18920cf1dd8SToby Isaac   if (useF0) {
19020cf1dd8SToby Isaac     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
19120cf1dd8SToby Isaac "  /* Intermediate calculations */\n"
19220cf1dd8SToby Isaac "  __local %s         f_0[%d]; //[N_t*N_sqc];      // $f_0(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n",
1932e58f0efSBarry Smith                               &count, numeric_str, N_t*N_q);CHKERRSTR(ierr);
19420cf1dd8SToby Isaac   }
19520cf1dd8SToby Isaac   if (useF1) {
19620cf1dd8SToby Isaac     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
19720cf1dd8SToby Isaac "  __local %s%d       f_1[%d]; //[N_t*N_sqc];      // $f_1(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n",
1982e58f0efSBarry Smith                               &count, numeric_str, dim, N_t*N_q);CHKERRSTR(ierr);
19920cf1dd8SToby Isaac   }
20020cf1dd8SToby Isaac   /* TODO: If using elasticity, put in mu/lambda coefficients */
20120cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
20220cf1dd8SToby Isaac "  /* Output data */\n"
20320cf1dd8SToby Isaac "  %s                e_i;                 // Coefficient $e_i$ of the residual\n\n",
2042e58f0efSBarry Smith                             &count, numeric_str);CHKERRSTR(ierr);
20520cf1dd8SToby Isaac   /* One-time loads */
20620cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
20720cf1dd8SToby Isaac "  /* These should be generated inline */\n"
20820cf1dd8SToby Isaac "  /* Load quadrature weights */\n"
20920cf1dd8SToby Isaac "  w = weights[qidx];\n"
21020cf1dd8SToby Isaac "  /* Load basis tabulation \\phi_i for this cell */\n"
21120cf1dd8SToby Isaac "  if (tidx < N_bt*N_q) {\n"
21220cf1dd8SToby Isaac "    phi_i[tidx]    = Basis[tidx];\n"
21320cf1dd8SToby Isaac "    phiDer_i[tidx] = BasisDerivatives[tidx];\n"
21420cf1dd8SToby Isaac "  }\n\n",
2152e58f0efSBarry Smith                        &count);CHKERRSTR(ierr);
21620cf1dd8SToby Isaac   /* Batch loads */
21720cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
21820cf1dd8SToby Isaac "  for (int batch = 0; batch < N_cb; ++batch) {\n"
21920cf1dd8SToby Isaac "    /* Load geometry */\n"
22020cf1dd8SToby Isaac "    detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];\n"
22120cf1dd8SToby Isaac "    for (int n = 0; n < dim*dim; ++n) {\n"
22220cf1dd8SToby Isaac "      const int offset = n*N_t;\n"
22320cf1dd8SToby Isaac "      invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];\n"
22420cf1dd8SToby Isaac "    }\n"
22520cf1dd8SToby Isaac "    /* Load coefficients u_i for this cell */\n"
22620cf1dd8SToby Isaac "    for (int n = 0; n < N_bt; ++n) {\n"
22720cf1dd8SToby Isaac "      const int offset = n*N_t;\n"
22820cf1dd8SToby Isaac "      u_i[offset+tidx] = coefficients[(Goffset*N_bt)+batch*N_t*N_b+offset+tidx];\n"
22920cf1dd8SToby Isaac "    }\n",
2302e58f0efSBarry Smith                        &count);CHKERRSTR(ierr);
23120cf1dd8SToby Isaac   if (useAux) {
23220cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
23320cf1dd8SToby Isaac "    /* Load coefficients a_i for this cell */\n"
23420cf1dd8SToby Isaac "    /* TODO: This should not be N_t here, it should be N_bc*N_comp_aux */\n"
23520cf1dd8SToby Isaac "    a_i[tidx] = coefficientsAux[Goffset+batch*N_t+tidx];\n",
2362e58f0efSBarry Smith                             &count);CHKERRSTR(ierr);
23720cf1dd8SToby Isaac   }
23820cf1dd8SToby Isaac   /* Quadrature phase */
23920cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
24020cf1dd8SToby Isaac "    barrier(CLK_LOCAL_MEM_FENCE);\n"
24120cf1dd8SToby Isaac "\n"
24220cf1dd8SToby Isaac "    /* Map coefficients to values at quadrature points */\n"
24320cf1dd8SToby Isaac "    for (int c = 0; c < N_sqc; ++c) {\n"
24420cf1dd8SToby Isaac "      const int cell          = c*N_bl*N_b + blqidx;\n"
24520cf1dd8SToby Isaac "      const int fidx          = (cell*N_q + qidx)*N_comp + cidx;\n",
2462e58f0efSBarry Smith                        &count);CHKERRSTR(ierr);
24720cf1dd8SToby Isaac   if (useField) {
24820cf1dd8SToby Isaac     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
24920cf1dd8SToby Isaac "      %s  u[%d]; //[N_comp];     // $u(x_q)$, Value of the field at $x_q$\n",
2502e58f0efSBarry Smith                               &count, numeric_str, N_c);CHKERRSTR(ierr);
25120cf1dd8SToby Isaac   }
25220cf1dd8SToby Isaac   if (useFieldDer) {
25320cf1dd8SToby Isaac     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
25420cf1dd8SToby Isaac "      %s%d   gradU[%d]; //[N_comp]; // $\\nabla u(x_q)$, Value of the field gradient at $x_q$\n",
2552e58f0efSBarry Smith                               &count, numeric_str, dim, N_c);CHKERRSTR(ierr);
25620cf1dd8SToby Isaac   }
25720cf1dd8SToby Isaac   if (useFieldAux) {
25820cf1dd8SToby Isaac     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
25920cf1dd8SToby Isaac "      %s  a[%d]; //[1];     // $a(x_q)$, Value of the auxiliary fields at $x_q$\n",
2602e58f0efSBarry Smith                               &count, numeric_str, 1);CHKERRSTR(ierr);
26120cf1dd8SToby Isaac   }
26220cf1dd8SToby Isaac   if (useFieldDerAux) {
26320cf1dd8SToby Isaac     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
26420cf1dd8SToby Isaac "      %s%d   gradA[%d]; //[1]; // $\\nabla a(x_q)$, Value of the auxiliary field gradient at $x_q$\n",
2652e58f0efSBarry Smith                               &count, numeric_str, dim, 1);CHKERRSTR(ierr);
26620cf1dd8SToby Isaac   }
26720cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
26820cf1dd8SToby Isaac "\n"
26920cf1dd8SToby Isaac "      for (int comp = 0; comp < N_comp; ++comp) {\n",
2702e58f0efSBarry Smith                             &count);CHKERRSTR(ierr);
2712e58f0efSBarry Smith   if (useField) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        u[comp] = 0.0;\n", &count);CHKERRSTR(ierr);}
27220cf1dd8SToby Isaac   if (useFieldDer) {
27320cf1dd8SToby Isaac     switch (dim) {
27420cf1dd8SToby Isaac     case 1:
2752e58f0efSBarry Smith       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        gradU[comp].x = 0.0;\n", &count);CHKERRSTR(ierr);break;
27620cf1dd8SToby Isaac     case 2:
2772e58f0efSBarry Smith       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        gradU[comp].x = 0.0; gradU[comp].y = 0.0;\n", &count);CHKERRSTR(ierr);break;
27820cf1dd8SToby Isaac     case 3:
2792e58f0efSBarry Smith       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;\n", &count);CHKERRSTR(ierr);break;
28020cf1dd8SToby Isaac     }
28120cf1dd8SToby Isaac   }
28220cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
28320cf1dd8SToby Isaac "      }\n",
2842e58f0efSBarry Smith                             &count);CHKERRSTR(ierr);
28520cf1dd8SToby Isaac   if (useFieldAux) {
2862e58f0efSBarry Smith     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      a[0] = 0.0;\n", &count);CHKERRSTR(ierr);
28720cf1dd8SToby Isaac   }
28820cf1dd8SToby Isaac   if (useFieldDerAux) {
28920cf1dd8SToby Isaac     switch (dim) {
29020cf1dd8SToby Isaac     case 1:
2912e58f0efSBarry Smith       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      gradA[0].x = 0.0;\n", &count);CHKERRSTR(ierr);break;
29220cf1dd8SToby Isaac     case 2:
2932e58f0efSBarry Smith       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      gradA[0].x = 0.0; gradA[0].y = 0.0;\n", &count);CHKERRSTR(ierr);break;
29420cf1dd8SToby Isaac     case 3:
2952e58f0efSBarry Smith       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      gradA[0].x = 0.0; gradA[0].y = 0.0; gradA[0].z = 0.0;\n", &count);CHKERRSTR(ierr);break;
29620cf1dd8SToby Isaac     }
29720cf1dd8SToby Isaac   }
29820cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
29920cf1dd8SToby Isaac "      /* Get field and derivatives at this quadrature point */\n"
30020cf1dd8SToby Isaac "      for (int i = 0; i < N_b; ++i) {\n"
30120cf1dd8SToby Isaac "        for (int comp = 0; comp < N_comp; ++comp) {\n"
30220cf1dd8SToby Isaac "          const int b    = i*N_comp+comp;\n"
30320cf1dd8SToby Isaac "          const int pidx = qidx*N_bt + b;\n"
30420cf1dd8SToby Isaac "          const int uidx = cell*N_bt + b;\n"
30520cf1dd8SToby Isaac "          %s%d   realSpaceDer;\n\n",
3062e58f0efSBarry Smith                             &count, numeric_str, dim);CHKERRSTR(ierr);
3072e58f0efSBarry Smith   if (useField) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"          u[comp] += u_i[uidx]*phi_i[pidx];\n", &count);CHKERRSTR(ierr);}
30820cf1dd8SToby Isaac   if (useFieldDer) {
30920cf1dd8SToby Isaac     switch (dim) {
31020cf1dd8SToby Isaac     case 2:
31120cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
31220cf1dd8SToby Isaac "          realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
31320cf1dd8SToby Isaac "          gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
31420cf1dd8SToby Isaac "          realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
31520cf1dd8SToby Isaac "          gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n",
3162e58f0efSBarry Smith                            &count);CHKERRSTR(ierr);break;
31720cf1dd8SToby Isaac     case 3:
31820cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
31920cf1dd8SToby Isaac "          realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
32020cf1dd8SToby Isaac "          gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
32120cf1dd8SToby Isaac "          realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
32220cf1dd8SToby Isaac "          gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n"
32320cf1dd8SToby Isaac "          realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
32420cf1dd8SToby Isaac "          gradU[comp].z += u_i[uidx]*realSpaceDer.z;\n",
3252e58f0efSBarry Smith                            &count);CHKERRSTR(ierr);break;
32620cf1dd8SToby Isaac     }
32720cf1dd8SToby Isaac   }
32820cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
32920cf1dd8SToby Isaac "        }\n"
33020cf1dd8SToby Isaac "      }\n",
3312e58f0efSBarry Smith                             &count);CHKERRSTR(ierr);
33220cf1dd8SToby Isaac   if (useFieldAux) {
3332e58f0efSBarry Smith     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"          a[0] += a_i[cell];\n", &count);CHKERRSTR(ierr);
33420cf1dd8SToby Isaac   }
33520cf1dd8SToby Isaac   /* Calculate residual at quadrature points: Should be generated by an weak form egine */
33620cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
33720cf1dd8SToby Isaac "      /* Process values at quadrature points */\n",
3382e58f0efSBarry Smith                             &count);CHKERRSTR(ierr);
33920cf1dd8SToby Isaac   switch (op) {
34020cf1dd8SToby Isaac   case LAPLACIAN:
3412e58f0efSBarry Smith     if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_0[fidx] = 4.0;\n", &count);CHKERRSTR(ierr);}
34220cf1dd8SToby Isaac     if (useF1) {
3432e58f0efSBarry Smith       if (useAux) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_1[fidx] = a[0]*gradU[cidx];\n", &count);CHKERRSTR(ierr);}
3442e58f0efSBarry Smith       else        {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_1[fidx] = gradU[cidx];\n", &count);CHKERRSTR(ierr);}
34520cf1dd8SToby Isaac     }
34620cf1dd8SToby Isaac     break;
34720cf1dd8SToby Isaac   case ELASTICITY:
3482e58f0efSBarry Smith     if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_0[fidx] = 4.0;\n", &count);CHKERRSTR(ierr);}
34920cf1dd8SToby Isaac     if (useF1) {
35020cf1dd8SToby Isaac     switch (dim) {
35120cf1dd8SToby Isaac     case 2:
35220cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
35320cf1dd8SToby Isaac "      switch (cidx) {\n"
35420cf1dd8SToby Isaac "      case 0:\n"
35520cf1dd8SToby Isaac "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].x + gradU[0].x);\n"
35620cf1dd8SToby Isaac "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].y + gradU[1].x);\n"
35720cf1dd8SToby Isaac "        break;\n"
35820cf1dd8SToby Isaac "      case 1:\n"
35920cf1dd8SToby Isaac "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].x + gradU[0].y);\n"
36020cf1dd8SToby Isaac "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].y + gradU[1].y);\n"
36120cf1dd8SToby Isaac "      }\n",
3622e58f0efSBarry Smith                            &count);CHKERRSTR(ierr);break;
36320cf1dd8SToby Isaac     case 3:
36420cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
36520cf1dd8SToby Isaac "      switch (cidx) {\n"
36620cf1dd8SToby Isaac "      case 0:\n"
36720cf1dd8SToby Isaac "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].x + gradU[0].x);\n"
36820cf1dd8SToby Isaac "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].y + gradU[1].x);\n"
36920cf1dd8SToby Isaac "        f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].z + gradU[2].x);\n"
37020cf1dd8SToby Isaac "        break;\n"
37120cf1dd8SToby Isaac "      case 1:\n"
37220cf1dd8SToby Isaac "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].x + gradU[0].y);\n"
37320cf1dd8SToby Isaac "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[1].y);\n"
37420cf1dd8SToby Isaac "        f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[2].y);\n"
37520cf1dd8SToby Isaac "        break;\n"
37620cf1dd8SToby Isaac "      case 2:\n"
37720cf1dd8SToby Isaac "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].x + gradU[0].z);\n"
37820cf1dd8SToby Isaac "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[1].z);\n"
37920cf1dd8SToby Isaac "        f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[2].z);\n"
38020cf1dd8SToby Isaac "      }\n",
3812e58f0efSBarry Smith                            &count);CHKERRSTR(ierr);break;
38220cf1dd8SToby Isaac     }}
38320cf1dd8SToby Isaac     break;
38420cf1dd8SToby Isaac   default:
385ea13f565SStefano Zampini     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "PDE operator %d is not supported", op);
38620cf1dd8SToby Isaac   }
3872e58f0efSBarry Smith   if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"      f_0[fidx] *= detJ[cell]*w;\n", &count);CHKERRSTR(ierr);}
38820cf1dd8SToby Isaac   if (useF1) {
38920cf1dd8SToby Isaac     switch (dim) {
39020cf1dd8SToby Isaac     case 1:
3912e58f0efSBarry Smith       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"      f_1[fidx].x *= detJ[cell]*w;\n", &count);CHKERRSTR(ierr);break;
39220cf1dd8SToby Isaac     case 2:
3932e58f0efSBarry Smith       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"      f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;\n", &count);CHKERRSTR(ierr);break;
39420cf1dd8SToby Isaac     case 3:
3952e58f0efSBarry Smith       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"      f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;\n", &count);CHKERRSTR(ierr);break;
39620cf1dd8SToby Isaac     }
39720cf1dd8SToby Isaac   }
39820cf1dd8SToby Isaac   /* Thread transpose */
39920cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
40020cf1dd8SToby Isaac "    }\n\n"
40120cf1dd8SToby Isaac "    /* ==== TRANSPOSE THREADS ==== */\n"
40220cf1dd8SToby Isaac "    barrier(CLK_LOCAL_MEM_FENCE);\n\n",
4032e58f0efSBarry Smith                        &count);CHKERRSTR(ierr);
40420cf1dd8SToby Isaac   /* Basis phase */
40520cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
40620cf1dd8SToby Isaac "    /* Map values at quadrature points to coefficients */\n"
40720cf1dd8SToby Isaac "    for (int c = 0; c < N_sbc; ++c) {\n"
40820cf1dd8SToby Isaac "      const int cell = c*N_bl*N_q + blbidx; /* Cell number in batch */\n"
40920cf1dd8SToby Isaac "\n"
41020cf1dd8SToby Isaac "      e_i = 0.0;\n"
41120cf1dd8SToby Isaac "      for (int q = 0; q < N_q; ++q) {\n"
41220cf1dd8SToby Isaac "        const int pidx = q*N_bt + bidx;\n"
41320cf1dd8SToby Isaac "        const int fidx = (cell*N_q + q)*N_comp + cidx;\n"
41420cf1dd8SToby Isaac "        %s%d   realSpaceDer;\n\n",
4152e58f0efSBarry Smith                        &count, numeric_str, dim);CHKERRSTR(ierr);
41620cf1dd8SToby Isaac 
4172e58f0efSBarry Smith   if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"        e_i += phi_i[pidx]*f_0[fidx];\n", &count);CHKERRSTR(ierr);}
41820cf1dd8SToby Isaac   if (useF1) {
41920cf1dd8SToby Isaac     switch (dim) {
42020cf1dd8SToby Isaac     case 2:
42120cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
42220cf1dd8SToby Isaac "        realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
42320cf1dd8SToby Isaac "        e_i           += realSpaceDer.x*f_1[fidx].x;\n"
42420cf1dd8SToby Isaac "        realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
42520cf1dd8SToby Isaac "        e_i           += realSpaceDer.y*f_1[fidx].y;\n",
4262e58f0efSBarry Smith                            &count);CHKERRSTR(ierr);break;
42720cf1dd8SToby Isaac     case 3:
42820cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
42920cf1dd8SToby Isaac "        realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
43020cf1dd8SToby Isaac "        e_i           += realSpaceDer.x*f_1[fidx].x;\n"
43120cf1dd8SToby Isaac "        realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
43220cf1dd8SToby Isaac "        e_i           += realSpaceDer.y*f_1[fidx].y;\n"
43320cf1dd8SToby Isaac "        realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
43420cf1dd8SToby Isaac "        e_i           += realSpaceDer.z*f_1[fidx].z;\n",
4352e58f0efSBarry Smith                            &count);CHKERRSTR(ierr);break;
43620cf1dd8SToby Isaac     }
43720cf1dd8SToby Isaac   }
43820cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
43920cf1dd8SToby Isaac "      }\n"
44020cf1dd8SToby Isaac "      /* Write element vector for N_{cbc} cells at a time */\n"
44120cf1dd8SToby Isaac "      elemVec[(Goffset + batch*N_bc + c*N_bl*N_q)*N_bt + tidx] = e_i;\n"
44220cf1dd8SToby Isaac "    }\n"
44320cf1dd8SToby Isaac "    /* ==== Could do one write per batch ==== */\n"
44420cf1dd8SToby Isaac "  }\n"
44520cf1dd8SToby Isaac "  return;\n"
44620cf1dd8SToby Isaac "}\n",
4472e58f0efSBarry Smith                        &count);CHKERRSTR(ierr);
44820cf1dd8SToby Isaac   PetscFunctionReturn(0);
44920cf1dd8SToby Isaac }
45020cf1dd8SToby Isaac 
4512b99622eSMatthew G. Knepley static PetscErrorCode PetscFEOpenCLGetIntegrationKernel(PetscFE fem, PetscBool useAux, cl_program *ocl_prog, cl_kernel *ocl_kernel)
45220cf1dd8SToby Isaac {
45320cf1dd8SToby Isaac   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
45420cf1dd8SToby Isaac   PetscInt        dim, N_bl;
45520cf1dd8SToby Isaac   PetscBool       flg;
45620cf1dd8SToby Isaac   char           *buffer;
45720cf1dd8SToby Isaac   size_t          len;
45820cf1dd8SToby Isaac   char            errMsg[8192];
4592da392ccSBarry Smith   cl_int          err;
46020cf1dd8SToby Isaac   PetscErrorCode  ierr;
46120cf1dd8SToby Isaac 
46220cf1dd8SToby Isaac   PetscFunctionBegin;
46320cf1dd8SToby Isaac   ierr = PetscFEGetSpatialDimension(fem, &dim);CHKERRQ(ierr);
46420cf1dd8SToby Isaac   ierr = PetscMalloc1(8192, &buffer);CHKERRQ(ierr);
46520cf1dd8SToby Isaac   ierr = PetscFEGetTileSizes(fem, NULL, &N_bl, NULL, NULL);CHKERRQ(ierr);
46620cf1dd8SToby Isaac   ierr = PetscFEOpenCLGenerateIntegrationCode(fem, &buffer, 8192, useAux, N_bl);CHKERRQ(ierr);
46720cf1dd8SToby Isaac   ierr = PetscOptionsHasName(((PetscObject)fem)->options,((PetscObject)fem)->prefix, "-petscfe_opencl_kernel_print", &flg);CHKERRQ(ierr);
46820cf1dd8SToby Isaac   if (flg) {ierr = PetscPrintf(PetscObjectComm((PetscObject) fem), "OpenCL FE Integration Kernel:\n%s\n", buffer);CHKERRQ(ierr);}
4692da392ccSBarry Smith   ierr = PetscStrlen(buffer,&len);CHKERRQ(ierr);
4702da392ccSBarry Smith   *ocl_prog = clCreateProgramWithSource(ocl->ctx_id, 1, (const char **) &buffer, &len, &err);CHKERRQ(err);
4712da392ccSBarry Smith   err = clBuildProgram(*ocl_prog, 0, NULL, NULL, NULL, NULL);
4722da392ccSBarry Smith   if (err != CL_SUCCESS) {
473*2f613bf5SBarry Smith     err = clGetProgramBuildInfo(*ocl_prog, ocl->dev_id, CL_PROGRAM_BUILD_LOG, 8192*sizeof(char), &errMsg, NULL);
47420cf1dd8SToby Isaac     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Build failed! Log:\n %s", errMsg);
47520cf1dd8SToby Isaac   }
47620cf1dd8SToby Isaac   ierr = PetscFree(buffer);CHKERRQ(ierr);
477*2f613bf5SBarry Smith   *ocl_kernel = clCreateKernel(*ocl_prog, "integrateElementQuadrature", &ierr);
47820cf1dd8SToby Isaac   PetscFunctionReturn(0);
47920cf1dd8SToby Isaac }
48020cf1dd8SToby Isaac 
4812b99622eSMatthew G. Knepley static PetscErrorCode PetscFEOpenCLCalculateGrid(PetscFE fem, PetscInt N, PetscInt blockSize, size_t *x, size_t *y, size_t *z)
48220cf1dd8SToby Isaac {
48320cf1dd8SToby Isaac   const PetscInt Nblocks = N/blockSize;
48420cf1dd8SToby Isaac 
48520cf1dd8SToby Isaac   PetscFunctionBegin;
48620cf1dd8SToby Isaac   if (N % blockSize) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N);
48720cf1dd8SToby Isaac   *z = 1;
488623c9f23SSatish Balay   *y = 1;
48920cf1dd8SToby Isaac   for (*x = (size_t) (PetscSqrtReal(Nblocks) + 0.5); *x > 0; --*x) {
49020cf1dd8SToby Isaac     *y = Nblocks / *x;
491526996e7SStefano Zampini     if (*x * *y == (size_t)Nblocks) break;
49220cf1dd8SToby Isaac   }
493adf5291fSStefano Zampini   if (*x * *y != (size_t)Nblocks) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %D with block size %D", N, blockSize);
49420cf1dd8SToby Isaac   PetscFunctionReturn(0);
49520cf1dd8SToby Isaac }
49620cf1dd8SToby Isaac 
4972b99622eSMatthew G. Knepley static PetscErrorCode PetscFEOpenCLLogResidual(PetscFE fem, PetscLogDouble time, PetscLogDouble flops)
49820cf1dd8SToby Isaac {
49920cf1dd8SToby Isaac   PetscFE_OpenCL   *ocl = (PetscFE_OpenCL *) fem->data;
50020cf1dd8SToby Isaac   PetscStageLog     stageLog;
50120cf1dd8SToby Isaac   PetscEventPerfLog eventLog = NULL;
502afbfd3a7SStefano Zampini   int               stage;
50320cf1dd8SToby Isaac   PetscErrorCode    ierr;
50420cf1dd8SToby Isaac 
50520cf1dd8SToby Isaac   PetscFunctionBegin;
50620cf1dd8SToby Isaac   ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
50720cf1dd8SToby Isaac   ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr);
50820cf1dd8SToby Isaac   ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr);
50920cf1dd8SToby Isaac     /* Log performance info */
51020cf1dd8SToby Isaac   eventLog->eventInfo[ocl->residualEvent].count++;
51120cf1dd8SToby Isaac   eventLog->eventInfo[ocl->residualEvent].time  += time;
51220cf1dd8SToby Isaac   eventLog->eventInfo[ocl->residualEvent].flops += flops;
51320cf1dd8SToby Isaac   PetscFunctionReturn(0);
51420cf1dd8SToby Isaac }
51520cf1dd8SToby Isaac 
51606ad1575SMatthew G. Knepley static PetscErrorCode PetscFEIntegrateResidual_OpenCL(PetscDS prob, PetscFormKey key, PetscInt Ne, PetscFEGeom *cgeom,
51720cf1dd8SToby Isaac                                                       const PetscScalar coefficients[], const PetscScalar coefficients_t[], PetscDS probAux, const PetscScalar coefficientsAux[], PetscReal t, PetscScalar elemVec[])
51820cf1dd8SToby Isaac {
51920cf1dd8SToby Isaac   /* Nbc = batchSize */
520849189efSMatthew G. Knepley   PetscFE           fem;
521849189efSMatthew G. Knepley   PetscFE_OpenCL   *ocl;
52220cf1dd8SToby Isaac   PetscPointFunc    f0_func;
52320cf1dd8SToby Isaac   PetscPointFunc    f1_func;
52420cf1dd8SToby Isaac   PetscQuadrature   q;
52520cf1dd8SToby Isaac   PetscInt          dim, qNc;
52620cf1dd8SToby Isaac   PetscInt          N_b;    /* The number of basis functions */
52720cf1dd8SToby Isaac   PetscInt          N_comp; /* The number of basis function components */
52820cf1dd8SToby Isaac   PetscInt          N_bt;   /* The total number of scalar basis functions */
52920cf1dd8SToby Isaac   PetscInt          N_q;    /* The number of quadrature points */
53020cf1dd8SToby Isaac   PetscInt          N_bst;  /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */
53120cf1dd8SToby Isaac   PetscInt          N_t;    /* The number of threads, N_bst * N_bl */
53220cf1dd8SToby Isaac   PetscInt          N_bl;   /* The number of blocks */
53320cf1dd8SToby Isaac   PetscInt          N_bc;   /* The batch size, N_bl*N_q*N_b */
53420cf1dd8SToby Isaac   PetscInt          N_cb;   /* The number of batches */
5356528b96dSMatthew G. Knepley   const PetscInt    field = key.field;
53620cf1dd8SToby Isaac   PetscInt          numFlops, f0Flops = 0, f1Flops = 0;
53720cf1dd8SToby Isaac   PetscBool         useAux      = probAux ? PETSC_TRUE : PETSC_FALSE;
53820cf1dd8SToby Isaac   PetscBool         useField    = PETSC_FALSE;
53920cf1dd8SToby Isaac   PetscBool         useFieldDer = PETSC_TRUE;
54020cf1dd8SToby Isaac   PetscBool         useF0       = PETSC_TRUE;
54120cf1dd8SToby Isaac   PetscBool         useF1       = PETSC_TRUE;
54220cf1dd8SToby Isaac   /* OpenCL variables */
54320cf1dd8SToby Isaac   cl_program        ocl_prog;
54420cf1dd8SToby Isaac   cl_kernel         ocl_kernel;
54520cf1dd8SToby Isaac   cl_event          ocl_ev;         /* The event for tracking kernel execution */
54620cf1dd8SToby Isaac   cl_ulong          ns_start;       /* Nanoseconds counter on GPU at kernel start */
54720cf1dd8SToby Isaac   cl_ulong          ns_end;         /* Nanoseconds counter on GPU at kernel stop */
54820cf1dd8SToby Isaac   cl_mem            o_jacobianInverses, o_jacobianDeterminants;
54920cf1dd8SToby Isaac   cl_mem            o_coefficients, o_coefficientsAux, o_elemVec;
55020cf1dd8SToby Isaac   float            *f_coeff = NULL, *f_coeffAux = NULL, *f_invJ = NULL, *f_detJ = NULL;
55120cf1dd8SToby Isaac   double           *d_coeff = NULL, *d_coeffAux = NULL, *d_invJ = NULL, *d_detJ = NULL;
55220cf1dd8SToby Isaac   PetscReal        *r_invJ = NULL, *r_detJ = NULL;
55320cf1dd8SToby Isaac   void             *oclCoeff, *oclCoeffAux, *oclInvJ, *oclDetJ;
55420cf1dd8SToby Isaac   size_t            local_work_size[3], global_work_size[3];
55520cf1dd8SToby Isaac   size_t            realSize, x, y, z;
55620cf1dd8SToby Isaac   const PetscReal   *points, *weights;
55720cf1dd8SToby Isaac   PetscErrorCode    ierr;
55820cf1dd8SToby Isaac 
55920cf1dd8SToby Isaac   PetscFunctionBegin;
560849189efSMatthew G. Knepley   ierr = PetscDSGetDiscretization(prob, field, (PetscObject *) &fem);CHKERRQ(ierr);
561849189efSMatthew G. Knepley   ocl  = (PetscFE_OpenCL *) fem->data;
56220cf1dd8SToby Isaac   if (!Ne) {ierr = PetscFEOpenCLLogResidual(fem, 0.0, 0.0);CHKERRQ(ierr); PetscFunctionReturn(0);}
56320cf1dd8SToby Isaac   ierr = PetscFEGetSpatialDimension(fem, &dim);CHKERRQ(ierr);
56420cf1dd8SToby Isaac   ierr = PetscFEGetQuadrature(fem, &q);CHKERRQ(ierr);
56520cf1dd8SToby Isaac   ierr = PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);CHKERRQ(ierr);
56620cf1dd8SToby Isaac   if (qNc != 1) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %D components\n", qNc);
56720cf1dd8SToby Isaac   ierr = PetscFEGetDimension(fem, &N_b);CHKERRQ(ierr);
56820cf1dd8SToby Isaac   ierr = PetscFEGetNumComponents(fem, &N_comp);CHKERRQ(ierr);
56920cf1dd8SToby Isaac   ierr = PetscDSGetResidual(prob, field, &f0_func, &f1_func);CHKERRQ(ierr);
57020cf1dd8SToby Isaac   ierr = PetscFEGetTileSizes(fem, NULL, &N_bl, &N_bc, &N_cb);CHKERRQ(ierr);
57120cf1dd8SToby Isaac   N_bt  = N_b*N_comp;
57220cf1dd8SToby Isaac   N_bst = N_bt*N_q;
57320cf1dd8SToby Isaac   N_t   = N_bst*N_bl;
57420cf1dd8SToby Isaac   if (N_bc*N_comp != N_t) SETERRQ3(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, N_bc, N_comp);
57520cf1dd8SToby Isaac   /* Calculate layout */
57620cf1dd8SToby Isaac   if (Ne % (N_cb*N_bc)) { /* Remainder cells */
5776528b96dSMatthew G. Knepley     ierr = PetscFEIntegrateResidual_Basic(prob, key, Ne, cgeom, coefficients, coefficients_t, probAux, coefficientsAux, t, elemVec);CHKERRQ(ierr);
57820cf1dd8SToby Isaac     PetscFunctionReturn(0);
57920cf1dd8SToby Isaac   }
58020cf1dd8SToby Isaac   ierr = PetscFEOpenCLCalculateGrid(fem, Ne, N_cb*N_bc, &x, &y, &z);CHKERRQ(ierr);
58120cf1dd8SToby Isaac   local_work_size[0]  = N_bc*N_comp;
58220cf1dd8SToby Isaac   local_work_size[1]  = 1;
58320cf1dd8SToby Isaac   local_work_size[2]  = 1;
58420cf1dd8SToby Isaac   global_work_size[0] = x * local_work_size[0];
58520cf1dd8SToby Isaac   global_work_size[1] = y * local_work_size[1];
58620cf1dd8SToby Isaac   global_work_size[2] = z * local_work_size[2];
58720cf1dd8SToby Isaac   ierr = PetscInfo7(fem, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n", x, y, z, local_work_size[0], local_work_size[1], local_work_size[2], N_cb);CHKERRQ(ierr);
5881e1ea65dSPierre Jolivet   ierr = PetscInfo2(fem, " N_t: %d, N_cb: %d\n", N_t, N_cb);CHKERRQ(ierr);
58920cf1dd8SToby Isaac   /* Generate code */
59020cf1dd8SToby Isaac   if (probAux) {
59120cf1dd8SToby Isaac     PetscSpace P;
59220cf1dd8SToby Isaac     PetscInt   NfAux, order, f;
59320cf1dd8SToby Isaac 
59420cf1dd8SToby Isaac     ierr = PetscDSGetNumFields(probAux, &NfAux);CHKERRQ(ierr);
59520cf1dd8SToby Isaac     for (f = 0; f < NfAux; ++f) {
59620cf1dd8SToby Isaac       PetscFE feAux;
59720cf1dd8SToby Isaac 
59820cf1dd8SToby Isaac       ierr = PetscDSGetDiscretization(probAux, f, (PetscObject *) &feAux);CHKERRQ(ierr);
59920cf1dd8SToby Isaac       ierr = PetscFEGetBasisSpace(feAux, &P);CHKERRQ(ierr);
60020cf1dd8SToby Isaac       ierr = PetscSpaceGetDegree(P, &order, NULL);CHKERRQ(ierr);
60120cf1dd8SToby Isaac       if (order > 0) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Can only handle P0 coefficient fields");
60220cf1dd8SToby Isaac     }
60320cf1dd8SToby Isaac   }
60420cf1dd8SToby Isaac   ierr = PetscFEOpenCLGetIntegrationKernel(fem, useAux, &ocl_prog, &ocl_kernel);CHKERRQ(ierr);
60520cf1dd8SToby Isaac   /* Create buffers on the device and send data over */
60620cf1dd8SToby Isaac   ierr = PetscDataTypeGetSize(ocl->realType, &realSize);CHKERRQ(ierr);
6077be5e748SToby Isaac   if (cgeom->numPoints > 1) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only support affine geometry for OpenCL integration right now");
60820cf1dd8SToby Isaac   if (sizeof(PetscReal) != realSize) {
60920cf1dd8SToby Isaac     switch (ocl->realType) {
61020cf1dd8SToby Isaac     case PETSC_FLOAT:
61120cf1dd8SToby Isaac     {
61220cf1dd8SToby Isaac       PetscInt c, b, d;
61320cf1dd8SToby Isaac 
61420cf1dd8SToby Isaac       ierr = PetscMalloc4(Ne*N_bt,&f_coeff,Ne,&f_coeffAux,Ne*dim*dim,&f_invJ,Ne,&f_detJ);CHKERRQ(ierr);
61520cf1dd8SToby Isaac       for (c = 0; c < Ne; ++c) {
6167be5e748SToby Isaac         f_detJ[c] = (float) cgeom->detJ[c];
61720cf1dd8SToby Isaac         for (d = 0; d < dim*dim; ++d) {
6187be5e748SToby Isaac           f_invJ[c*dim*dim+d] = (float) cgeom->invJ[c * dim * dim + d];
61920cf1dd8SToby Isaac         }
62020cf1dd8SToby Isaac         for (b = 0; b < N_bt; ++b) {
62120cf1dd8SToby Isaac           f_coeff[c*N_bt+b] = (float) coefficients[c*N_bt+b];
62220cf1dd8SToby Isaac         }
62320cf1dd8SToby Isaac       }
62420cf1dd8SToby Isaac       if (coefficientsAux) { /* Assume P0 */
62520cf1dd8SToby Isaac         for (c = 0; c < Ne; ++c) {
62620cf1dd8SToby Isaac           f_coeffAux[c] = (float) coefficientsAux[c];
62720cf1dd8SToby Isaac         }
62820cf1dd8SToby Isaac       }
62920cf1dd8SToby Isaac       oclCoeff      = (void *) f_coeff;
63020cf1dd8SToby Isaac       if (coefficientsAux) {
63120cf1dd8SToby Isaac         oclCoeffAux = (void *) f_coeffAux;
63220cf1dd8SToby Isaac       } else {
63320cf1dd8SToby Isaac         oclCoeffAux = NULL;
63420cf1dd8SToby Isaac       }
63520cf1dd8SToby Isaac       oclInvJ       = (void *) f_invJ;
63620cf1dd8SToby Isaac       oclDetJ       = (void *) f_detJ;
63720cf1dd8SToby Isaac     }
63820cf1dd8SToby Isaac     break;
63920cf1dd8SToby Isaac     case PETSC_DOUBLE:
64020cf1dd8SToby Isaac     {
64120cf1dd8SToby Isaac       PetscInt c, b, d;
64220cf1dd8SToby Isaac 
64320cf1dd8SToby Isaac       ierr = PetscMalloc4(Ne*N_bt,&d_coeff,Ne,&d_coeffAux,Ne*dim*dim,&d_invJ,Ne,&d_detJ);CHKERRQ(ierr);
64420cf1dd8SToby Isaac       for (c = 0; c < Ne; ++c) {
6457be5e748SToby Isaac         d_detJ[c] = (double) cgeom->detJ[c];
64620cf1dd8SToby Isaac         for (d = 0; d < dim*dim; ++d) {
6477be5e748SToby Isaac           d_invJ[c*dim*dim+d] = (double) cgeom->invJ[c * dim * dim + d];
64820cf1dd8SToby Isaac         }
64920cf1dd8SToby Isaac         for (b = 0; b < N_bt; ++b) {
65020cf1dd8SToby Isaac           d_coeff[c*N_bt+b] = (double) coefficients[c*N_bt+b];
65120cf1dd8SToby Isaac         }
65220cf1dd8SToby Isaac       }
65320cf1dd8SToby Isaac       if (coefficientsAux) { /* Assume P0 */
65420cf1dd8SToby Isaac         for (c = 0; c < Ne; ++c) {
65520cf1dd8SToby Isaac           d_coeffAux[c] = (double) coefficientsAux[c];
65620cf1dd8SToby Isaac         }
65720cf1dd8SToby Isaac       }
65820cf1dd8SToby Isaac       oclCoeff      = (void *) d_coeff;
65920cf1dd8SToby Isaac       if (coefficientsAux) {
66020cf1dd8SToby Isaac         oclCoeffAux = (void *) d_coeffAux;
66120cf1dd8SToby Isaac       } else {
66220cf1dd8SToby Isaac         oclCoeffAux = NULL;
66320cf1dd8SToby Isaac       }
66420cf1dd8SToby Isaac       oclInvJ       = (void *) d_invJ;
66520cf1dd8SToby Isaac       oclDetJ       = (void *) d_detJ;
66620cf1dd8SToby Isaac     }
66720cf1dd8SToby Isaac     break;
66820cf1dd8SToby Isaac     default:
66920cf1dd8SToby Isaac       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
67020cf1dd8SToby Isaac     }
67120cf1dd8SToby Isaac   } else {
67220cf1dd8SToby Isaac     PetscInt c, d;
67320cf1dd8SToby Isaac 
67420cf1dd8SToby Isaac     ierr = PetscMalloc2(Ne*dim*dim,&r_invJ,Ne,&r_detJ);CHKERRQ(ierr);
67520cf1dd8SToby Isaac     for (c = 0; c < Ne; ++c) {
6767be5e748SToby Isaac       r_detJ[c] = cgeom->detJ[c];
67720cf1dd8SToby Isaac       for (d = 0; d < dim*dim; ++d) {
6787be5e748SToby Isaac         r_invJ[c*dim*dim+d] = cgeom->invJ[c * dim * dim + d];
67920cf1dd8SToby Isaac       }
68020cf1dd8SToby Isaac     }
68120cf1dd8SToby Isaac     oclCoeff    = (void *) coefficients;
68220cf1dd8SToby Isaac     oclCoeffAux = (void *) coefficientsAux;
68320cf1dd8SToby Isaac     oclInvJ     = (void *) r_invJ;
68420cf1dd8SToby Isaac     oclDetJ     = (void *) r_detJ;
68520cf1dd8SToby Isaac   }
686*2f613bf5SBarry Smith   o_coefficients         = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*N_bt    * realSize, oclCoeff,    &ierr);
68720cf1dd8SToby Isaac   if (coefficientsAux) {
688*2f613bf5SBarry Smith     o_coefficientsAux    = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne         * realSize, oclCoeffAux, &ierr);
68920cf1dd8SToby Isaac   } else {
690*2f613bf5SBarry Smith     o_coefficientsAux    = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY,                        Ne         * realSize, oclCoeffAux, &ierr);
69120cf1dd8SToby Isaac   }
692*2f613bf5SBarry Smith   o_jacobianInverses     = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*dim*dim * realSize, oclInvJ,     &ierr);
693*2f613bf5SBarry Smith   o_jacobianDeterminants = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne         * realSize, oclDetJ,     &ierr);
694*2f613bf5SBarry Smith   o_elemVec              = clCreateBuffer(ocl->ctx_id, CL_MEM_WRITE_ONLY,                       Ne*N_bt    * realSize, NULL,        &ierr);
69520cf1dd8SToby Isaac   /* Kernel launch */
69620cf1dd8SToby Isaac   ierr = clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void*) &N_cb);CHKERRQ(ierr);
69720cf1dd8SToby Isaac   ierr = clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void*) &o_coefficients);CHKERRQ(ierr);
69820cf1dd8SToby Isaac   ierr = clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void*) &o_coefficientsAux);CHKERRQ(ierr);
69920cf1dd8SToby Isaac   ierr = clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void*) &o_jacobianInverses);CHKERRQ(ierr);
70020cf1dd8SToby Isaac   ierr = clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void*) &o_jacobianDeterminants);CHKERRQ(ierr);
70120cf1dd8SToby Isaac   ierr = clSetKernelArg(ocl_kernel, 5, sizeof(cl_mem), (void*) &o_elemVec);CHKERRQ(ierr);
70220cf1dd8SToby Isaac   ierr = clEnqueueNDRangeKernel(ocl->queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev);CHKERRQ(ierr);
70320cf1dd8SToby Isaac   /* Read data back from device */
70420cf1dd8SToby Isaac   if (sizeof(PetscReal) != realSize) {
70520cf1dd8SToby Isaac     switch (ocl->realType) {
70620cf1dd8SToby Isaac     case PETSC_FLOAT:
70720cf1dd8SToby Isaac     {
70820cf1dd8SToby Isaac       float   *elem;
70920cf1dd8SToby Isaac       PetscInt c, b;
71020cf1dd8SToby Isaac 
71120cf1dd8SToby Isaac       ierr = PetscFree4(f_coeff,f_coeffAux,f_invJ,f_detJ);CHKERRQ(ierr);
71220cf1dd8SToby Isaac       ierr = PetscMalloc1(Ne*N_bt, &elem);CHKERRQ(ierr);
71320cf1dd8SToby Isaac       ierr = clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL);CHKERRQ(ierr);
71420cf1dd8SToby Isaac       for (c = 0; c < Ne; ++c) {
71520cf1dd8SToby Isaac         for (b = 0; b < N_bt; ++b) {
71620cf1dd8SToby Isaac           elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b];
71720cf1dd8SToby Isaac         }
71820cf1dd8SToby Isaac       }
71920cf1dd8SToby Isaac       ierr = PetscFree(elem);CHKERRQ(ierr);
72020cf1dd8SToby Isaac     }
72120cf1dd8SToby Isaac     break;
72220cf1dd8SToby Isaac     case PETSC_DOUBLE:
72320cf1dd8SToby Isaac     {
72420cf1dd8SToby Isaac       double  *elem;
72520cf1dd8SToby Isaac       PetscInt c, b;
72620cf1dd8SToby Isaac 
72720cf1dd8SToby Isaac       ierr = PetscFree4(d_coeff,d_coeffAux,d_invJ,d_detJ);CHKERRQ(ierr);
72820cf1dd8SToby Isaac       ierr = PetscMalloc1(Ne*N_bt, &elem);CHKERRQ(ierr);
72920cf1dd8SToby Isaac       ierr = clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL);CHKERRQ(ierr);
73020cf1dd8SToby Isaac       for (c = 0; c < Ne; ++c) {
73120cf1dd8SToby Isaac         for (b = 0; b < N_bt; ++b) {
73220cf1dd8SToby Isaac           elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b];
73320cf1dd8SToby Isaac         }
73420cf1dd8SToby Isaac       }
73520cf1dd8SToby Isaac       ierr = PetscFree(elem);CHKERRQ(ierr);
73620cf1dd8SToby Isaac     }
73720cf1dd8SToby Isaac     break;
73820cf1dd8SToby Isaac     default:
73920cf1dd8SToby Isaac       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
74020cf1dd8SToby Isaac     }
74120cf1dd8SToby Isaac   } else {
74220cf1dd8SToby Isaac     ierr = PetscFree2(r_invJ,r_detJ);CHKERRQ(ierr);
74320cf1dd8SToby Isaac     ierr = clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elemVec, 0, NULL, NULL);CHKERRQ(ierr);
74420cf1dd8SToby Isaac   }
74520cf1dd8SToby Isaac   /* Log performance */
74620cf1dd8SToby Isaac   ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL);CHKERRQ(ierr);
74720cf1dd8SToby Isaac   ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END,   sizeof(cl_ulong), &ns_end,   NULL);CHKERRQ(ierr);
74820cf1dd8SToby Isaac   f0Flops = 0;
74920cf1dd8SToby Isaac   switch (ocl->op) {
75020cf1dd8SToby Isaac   case LAPLACIAN:
75120cf1dd8SToby Isaac     f1Flops = useAux ? dim : 0;break;
75220cf1dd8SToby Isaac   case ELASTICITY:
75320cf1dd8SToby Isaac     f1Flops = 2*dim*dim;break;
75420cf1dd8SToby Isaac   }
75520cf1dd8SToby Isaac   numFlops = Ne*(
75620cf1dd8SToby Isaac     N_q*(
75720cf1dd8SToby Isaac       N_b*N_comp*((useField ? 2 : 0) + (useFieldDer ? 2*dim*(dim + 1) : 0))
75820cf1dd8SToby Isaac       /*+
75920cf1dd8SToby Isaac        N_ba*N_compa*((useFieldAux ? 2 : 0) + (useFieldDerAux ? 2*dim*(dim + 1) : 0))*/
76020cf1dd8SToby Isaac       +
76120cf1dd8SToby Isaac       N_comp*((useF0 ? f0Flops + 2 : 0) + (useF1 ? f1Flops + 2*dim : 0)))
76220cf1dd8SToby Isaac     +
76320cf1dd8SToby Isaac     N_b*((useF0 ? 2 : 0) + (useF1 ? 2*dim*(dim + 1) : 0)));
76420cf1dd8SToby Isaac   ierr = PetscFEOpenCLLogResidual(fem, (ns_end - ns_start)*1.0e-9, numFlops);CHKERRQ(ierr);
76520cf1dd8SToby Isaac   /* Cleanup */
76620cf1dd8SToby Isaac   ierr = clReleaseMemObject(o_coefficients);CHKERRQ(ierr);
76720cf1dd8SToby Isaac   ierr = clReleaseMemObject(o_coefficientsAux);CHKERRQ(ierr);
76820cf1dd8SToby Isaac   ierr = clReleaseMemObject(o_jacobianInverses);CHKERRQ(ierr);
76920cf1dd8SToby Isaac   ierr = clReleaseMemObject(o_jacobianDeterminants);CHKERRQ(ierr);
77020cf1dd8SToby Isaac   ierr = clReleaseMemObject(o_elemVec);CHKERRQ(ierr);
77120cf1dd8SToby Isaac   ierr = clReleaseKernel(ocl_kernel);CHKERRQ(ierr);
77220cf1dd8SToby Isaac   ierr = clReleaseProgram(ocl_prog);CHKERRQ(ierr);
77320cf1dd8SToby Isaac   PetscFunctionReturn(0);
77420cf1dd8SToby Isaac }
77520cf1dd8SToby Isaac 
776526996e7SStefano Zampini PETSC_INTERN PetscErrorCode PetscFESetUp_Basic(PetscFE);
777526996e7SStefano Zampini PETSC_INTERN PetscErrorCode PetscFECreateTabulation_Basic(PetscFE, PetscInt, const PetscReal [], PetscInt, PetscTabulation);
7782e47ffbbSToby Isaac 
7792b99622eSMatthew G. Knepley static PetscErrorCode PetscFEInitialize_OpenCL(PetscFE fem)
78020cf1dd8SToby Isaac {
78120cf1dd8SToby Isaac   PetscFunctionBegin;
78220cf1dd8SToby Isaac   fem->ops->setfromoptions          = NULL;
78320cf1dd8SToby Isaac   fem->ops->setup                   = PetscFESetUp_Basic;
78420cf1dd8SToby Isaac   fem->ops->view                    = NULL;
78520cf1dd8SToby Isaac   fem->ops->destroy                 = PetscFEDestroy_OpenCL;
78620cf1dd8SToby Isaac   fem->ops->getdimension            = PetscFEGetDimension_Basic;
787ef0bb6c7SMatthew G. Knepley   fem->ops->createtabulation        = PetscFECreateTabulation_Basic;
78820cf1dd8SToby Isaac   fem->ops->integrateresidual       = PetscFEIntegrateResidual_OpenCL;
78920cf1dd8SToby Isaac   fem->ops->integratebdresidual     = NULL/* PetscFEIntegrateBdResidual_OpenCL */;
79020cf1dd8SToby Isaac   fem->ops->integratejacobianaction = NULL/* PetscFEIntegrateJacobianAction_OpenCL */;
79120cf1dd8SToby Isaac   fem->ops->integratejacobian       = PetscFEIntegrateJacobian_Basic;
79220cf1dd8SToby Isaac   PetscFunctionReturn(0);
79320cf1dd8SToby Isaac }
79420cf1dd8SToby Isaac 
79520cf1dd8SToby Isaac /*MC
79620cf1dd8SToby Isaac   PETSCFEOPENCL = "opencl" - A PetscFE object that integrates using a vectorized OpenCL implementation
79720cf1dd8SToby Isaac 
79820cf1dd8SToby Isaac   Level: intermediate
79920cf1dd8SToby Isaac 
80020cf1dd8SToby Isaac .seealso: PetscFEType, PetscFECreate(), PetscFESetType()
80120cf1dd8SToby Isaac M*/
80220cf1dd8SToby Isaac 
80320cf1dd8SToby Isaac PETSC_EXTERN PetscErrorCode PetscFECreate_OpenCL(PetscFE fem)
80420cf1dd8SToby Isaac {
80520cf1dd8SToby Isaac   PetscFE_OpenCL *ocl;
80620cf1dd8SToby Isaac   cl_uint         num_platforms;
80720cf1dd8SToby Isaac   cl_platform_id  platform_ids[42];
80820cf1dd8SToby Isaac   cl_uint         num_devices;
80920cf1dd8SToby Isaac   cl_device_id    device_ids[42];
8102da392ccSBarry Smith   cl_int          err;
81120cf1dd8SToby Isaac   PetscErrorCode  ierr;
81220cf1dd8SToby Isaac 
81320cf1dd8SToby Isaac   PetscFunctionBegin;
81420cf1dd8SToby Isaac   PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
81520cf1dd8SToby Isaac   ierr      = PetscNewLog(fem,&ocl);CHKERRQ(ierr);
81620cf1dd8SToby Isaac   fem->data = ocl;
81720cf1dd8SToby Isaac 
81820cf1dd8SToby Isaac   /* Init Platform */
81920cf1dd8SToby Isaac   ierr = clGetPlatformIDs(42, platform_ids, &num_platforms);CHKERRQ(ierr);
82020cf1dd8SToby Isaac   if (!num_platforms) SETERRQ(PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL platform found.");
82120cf1dd8SToby Isaac   ocl->pf_id = platform_ids[0];
82220cf1dd8SToby Isaac   /* Init Device */
82320cf1dd8SToby Isaac   ierr = clGetDeviceIDs(ocl->pf_id, CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices);CHKERRQ(ierr);
82420cf1dd8SToby Isaac   if (!num_devices) SETERRQ(PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL device found.");
82520cf1dd8SToby Isaac   ocl->dev_id = device_ids[0];
82620cf1dd8SToby Isaac   /* Create context with one command queue */
8272da392ccSBarry Smith   ocl->ctx_id   = clCreateContext(0, 1, &(ocl->dev_id), NULL, NULL, &err);CHKERRQ(err);
8282da392ccSBarry Smith   ocl->queue_id = clCreateCommandQueue(ocl->ctx_id, ocl->dev_id, CL_QUEUE_PROFILING_ENABLE, &err);CHKERRQ(err);
82920cf1dd8SToby Isaac   /* Types */
83020cf1dd8SToby Isaac   ocl->realType = PETSC_FLOAT;
83120cf1dd8SToby Isaac   /* Register events */
83220cf1dd8SToby Isaac   ierr = PetscLogEventRegister("OpenCL FEResidual", PETSCFE_CLASSID, &ocl->residualEvent);CHKERRQ(ierr);
83320cf1dd8SToby Isaac   /* Equation handling */
83420cf1dd8SToby Isaac   ocl->op = LAPLACIAN;
83520cf1dd8SToby Isaac 
83620cf1dd8SToby Isaac   ierr = PetscFEInitialize_OpenCL(fem);CHKERRQ(ierr);
83720cf1dd8SToby Isaac   PetscFunctionReturn(0);
83820cf1dd8SToby Isaac }
83920cf1dd8SToby Isaac 
8402b99622eSMatthew G. Knepley /*@
8412b99622eSMatthew G. Knepley   PetscFEOpenCLSetRealType - Set the scalar type for running on the accelerator
8422b99622eSMatthew G. Knepley 
8432b99622eSMatthew G. Knepley   Input Parameters:
8442b99622eSMatthew G. Knepley + fem      - The PetscFE
8452b99622eSMatthew G. Knepley - realType - The scalar type
8462b99622eSMatthew G. Knepley 
8472b99622eSMatthew G. Knepley   Level: developer
8482b99622eSMatthew G. Knepley 
8492b99622eSMatthew G. Knepley .seealso: PetscFEOpenCLGetRealType()
8502b99622eSMatthew G. Knepley @*/
85120cf1dd8SToby Isaac PetscErrorCode PetscFEOpenCLSetRealType(PetscFE fem, PetscDataType realType)
85220cf1dd8SToby Isaac {
85320cf1dd8SToby Isaac   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
85420cf1dd8SToby Isaac 
85520cf1dd8SToby Isaac   PetscFunctionBegin;
85620cf1dd8SToby Isaac   PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
85720cf1dd8SToby Isaac   ocl->realType = realType;
85820cf1dd8SToby Isaac   PetscFunctionReturn(0);
85920cf1dd8SToby Isaac }
86020cf1dd8SToby Isaac 
8612b99622eSMatthew G. Knepley /*@
8622b99622eSMatthew G. Knepley   PetscFEOpenCLGetRealType - Get the scalar type for running on the accelerator
8632b99622eSMatthew G. Knepley 
8642b99622eSMatthew G. Knepley   Input Parameter:
8652b99622eSMatthew G. Knepley . fem      - The PetscFE
8662b99622eSMatthew G. Knepley 
8672b99622eSMatthew G. Knepley   Output Parameter:
8682b99622eSMatthew G. Knepley . realType - The scalar type
8692b99622eSMatthew G. Knepley 
8702b99622eSMatthew G. Knepley   Level: developer
8712b99622eSMatthew G. Knepley 
8722b99622eSMatthew G. Knepley .seealso: PetscFEOpenCLSetRealType()
8732b99622eSMatthew G. Knepley @*/
87420cf1dd8SToby Isaac PetscErrorCode PetscFEOpenCLGetRealType(PetscFE fem, PetscDataType *realType)
87520cf1dd8SToby Isaac {
87620cf1dd8SToby Isaac   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
87720cf1dd8SToby Isaac 
87820cf1dd8SToby Isaac   PetscFunctionBegin;
87920cf1dd8SToby Isaac   PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
88020cf1dd8SToby Isaac   PetscValidPointer(realType, 2);
88120cf1dd8SToby Isaac   *realType = ocl->realType;
88220cf1dd8SToby Isaac   PetscFunctionReturn(0);
88320cf1dd8SToby Isaac }
88420cf1dd8SToby Isaac 
88520cf1dd8SToby Isaac #endif /* PETSC_HAVE_OPENCL */
886