xref: /petsc/src/dm/dt/fe/impls/opencl/feopencl.c (revision 48a46eb9bd028bec07ec0f396b1a3abb43f14558)
120cf1dd8SToby Isaac #include <petsc/private/petscfeimpl.h> /*I "petscfe.h" I*/
220cf1dd8SToby Isaac 
37be5e748SToby Isaac #if defined(PETSC_HAVE_OPENCL)
420cf1dd8SToby Isaac 
59371c9d4SSatish Balay static PetscErrorCode PetscFEDestroy_OpenCL(PetscFE fem) {
620cf1dd8SToby Isaac   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data;
720cf1dd8SToby Isaac 
820cf1dd8SToby Isaac   PetscFunctionBegin;
99566063dSJacob Faibussowitsch   PetscCall(clReleaseCommandQueue(ocl->queue_id));
1020cf1dd8SToby Isaac   ocl->queue_id = 0;
119566063dSJacob Faibussowitsch   PetscCall(clReleaseContext(ocl->ctx_id));
1220cf1dd8SToby Isaac   ocl->ctx_id = 0;
139566063dSJacob Faibussowitsch   PetscCall(PetscFree(ocl));
1420cf1dd8SToby Isaac   PetscFunctionReturn(0);
1520cf1dd8SToby Isaac }
1620cf1dd8SToby Isaac 
179371c9d4SSatish Balay #define PetscCallSTR(err) \
189371c9d4SSatish Balay   do { \
199371c9d4SSatish Balay     PetscCall(err); \
209371c9d4SSatish Balay     string_tail += count; \
219371c9d4SSatish Balay     PetscCheck(string_tail != end_of_buffer, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Buffer overflow"); \
229371c9d4SSatish Balay   } while (0)
239371c9d4SSatish Balay enum {
249371c9d4SSatish Balay   LAPLACIAN  = 0,
259371c9d4SSatish Balay   ELASTICITY = 1
269371c9d4SSatish Balay };
2720cf1dd8SToby Isaac 
2820cf1dd8SToby Isaac /* NOTE: This is now broken for vector problems. Must redo loops to respect vector basis elements */
2920cf1dd8SToby Isaac /* dim     Number of spatial dimensions:          2                   */
3020cf1dd8SToby Isaac /* N_b     Number of basis functions:             generated           */
3120cf1dd8SToby Isaac /* N_{bt}  Number of total basis functions:       N_b * N_{comp}      */
3220cf1dd8SToby Isaac /* N_q     Number of quadrature points:           generated           */
3320cf1dd8SToby Isaac /* N_{bs}  Number of block cells                  LCM(N_b, N_q)       */
3420cf1dd8SToby Isaac /* N_{bst} Number of block cell components        LCM(N_{bt}, N_q)    */
3520cf1dd8SToby Isaac /* N_{bl}  Number of concurrent blocks            generated           */
3620cf1dd8SToby Isaac /* N_t     Number of threads:                     N_{bl} * N_{bs}     */
3720cf1dd8SToby Isaac /* N_{cbc} Number of concurrent basis      cells: N_{bl} * N_q        */
3820cf1dd8SToby Isaac /* N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b        */
3920cf1dd8SToby Isaac /* N_{sbc} Number of serial     basis      cells: N_{bs} / N_q        */
4020cf1dd8SToby Isaac /* N_{sqc} Number of serial     quadrature cells: N_{bs} / N_b        */
4120cf1dd8SToby Isaac /* N_{cb}  Number of serial cell batches:         input               */
4220cf1dd8SToby Isaac /* N_c     Number of total cells:                 N_{cb}*N_{t}/N_{comp} */
439371c9d4SSatish Balay static PetscErrorCode PetscFEOpenCLGenerateIntegrationCode(PetscFE fem, char **string_buffer, PetscInt buffer_length, PetscBool useAux, PetscInt N_bl) {
4420cf1dd8SToby Isaac   PetscFE_OpenCL  *ocl = (PetscFE_OpenCL *)fem->data;
4520cf1dd8SToby Isaac   PetscQuadrature  q;
4620cf1dd8SToby Isaac   char            *string_tail   = *string_buffer;
4720cf1dd8SToby Isaac   char            *end_of_buffer = *string_buffer + buffer_length;
4820cf1dd8SToby Isaac   char             float_str[] = "float", double_str[] = "double";
4920cf1dd8SToby Isaac   char            *numeric_str    = &(float_str[0]);
5020cf1dd8SToby Isaac   PetscInt         op             = ocl->op;
5120cf1dd8SToby Isaac   PetscBool        useField       = PETSC_FALSE;
5220cf1dd8SToby Isaac   PetscBool        useFieldDer    = PETSC_TRUE;
5320cf1dd8SToby Isaac   PetscBool        useFieldAux    = useAux;
5420cf1dd8SToby Isaac   PetscBool        useFieldDerAux = PETSC_FALSE;
5520cf1dd8SToby Isaac   PetscBool        useF0          = PETSC_TRUE;
5620cf1dd8SToby Isaac   PetscBool        useF1          = PETSC_TRUE;
5720cf1dd8SToby Isaac   const PetscReal *points, *weights;
58ef0bb6c7SMatthew G. Knepley   PetscTabulation  T;
5920cf1dd8SToby Isaac   PetscInt         dim, qNc, N_b, N_c, N_q, N_t, p, d, b, c;
6020cf1dd8SToby Isaac   size_t           count;
6120cf1dd8SToby Isaac 
6220cf1dd8SToby Isaac   PetscFunctionBegin;
639566063dSJacob Faibussowitsch   PetscCall(PetscFEGetSpatialDimension(fem, &dim));
649566063dSJacob Faibussowitsch   PetscCall(PetscFEGetDimension(fem, &N_b));
659566063dSJacob Faibussowitsch   PetscCall(PetscFEGetNumComponents(fem, &N_c));
669566063dSJacob Faibussowitsch   PetscCall(PetscFEGetQuadrature(fem, &q));
679566063dSJacob Faibussowitsch   PetscCall(PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights));
685f80ce2aSJacob Faibussowitsch   PetscCheck(qNc == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %" PetscInt_FMT " components", qNc);
6920cf1dd8SToby Isaac   N_t = N_b * N_c * N_q * N_bl;
7020cf1dd8SToby Isaac   /* Enable device extension for double precision */
7120cf1dd8SToby Isaac   if (ocl->realType == PETSC_DOUBLE) {
729566063dSJacob Faibussowitsch     PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
7320cf1dd8SToby Isaac                                     "#if defined(cl_khr_fp64)\n"
7420cf1dd8SToby Isaac                                     "#  pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
7520cf1dd8SToby Isaac                                     "#elif defined(cl_amd_fp64)\n"
7620cf1dd8SToby Isaac                                     "#  pragma OPENCL EXTENSION cl_amd_fp64: enable\n"
7720cf1dd8SToby Isaac                                     "#endif\n",
785f80ce2aSJacob Faibussowitsch                                     &count));
7920cf1dd8SToby Isaac     numeric_str = &(double_str[0]);
8020cf1dd8SToby Isaac   }
8120cf1dd8SToby Isaac   /* Kernel API */
829566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
8320cf1dd8SToby Isaac                                   "\n"
8420cf1dd8SToby Isaac                                   "__kernel void integrateElementQuadrature(int N_cb, __global %s *coefficients, __global %s *coefficientsAux, __global %s *jacobianInverses, __global %s *jacobianDeterminants, __global %s *elemVec)\n"
8520cf1dd8SToby Isaac                                   "{\n",
865f80ce2aSJacob Faibussowitsch                                   &count, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str));
8720cf1dd8SToby Isaac   /* Quadrature */
889566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
8920cf1dd8SToby Isaac                                   "  /* Quadrature points\n"
9020cf1dd8SToby Isaac                                   "   - (x1,y1,x2,y2,...) */\n"
9120cf1dd8SToby Isaac                                   "  const %s points[%d] = {\n",
925f80ce2aSJacob Faibussowitsch                                   &count, numeric_str, N_q * dim));
9320cf1dd8SToby Isaac   for (p = 0; p < N_q; ++p) {
94*48a46eb9SPierre Jolivet     for (d = 0; d < dim; ++d) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, points[p * dim + d]));
9520cf1dd8SToby Isaac   }
969566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count));
979566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
9820cf1dd8SToby Isaac                                   "  /* Quadrature weights\n"
9920cf1dd8SToby Isaac                                   "   - (v1,v2,...) */\n"
10020cf1dd8SToby Isaac                                   "  const %s weights[%d] = {\n",
1015f80ce2aSJacob Faibussowitsch                                   &count, numeric_str, N_q));
102*48a46eb9SPierre Jolivet   for (p = 0; p < N_q; ++p) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, weights[p]));
1039566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count));
10420cf1dd8SToby Isaac   /* Basis Functions */
1059566063dSJacob Faibussowitsch   PetscCall(PetscFEGetCellTabulation(fem, 1, &T));
1069566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
10720cf1dd8SToby Isaac                                   "  /* Nodal basis function evaluations\n"
10820cf1dd8SToby Isaac                                   "    - basis component is fastest varying, the basis function, then point */\n"
10920cf1dd8SToby Isaac                                   "  const %s Basis[%d] = {\n",
1105f80ce2aSJacob Faibussowitsch                                   &count, numeric_str, N_q * N_b * N_c));
11120cf1dd8SToby Isaac   for (p = 0; p < N_q; ++p) {
11220cf1dd8SToby Isaac     for (b = 0; b < N_b; ++b) {
113*48a46eb9SPierre Jolivet       for (c = 0; c < N_c; ++c) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, T->T[0][(p * N_b + b) * N_c + c]));
11420cf1dd8SToby Isaac     }
11520cf1dd8SToby Isaac   }
1169566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count));
1179566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
11820cf1dd8SToby Isaac                                   "\n"
11920cf1dd8SToby Isaac                                   "  /* Nodal basis function derivative evaluations,\n"
12020cf1dd8SToby Isaac                                   "      - derivative direction is fastest varying, then basis component, then basis function, then point */\n"
12120cf1dd8SToby Isaac                                   "  const %s%d BasisDerivatives[%d] = {\n",
1225f80ce2aSJacob Faibussowitsch                                   &count, numeric_str, dim, N_q * N_b * N_c));
12320cf1dd8SToby Isaac   for (p = 0; p < N_q; ++p) {
12420cf1dd8SToby Isaac     for (b = 0; b < N_b; ++b) {
12520cf1dd8SToby Isaac       for (c = 0; c < N_c; ++c) {
1269566063dSJacob Faibussowitsch         PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "(%s%d)(", &count, numeric_str, dim));
12720cf1dd8SToby Isaac         for (d = 0; d < dim; ++d) {
12820cf1dd8SToby Isaac           if (d > 0) {
1299566063dSJacob Faibussowitsch             PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, ", %g", &count, T->T[1][((p * N_b + b) * dim + d) * N_c + c]));
13020cf1dd8SToby Isaac           } else {
1319566063dSJacob Faibussowitsch             PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g", &count, T->T[1][((p * N_b + b) * dim + d) * N_c + c]));
13220cf1dd8SToby Isaac           }
13320cf1dd8SToby Isaac         }
1349566063dSJacob Faibussowitsch         PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "),\n", &count));
13520cf1dd8SToby Isaac       }
13620cf1dd8SToby Isaac     }
13720cf1dd8SToby Isaac   }
1389566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count));
13920cf1dd8SToby Isaac   /* Sizes */
1409566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
14120cf1dd8SToby Isaac                                   "  const int dim    = %d;                           // The spatial dimension\n"
14220cf1dd8SToby Isaac                                   "  const int N_bl   = %d;                           // The number of concurrent blocks\n"
14320cf1dd8SToby Isaac                                   "  const int N_b    = %d;                           // The number of basis functions\n"
14420cf1dd8SToby Isaac                                   "  const int N_comp = %d;                           // The number of basis function components\n"
14520cf1dd8SToby Isaac                                   "  const int N_bt   = N_b*N_comp;                    // The total number of scalar basis functions\n"
14620cf1dd8SToby Isaac                                   "  const int N_q    = %d;                           // The number of quadrature points\n"
14720cf1dd8SToby Isaac                                   "  const int N_bst  = N_bt*N_q;                      // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously\n"
14820cf1dd8SToby Isaac                                   "  const int N_t    = N_bst*N_bl;                    // The number of threads, N_bst * N_bl\n"
14920cf1dd8SToby Isaac                                   "  const int N_bc   = N_t/N_comp;                    // The number of cells per batch (N_b*N_q*N_bl)\n"
15020cf1dd8SToby Isaac                                   "  const int N_sbc  = N_bst / (N_q * N_comp);\n"
15120cf1dd8SToby Isaac                                   "  const int N_sqc  = N_bst / N_bt;\n"
15220cf1dd8SToby Isaac                                   "  /*const int N_c    = N_cb * N_bc;*/\n"
15320cf1dd8SToby Isaac                                   "\n"
15420cf1dd8SToby Isaac                                   "  /* Calculated indices */\n"
15520cf1dd8SToby Isaac                                   "  /*const int tidx    = get_local_id(0) + get_local_size(0)*get_local_id(1);*/\n"
15620cf1dd8SToby Isaac                                   "  const int tidx    = get_local_id(0);\n"
15720cf1dd8SToby Isaac                                   "  const int blidx   = tidx / N_bst;                  // Block number for this thread\n"
15820cf1dd8SToby Isaac                                   "  const int bidx    = tidx %% N_bt;                   // Basis function mapped to this thread\n"
15920cf1dd8SToby Isaac                                   "  const int cidx    = tidx %% N_comp;                 // Basis component mapped to this thread\n"
16020cf1dd8SToby Isaac                                   "  const int qidx    = tidx %% N_q;                    // Quadrature point mapped to this thread\n"
16120cf1dd8SToby Isaac                                   "  const int blbidx  = tidx %% N_q + blidx*N_q;        // Cell mapped to this thread in the basis phase\n"
16220cf1dd8SToby Isaac                                   "  const int blqidx  = tidx %% N_b + blidx*N_b;        // Cell mapped to this thread in the quadrature phase\n"
16320cf1dd8SToby Isaac                                   "  const int gidx    = get_group_id(1)*get_num_groups(0) + get_group_id(0);\n"
16420cf1dd8SToby Isaac                                   "  const int Goffset = gidx*N_cb*N_bc;\n",
1655f80ce2aSJacob Faibussowitsch                                   &count, dim, N_bl, N_b, N_c, N_q));
16620cf1dd8SToby Isaac   /* Local memory */
1679566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
16820cf1dd8SToby Isaac                                   "\n"
16920cf1dd8SToby Isaac                                   "  /* Quadrature data */\n"
17020cf1dd8SToby Isaac                                   "  %s                w;                   // $w_q$, Quadrature weight at $x_q$\n"
17120cf1dd8SToby Isaac                                   "  __local %s         phi_i[%d];    //[N_bt*N_q];  // $\\phi_i(x_q)$, Value of the basis function $i$ at $x_q$\n"
17220cf1dd8SToby Isaac                                   "  __local %s%d       phiDer_i[%d]; //[N_bt*N_q];  // $\\frac{\\partial\\phi_i(x_q)}{\\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$\n"
17320cf1dd8SToby Isaac                                   "  /* Geometric data */\n"
17420cf1dd8SToby Isaac                                   "  __local %s        detJ[%d]; //[N_t];           // $|J(x_q)|$, Jacobian determinant at $x_q$\n"
17520cf1dd8SToby Isaac                                   "  __local %s        invJ[%d];//[N_t*dim*dim];   // $J^{-1}(x_q)$, Jacobian inverse at $x_q$\n",
1769371c9d4SSatish Balay                                   &count, numeric_str, numeric_str, N_b * N_c * N_q, numeric_str, dim, N_b * N_c * N_q, numeric_str, N_t, numeric_str, N_t * dim * dim));
1779566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
17820cf1dd8SToby Isaac                                   "  /* FEM data */\n"
17920cf1dd8SToby Isaac                                   "  __local %s        u_i[%d]; //[N_t*N_bt];       // Coefficients $u_i$ of the field $u|_{\\mathcal{T}} = \\sum_i u_i \\phi_i$\n",
1805f80ce2aSJacob Faibussowitsch                                   &count, numeric_str, N_t * N_b * N_c));
18120cf1dd8SToby Isaac   if (useAux) {
1829371c9d4SSatish Balay     PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "  __local %s        a_i[%d]; //[N_t];            // Coefficients $a_i$ of the auxiliary field $a|_{\\mathcal{T}} = \\sum_i a_i \\phi^R_i$\n", &count, numeric_str, N_t));
18320cf1dd8SToby Isaac   }
18420cf1dd8SToby Isaac   if (useF0) {
1859566063dSJacob Faibussowitsch     PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
18620cf1dd8SToby Isaac                                     "  /* Intermediate calculations */\n"
18720cf1dd8SToby Isaac                                     "  __local %s         f_0[%d]; //[N_t*N_sqc];      // $f_0(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n",
1885f80ce2aSJacob Faibussowitsch                                     &count, numeric_str, N_t * N_q));
18920cf1dd8SToby Isaac   }
190*48a46eb9SPierre Jolivet   if (useF1) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "  __local %s%d       f_1[%d]; //[N_t*N_sqc];      // $f_1(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n", &count, numeric_str, dim, N_t * N_q));
19120cf1dd8SToby Isaac   /* TODO: If using elasticity, put in mu/lambda coefficients */
1929566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
19320cf1dd8SToby Isaac                                   "  /* Output data */\n"
19420cf1dd8SToby Isaac                                   "  %s                e_i;                 // Coefficient $e_i$ of the residual\n\n",
1955f80ce2aSJacob Faibussowitsch                                   &count, numeric_str));
19620cf1dd8SToby Isaac   /* One-time loads */
1979566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
19820cf1dd8SToby Isaac                                   "  /* These should be generated inline */\n"
19920cf1dd8SToby Isaac                                   "  /* Load quadrature weights */\n"
20020cf1dd8SToby Isaac                                   "  w = weights[qidx];\n"
20120cf1dd8SToby Isaac                                   "  /* Load basis tabulation \\phi_i for this cell */\n"
20220cf1dd8SToby Isaac                                   "  if (tidx < N_bt*N_q) {\n"
20320cf1dd8SToby Isaac                                   "    phi_i[tidx]    = Basis[tidx];\n"
20420cf1dd8SToby Isaac                                   "    phiDer_i[tidx] = BasisDerivatives[tidx];\n"
20520cf1dd8SToby Isaac                                   "  }\n\n",
2065f80ce2aSJacob Faibussowitsch                                   &count));
20720cf1dd8SToby Isaac   /* Batch loads */
2089566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
20920cf1dd8SToby Isaac                                   "  for (int batch = 0; batch < N_cb; ++batch) {\n"
21020cf1dd8SToby Isaac                                   "    /* Load geometry */\n"
21120cf1dd8SToby Isaac                                   "    detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];\n"
21220cf1dd8SToby Isaac                                   "    for (int n = 0; n < dim*dim; ++n) {\n"
21320cf1dd8SToby Isaac                                   "      const int offset = n*N_t;\n"
21420cf1dd8SToby Isaac                                   "      invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];\n"
21520cf1dd8SToby Isaac                                   "    }\n"
21620cf1dd8SToby Isaac                                   "    /* Load coefficients u_i for this cell */\n"
21720cf1dd8SToby Isaac                                   "    for (int n = 0; n < N_bt; ++n) {\n"
21820cf1dd8SToby Isaac                                   "      const int offset = n*N_t;\n"
21920cf1dd8SToby Isaac                                   "      u_i[offset+tidx] = coefficients[(Goffset*N_bt)+batch*N_t*N_b+offset+tidx];\n"
22020cf1dd8SToby Isaac                                   "    }\n",
2215f80ce2aSJacob Faibussowitsch                                   &count));
22220cf1dd8SToby Isaac   if (useAux) {
2239566063dSJacob Faibussowitsch     PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
22420cf1dd8SToby Isaac                                     "    /* Load coefficients a_i for this cell */\n"
22520cf1dd8SToby Isaac                                     "    /* TODO: This should not be N_t here, it should be N_bc*N_comp_aux */\n"
22620cf1dd8SToby Isaac                                     "    a_i[tidx] = coefficientsAux[Goffset+batch*N_t+tidx];\n",
2275f80ce2aSJacob Faibussowitsch                                     &count));
22820cf1dd8SToby Isaac   }
22920cf1dd8SToby Isaac   /* Quadrature phase */
2309566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
23120cf1dd8SToby Isaac                                   "    barrier(CLK_LOCAL_MEM_FENCE);\n"
23220cf1dd8SToby Isaac                                   "\n"
23320cf1dd8SToby Isaac                                   "    /* Map coefficients to values at quadrature points */\n"
23420cf1dd8SToby Isaac                                   "    for (int c = 0; c < N_sqc; ++c) {\n"
23520cf1dd8SToby Isaac                                   "      const int cell          = c*N_bl*N_b + blqidx;\n"
23620cf1dd8SToby Isaac                                   "      const int fidx          = (cell*N_q + qidx)*N_comp + cidx;\n",
2375f80ce2aSJacob Faibussowitsch                                   &count));
238*48a46eb9SPierre Jolivet   if (useField) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      %s  u[%d]; //[N_comp];     // $u(x_q)$, Value of the field at $x_q$\n", &count, numeric_str, N_c));
239*48a46eb9SPierre Jolivet   if (useFieldDer) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      %s%d   gradU[%d]; //[N_comp]; // $\\nabla u(x_q)$, Value of the field gradient at $x_q$\n", &count, numeric_str, dim, N_c));
240*48a46eb9SPierre Jolivet   if (useFieldAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      %s  a[%d]; //[1];     // $a(x_q)$, Value of the auxiliary fields at $x_q$\n", &count, numeric_str, 1));
241*48a46eb9SPierre Jolivet   if (useFieldDerAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      %s%d   gradA[%d]; //[1]; // $\\nabla a(x_q)$, Value of the auxiliary field gradient at $x_q$\n", &count, numeric_str, dim, 1));
2429566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
24320cf1dd8SToby Isaac                                   "\n"
24420cf1dd8SToby Isaac                                   "      for (int comp = 0; comp < N_comp; ++comp) {\n",
2455f80ce2aSJacob Faibussowitsch                                   &count));
2469566063dSJacob Faibussowitsch   if (useField) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        u[comp] = 0.0;\n", &count));
24720cf1dd8SToby Isaac   if (useFieldDer) {
24820cf1dd8SToby Isaac     switch (dim) {
2499371c9d4SSatish Balay     case 1: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        gradU[comp].x = 0.0;\n", &count)); break;
2509371c9d4SSatish Balay     case 2: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        gradU[comp].x = 0.0; gradU[comp].y = 0.0;\n", &count)); break;
2519371c9d4SSatish Balay     case 3: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;\n", &count)); break;
25220cf1dd8SToby Isaac     }
25320cf1dd8SToby Isaac   }
2549371c9d4SSatish Balay   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      }\n", &count));
255*48a46eb9SPierre Jolivet   if (useFieldAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      a[0] = 0.0;\n", &count));
25620cf1dd8SToby Isaac   if (useFieldDerAux) {
25720cf1dd8SToby Isaac     switch (dim) {
2589371c9d4SSatish Balay     case 1: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      gradA[0].x = 0.0;\n", &count)); break;
2599371c9d4SSatish Balay     case 2: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      gradA[0].x = 0.0; gradA[0].y = 0.0;\n", &count)); break;
2609371c9d4SSatish Balay     case 3: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      gradA[0].x = 0.0; gradA[0].y = 0.0; gradA[0].z = 0.0;\n", &count)); break;
26120cf1dd8SToby Isaac     }
26220cf1dd8SToby Isaac   }
2639566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
26420cf1dd8SToby Isaac                                   "      /* Get field and derivatives at this quadrature point */\n"
26520cf1dd8SToby Isaac                                   "      for (int i = 0; i < N_b; ++i) {\n"
26620cf1dd8SToby Isaac                                   "        for (int comp = 0; comp < N_comp; ++comp) {\n"
26720cf1dd8SToby Isaac                                   "          const int b    = i*N_comp+comp;\n"
26820cf1dd8SToby Isaac                                   "          const int pidx = qidx*N_bt + b;\n"
26920cf1dd8SToby Isaac                                   "          const int uidx = cell*N_bt + b;\n"
27020cf1dd8SToby Isaac                                   "          %s%d   realSpaceDer;\n\n",
2715f80ce2aSJacob Faibussowitsch                                   &count, numeric_str, dim));
2729566063dSJacob Faibussowitsch   if (useField) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "          u[comp] += u_i[uidx]*phi_i[pidx];\n", &count));
27320cf1dd8SToby Isaac   if (useFieldDer) {
27420cf1dd8SToby Isaac     switch (dim) {
27520cf1dd8SToby Isaac     case 2:
2769566063dSJacob Faibussowitsch       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
27720cf1dd8SToby Isaac                                       "          realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
27820cf1dd8SToby Isaac                                       "          gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
27920cf1dd8SToby Isaac                                       "          realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
28020cf1dd8SToby Isaac                                       "          gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n",
2819371c9d4SSatish Balay                                       &count));
2829371c9d4SSatish Balay       break;
28320cf1dd8SToby Isaac     case 3:
2849566063dSJacob Faibussowitsch       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
28520cf1dd8SToby Isaac                                       "          realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
28620cf1dd8SToby Isaac                                       "          gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
28720cf1dd8SToby Isaac                                       "          realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
28820cf1dd8SToby Isaac                                       "          gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n"
28920cf1dd8SToby Isaac                                       "          realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
29020cf1dd8SToby Isaac                                       "          gradU[comp].z += u_i[uidx]*realSpaceDer.z;\n",
2919371c9d4SSatish Balay                                       &count));
2929371c9d4SSatish Balay       break;
29320cf1dd8SToby Isaac     }
29420cf1dd8SToby Isaac   }
2959566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
29620cf1dd8SToby Isaac                                   "        }\n"
29720cf1dd8SToby Isaac                                   "      }\n",
2985f80ce2aSJacob Faibussowitsch                                   &count));
299*48a46eb9SPierre Jolivet   if (useFieldAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "          a[0] += a_i[cell];\n", &count));
30020cf1dd8SToby Isaac   /* Calculate residual at quadrature points: Should be generated by an weak form egine */
3019371c9d4SSatish Balay   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      /* Process values at quadrature points */\n", &count));
30220cf1dd8SToby Isaac   switch (op) {
30320cf1dd8SToby Isaac   case LAPLACIAN:
304*48a46eb9SPierre Jolivet     if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_0[fidx] = 4.0;\n", &count));
30520cf1dd8SToby Isaac     if (useF1) {
3069566063dSJacob Faibussowitsch       if (useAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_1[fidx] = a[0]*gradU[cidx];\n", &count));
3079566063dSJacob Faibussowitsch       else PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_1[fidx] = gradU[cidx];\n", &count));
30820cf1dd8SToby Isaac     }
30920cf1dd8SToby Isaac     break;
31020cf1dd8SToby Isaac   case ELASTICITY:
3119566063dSJacob Faibussowitsch     if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_0[fidx] = 4.0;\n", &count));
31220cf1dd8SToby Isaac     if (useF1) {
31320cf1dd8SToby Isaac       switch (dim) {
31420cf1dd8SToby Isaac       case 2:
3159566063dSJacob Faibussowitsch         PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
31620cf1dd8SToby Isaac                                         "      switch (cidx) {\n"
31720cf1dd8SToby Isaac                                         "      case 0:\n"
31820cf1dd8SToby Isaac                                         "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].x + gradU[0].x);\n"
31920cf1dd8SToby Isaac                                         "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].y + gradU[1].x);\n"
32020cf1dd8SToby Isaac                                         "        break;\n"
32120cf1dd8SToby Isaac                                         "      case 1:\n"
32220cf1dd8SToby Isaac                                         "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].x + gradU[0].y);\n"
32320cf1dd8SToby Isaac                                         "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].y + gradU[1].y);\n"
32420cf1dd8SToby Isaac                                         "      }\n",
3259371c9d4SSatish Balay                                         &count));
3269371c9d4SSatish Balay         break;
32720cf1dd8SToby Isaac       case 3:
3289566063dSJacob Faibussowitsch         PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
32920cf1dd8SToby Isaac                                         "      switch (cidx) {\n"
33020cf1dd8SToby Isaac                                         "      case 0:\n"
33120cf1dd8SToby Isaac                                         "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].x + gradU[0].x);\n"
33220cf1dd8SToby Isaac                                         "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].y + gradU[1].x);\n"
33320cf1dd8SToby Isaac                                         "        f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].z + gradU[2].x);\n"
33420cf1dd8SToby Isaac                                         "        break;\n"
33520cf1dd8SToby Isaac                                         "      case 1:\n"
33620cf1dd8SToby Isaac                                         "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].x + gradU[0].y);\n"
33720cf1dd8SToby Isaac                                         "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[1].y);\n"
33820cf1dd8SToby Isaac                                         "        f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[2].y);\n"
33920cf1dd8SToby Isaac                                         "        break;\n"
34020cf1dd8SToby Isaac                                         "      case 2:\n"
34120cf1dd8SToby Isaac                                         "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].x + gradU[0].z);\n"
34220cf1dd8SToby Isaac                                         "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[1].z);\n"
34320cf1dd8SToby Isaac                                         "        f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[2].z);\n"
34420cf1dd8SToby Isaac                                         "      }\n",
3459371c9d4SSatish Balay                                         &count));
34620cf1dd8SToby Isaac         break;
3479371c9d4SSatish Balay       }
3489371c9d4SSatish Balay     }
3499371c9d4SSatish Balay     break;
3509371c9d4SSatish Balay   default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "PDE operator %d is not supported", op);
35120cf1dd8SToby Isaac   }
3529566063dSJacob Faibussowitsch   if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_0[fidx] *= detJ[cell]*w;\n", &count));
35320cf1dd8SToby Isaac   if (useF1) {
35420cf1dd8SToby Isaac     switch (dim) {
3559371c9d4SSatish Balay     case 1: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_1[fidx].x *= detJ[cell]*w;\n", &count)); break;
3569371c9d4SSatish Balay     case 2: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;\n", &count)); break;
3579371c9d4SSatish Balay     case 3: PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;\n", &count)); break;
35820cf1dd8SToby Isaac     }
35920cf1dd8SToby Isaac   }
36020cf1dd8SToby Isaac   /* Thread transpose */
3619566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
36220cf1dd8SToby Isaac                                   "    }\n\n"
36320cf1dd8SToby Isaac                                   "    /* ==== TRANSPOSE THREADS ==== */\n"
36420cf1dd8SToby Isaac                                   "    barrier(CLK_LOCAL_MEM_FENCE);\n\n",
3655f80ce2aSJacob Faibussowitsch                                   &count));
36620cf1dd8SToby Isaac   /* Basis phase */
3679566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
36820cf1dd8SToby Isaac                                   "    /* Map values at quadrature points to coefficients */\n"
36920cf1dd8SToby Isaac                                   "    for (int c = 0; c < N_sbc; ++c) {\n"
37020cf1dd8SToby Isaac                                   "      const int cell = c*N_bl*N_q + blbidx; /* Cell number in batch */\n"
37120cf1dd8SToby Isaac                                   "\n"
37220cf1dd8SToby Isaac                                   "      e_i = 0.0;\n"
37320cf1dd8SToby Isaac                                   "      for (int q = 0; q < N_q; ++q) {\n"
37420cf1dd8SToby Isaac                                   "        const int pidx = q*N_bt + bidx;\n"
37520cf1dd8SToby Isaac                                   "        const int fidx = (cell*N_q + q)*N_comp + cidx;\n"
37620cf1dd8SToby Isaac                                   "        %s%d   realSpaceDer;\n\n",
3775f80ce2aSJacob Faibussowitsch                                   &count, numeric_str, dim));
37820cf1dd8SToby Isaac 
3799566063dSJacob Faibussowitsch   if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        e_i += phi_i[pidx]*f_0[fidx];\n", &count));
38020cf1dd8SToby Isaac   if (useF1) {
38120cf1dd8SToby Isaac     switch (dim) {
38220cf1dd8SToby Isaac     case 2:
3839566063dSJacob Faibussowitsch       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
38420cf1dd8SToby Isaac                                       "        realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
38520cf1dd8SToby Isaac                                       "        e_i           += realSpaceDer.x*f_1[fidx].x;\n"
38620cf1dd8SToby Isaac                                       "        realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
38720cf1dd8SToby Isaac                                       "        e_i           += realSpaceDer.y*f_1[fidx].y;\n",
3889371c9d4SSatish Balay                                       &count));
3899371c9d4SSatish Balay       break;
39020cf1dd8SToby Isaac     case 3:
3919566063dSJacob Faibussowitsch       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
39220cf1dd8SToby Isaac                                       "        realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
39320cf1dd8SToby Isaac                                       "        e_i           += realSpaceDer.x*f_1[fidx].x;\n"
39420cf1dd8SToby Isaac                                       "        realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
39520cf1dd8SToby Isaac                                       "        e_i           += realSpaceDer.y*f_1[fidx].y;\n"
39620cf1dd8SToby Isaac                                       "        realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
39720cf1dd8SToby Isaac                                       "        e_i           += realSpaceDer.z*f_1[fidx].z;\n",
3989371c9d4SSatish Balay                                       &count));
3999371c9d4SSatish Balay       break;
40020cf1dd8SToby Isaac     }
40120cf1dd8SToby Isaac   }
4029566063dSJacob Faibussowitsch   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
40320cf1dd8SToby Isaac                                   "      }\n"
40420cf1dd8SToby Isaac                                   "      /* Write element vector for N_{cbc} cells at a time */\n"
40520cf1dd8SToby Isaac                                   "      elemVec[(Goffset + batch*N_bc + c*N_bl*N_q)*N_bt + tidx] = e_i;\n"
40620cf1dd8SToby Isaac                                   "    }\n"
40720cf1dd8SToby Isaac                                   "    /* ==== Could do one write per batch ==== */\n"
40820cf1dd8SToby Isaac                                   "  }\n"
40920cf1dd8SToby Isaac                                   "  return;\n"
41020cf1dd8SToby Isaac                                   "}\n",
4115f80ce2aSJacob Faibussowitsch                                   &count));
41220cf1dd8SToby Isaac   PetscFunctionReturn(0);
41320cf1dd8SToby Isaac }
41420cf1dd8SToby Isaac 
4159371c9d4SSatish Balay static PetscErrorCode PetscFEOpenCLGetIntegrationKernel(PetscFE fem, PetscBool useAux, cl_program *ocl_prog, cl_kernel *ocl_kernel) {
41620cf1dd8SToby Isaac   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data;
41720cf1dd8SToby Isaac   PetscInt        dim, N_bl;
41820cf1dd8SToby Isaac   PetscBool       flg;
41920cf1dd8SToby Isaac   char           *buffer;
42020cf1dd8SToby Isaac   size_t          len;
42120cf1dd8SToby Isaac   char            errMsg[8192];
4222da392ccSBarry Smith   cl_int          err;
42320cf1dd8SToby Isaac 
42420cf1dd8SToby Isaac   PetscFunctionBegin;
4259566063dSJacob Faibussowitsch   PetscCall(PetscFEGetSpatialDimension(fem, &dim));
4269566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(8192, &buffer));
4279566063dSJacob Faibussowitsch   PetscCall(PetscFEGetTileSizes(fem, NULL, &N_bl, NULL, NULL));
4289566063dSJacob Faibussowitsch   PetscCall(PetscFEOpenCLGenerateIntegrationCode(fem, &buffer, 8192, useAux, N_bl));
4299566063dSJacob Faibussowitsch   PetscCall(PetscOptionsHasName(((PetscObject)fem)->options, ((PetscObject)fem)->prefix, "-petscfe_opencl_kernel_print", &flg));
4309566063dSJacob Faibussowitsch   if (flg) PetscCall(PetscPrintf(PetscObjectComm((PetscObject)fem), "OpenCL FE Integration Kernel:\n%s\n", buffer));
4319566063dSJacob Faibussowitsch   PetscCall(PetscStrlen(buffer, &len));
4329371c9d4SSatish Balay   *ocl_prog = clCreateProgramWithSource(ocl->ctx_id, 1, (const char **)&buffer, &len, &err);
4339371c9d4SSatish Balay   PetscCall(err);
4342da392ccSBarry Smith   err = clBuildProgram(*ocl_prog, 0, NULL, NULL, NULL, NULL);
4352da392ccSBarry Smith   if (err != CL_SUCCESS) {
4362f613bf5SBarry Smith     err = clGetProgramBuildInfo(*ocl_prog, ocl->dev_id, CL_PROGRAM_BUILD_LOG, 8192 * sizeof(char), &errMsg, NULL);
43798921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Build failed! Log:\n %s", errMsg);
43820cf1dd8SToby Isaac   }
4399566063dSJacob Faibussowitsch   PetscCall(PetscFree(buffer));
440d0609cedSBarry Smith   *ocl_kernel = clCreateKernel(*ocl_prog, "integrateElementQuadrature", &err);
44120cf1dd8SToby Isaac   PetscFunctionReturn(0);
44220cf1dd8SToby Isaac }
44320cf1dd8SToby Isaac 
4449371c9d4SSatish Balay static PetscErrorCode PetscFEOpenCLCalculateGrid(PetscFE fem, PetscInt N, PetscInt blockSize, size_t *x, size_t *y, size_t *z) {
44520cf1dd8SToby Isaac   const PetscInt Nblocks = N / blockSize;
44620cf1dd8SToby Isaac 
44720cf1dd8SToby Isaac   PetscFunctionBegin;
4485f80ce2aSJacob Faibussowitsch   PetscCheck(!(N % blockSize), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N);
44920cf1dd8SToby Isaac   *z = 1;
450623c9f23SSatish Balay   *y = 1;
45120cf1dd8SToby Isaac   for (*x = (size_t)(PetscSqrtReal(Nblocks) + 0.5); *x > 0; --*x) {
45220cf1dd8SToby Isaac     *y = Nblocks / *x;
453526996e7SStefano Zampini     if (*x * *y == (size_t)Nblocks) break;
45420cf1dd8SToby Isaac   }
45563a3b9bcSJacob Faibussowitsch   PetscCheck(*x * *y == (size_t)Nblocks, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %" PetscInt_FMT " with block size %" PetscInt_FMT, N, blockSize);
45620cf1dd8SToby Isaac   PetscFunctionReturn(0);
45720cf1dd8SToby Isaac }
45820cf1dd8SToby Isaac 
4599371c9d4SSatish Balay static PetscErrorCode PetscFEOpenCLLogResidual(PetscFE fem, PetscLogDouble time, PetscLogDouble flops) {
46020cf1dd8SToby Isaac   PetscFE_OpenCL   *ocl = (PetscFE_OpenCL *)fem->data;
46120cf1dd8SToby Isaac   PetscStageLog     stageLog;
46220cf1dd8SToby Isaac   PetscEventPerfLog eventLog = NULL;
463afbfd3a7SStefano Zampini   int               stage;
46420cf1dd8SToby Isaac 
46520cf1dd8SToby Isaac   PetscFunctionBegin;
4669566063dSJacob Faibussowitsch   PetscCall(PetscLogGetStageLog(&stageLog));
4679566063dSJacob Faibussowitsch   PetscCall(PetscStageLogGetCurrent(stageLog, &stage));
4689566063dSJacob Faibussowitsch   PetscCall(PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog));
46920cf1dd8SToby Isaac   /* Log performance info */
47020cf1dd8SToby Isaac   eventLog->eventInfo[ocl->residualEvent].count++;
47120cf1dd8SToby Isaac   eventLog->eventInfo[ocl->residualEvent].time += time;
47220cf1dd8SToby Isaac   eventLog->eventInfo[ocl->residualEvent].flops += flops;
47320cf1dd8SToby Isaac   PetscFunctionReturn(0);
47420cf1dd8SToby Isaac }
47520cf1dd8SToby Isaac 
4769371c9d4SSatish Balay static PetscErrorCode PetscFEIntegrateResidual_OpenCL(PetscDS prob, PetscFormKey key, PetscInt Ne, PetscFEGeom *cgeom, const PetscScalar coefficients[], const PetscScalar coefficients_t[], PetscDS probAux, const PetscScalar coefficientsAux[], PetscReal t, PetscScalar elemVec[]) {
47720cf1dd8SToby Isaac   /* Nbc = batchSize */
478849189efSMatthew G. Knepley   PetscFE          fem;
479849189efSMatthew G. Knepley   PetscFE_OpenCL  *ocl;
48020cf1dd8SToby Isaac   PetscPointFunc   f0_func;
48120cf1dd8SToby Isaac   PetscPointFunc   f1_func;
48220cf1dd8SToby Isaac   PetscQuadrature  q;
48320cf1dd8SToby Isaac   PetscInt         dim, qNc;
48420cf1dd8SToby Isaac   PetscInt         N_b;    /* The number of basis functions */
48520cf1dd8SToby Isaac   PetscInt         N_comp; /* The number of basis function components */
48620cf1dd8SToby Isaac   PetscInt         N_bt;   /* The total number of scalar basis functions */
48720cf1dd8SToby Isaac   PetscInt         N_q;    /* The number of quadrature points */
48820cf1dd8SToby Isaac   PetscInt         N_bst;  /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */
48920cf1dd8SToby Isaac   PetscInt         N_t;    /* The number of threads, N_bst * N_bl */
49020cf1dd8SToby Isaac   PetscInt         N_bl;   /* The number of blocks */
49120cf1dd8SToby Isaac   PetscInt         N_bc;   /* The batch size, N_bl*N_q*N_b */
49220cf1dd8SToby Isaac   PetscInt         N_cb;   /* The number of batches */
4936528b96dSMatthew G. Knepley   const PetscInt   field = key.field;
49420cf1dd8SToby Isaac   PetscInt         numFlops, f0Flops = 0, f1Flops = 0;
49520cf1dd8SToby Isaac   PetscBool        useAux      = probAux ? PETSC_TRUE : PETSC_FALSE;
49620cf1dd8SToby Isaac   PetscBool        useField    = PETSC_FALSE;
49720cf1dd8SToby Isaac   PetscBool        useFieldDer = PETSC_TRUE;
49820cf1dd8SToby Isaac   PetscBool        useF0       = PETSC_TRUE;
49920cf1dd8SToby Isaac   PetscBool        useF1       = PETSC_TRUE;
50020cf1dd8SToby Isaac   /* OpenCL variables */
50120cf1dd8SToby Isaac   cl_program       ocl_prog;
50220cf1dd8SToby Isaac   cl_kernel        ocl_kernel;
50320cf1dd8SToby Isaac   cl_event         ocl_ev;   /* The event for tracking kernel execution */
50420cf1dd8SToby Isaac   cl_ulong         ns_start; /* Nanoseconds counter on GPU at kernel start */
50520cf1dd8SToby Isaac   cl_ulong         ns_end;   /* Nanoseconds counter on GPU at kernel stop */
50620cf1dd8SToby Isaac   cl_mem           o_jacobianInverses, o_jacobianDeterminants;
50720cf1dd8SToby Isaac   cl_mem           o_coefficients, o_coefficientsAux, o_elemVec;
50820cf1dd8SToby Isaac   float           *f_coeff = NULL, *f_coeffAux = NULL, *f_invJ = NULL, *f_detJ = NULL;
50920cf1dd8SToby Isaac   double          *d_coeff = NULL, *d_coeffAux = NULL, *d_invJ = NULL, *d_detJ = NULL;
51020cf1dd8SToby Isaac   PetscReal       *r_invJ = NULL, *r_detJ = NULL;
51120cf1dd8SToby Isaac   void            *oclCoeff, *oclCoeffAux, *oclInvJ, *oclDetJ;
51220cf1dd8SToby Isaac   size_t           local_work_size[3], global_work_size[3];
51320cf1dd8SToby Isaac   size_t           realSize, x, y, z;
51420cf1dd8SToby Isaac   const PetscReal *points, *weights;
515d0609cedSBarry Smith   int              err;
51620cf1dd8SToby Isaac 
51720cf1dd8SToby Isaac   PetscFunctionBegin;
5189566063dSJacob Faibussowitsch   PetscCall(PetscDSGetDiscretization(prob, field, (PetscObject *)&fem));
519849189efSMatthew G. Knepley   ocl = (PetscFE_OpenCL *)fem->data;
5209371c9d4SSatish Balay   if (!Ne) {
5219371c9d4SSatish Balay     PetscCall(PetscFEOpenCLLogResidual(fem, 0.0, 0.0));
5229371c9d4SSatish Balay     PetscFunctionReturn(0);
5239371c9d4SSatish Balay   }
5249566063dSJacob Faibussowitsch   PetscCall(PetscFEGetSpatialDimension(fem, &dim));
5259566063dSJacob Faibussowitsch   PetscCall(PetscFEGetQuadrature(fem, &q));
5269566063dSJacob Faibussowitsch   PetscCall(PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights));
52763a3b9bcSJacob Faibussowitsch   PetscCheck(qNc == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %" PetscInt_FMT " components", qNc);
5289566063dSJacob Faibussowitsch   PetscCall(PetscFEGetDimension(fem, &N_b));
5299566063dSJacob Faibussowitsch   PetscCall(PetscFEGetNumComponents(fem, &N_comp));
5309566063dSJacob Faibussowitsch   PetscCall(PetscDSGetResidual(prob, field, &f0_func, &f1_func));
5319566063dSJacob Faibussowitsch   PetscCall(PetscFEGetTileSizes(fem, NULL, &N_bl, &N_bc, &N_cb));
53220cf1dd8SToby Isaac   N_bt  = N_b * N_comp;
53320cf1dd8SToby Isaac   N_bst = N_bt * N_q;
53420cf1dd8SToby Isaac   N_t   = N_bst * N_bl;
5355f80ce2aSJacob Faibussowitsch   PetscCheck(N_bc * N_comp == N_t, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, N_bc, N_comp);
53620cf1dd8SToby Isaac   /* Calculate layout */
53720cf1dd8SToby Isaac   if (Ne % (N_cb * N_bc)) { /* Remainder cells */
5389566063dSJacob Faibussowitsch     PetscCall(PetscFEIntegrateResidual_Basic(prob, key, Ne, cgeom, coefficients, coefficients_t, probAux, coefficientsAux, t, elemVec));
53920cf1dd8SToby Isaac     PetscFunctionReturn(0);
54020cf1dd8SToby Isaac   }
5419566063dSJacob Faibussowitsch   PetscCall(PetscFEOpenCLCalculateGrid(fem, Ne, N_cb * N_bc, &x, &y, &z));
54220cf1dd8SToby Isaac   local_work_size[0]  = N_bc * N_comp;
54320cf1dd8SToby Isaac   local_work_size[1]  = 1;
54420cf1dd8SToby Isaac   local_work_size[2]  = 1;
54520cf1dd8SToby Isaac   global_work_size[0] = x * local_work_size[0];
54620cf1dd8SToby Isaac   global_work_size[1] = y * local_work_size[1];
54720cf1dd8SToby Isaac   global_work_size[2] = z * local_work_size[2];
54863a3b9bcSJacob Faibussowitsch   PetscCall(PetscInfo(fem, "GPU layout grid(%zu,%zu,%zu) block(%zu,%zu,%zu) with %d batches\n", x, y, z, local_work_size[0], local_work_size[1], local_work_size[2], N_cb));
5499566063dSJacob Faibussowitsch   PetscCall(PetscInfo(fem, " N_t: %d, N_cb: %d\n", N_t, N_cb));
55020cf1dd8SToby Isaac   /* Generate code */
55120cf1dd8SToby Isaac   if (probAux) {
55220cf1dd8SToby Isaac     PetscSpace P;
55320cf1dd8SToby Isaac     PetscInt   NfAux, order, f;
55420cf1dd8SToby Isaac 
5559566063dSJacob Faibussowitsch     PetscCall(PetscDSGetNumFields(probAux, &NfAux));
55620cf1dd8SToby Isaac     for (f = 0; f < NfAux; ++f) {
55720cf1dd8SToby Isaac       PetscFE feAux;
55820cf1dd8SToby Isaac 
5599566063dSJacob Faibussowitsch       PetscCall(PetscDSGetDiscretization(probAux, f, (PetscObject *)&feAux));
5609566063dSJacob Faibussowitsch       PetscCall(PetscFEGetBasisSpace(feAux, &P));
5619566063dSJacob Faibussowitsch       PetscCall(PetscSpaceGetDegree(P, &order, NULL));
5625f80ce2aSJacob Faibussowitsch       PetscCheck(order <= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Can only handle P0 coefficient fields");
56320cf1dd8SToby Isaac     }
56420cf1dd8SToby Isaac   }
5659566063dSJacob Faibussowitsch   PetscCall(PetscFEOpenCLGetIntegrationKernel(fem, useAux, &ocl_prog, &ocl_kernel));
56620cf1dd8SToby Isaac   /* Create buffers on the device and send data over */
5679566063dSJacob Faibussowitsch   PetscCall(PetscDataTypeGetSize(ocl->realType, &realSize));
5685f80ce2aSJacob Faibussowitsch   PetscCheck(cgeom->numPoints <= 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only support affine geometry for OpenCL integration right now");
56920cf1dd8SToby Isaac   if (sizeof(PetscReal) != realSize) {
57020cf1dd8SToby Isaac     switch (ocl->realType) {
5719371c9d4SSatish Balay     case PETSC_FLOAT: {
57220cf1dd8SToby Isaac       PetscInt c, b, d;
57320cf1dd8SToby Isaac 
5749566063dSJacob Faibussowitsch       PetscCall(PetscMalloc4(Ne * N_bt, &f_coeff, Ne, &f_coeffAux, Ne * dim * dim, &f_invJ, Ne, &f_detJ));
57520cf1dd8SToby Isaac       for (c = 0; c < Ne; ++c) {
5767be5e748SToby Isaac         f_detJ[c] = (float)cgeom->detJ[c];
5779371c9d4SSatish Balay         for (d = 0; d < dim * dim; ++d) { f_invJ[c * dim * dim + d] = (float)cgeom->invJ[c * dim * dim + d]; }
5789371c9d4SSatish Balay         for (b = 0; b < N_bt; ++b) { f_coeff[c * N_bt + b] = (float)coefficients[c * N_bt + b]; }
57920cf1dd8SToby Isaac       }
58020cf1dd8SToby Isaac       if (coefficientsAux) { /* Assume P0 */
5819371c9d4SSatish Balay         for (c = 0; c < Ne; ++c) { f_coeffAux[c] = (float)coefficientsAux[c]; }
58220cf1dd8SToby Isaac       }
58320cf1dd8SToby Isaac       oclCoeff = (void *)f_coeff;
58420cf1dd8SToby Isaac       if (coefficientsAux) {
58520cf1dd8SToby Isaac         oclCoeffAux = (void *)f_coeffAux;
58620cf1dd8SToby Isaac       } else {
58720cf1dd8SToby Isaac         oclCoeffAux = NULL;
58820cf1dd8SToby Isaac       }
58920cf1dd8SToby Isaac       oclInvJ = (void *)f_invJ;
59020cf1dd8SToby Isaac       oclDetJ = (void *)f_detJ;
5919371c9d4SSatish Balay     } break;
5929371c9d4SSatish Balay     case PETSC_DOUBLE: {
59320cf1dd8SToby Isaac       PetscInt c, b, d;
59420cf1dd8SToby Isaac 
5959566063dSJacob Faibussowitsch       PetscCall(PetscMalloc4(Ne * N_bt, &d_coeff, Ne, &d_coeffAux, Ne * dim * dim, &d_invJ, Ne, &d_detJ));
59620cf1dd8SToby Isaac       for (c = 0; c < Ne; ++c) {
5977be5e748SToby Isaac         d_detJ[c] = (double)cgeom->detJ[c];
5989371c9d4SSatish Balay         for (d = 0; d < dim * dim; ++d) { d_invJ[c * dim * dim + d] = (double)cgeom->invJ[c * dim * dim + d]; }
5999371c9d4SSatish Balay         for (b = 0; b < N_bt; ++b) { d_coeff[c * N_bt + b] = (double)coefficients[c * N_bt + b]; }
60020cf1dd8SToby Isaac       }
60120cf1dd8SToby Isaac       if (coefficientsAux) { /* Assume P0 */
6029371c9d4SSatish Balay         for (c = 0; c < Ne; ++c) { d_coeffAux[c] = (double)coefficientsAux[c]; }
60320cf1dd8SToby Isaac       }
60420cf1dd8SToby Isaac       oclCoeff = (void *)d_coeff;
60520cf1dd8SToby Isaac       if (coefficientsAux) {
60620cf1dd8SToby Isaac         oclCoeffAux = (void *)d_coeffAux;
60720cf1dd8SToby Isaac       } else {
60820cf1dd8SToby Isaac         oclCoeffAux = NULL;
60920cf1dd8SToby Isaac       }
61020cf1dd8SToby Isaac       oclInvJ = (void *)d_invJ;
61120cf1dd8SToby Isaac       oclDetJ = (void *)d_detJ;
6129371c9d4SSatish Balay     } break;
6139371c9d4SSatish Balay     default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
61420cf1dd8SToby Isaac     }
61520cf1dd8SToby Isaac   } else {
61620cf1dd8SToby Isaac     PetscInt c, d;
61720cf1dd8SToby Isaac 
6189566063dSJacob Faibussowitsch     PetscCall(PetscMalloc2(Ne * dim * dim, &r_invJ, Ne, &r_detJ));
61920cf1dd8SToby Isaac     for (c = 0; c < Ne; ++c) {
6207be5e748SToby Isaac       r_detJ[c] = cgeom->detJ[c];
6219371c9d4SSatish Balay       for (d = 0; d < dim * dim; ++d) { r_invJ[c * dim * dim + d] = cgeom->invJ[c * dim * dim + d]; }
62220cf1dd8SToby Isaac     }
62320cf1dd8SToby Isaac     oclCoeff    = (void *)coefficients;
62420cf1dd8SToby Isaac     oclCoeffAux = (void *)coefficientsAux;
62520cf1dd8SToby Isaac     oclInvJ     = (void *)r_invJ;
62620cf1dd8SToby Isaac     oclDetJ     = (void *)r_detJ;
62720cf1dd8SToby Isaac   }
628d0609cedSBarry Smith   o_coefficients = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * N_bt * realSize, oclCoeff, &err);
62920cf1dd8SToby Isaac   if (coefficientsAux) {
630d0609cedSBarry Smith     o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclCoeffAux, &err);
63120cf1dd8SToby Isaac   } else {
632d0609cedSBarry Smith     o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY, Ne * realSize, oclCoeffAux, &err);
63320cf1dd8SToby Isaac   }
634d0609cedSBarry Smith   o_jacobianInverses     = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * dim * dim * realSize, oclInvJ, &err);
635d0609cedSBarry Smith   o_jacobianDeterminants = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclDetJ, &err);
636d0609cedSBarry Smith   o_elemVec              = clCreateBuffer(ocl->ctx_id, CL_MEM_WRITE_ONLY, Ne * N_bt * realSize, NULL, &err);
63720cf1dd8SToby Isaac   /* Kernel launch */
6389566063dSJacob Faibussowitsch   PetscCall(clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void *)&N_cb));
6399566063dSJacob Faibussowitsch   PetscCall(clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void *)&o_coefficients));
6409566063dSJacob Faibussowitsch   PetscCall(clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void *)&o_coefficientsAux));
6419566063dSJacob Faibussowitsch   PetscCall(clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void *)&o_jacobianInverses));
6429566063dSJacob Faibussowitsch   PetscCall(clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void *)&o_jacobianDeterminants));
6439566063dSJacob Faibussowitsch   PetscCall(clSetKernelArg(ocl_kernel, 5, sizeof(cl_mem), (void *)&o_elemVec));
6449566063dSJacob Faibussowitsch   PetscCall(clEnqueueNDRangeKernel(ocl->queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev));
64520cf1dd8SToby Isaac   /* Read data back from device */
64620cf1dd8SToby Isaac   if (sizeof(PetscReal) != realSize) {
64720cf1dd8SToby Isaac     switch (ocl->realType) {
6489371c9d4SSatish Balay     case PETSC_FLOAT: {
64920cf1dd8SToby Isaac       float   *elem;
65020cf1dd8SToby Isaac       PetscInt c, b;
65120cf1dd8SToby Isaac 
6529566063dSJacob Faibussowitsch       PetscCall(PetscFree4(f_coeff, f_coeffAux, f_invJ, f_detJ));
6539566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(Ne * N_bt, &elem));
6549566063dSJacob Faibussowitsch       PetscCall(clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne * N_bt * realSize, elem, 0, NULL, NULL));
65520cf1dd8SToby Isaac       for (c = 0; c < Ne; ++c) {
6569371c9d4SSatish Balay         for (b = 0; b < N_bt; ++b) { elemVec[c * N_bt + b] = (PetscScalar)elem[c * N_bt + b]; }
65720cf1dd8SToby Isaac       }
6589566063dSJacob Faibussowitsch       PetscCall(PetscFree(elem));
6599371c9d4SSatish Balay     } break;
6609371c9d4SSatish Balay     case PETSC_DOUBLE: {
66120cf1dd8SToby Isaac       double  *elem;
66220cf1dd8SToby Isaac       PetscInt c, b;
66320cf1dd8SToby Isaac 
6649566063dSJacob Faibussowitsch       PetscCall(PetscFree4(d_coeff, d_coeffAux, d_invJ, d_detJ));
6659566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(Ne * N_bt, &elem));
6669566063dSJacob Faibussowitsch       PetscCall(clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne * N_bt * realSize, elem, 0, NULL, NULL));
66720cf1dd8SToby Isaac       for (c = 0; c < Ne; ++c) {
6689371c9d4SSatish Balay         for (b = 0; b < N_bt; ++b) { elemVec[c * N_bt + b] = (PetscScalar)elem[c * N_bt + b]; }
66920cf1dd8SToby Isaac       }
6709566063dSJacob Faibussowitsch       PetscCall(PetscFree(elem));
6719371c9d4SSatish Balay     } break;
6729371c9d4SSatish Balay     default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
67320cf1dd8SToby Isaac     }
67420cf1dd8SToby Isaac   } else {
6759566063dSJacob Faibussowitsch     PetscCall(PetscFree2(r_invJ, r_detJ));
6769566063dSJacob Faibussowitsch     PetscCall(clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne * N_bt * realSize, elemVec, 0, NULL, NULL));
67720cf1dd8SToby Isaac   }
67820cf1dd8SToby Isaac   /* Log performance */
6799566063dSJacob Faibussowitsch   PetscCall(clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL));
6809566063dSJacob Faibussowitsch   PetscCall(clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ns_end, NULL));
68120cf1dd8SToby Isaac   f0Flops = 0;
68220cf1dd8SToby Isaac   switch (ocl->op) {
6839371c9d4SSatish Balay   case LAPLACIAN: f1Flops = useAux ? dim : 0; break;
6849371c9d4SSatish Balay   case ELASTICITY: f1Flops = 2 * dim * dim; break;
68520cf1dd8SToby Isaac   }
6869371c9d4SSatish Balay   numFlops = Ne * (N_q * (N_b * N_comp * ((useField ? 2 : 0) + (useFieldDer ? 2 * dim * (dim + 1) : 0))
68720cf1dd8SToby Isaac                           /*+
68820cf1dd8SToby Isaac        N_ba*N_compa*((useFieldAux ? 2 : 0) + (useFieldDerAux ? 2*dim*(dim + 1) : 0))*/
6899371c9d4SSatish Balay                           + N_comp * ((useF0 ? f0Flops + 2 : 0) + (useF1 ? f1Flops + 2 * dim : 0))) +
69020cf1dd8SToby Isaac                    N_b * ((useF0 ? 2 : 0) + (useF1 ? 2 * dim * (dim + 1) : 0)));
6919566063dSJacob Faibussowitsch   PetscCall(PetscFEOpenCLLogResidual(fem, (ns_end - ns_start) * 1.0e-9, numFlops));
69220cf1dd8SToby Isaac   /* Cleanup */
6939566063dSJacob Faibussowitsch   PetscCall(clReleaseMemObject(o_coefficients));
6949566063dSJacob Faibussowitsch   PetscCall(clReleaseMemObject(o_coefficientsAux));
6959566063dSJacob Faibussowitsch   PetscCall(clReleaseMemObject(o_jacobianInverses));
6969566063dSJacob Faibussowitsch   PetscCall(clReleaseMemObject(o_jacobianDeterminants));
6979566063dSJacob Faibussowitsch   PetscCall(clReleaseMemObject(o_elemVec));
6989566063dSJacob Faibussowitsch   PetscCall(clReleaseKernel(ocl_kernel));
6999566063dSJacob Faibussowitsch   PetscCall(clReleaseProgram(ocl_prog));
70020cf1dd8SToby Isaac   PetscFunctionReturn(0);
70120cf1dd8SToby Isaac }
70220cf1dd8SToby Isaac 
703526996e7SStefano Zampini PETSC_INTERN PetscErrorCode PetscFESetUp_Basic(PetscFE);
704526996e7SStefano Zampini PETSC_INTERN PetscErrorCode PetscFECreateTabulation_Basic(PetscFE, PetscInt, const PetscReal[], PetscInt, PetscTabulation);
7052e47ffbbSToby Isaac 
7069371c9d4SSatish Balay static PetscErrorCode PetscFEInitialize_OpenCL(PetscFE fem) {
70720cf1dd8SToby Isaac   PetscFunctionBegin;
70820cf1dd8SToby Isaac   fem->ops->setfromoptions          = NULL;
70920cf1dd8SToby Isaac   fem->ops->setup                   = PetscFESetUp_Basic;
71020cf1dd8SToby Isaac   fem->ops->view                    = NULL;
71120cf1dd8SToby Isaac   fem->ops->destroy                 = PetscFEDestroy_OpenCL;
71220cf1dd8SToby Isaac   fem->ops->getdimension            = PetscFEGetDimension_Basic;
713ef0bb6c7SMatthew G. Knepley   fem->ops->createtabulation        = PetscFECreateTabulation_Basic;
71420cf1dd8SToby Isaac   fem->ops->integrateresidual       = PetscFEIntegrateResidual_OpenCL;
71520cf1dd8SToby Isaac   fem->ops->integratebdresidual     = NULL /* PetscFEIntegrateBdResidual_OpenCL */;
71620cf1dd8SToby Isaac   fem->ops->integratejacobianaction = NULL /* PetscFEIntegrateJacobianAction_OpenCL */;
71720cf1dd8SToby Isaac   fem->ops->integratejacobian       = PetscFEIntegrateJacobian_Basic;
71820cf1dd8SToby Isaac   PetscFunctionReturn(0);
71920cf1dd8SToby Isaac }
72020cf1dd8SToby Isaac 
72120cf1dd8SToby Isaac /*MC
72220cf1dd8SToby Isaac   PETSCFEOPENCL = "opencl" - A PetscFE object that integrates using a vectorized OpenCL implementation
72320cf1dd8SToby Isaac 
72420cf1dd8SToby Isaac   Level: intermediate
72520cf1dd8SToby Isaac 
726db781477SPatrick Sanan .seealso: `PetscFEType`, `PetscFECreate()`, `PetscFESetType()`
72720cf1dd8SToby Isaac M*/
72820cf1dd8SToby Isaac 
7299371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode PetscFECreate_OpenCL(PetscFE fem) {
73020cf1dd8SToby Isaac   PetscFE_OpenCL *ocl;
73120cf1dd8SToby Isaac   cl_uint         num_platforms;
73220cf1dd8SToby Isaac   cl_platform_id  platform_ids[42];
73320cf1dd8SToby Isaac   cl_uint         num_devices;
73420cf1dd8SToby Isaac   cl_device_id    device_ids[42];
7352da392ccSBarry Smith   cl_int          err;
73620cf1dd8SToby Isaac 
73720cf1dd8SToby Isaac   PetscFunctionBegin;
73820cf1dd8SToby Isaac   PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
7399566063dSJacob Faibussowitsch   PetscCall(PetscNewLog(fem, &ocl));
74020cf1dd8SToby Isaac   fem->data = ocl;
74120cf1dd8SToby Isaac 
74220cf1dd8SToby Isaac   /* Init Platform */
7439566063dSJacob Faibussowitsch   PetscCall(clGetPlatformIDs(42, platform_ids, &num_platforms));
74428b400f6SJacob Faibussowitsch   PetscCheck(num_platforms, PetscObjectComm((PetscObject)fem), PETSC_ERR_SUP, "No OpenCL platform found.");
74520cf1dd8SToby Isaac   ocl->pf_id = platform_ids[0];
74620cf1dd8SToby Isaac   /* Init Device */
7479566063dSJacob Faibussowitsch   PetscCall(clGetDeviceIDs(ocl->pf_id, CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices));
74828b400f6SJacob Faibussowitsch   PetscCheck(num_devices, PetscObjectComm((PetscObject)fem), PETSC_ERR_SUP, "No OpenCL device found.");
74920cf1dd8SToby Isaac   ocl->dev_id = device_ids[0];
75020cf1dd8SToby Isaac   /* Create context with one command queue */
7519371c9d4SSatish Balay   ocl->ctx_id = clCreateContext(0, 1, &(ocl->dev_id), NULL, NULL, &err);
7529371c9d4SSatish Balay   PetscCall(err);
7539371c9d4SSatish Balay   ocl->queue_id = clCreateCommandQueue(ocl->ctx_id, ocl->dev_id, CL_QUEUE_PROFILING_ENABLE, &err);
7549371c9d4SSatish Balay   PetscCall(err);
75520cf1dd8SToby Isaac   /* Types */
75620cf1dd8SToby Isaac   ocl->realType = PETSC_FLOAT;
75720cf1dd8SToby Isaac   /* Register events */
7589566063dSJacob Faibussowitsch   PetscCall(PetscLogEventRegister("OpenCL FEResidual", PETSCFE_CLASSID, &ocl->residualEvent));
75920cf1dd8SToby Isaac   /* Equation handling */
76020cf1dd8SToby Isaac   ocl->op = LAPLACIAN;
76120cf1dd8SToby Isaac 
7629566063dSJacob Faibussowitsch   PetscCall(PetscFEInitialize_OpenCL(fem));
76320cf1dd8SToby Isaac   PetscFunctionReturn(0);
76420cf1dd8SToby Isaac }
76520cf1dd8SToby Isaac 
7662b99622eSMatthew G. Knepley /*@
7672b99622eSMatthew G. Knepley   PetscFEOpenCLSetRealType - Set the scalar type for running on the accelerator
7682b99622eSMatthew G. Knepley 
7692b99622eSMatthew G. Knepley   Input Parameters:
7702b99622eSMatthew G. Knepley + fem      - The PetscFE
7712b99622eSMatthew G. Knepley - realType - The scalar type
7722b99622eSMatthew G. Knepley 
7732b99622eSMatthew G. Knepley   Level: developer
7742b99622eSMatthew G. Knepley 
775db781477SPatrick Sanan .seealso: `PetscFEOpenCLGetRealType()`
7762b99622eSMatthew G. Knepley @*/
7779371c9d4SSatish Balay PetscErrorCode PetscFEOpenCLSetRealType(PetscFE fem, PetscDataType realType) {
77820cf1dd8SToby Isaac   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data;
77920cf1dd8SToby Isaac 
78020cf1dd8SToby Isaac   PetscFunctionBegin;
78120cf1dd8SToby Isaac   PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
78220cf1dd8SToby Isaac   ocl->realType = realType;
78320cf1dd8SToby Isaac   PetscFunctionReturn(0);
78420cf1dd8SToby Isaac }
78520cf1dd8SToby Isaac 
7862b99622eSMatthew G. Knepley /*@
7872b99622eSMatthew G. Knepley   PetscFEOpenCLGetRealType - Get the scalar type for running on the accelerator
7882b99622eSMatthew G. Knepley 
7892b99622eSMatthew G. Knepley   Input Parameter:
7902b99622eSMatthew G. Knepley . fem      - The PetscFE
7912b99622eSMatthew G. Knepley 
7922b99622eSMatthew G. Knepley   Output Parameter:
7932b99622eSMatthew G. Knepley . realType - The scalar type
7942b99622eSMatthew G. Knepley 
7952b99622eSMatthew G. Knepley   Level: developer
7962b99622eSMatthew G. Knepley 
797db781477SPatrick Sanan .seealso: `PetscFEOpenCLSetRealType()`
7982b99622eSMatthew G. Knepley @*/
7999371c9d4SSatish Balay PetscErrorCode PetscFEOpenCLGetRealType(PetscFE fem, PetscDataType *realType) {
80020cf1dd8SToby Isaac   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data;
80120cf1dd8SToby Isaac 
80220cf1dd8SToby Isaac   PetscFunctionBegin;
80320cf1dd8SToby Isaac   PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
80420cf1dd8SToby Isaac   PetscValidPointer(realType, 2);
80520cf1dd8SToby Isaac   *realType = ocl->realType;
80620cf1dd8SToby Isaac   PetscFunctionReturn(0);
80720cf1dd8SToby Isaac }
80820cf1dd8SToby Isaac 
80920cf1dd8SToby Isaac #endif /* PETSC_HAVE_OPENCL */
810