xref: /petsc/src/dm/dt/fe/impls/opencl/feopencl.c (revision 20cf1dd8cd656e27510885a71ea4be028bf48813)
1*20cf1dd8SToby Isaac #include <petsc/private/petscfeimpl.h> /*I "petscfe.h" I*/
2*20cf1dd8SToby Isaac 
3*20cf1dd8SToby Isaac #ifdef PETSC_HAVE_OPENCL
4*20cf1dd8SToby Isaac 
5*20cf1dd8SToby Isaac PetscErrorCode PetscFEDestroy_OpenCL(PetscFE fem)
6*20cf1dd8SToby Isaac {
7*20cf1dd8SToby Isaac   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
8*20cf1dd8SToby Isaac   PetscErrorCode  ierr;
9*20cf1dd8SToby Isaac 
10*20cf1dd8SToby Isaac   PetscFunctionBegin;
11*20cf1dd8SToby Isaac   ierr = clReleaseCommandQueue(ocl->queue_id);CHKERRQ(ierr);
12*20cf1dd8SToby Isaac   ocl->queue_id = 0;
13*20cf1dd8SToby Isaac   ierr = clReleaseContext(ocl->ctx_id);CHKERRQ(ierr);
14*20cf1dd8SToby Isaac   ocl->ctx_id = 0;
15*20cf1dd8SToby Isaac   ierr = PetscFree(ocl);CHKERRQ(ierr);
16*20cf1dd8SToby Isaac   PetscFunctionReturn(0);
17*20cf1dd8SToby Isaac }
18*20cf1dd8SToby Isaac 
19*20cf1dd8SToby Isaac #define STRING_ERROR_CHECK(MSG) do {CHKERRQ(ierr); string_tail += count; if (string_tail == end_of_buffer) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, MSG);} while(0)
20*20cf1dd8SToby Isaac enum {LAPLACIAN = 0, ELASTICITY = 1};
21*20cf1dd8SToby Isaac 
22*20cf1dd8SToby Isaac /* NOTE: This is now broken for vector problems. Must redo loops to respect vector basis elements */
23*20cf1dd8SToby Isaac /* dim     Number of spatial dimensions:          2                   */
24*20cf1dd8SToby Isaac /* N_b     Number of basis functions:             generated           */
25*20cf1dd8SToby Isaac /* N_{bt}  Number of total basis functions:       N_b * N_{comp}      */
26*20cf1dd8SToby Isaac /* N_q     Number of quadrature points:           generated           */
27*20cf1dd8SToby Isaac /* N_{bs}  Number of block cells                  LCM(N_b, N_q)       */
28*20cf1dd8SToby Isaac /* N_{bst} Number of block cell components        LCM(N_{bt}, N_q)    */
29*20cf1dd8SToby Isaac /* N_{bl}  Number of concurrent blocks            generated           */
30*20cf1dd8SToby Isaac /* N_t     Number of threads:                     N_{bl} * N_{bs}     */
31*20cf1dd8SToby Isaac /* N_{cbc} Number of concurrent basis      cells: N_{bl} * N_q        */
32*20cf1dd8SToby Isaac /* N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b        */
33*20cf1dd8SToby Isaac /* N_{sbc} Number of serial     basis      cells: N_{bs} / N_q        */
34*20cf1dd8SToby Isaac /* N_{sqc} Number of serial     quadrature cells: N_{bs} / N_b        */
35*20cf1dd8SToby Isaac /* N_{cb}  Number of serial cell batches:         input               */
36*20cf1dd8SToby Isaac /* N_c     Number of total cells:                 N_{cb}*N_{t}/N_{comp} */
37*20cf1dd8SToby Isaac PetscErrorCode PetscFEOpenCLGenerateIntegrationCode(PetscFE fem, char **string_buffer, PetscInt buffer_length, PetscBool useAux, PetscInt N_bl)
38*20cf1dd8SToby Isaac {
39*20cf1dd8SToby Isaac   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
40*20cf1dd8SToby Isaac   PetscQuadrature q;
41*20cf1dd8SToby Isaac   char           *string_tail   = *string_buffer;
42*20cf1dd8SToby Isaac   char           *end_of_buffer = *string_buffer + buffer_length;
43*20cf1dd8SToby Isaac   char            float_str[]   = "float", double_str[]  = "double";
44*20cf1dd8SToby Isaac   char           *numeric_str   = &(float_str[0]);
45*20cf1dd8SToby Isaac   PetscInt        op            = ocl->op;
46*20cf1dd8SToby Isaac   PetscBool       useField      = PETSC_FALSE;
47*20cf1dd8SToby Isaac   PetscBool       useFieldDer   = PETSC_TRUE;
48*20cf1dd8SToby Isaac   PetscBool       useFieldAux   = useAux;
49*20cf1dd8SToby Isaac   PetscBool       useFieldDerAux= PETSC_FALSE;
50*20cf1dd8SToby Isaac   PetscBool       useF0         = PETSC_TRUE;
51*20cf1dd8SToby Isaac   PetscBool       useF1         = PETSC_TRUE;
52*20cf1dd8SToby Isaac   const PetscReal *points, *weights;
53*20cf1dd8SToby Isaac   PetscReal      *basis, *basisDer;
54*20cf1dd8SToby Isaac   PetscInt        dim, qNc, N_b, N_c, N_q, N_t, p, d, b, c;
55*20cf1dd8SToby Isaac   size_t          count;
56*20cf1dd8SToby Isaac   PetscErrorCode  ierr;
57*20cf1dd8SToby Isaac 
58*20cf1dd8SToby Isaac   PetscFunctionBegin;
59*20cf1dd8SToby Isaac   ierr = PetscFEGetSpatialDimension(fem, &dim);CHKERRQ(ierr);
60*20cf1dd8SToby Isaac   ierr = PetscFEGetDimension(fem, &N_b);CHKERRQ(ierr);
61*20cf1dd8SToby Isaac   ierr = PetscFEGetNumComponents(fem, &N_c);CHKERRQ(ierr);
62*20cf1dd8SToby Isaac   ierr = PetscFEGetQuadrature(fem, &q);CHKERRQ(ierr);
63*20cf1dd8SToby Isaac   ierr = PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);CHKERRQ(ierr);
64*20cf1dd8SToby Isaac   if (qNc != 1) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %D components\n", qNc);
65*20cf1dd8SToby Isaac   N_t  = N_b * N_c * N_q * N_bl;
66*20cf1dd8SToby Isaac   /* Enable device extension for double precision */
67*20cf1dd8SToby Isaac   if (ocl->realType == PETSC_DOUBLE) {
68*20cf1dd8SToby Isaac     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
69*20cf1dd8SToby Isaac "#if defined(cl_khr_fp64)\n"
70*20cf1dd8SToby Isaac "#  pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
71*20cf1dd8SToby Isaac "#elif defined(cl_amd_fp64)\n"
72*20cf1dd8SToby Isaac "#  pragma OPENCL EXTENSION cl_amd_fp64: enable\n"
73*20cf1dd8SToby Isaac "#endif\n",
74*20cf1dd8SToby Isaac                               &count);STRING_ERROR_CHECK("Message to short");
75*20cf1dd8SToby Isaac     numeric_str  = &(double_str[0]);
76*20cf1dd8SToby Isaac   }
77*20cf1dd8SToby Isaac   /* Kernel API */
78*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
79*20cf1dd8SToby Isaac "\n"
80*20cf1dd8SToby Isaac "__kernel void integrateElementQuadrature(int N_cb, __global %s *coefficients, __global %s *coefficientsAux, __global %s *jacobianInverses, __global %s *jacobianDeterminants, __global %s *elemVec)\n"
81*20cf1dd8SToby Isaac "{\n",
82*20cf1dd8SToby Isaac                        &count, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str);STRING_ERROR_CHECK("Message to short");
83*20cf1dd8SToby Isaac   /* Quadrature */
84*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
85*20cf1dd8SToby Isaac "  /* Quadrature points\n"
86*20cf1dd8SToby Isaac "   - (x1,y1,x2,y2,...) */\n"
87*20cf1dd8SToby Isaac "  const %s points[%d] = {\n",
88*20cf1dd8SToby Isaac                        &count, numeric_str, N_q*dim);STRING_ERROR_CHECK("Message to short");
89*20cf1dd8SToby Isaac   for (p = 0; p < N_q; ++p) {
90*20cf1dd8SToby Isaac     for (d = 0; d < dim; ++d) {
91*20cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, points[p*dim+d]);STRING_ERROR_CHECK("Message to short");
92*20cf1dd8SToby Isaac     }
93*20cf1dd8SToby Isaac   }
94*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);STRING_ERROR_CHECK("Message to short");
95*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
96*20cf1dd8SToby Isaac "  /* Quadrature weights\n"
97*20cf1dd8SToby Isaac "   - (v1,v2,...) */\n"
98*20cf1dd8SToby Isaac "  const %s weights[%d] = {\n",
99*20cf1dd8SToby Isaac                        &count, numeric_str, N_q);STRING_ERROR_CHECK("Message to short");
100*20cf1dd8SToby Isaac   for (p = 0; p < N_q; ++p) {
101*20cf1dd8SToby Isaac     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, weights[p]);STRING_ERROR_CHECK("Message to short");
102*20cf1dd8SToby Isaac   }
103*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);STRING_ERROR_CHECK("Message to short");
104*20cf1dd8SToby Isaac   /* Basis Functions */
105*20cf1dd8SToby Isaac   ierr = PetscFEGetDefaultTabulation(fem, &basis, &basisDer, NULL);CHKERRQ(ierr);
106*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
107*20cf1dd8SToby Isaac "  /* Nodal basis function evaluations\n"
108*20cf1dd8SToby Isaac "    - basis component is fastest varying, the basis function, then point */\n"
109*20cf1dd8SToby Isaac "  const %s Basis[%d] = {\n",
110*20cf1dd8SToby Isaac                        &count, numeric_str, N_q*N_b*N_c);STRING_ERROR_CHECK("Message to short");
111*20cf1dd8SToby Isaac   for (p = 0; p < N_q; ++p) {
112*20cf1dd8SToby Isaac     for (b = 0; b < N_b; ++b) {
113*20cf1dd8SToby Isaac       for (c = 0; c < N_c; ++c) {
114*20cf1dd8SToby Isaac         ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, basis[(p*N_b + b)*N_c + c]);STRING_ERROR_CHECK("Message to short");
115*20cf1dd8SToby Isaac       }
116*20cf1dd8SToby Isaac     }
117*20cf1dd8SToby Isaac   }
118*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);STRING_ERROR_CHECK("Message to short");
119*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
120*20cf1dd8SToby Isaac "\n"
121*20cf1dd8SToby Isaac "  /* Nodal basis function derivative evaluations,\n"
122*20cf1dd8SToby Isaac "      - derivative direction is fastest varying, then basis component, then basis function, then point */\n"
123*20cf1dd8SToby Isaac "  const %s%d BasisDerivatives[%d] = {\n",
124*20cf1dd8SToby Isaac                        &count, numeric_str, dim, N_q*N_b*N_c);STRING_ERROR_CHECK("Message to short");
125*20cf1dd8SToby Isaac   for (p = 0; p < N_q; ++p) {
126*20cf1dd8SToby Isaac     for (b = 0; b < N_b; ++b) {
127*20cf1dd8SToby Isaac       for (c = 0; c < N_c; ++c) {
128*20cf1dd8SToby Isaac         ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "(%s%d)(", &count, numeric_str, dim);STRING_ERROR_CHECK("Message to short");
129*20cf1dd8SToby Isaac         for (d = 0; d < dim; ++d) {
130*20cf1dd8SToby Isaac           if (d > 0) {
131*20cf1dd8SToby Isaac             ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, ", %g", &count, basisDer[((p*N_b + b)*dim + d)*N_c + c]);STRING_ERROR_CHECK("Message to short");
132*20cf1dd8SToby Isaac           } else {
133*20cf1dd8SToby Isaac             ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g", &count, basisDer[((p*N_b + b)*dim + d)*N_c + c]);STRING_ERROR_CHECK("Message to short");
134*20cf1dd8SToby Isaac           }
135*20cf1dd8SToby Isaac         }
136*20cf1dd8SToby Isaac         ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "),\n", &count);STRING_ERROR_CHECK("Message to short");
137*20cf1dd8SToby Isaac       }
138*20cf1dd8SToby Isaac     }
139*20cf1dd8SToby Isaac   }
140*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);STRING_ERROR_CHECK("Message to short");
141*20cf1dd8SToby Isaac   /* Sizes */
142*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
143*20cf1dd8SToby Isaac "  const int dim    = %d;                           // The spatial dimension\n"
144*20cf1dd8SToby Isaac "  const int N_bl   = %d;                           // The number of concurrent blocks\n"
145*20cf1dd8SToby Isaac "  const int N_b    = %d;                           // The number of basis functions\n"
146*20cf1dd8SToby Isaac "  const int N_comp = %d;                           // The number of basis function components\n"
147*20cf1dd8SToby Isaac "  const int N_bt   = N_b*N_comp;                    // The total number of scalar basis functions\n"
148*20cf1dd8SToby Isaac "  const int N_q    = %d;                           // The number of quadrature points\n"
149*20cf1dd8SToby Isaac "  const int N_bst  = N_bt*N_q;                      // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously\n"
150*20cf1dd8SToby Isaac "  const int N_t    = N_bst*N_bl;                    // The number of threads, N_bst * N_bl\n"
151*20cf1dd8SToby Isaac "  const int N_bc   = N_t/N_comp;                    // The number of cells per batch (N_b*N_q*N_bl)\n"
152*20cf1dd8SToby Isaac "  const int N_sbc  = N_bst / (N_q * N_comp);\n"
153*20cf1dd8SToby Isaac "  const int N_sqc  = N_bst / N_bt;\n"
154*20cf1dd8SToby Isaac "  /*const int N_c    = N_cb * N_bc;*/\n"
155*20cf1dd8SToby Isaac "\n"
156*20cf1dd8SToby Isaac "  /* Calculated indices */\n"
157*20cf1dd8SToby Isaac "  /*const int tidx    = get_local_id(0) + get_local_size(0)*get_local_id(1);*/\n"
158*20cf1dd8SToby Isaac "  const int tidx    = get_local_id(0);\n"
159*20cf1dd8SToby Isaac "  const int blidx   = tidx / N_bst;                  // Block number for this thread\n"
160*20cf1dd8SToby Isaac "  const int bidx    = tidx %% N_bt;                   // Basis function mapped to this thread\n"
161*20cf1dd8SToby Isaac "  const int cidx    = tidx %% N_comp;                 // Basis component mapped to this thread\n"
162*20cf1dd8SToby Isaac "  const int qidx    = tidx %% N_q;                    // Quadrature point mapped to this thread\n"
163*20cf1dd8SToby Isaac "  const int blbidx  = tidx %% N_q + blidx*N_q;        // Cell mapped to this thread in the basis phase\n"
164*20cf1dd8SToby Isaac "  const int blqidx  = tidx %% N_b + blidx*N_b;        // Cell mapped to this thread in the quadrature phase\n"
165*20cf1dd8SToby Isaac "  const int gidx    = get_group_id(1)*get_num_groups(0) + get_group_id(0);\n"
166*20cf1dd8SToby Isaac "  const int Goffset = gidx*N_cb*N_bc;\n",
167*20cf1dd8SToby Isaac                             &count, dim, N_bl, N_b, N_c, N_q);STRING_ERROR_CHECK("Message to short");
168*20cf1dd8SToby Isaac   /* Local memory */
169*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
170*20cf1dd8SToby Isaac "\n"
171*20cf1dd8SToby Isaac "  /* Quadrature data */\n"
172*20cf1dd8SToby Isaac "  %s                w;                   // $w_q$, Quadrature weight at $x_q$\n"
173*20cf1dd8SToby Isaac "  __local %s         phi_i[%d];    //[N_bt*N_q];  // $\\phi_i(x_q)$, Value of the basis function $i$ at $x_q$\n"
174*20cf1dd8SToby Isaac "  __local %s%d       phiDer_i[%d]; //[N_bt*N_q];  // $\\frac{\\partial\\phi_i(x_q)}{\\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$\n"
175*20cf1dd8SToby Isaac "  /* Geometric data */\n"
176*20cf1dd8SToby Isaac "  __local %s        detJ[%d]; //[N_t];           // $|J(x_q)|$, Jacobian determinant at $x_q$\n"
177*20cf1dd8SToby Isaac "  __local %s        invJ[%d];//[N_t*dim*dim];   // $J^{-1}(x_q)$, Jacobian inverse at $x_q$\n",
178*20cf1dd8SToby Isaac                             &count, numeric_str, numeric_str, N_b*N_c*N_q, numeric_str, dim, N_b*N_c*N_q, numeric_str, N_t,
179*20cf1dd8SToby Isaac                             numeric_str, N_t*dim*dim, numeric_str, N_t*N_b*N_c);STRING_ERROR_CHECK("Message to short");
180*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
181*20cf1dd8SToby Isaac "  /* FEM data */\n"
182*20cf1dd8SToby Isaac "  __local %s        u_i[%d]; //[N_t*N_bt];       // Coefficients $u_i$ of the field $u|_{\\mathcal{T}} = \\sum_i u_i \\phi_i$\n",
183*20cf1dd8SToby Isaac                             &count, numeric_str, N_t*N_b*N_c);STRING_ERROR_CHECK("Message to short");
184*20cf1dd8SToby Isaac   if (useAux) {
185*20cf1dd8SToby Isaac     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
186*20cf1dd8SToby Isaac "  __local %s        a_i[%d]; //[N_t];            // Coefficients $a_i$ of the auxiliary field $a|_{\\mathcal{T}} = \\sum_i a_i \\phi^R_i$\n",
187*20cf1dd8SToby Isaac                             &count, numeric_str, N_t);STRING_ERROR_CHECK("Message to short");
188*20cf1dd8SToby Isaac   }
189*20cf1dd8SToby Isaac   if (useF0) {
190*20cf1dd8SToby Isaac     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
191*20cf1dd8SToby Isaac "  /* Intermediate calculations */\n"
192*20cf1dd8SToby Isaac "  __local %s         f_0[%d]; //[N_t*N_sqc];      // $f_0(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n",
193*20cf1dd8SToby Isaac                               &count, numeric_str, N_t*N_q);STRING_ERROR_CHECK("Message to short");
194*20cf1dd8SToby Isaac   }
195*20cf1dd8SToby Isaac   if (useF1) {
196*20cf1dd8SToby Isaac     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
197*20cf1dd8SToby Isaac "  __local %s%d       f_1[%d]; //[N_t*N_sqc];      // $f_1(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n",
198*20cf1dd8SToby Isaac                               &count, numeric_str, dim, N_t*N_q);STRING_ERROR_CHECK("Message to short");
199*20cf1dd8SToby Isaac   }
200*20cf1dd8SToby Isaac   /* TODO: If using elasticity, put in mu/lambda coefficients */
201*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
202*20cf1dd8SToby Isaac "  /* Output data */\n"
203*20cf1dd8SToby Isaac "  %s                e_i;                 // Coefficient $e_i$ of the residual\n\n",
204*20cf1dd8SToby Isaac                             &count, numeric_str);STRING_ERROR_CHECK("Message to short");
205*20cf1dd8SToby Isaac   /* One-time loads */
206*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
207*20cf1dd8SToby Isaac "  /* These should be generated inline */\n"
208*20cf1dd8SToby Isaac "  /* Load quadrature weights */\n"
209*20cf1dd8SToby Isaac "  w = weights[qidx];\n"
210*20cf1dd8SToby Isaac "  /* Load basis tabulation \\phi_i for this cell */\n"
211*20cf1dd8SToby Isaac "  if (tidx < N_bt*N_q) {\n"
212*20cf1dd8SToby Isaac "    phi_i[tidx]    = Basis[tidx];\n"
213*20cf1dd8SToby Isaac "    phiDer_i[tidx] = BasisDerivatives[tidx];\n"
214*20cf1dd8SToby Isaac "  }\n\n",
215*20cf1dd8SToby Isaac                        &count);STRING_ERROR_CHECK("Message to short");
216*20cf1dd8SToby Isaac   /* Batch loads */
217*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
218*20cf1dd8SToby Isaac "  for (int batch = 0; batch < N_cb; ++batch) {\n"
219*20cf1dd8SToby Isaac "    /* Load geometry */\n"
220*20cf1dd8SToby Isaac "    detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];\n"
221*20cf1dd8SToby Isaac "    for (int n = 0; n < dim*dim; ++n) {\n"
222*20cf1dd8SToby Isaac "      const int offset = n*N_t;\n"
223*20cf1dd8SToby Isaac "      invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];\n"
224*20cf1dd8SToby Isaac "    }\n"
225*20cf1dd8SToby Isaac "    /* Load coefficients u_i for this cell */\n"
226*20cf1dd8SToby Isaac "    for (int n = 0; n < N_bt; ++n) {\n"
227*20cf1dd8SToby Isaac "      const int offset = n*N_t;\n"
228*20cf1dd8SToby Isaac "      u_i[offset+tidx] = coefficients[(Goffset*N_bt)+batch*N_t*N_b+offset+tidx];\n"
229*20cf1dd8SToby Isaac "    }\n",
230*20cf1dd8SToby Isaac                        &count);STRING_ERROR_CHECK("Message to short");
231*20cf1dd8SToby Isaac   if (useAux) {
232*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
233*20cf1dd8SToby Isaac "    /* Load coefficients a_i for this cell */\n"
234*20cf1dd8SToby Isaac "    /* TODO: This should not be N_t here, it should be N_bc*N_comp_aux */\n"
235*20cf1dd8SToby Isaac "    a_i[tidx] = coefficientsAux[Goffset+batch*N_t+tidx];\n",
236*20cf1dd8SToby Isaac                             &count);STRING_ERROR_CHECK("Message to short");
237*20cf1dd8SToby Isaac   }
238*20cf1dd8SToby Isaac   /* Quadrature phase */
239*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
240*20cf1dd8SToby Isaac "    barrier(CLK_LOCAL_MEM_FENCE);\n"
241*20cf1dd8SToby Isaac "\n"
242*20cf1dd8SToby Isaac "    /* Map coefficients to values at quadrature points */\n"
243*20cf1dd8SToby Isaac "    for (int c = 0; c < N_sqc; ++c) {\n"
244*20cf1dd8SToby Isaac "      const int cell          = c*N_bl*N_b + blqidx;\n"
245*20cf1dd8SToby Isaac "      const int fidx          = (cell*N_q + qidx)*N_comp + cidx;\n",
246*20cf1dd8SToby Isaac                        &count);STRING_ERROR_CHECK("Message to short");
247*20cf1dd8SToby Isaac   if (useField) {
248*20cf1dd8SToby Isaac     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
249*20cf1dd8SToby Isaac "      %s  u[%d]; //[N_comp];     // $u(x_q)$, Value of the field at $x_q$\n",
250*20cf1dd8SToby Isaac                               &count, numeric_str, N_c);STRING_ERROR_CHECK("Message to short");
251*20cf1dd8SToby Isaac   }
252*20cf1dd8SToby Isaac   if (useFieldDer) {
253*20cf1dd8SToby Isaac     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
254*20cf1dd8SToby Isaac "      %s%d   gradU[%d]; //[N_comp]; // $\\nabla u(x_q)$, Value of the field gradient at $x_q$\n",
255*20cf1dd8SToby Isaac                               &count, numeric_str, dim, N_c);STRING_ERROR_CHECK("Message to short");
256*20cf1dd8SToby Isaac   }
257*20cf1dd8SToby Isaac   if (useFieldAux) {
258*20cf1dd8SToby Isaac     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
259*20cf1dd8SToby Isaac "      %s  a[%d]; //[1];     // $a(x_q)$, Value of the auxiliary fields at $x_q$\n",
260*20cf1dd8SToby Isaac                               &count, numeric_str, 1);STRING_ERROR_CHECK("Message to short");
261*20cf1dd8SToby Isaac   }
262*20cf1dd8SToby Isaac   if (useFieldDerAux) {
263*20cf1dd8SToby Isaac     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
264*20cf1dd8SToby Isaac "      %s%d   gradA[%d]; //[1]; // $\\nabla a(x_q)$, Value of the auxiliary field gradient at $x_q$\n",
265*20cf1dd8SToby Isaac                               &count, numeric_str, dim, 1);STRING_ERROR_CHECK("Message to short");
266*20cf1dd8SToby Isaac   }
267*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
268*20cf1dd8SToby Isaac "\n"
269*20cf1dd8SToby Isaac "      for (int comp = 0; comp < N_comp; ++comp) {\n",
270*20cf1dd8SToby Isaac                             &count);STRING_ERROR_CHECK("Message to short");
271*20cf1dd8SToby Isaac   if (useField) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        u[comp] = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");}
272*20cf1dd8SToby Isaac   if (useFieldDer) {
273*20cf1dd8SToby Isaac     switch (dim) {
274*20cf1dd8SToby Isaac     case 1:
275*20cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        gradU[comp].x = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break;
276*20cf1dd8SToby Isaac     case 2:
277*20cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        gradU[comp].x = 0.0; gradU[comp].y = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break;
278*20cf1dd8SToby Isaac     case 3:
279*20cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break;
280*20cf1dd8SToby Isaac     }
281*20cf1dd8SToby Isaac   }
282*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
283*20cf1dd8SToby Isaac "      }\n",
284*20cf1dd8SToby Isaac                             &count);STRING_ERROR_CHECK("Message to short");
285*20cf1dd8SToby Isaac   if (useFieldAux) {
286*20cf1dd8SToby Isaac     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      a[0] = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");
287*20cf1dd8SToby Isaac   }
288*20cf1dd8SToby Isaac   if (useFieldDerAux) {
289*20cf1dd8SToby Isaac     switch (dim) {
290*20cf1dd8SToby Isaac     case 1:
291*20cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      gradA[0].x = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break;
292*20cf1dd8SToby Isaac     case 2:
293*20cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      gradA[0].x = 0.0; gradA[0].y = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break;
294*20cf1dd8SToby Isaac     case 3:
295*20cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      gradA[0].x = 0.0; gradA[0].y = 0.0; gradA[0].z = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break;
296*20cf1dd8SToby Isaac     }
297*20cf1dd8SToby Isaac   }
298*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
299*20cf1dd8SToby Isaac "      /* Get field and derivatives at this quadrature point */\n"
300*20cf1dd8SToby Isaac "      for (int i = 0; i < N_b; ++i) {\n"
301*20cf1dd8SToby Isaac "        for (int comp = 0; comp < N_comp; ++comp) {\n"
302*20cf1dd8SToby Isaac "          const int b    = i*N_comp+comp;\n"
303*20cf1dd8SToby Isaac "          const int pidx = qidx*N_bt + b;\n"
304*20cf1dd8SToby Isaac "          const int uidx = cell*N_bt + b;\n"
305*20cf1dd8SToby Isaac "          %s%d   realSpaceDer;\n\n",
306*20cf1dd8SToby Isaac                             &count, numeric_str, dim);STRING_ERROR_CHECK("Message to short");
307*20cf1dd8SToby Isaac   if (useField) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"          u[comp] += u_i[uidx]*phi_i[pidx];\n", &count);STRING_ERROR_CHECK("Message to short");}
308*20cf1dd8SToby Isaac   if (useFieldDer) {
309*20cf1dd8SToby Isaac     switch (dim) {
310*20cf1dd8SToby Isaac     case 2:
311*20cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
312*20cf1dd8SToby Isaac "          realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
313*20cf1dd8SToby Isaac "          gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
314*20cf1dd8SToby Isaac "          realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
315*20cf1dd8SToby Isaac "          gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n",
316*20cf1dd8SToby Isaac                            &count);STRING_ERROR_CHECK("Message to short");break;
317*20cf1dd8SToby Isaac     case 3:
318*20cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
319*20cf1dd8SToby Isaac "          realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
320*20cf1dd8SToby Isaac "          gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
321*20cf1dd8SToby Isaac "          realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
322*20cf1dd8SToby Isaac "          gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n"
323*20cf1dd8SToby Isaac "          realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
324*20cf1dd8SToby Isaac "          gradU[comp].z += u_i[uidx]*realSpaceDer.z;\n",
325*20cf1dd8SToby Isaac                            &count);STRING_ERROR_CHECK("Message to short");break;
326*20cf1dd8SToby Isaac     }
327*20cf1dd8SToby Isaac   }
328*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
329*20cf1dd8SToby Isaac "        }\n"
330*20cf1dd8SToby Isaac "      }\n",
331*20cf1dd8SToby Isaac                             &count);STRING_ERROR_CHECK("Message to short");
332*20cf1dd8SToby Isaac   if (useFieldAux) {
333*20cf1dd8SToby Isaac     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"          a[0] += a_i[cell];\n", &count);STRING_ERROR_CHECK("Message to short");
334*20cf1dd8SToby Isaac   }
335*20cf1dd8SToby Isaac   /* Calculate residual at quadrature points: Should be generated by an weak form egine */
336*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
337*20cf1dd8SToby Isaac "      /* Process values at quadrature points */\n",
338*20cf1dd8SToby Isaac                             &count);STRING_ERROR_CHECK("Message to short");
339*20cf1dd8SToby Isaac   switch (op) {
340*20cf1dd8SToby Isaac   case LAPLACIAN:
341*20cf1dd8SToby Isaac     if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_0[fidx] = 4.0;\n", &count);STRING_ERROR_CHECK("Message to short");}
342*20cf1dd8SToby Isaac     if (useF1) {
343*20cf1dd8SToby Isaac       if (useAux) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_1[fidx] = a[0]*gradU[cidx];\n", &count);STRING_ERROR_CHECK("Message to short");}
344*20cf1dd8SToby Isaac       else        {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_1[fidx] = gradU[cidx];\n", &count);STRING_ERROR_CHECK("Message to short");}
345*20cf1dd8SToby Isaac     }
346*20cf1dd8SToby Isaac     break;
347*20cf1dd8SToby Isaac   case ELASTICITY:
348*20cf1dd8SToby Isaac     if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_0[fidx] = 4.0;\n", &count);STRING_ERROR_CHECK("Message to short");}
349*20cf1dd8SToby Isaac     if (useF1) {
350*20cf1dd8SToby Isaac     switch (dim) {
351*20cf1dd8SToby Isaac     case 2:
352*20cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
353*20cf1dd8SToby Isaac "      switch (cidx) {\n"
354*20cf1dd8SToby Isaac "      case 0:\n"
355*20cf1dd8SToby Isaac "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].x + gradU[0].x);\n"
356*20cf1dd8SToby Isaac "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].y + gradU[1].x);\n"
357*20cf1dd8SToby Isaac "        break;\n"
358*20cf1dd8SToby Isaac "      case 1:\n"
359*20cf1dd8SToby Isaac "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].x + gradU[0].y);\n"
360*20cf1dd8SToby Isaac "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].y + gradU[1].y);\n"
361*20cf1dd8SToby Isaac "      }\n",
362*20cf1dd8SToby Isaac                            &count);STRING_ERROR_CHECK("Message to short");break;
363*20cf1dd8SToby Isaac     case 3:
364*20cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
365*20cf1dd8SToby Isaac "      switch (cidx) {\n"
366*20cf1dd8SToby Isaac "      case 0:\n"
367*20cf1dd8SToby Isaac "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].x + gradU[0].x);\n"
368*20cf1dd8SToby Isaac "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].y + gradU[1].x);\n"
369*20cf1dd8SToby Isaac "        f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].z + gradU[2].x);\n"
370*20cf1dd8SToby Isaac "        break;\n"
371*20cf1dd8SToby Isaac "      case 1:\n"
372*20cf1dd8SToby Isaac "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].x + gradU[0].y);\n"
373*20cf1dd8SToby Isaac "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[1].y);\n"
374*20cf1dd8SToby Isaac "        f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[2].y);\n"
375*20cf1dd8SToby Isaac "        break;\n"
376*20cf1dd8SToby Isaac "      case 2:\n"
377*20cf1dd8SToby Isaac "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].x + gradU[0].z);\n"
378*20cf1dd8SToby Isaac "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[1].z);\n"
379*20cf1dd8SToby Isaac "        f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[2].z);\n"
380*20cf1dd8SToby Isaac "      }\n",
381*20cf1dd8SToby Isaac                            &count);STRING_ERROR_CHECK("Message to short");break;
382*20cf1dd8SToby Isaac     }}
383*20cf1dd8SToby Isaac     break;
384*20cf1dd8SToby Isaac   default:
385*20cf1dd8SToby Isaac     SETERRQ1(PETSC_COMM_WORLD, PETSC_ERR_SUP, "PDE operator %d is not supported", op);
386*20cf1dd8SToby Isaac   }
387*20cf1dd8SToby Isaac   if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"      f_0[fidx] *= detJ[cell]*w;\n", &count);STRING_ERROR_CHECK("Message to short");}
388*20cf1dd8SToby Isaac   if (useF1) {
389*20cf1dd8SToby Isaac     switch (dim) {
390*20cf1dd8SToby Isaac     case 1:
391*20cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"      f_1[fidx].x *= detJ[cell]*w;\n", &count);STRING_ERROR_CHECK("Message to short");break;
392*20cf1dd8SToby Isaac     case 2:
393*20cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"      f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;\n", &count);STRING_ERROR_CHECK("Message to short");break;
394*20cf1dd8SToby Isaac     case 3:
395*20cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"      f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;\n", &count);STRING_ERROR_CHECK("Message to short");break;
396*20cf1dd8SToby Isaac     }
397*20cf1dd8SToby Isaac   }
398*20cf1dd8SToby Isaac   /* Thread transpose */
399*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
400*20cf1dd8SToby Isaac "    }\n\n"
401*20cf1dd8SToby Isaac "    /* ==== TRANSPOSE THREADS ==== */\n"
402*20cf1dd8SToby Isaac "    barrier(CLK_LOCAL_MEM_FENCE);\n\n",
403*20cf1dd8SToby Isaac                        &count);STRING_ERROR_CHECK("Message to short");
404*20cf1dd8SToby Isaac   /* Basis phase */
405*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
406*20cf1dd8SToby Isaac "    /* Map values at quadrature points to coefficients */\n"
407*20cf1dd8SToby Isaac "    for (int c = 0; c < N_sbc; ++c) {\n"
408*20cf1dd8SToby Isaac "      const int cell = c*N_bl*N_q + blbidx; /* Cell number in batch */\n"
409*20cf1dd8SToby Isaac "\n"
410*20cf1dd8SToby Isaac "      e_i = 0.0;\n"
411*20cf1dd8SToby Isaac "      for (int q = 0; q < N_q; ++q) {\n"
412*20cf1dd8SToby Isaac "        const int pidx = q*N_bt + bidx;\n"
413*20cf1dd8SToby Isaac "        const int fidx = (cell*N_q + q)*N_comp + cidx;\n"
414*20cf1dd8SToby Isaac "        %s%d   realSpaceDer;\n\n",
415*20cf1dd8SToby Isaac                        &count, numeric_str, dim);STRING_ERROR_CHECK("Message to short");
416*20cf1dd8SToby Isaac 
417*20cf1dd8SToby Isaac   if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"        e_i += phi_i[pidx]*f_0[fidx];\n", &count);STRING_ERROR_CHECK("Message to short");}
418*20cf1dd8SToby Isaac   if (useF1) {
419*20cf1dd8SToby Isaac     switch (dim) {
420*20cf1dd8SToby Isaac     case 2:
421*20cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
422*20cf1dd8SToby Isaac "        realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
423*20cf1dd8SToby Isaac "        e_i           += realSpaceDer.x*f_1[fidx].x;\n"
424*20cf1dd8SToby Isaac "        realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
425*20cf1dd8SToby Isaac "        e_i           += realSpaceDer.y*f_1[fidx].y;\n",
426*20cf1dd8SToby Isaac                            &count);STRING_ERROR_CHECK("Message to short");break;
427*20cf1dd8SToby Isaac     case 3:
428*20cf1dd8SToby Isaac       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
429*20cf1dd8SToby Isaac "        realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
430*20cf1dd8SToby Isaac "        e_i           += realSpaceDer.x*f_1[fidx].x;\n"
431*20cf1dd8SToby Isaac "        realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
432*20cf1dd8SToby Isaac "        e_i           += realSpaceDer.y*f_1[fidx].y;\n"
433*20cf1dd8SToby Isaac "        realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
434*20cf1dd8SToby Isaac "        e_i           += realSpaceDer.z*f_1[fidx].z;\n",
435*20cf1dd8SToby Isaac                            &count);STRING_ERROR_CHECK("Message to short");break;
436*20cf1dd8SToby Isaac     }
437*20cf1dd8SToby Isaac   }
438*20cf1dd8SToby Isaac   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
439*20cf1dd8SToby Isaac "      }\n"
440*20cf1dd8SToby Isaac "      /* Write element vector for N_{cbc} cells at a time */\n"
441*20cf1dd8SToby Isaac "      elemVec[(Goffset + batch*N_bc + c*N_bl*N_q)*N_bt + tidx] = e_i;\n"
442*20cf1dd8SToby Isaac "    }\n"
443*20cf1dd8SToby Isaac "    /* ==== Could do one write per batch ==== */\n"
444*20cf1dd8SToby Isaac "  }\n"
445*20cf1dd8SToby Isaac "  return;\n"
446*20cf1dd8SToby Isaac "}\n",
447*20cf1dd8SToby Isaac                        &count);STRING_ERROR_CHECK("Message to short");
448*20cf1dd8SToby Isaac   PetscFunctionReturn(0);
449*20cf1dd8SToby Isaac }
450*20cf1dd8SToby Isaac 
451*20cf1dd8SToby Isaac PetscErrorCode PetscFEOpenCLGetIntegrationKernel(PetscFE fem, PetscBool useAux, cl_program *ocl_prog, cl_kernel *ocl_kernel)
452*20cf1dd8SToby Isaac {
453*20cf1dd8SToby Isaac   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
454*20cf1dd8SToby Isaac   PetscInt        dim, N_bl;
455*20cf1dd8SToby Isaac   PetscBool       flg;
456*20cf1dd8SToby Isaac   char           *buffer;
457*20cf1dd8SToby Isaac   size_t          len;
458*20cf1dd8SToby Isaac   char            errMsg[8192];
459*20cf1dd8SToby Isaac   cl_int          ierr2;
460*20cf1dd8SToby Isaac   PetscErrorCode  ierr;
461*20cf1dd8SToby Isaac 
462*20cf1dd8SToby Isaac   PetscFunctionBegin;
463*20cf1dd8SToby Isaac   ierr = PetscFEGetSpatialDimension(fem, &dim);CHKERRQ(ierr);
464*20cf1dd8SToby Isaac   ierr = PetscMalloc1(8192, &buffer);CHKERRQ(ierr);
465*20cf1dd8SToby Isaac   ierr = PetscFEGetTileSizes(fem, NULL, &N_bl, NULL, NULL);CHKERRQ(ierr);
466*20cf1dd8SToby Isaac   ierr = PetscFEOpenCLGenerateIntegrationCode(fem, &buffer, 8192, useAux, N_bl);CHKERRQ(ierr);
467*20cf1dd8SToby Isaac   ierr = PetscOptionsHasName(((PetscObject)fem)->options,((PetscObject)fem)->prefix, "-petscfe_opencl_kernel_print", &flg);CHKERRQ(ierr);
468*20cf1dd8SToby Isaac   if (flg) {ierr = PetscPrintf(PetscObjectComm((PetscObject) fem), "OpenCL FE Integration Kernel:\n%s\n", buffer);CHKERRQ(ierr);}
469*20cf1dd8SToby Isaac   len  = strlen(buffer);
470*20cf1dd8SToby Isaac   *ocl_prog = clCreateProgramWithSource(ocl->ctx_id, 1, (const char **) &buffer, &len, &ierr2);CHKERRQ(ierr2);
471*20cf1dd8SToby Isaac   ierr = clBuildProgram(*ocl_prog, 0, NULL, NULL, NULL, NULL);
472*20cf1dd8SToby Isaac   if (ierr != CL_SUCCESS) {
473*20cf1dd8SToby Isaac     ierr = clGetProgramBuildInfo(*ocl_prog, ocl->dev_id, CL_PROGRAM_BUILD_LOG, 8192*sizeof(char), &errMsg, NULL);CHKERRQ(ierr);
474*20cf1dd8SToby Isaac     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Build failed! Log:\n %s", errMsg);
475*20cf1dd8SToby Isaac   }
476*20cf1dd8SToby Isaac   ierr = PetscFree(buffer);CHKERRQ(ierr);
477*20cf1dd8SToby Isaac   *ocl_kernel = clCreateKernel(*ocl_prog, "integrateElementQuadrature", &ierr);CHKERRQ(ierr);
478*20cf1dd8SToby Isaac   PetscFunctionReturn(0);
479*20cf1dd8SToby Isaac }
480*20cf1dd8SToby Isaac 
481*20cf1dd8SToby Isaac PetscErrorCode PetscFEOpenCLCalculateGrid(PetscFE fem, PetscInt N, PetscInt blockSize, size_t *x, size_t *y, size_t *z)
482*20cf1dd8SToby Isaac {
483*20cf1dd8SToby Isaac   const PetscInt Nblocks = N/blockSize;
484*20cf1dd8SToby Isaac 
485*20cf1dd8SToby Isaac   PetscFunctionBegin;
486*20cf1dd8SToby Isaac   if (N % blockSize) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N);
487*20cf1dd8SToby Isaac   *z = 1;
488*20cf1dd8SToby Isaac   for (*x = (size_t) (PetscSqrtReal(Nblocks) + 0.5); *x > 0; --*x) {
489*20cf1dd8SToby Isaac     *y = Nblocks / *x;
490*20cf1dd8SToby Isaac     if (*x * *y == Nblocks) break;
491*20cf1dd8SToby Isaac   }
492*20cf1dd8SToby Isaac   if (*x * *y != Nblocks) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %d with block size %d", N, blockSize);
493*20cf1dd8SToby Isaac   PetscFunctionReturn(0);
494*20cf1dd8SToby Isaac }
495*20cf1dd8SToby Isaac 
496*20cf1dd8SToby Isaac PetscErrorCode PetscFEOpenCLLogResidual(PetscFE fem, PetscLogDouble time, PetscLogDouble flops)
497*20cf1dd8SToby Isaac {
498*20cf1dd8SToby Isaac   PetscFE_OpenCL   *ocl = (PetscFE_OpenCL *) fem->data;
499*20cf1dd8SToby Isaac   PetscStageLog     stageLog;
500*20cf1dd8SToby Isaac   PetscEventPerfLog eventLog = NULL;
501*20cf1dd8SToby Isaac   PetscInt          stage;
502*20cf1dd8SToby Isaac   PetscErrorCode    ierr;
503*20cf1dd8SToby Isaac 
504*20cf1dd8SToby Isaac   PetscFunctionBegin;
505*20cf1dd8SToby Isaac   ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
506*20cf1dd8SToby Isaac   ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr);
507*20cf1dd8SToby Isaac   ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr);
508*20cf1dd8SToby Isaac     /* Log performance info */
509*20cf1dd8SToby Isaac   eventLog->eventInfo[ocl->residualEvent].count++;
510*20cf1dd8SToby Isaac   eventLog->eventInfo[ocl->residualEvent].time  += time;
511*20cf1dd8SToby Isaac   eventLog->eventInfo[ocl->residualEvent].flops += flops;
512*20cf1dd8SToby Isaac   PetscFunctionReturn(0);
513*20cf1dd8SToby Isaac }
514*20cf1dd8SToby Isaac 
515*20cf1dd8SToby Isaac PetscErrorCode PetscFEIntegrateResidual_OpenCL(PetscFE fem, PetscDS prob, PetscInt field, PetscInt Ne, PetscFEGeom *cgeom,
516*20cf1dd8SToby Isaac                                                const PetscScalar coefficients[], const PetscScalar coefficients_t[], PetscDS probAux, const PetscScalar coefficientsAux[], PetscReal t, PetscScalar elemVec[])
517*20cf1dd8SToby Isaac {
518*20cf1dd8SToby Isaac   /* Nbc = batchSize */
519*20cf1dd8SToby Isaac   PetscFE_OpenCL   *ocl = (PetscFE_OpenCL *) fem->data;
520*20cf1dd8SToby Isaac   PetscPointFunc    f0_func;
521*20cf1dd8SToby Isaac   PetscPointFunc    f1_func;
522*20cf1dd8SToby Isaac   PetscQuadrature   q;
523*20cf1dd8SToby Isaac   PetscInt          dim, qNc;
524*20cf1dd8SToby Isaac   PetscInt          N_b;    /* The number of basis functions */
525*20cf1dd8SToby Isaac   PetscInt          N_comp; /* The number of basis function components */
526*20cf1dd8SToby Isaac   PetscInt          N_bt;   /* The total number of scalar basis functions */
527*20cf1dd8SToby Isaac   PetscInt          N_q;    /* The number of quadrature points */
528*20cf1dd8SToby Isaac   PetscInt          N_bst;  /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */
529*20cf1dd8SToby Isaac   PetscInt          N_t;    /* The number of threads, N_bst * N_bl */
530*20cf1dd8SToby Isaac   PetscInt          N_bl;   /* The number of blocks */
531*20cf1dd8SToby Isaac   PetscInt          N_bc;   /* The batch size, N_bl*N_q*N_b */
532*20cf1dd8SToby Isaac   PetscInt          N_cb;   /* The number of batches */
533*20cf1dd8SToby Isaac   PetscInt          numFlops, f0Flops = 0, f1Flops = 0;
534*20cf1dd8SToby Isaac   PetscBool         useAux      = probAux ? PETSC_TRUE : PETSC_FALSE;
535*20cf1dd8SToby Isaac   PetscBool         useField    = PETSC_FALSE;
536*20cf1dd8SToby Isaac   PetscBool         useFieldDer = PETSC_TRUE;
537*20cf1dd8SToby Isaac   PetscBool         useF0       = PETSC_TRUE;
538*20cf1dd8SToby Isaac   PetscBool         useF1       = PETSC_TRUE;
539*20cf1dd8SToby Isaac   /* OpenCL variables */
540*20cf1dd8SToby Isaac   cl_program        ocl_prog;
541*20cf1dd8SToby Isaac   cl_kernel         ocl_kernel;
542*20cf1dd8SToby Isaac   cl_event          ocl_ev;         /* The event for tracking kernel execution */
543*20cf1dd8SToby Isaac   cl_ulong          ns_start;       /* Nanoseconds counter on GPU at kernel start */
544*20cf1dd8SToby Isaac   cl_ulong          ns_end;         /* Nanoseconds counter on GPU at kernel stop */
545*20cf1dd8SToby Isaac   cl_mem            o_jacobianInverses, o_jacobianDeterminants;
546*20cf1dd8SToby Isaac   cl_mem            o_coefficients, o_coefficientsAux, o_elemVec;
547*20cf1dd8SToby Isaac   float            *f_coeff = NULL, *f_coeffAux = NULL, *f_invJ = NULL, *f_detJ = NULL;
548*20cf1dd8SToby Isaac   double           *d_coeff = NULL, *d_coeffAux = NULL, *d_invJ = NULL, *d_detJ = NULL;
549*20cf1dd8SToby Isaac   PetscReal        *r_invJ = NULL, *r_detJ = NULL;
550*20cf1dd8SToby Isaac   void             *oclCoeff, *oclCoeffAux, *oclInvJ, *oclDetJ;
551*20cf1dd8SToby Isaac   size_t            local_work_size[3], global_work_size[3];
552*20cf1dd8SToby Isaac   size_t            realSize, x, y, z;
553*20cf1dd8SToby Isaac   const PetscReal   *points, *weights;
554*20cf1dd8SToby Isaac   PetscErrorCode    ierr;
555*20cf1dd8SToby Isaac 
556*20cf1dd8SToby Isaac   PetscFunctionBegin;
557*20cf1dd8SToby Isaac   if (!Ne) {ierr = PetscFEOpenCLLogResidual(fem, 0.0, 0.0);CHKERRQ(ierr); PetscFunctionReturn(0);}
558*20cf1dd8SToby Isaac   ierr = PetscFEGetSpatialDimension(fem, &dim);CHKERRQ(ierr);
559*20cf1dd8SToby Isaac   ierr = PetscFEGetQuadrature(fem, &q);CHKERRQ(ierr);
560*20cf1dd8SToby Isaac   ierr = PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);CHKERRQ(ierr);
561*20cf1dd8SToby Isaac   if (qNc != 1) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %D components\n", qNc);
562*20cf1dd8SToby Isaac   ierr = PetscFEGetDimension(fem, &N_b);CHKERRQ(ierr);
563*20cf1dd8SToby Isaac   ierr = PetscFEGetNumComponents(fem, &N_comp);CHKERRQ(ierr);
564*20cf1dd8SToby Isaac   ierr = PetscDSGetResidual(prob, field, &f0_func, &f1_func);CHKERRQ(ierr);
565*20cf1dd8SToby Isaac   ierr = PetscFEGetTileSizes(fem, NULL, &N_bl, &N_bc, &N_cb);CHKERRQ(ierr);
566*20cf1dd8SToby Isaac   N_bt  = N_b*N_comp;
567*20cf1dd8SToby Isaac   N_bst = N_bt*N_q;
568*20cf1dd8SToby Isaac   N_t   = N_bst*N_bl;
569*20cf1dd8SToby Isaac   if (N_bc*N_comp != N_t) SETERRQ3(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, N_bc, N_comp);
570*20cf1dd8SToby Isaac   /* Calculate layout */
571*20cf1dd8SToby Isaac   if (Ne % (N_cb*N_bc)) { /* Remainder cells */
572*20cf1dd8SToby Isaac     ierr = PetscFEIntegrateResidual_Basic(fem, prob, field, Ne, cgeom, coefficients, coefficients_t, probAux, coefficientsAux, t, elemVec);CHKERRQ(ierr);
573*20cf1dd8SToby Isaac     PetscFunctionReturn(0);
574*20cf1dd8SToby Isaac   }
575*20cf1dd8SToby Isaac   ierr = PetscFEOpenCLCalculateGrid(fem, Ne, N_cb*N_bc, &x, &y, &z);CHKERRQ(ierr);
576*20cf1dd8SToby Isaac   local_work_size[0]  = N_bc*N_comp;
577*20cf1dd8SToby Isaac   local_work_size[1]  = 1;
578*20cf1dd8SToby Isaac   local_work_size[2]  = 1;
579*20cf1dd8SToby Isaac   global_work_size[0] = x * local_work_size[0];
580*20cf1dd8SToby Isaac   global_work_size[1] = y * local_work_size[1];
581*20cf1dd8SToby Isaac   global_work_size[2] = z * local_work_size[2];
582*20cf1dd8SToby Isaac   ierr = PetscInfo7(fem, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n", x, y, z, local_work_size[0], local_work_size[1], local_work_size[2], N_cb);CHKERRQ(ierr);
583*20cf1dd8SToby Isaac   ierr = PetscInfo2(fem, " N_t: %d, N_cb: %d\n", N_t, N_cb);
584*20cf1dd8SToby Isaac   /* Generate code */
585*20cf1dd8SToby Isaac   if (probAux) {
586*20cf1dd8SToby Isaac     PetscSpace P;
587*20cf1dd8SToby Isaac     PetscInt   NfAux, order, f;
588*20cf1dd8SToby Isaac 
589*20cf1dd8SToby Isaac     ierr = PetscDSGetNumFields(probAux, &NfAux);CHKERRQ(ierr);
590*20cf1dd8SToby Isaac     for (f = 0; f < NfAux; ++f) {
591*20cf1dd8SToby Isaac       PetscFE feAux;
592*20cf1dd8SToby Isaac 
593*20cf1dd8SToby Isaac       ierr = PetscDSGetDiscretization(probAux, f, (PetscObject *) &feAux);CHKERRQ(ierr);
594*20cf1dd8SToby Isaac       ierr = PetscFEGetBasisSpace(feAux, &P);CHKERRQ(ierr);
595*20cf1dd8SToby Isaac       ierr = PetscSpaceGetDegree(P, &order, NULL);CHKERRQ(ierr);
596*20cf1dd8SToby Isaac       if (order > 0) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Can only handle P0 coefficient fields");
597*20cf1dd8SToby Isaac     }
598*20cf1dd8SToby Isaac   }
599*20cf1dd8SToby Isaac   ierr = PetscFEOpenCLGetIntegrationKernel(fem, useAux, &ocl_prog, &ocl_kernel);CHKERRQ(ierr);
600*20cf1dd8SToby Isaac   /* Create buffers on the device and send data over */
601*20cf1dd8SToby Isaac   ierr = PetscDataTypeGetSize(ocl->realType, &realSize);CHKERRQ(ierr);
602*20cf1dd8SToby Isaac   if (sizeof(PetscReal) != realSize) {
603*20cf1dd8SToby Isaac     switch (ocl->realType) {
604*20cf1dd8SToby Isaac     case PETSC_FLOAT:
605*20cf1dd8SToby Isaac     {
606*20cf1dd8SToby Isaac       PetscInt c, b, d;
607*20cf1dd8SToby Isaac 
608*20cf1dd8SToby Isaac       ierr = PetscMalloc4(Ne*N_bt,&f_coeff,Ne,&f_coeffAux,Ne*dim*dim,&f_invJ,Ne,&f_detJ);CHKERRQ(ierr);
609*20cf1dd8SToby Isaac       for (c = 0; c < Ne; ++c) {
610*20cf1dd8SToby Isaac         f_detJ[c] = (float) cgeom[c].detJ;
611*20cf1dd8SToby Isaac         for (d = 0; d < dim*dim; ++d) {
612*20cf1dd8SToby Isaac           f_invJ[c*dim*dim+d] = (float) cgeom[c].invJ[d];
613*20cf1dd8SToby Isaac         }
614*20cf1dd8SToby Isaac         for (b = 0; b < N_bt; ++b) {
615*20cf1dd8SToby Isaac           f_coeff[c*N_bt+b] = (float) coefficients[c*N_bt+b];
616*20cf1dd8SToby Isaac         }
617*20cf1dd8SToby Isaac       }
618*20cf1dd8SToby Isaac       if (coefficientsAux) { /* Assume P0 */
619*20cf1dd8SToby Isaac         for (c = 0; c < Ne; ++c) {
620*20cf1dd8SToby Isaac           f_coeffAux[c] = (float) coefficientsAux[c];
621*20cf1dd8SToby Isaac         }
622*20cf1dd8SToby Isaac       }
623*20cf1dd8SToby Isaac       oclCoeff      = (void *) f_coeff;
624*20cf1dd8SToby Isaac       if (coefficientsAux) {
625*20cf1dd8SToby Isaac         oclCoeffAux = (void *) f_coeffAux;
626*20cf1dd8SToby Isaac       } else {
627*20cf1dd8SToby Isaac         oclCoeffAux = NULL;
628*20cf1dd8SToby Isaac       }
629*20cf1dd8SToby Isaac       oclInvJ       = (void *) f_invJ;
630*20cf1dd8SToby Isaac       oclDetJ       = (void *) f_detJ;
631*20cf1dd8SToby Isaac     }
632*20cf1dd8SToby Isaac     break;
633*20cf1dd8SToby Isaac     case PETSC_DOUBLE:
634*20cf1dd8SToby Isaac     {
635*20cf1dd8SToby Isaac       PetscInt c, b, d;
636*20cf1dd8SToby Isaac 
637*20cf1dd8SToby Isaac       ierr = PetscMalloc4(Ne*N_bt,&d_coeff,Ne,&d_coeffAux,Ne*dim*dim,&d_invJ,Ne,&d_detJ);CHKERRQ(ierr);
638*20cf1dd8SToby Isaac       for (c = 0; c < Ne; ++c) {
639*20cf1dd8SToby Isaac         d_detJ[c] = (double) cgeom[c].detJ;
640*20cf1dd8SToby Isaac         for (d = 0; d < dim*dim; ++d) {
641*20cf1dd8SToby Isaac           d_invJ[c*dim*dim+d] = (double) cgeom[c].invJ[d];
642*20cf1dd8SToby Isaac         }
643*20cf1dd8SToby Isaac         for (b = 0; b < N_bt; ++b) {
644*20cf1dd8SToby Isaac           d_coeff[c*N_bt+b] = (double) coefficients[c*N_bt+b];
645*20cf1dd8SToby Isaac         }
646*20cf1dd8SToby Isaac       }
647*20cf1dd8SToby Isaac       if (coefficientsAux) { /* Assume P0 */
648*20cf1dd8SToby Isaac         for (c = 0; c < Ne; ++c) {
649*20cf1dd8SToby Isaac           d_coeffAux[c] = (double) coefficientsAux[c];
650*20cf1dd8SToby Isaac         }
651*20cf1dd8SToby Isaac       }
652*20cf1dd8SToby Isaac       oclCoeff      = (void *) d_coeff;
653*20cf1dd8SToby Isaac       if (coefficientsAux) {
654*20cf1dd8SToby Isaac         oclCoeffAux = (void *) d_coeffAux;
655*20cf1dd8SToby Isaac       } else {
656*20cf1dd8SToby Isaac         oclCoeffAux = NULL;
657*20cf1dd8SToby Isaac       }
658*20cf1dd8SToby Isaac       oclInvJ       = (void *) d_invJ;
659*20cf1dd8SToby Isaac       oclDetJ       = (void *) d_detJ;
660*20cf1dd8SToby Isaac     }
661*20cf1dd8SToby Isaac     break;
662*20cf1dd8SToby Isaac     default:
663*20cf1dd8SToby Isaac       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
664*20cf1dd8SToby Isaac     }
665*20cf1dd8SToby Isaac   } else {
666*20cf1dd8SToby Isaac     PetscInt c, d;
667*20cf1dd8SToby Isaac 
668*20cf1dd8SToby Isaac     ierr = PetscMalloc2(Ne*dim*dim,&r_invJ,Ne,&r_detJ);CHKERRQ(ierr);
669*20cf1dd8SToby Isaac     for (c = 0; c < Ne; ++c) {
670*20cf1dd8SToby Isaac       r_detJ[c] = cgeom[c].detJ;
671*20cf1dd8SToby Isaac       for (d = 0; d < dim*dim; ++d) {
672*20cf1dd8SToby Isaac         r_invJ[c*dim*dim+d] = cgeom[c].invJ[d];
673*20cf1dd8SToby Isaac       }
674*20cf1dd8SToby Isaac     }
675*20cf1dd8SToby Isaac     oclCoeff    = (void *) coefficients;
676*20cf1dd8SToby Isaac     oclCoeffAux = (void *) coefficientsAux;
677*20cf1dd8SToby Isaac     oclInvJ     = (void *) r_invJ;
678*20cf1dd8SToby Isaac     oclDetJ     = (void *) r_detJ;
679*20cf1dd8SToby Isaac   }
680*20cf1dd8SToby Isaac   o_coefficients         = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*N_bt    * realSize, oclCoeff,    &ierr);CHKERRQ(ierr);
681*20cf1dd8SToby Isaac   if (coefficientsAux) {
682*20cf1dd8SToby Isaac     o_coefficientsAux    = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne         * realSize, oclCoeffAux, &ierr);CHKERRQ(ierr);
683*20cf1dd8SToby Isaac   } else {
684*20cf1dd8SToby Isaac     o_coefficientsAux    = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY,                        Ne         * realSize, oclCoeffAux, &ierr);CHKERRQ(ierr);
685*20cf1dd8SToby Isaac   }
686*20cf1dd8SToby Isaac   o_jacobianInverses     = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*dim*dim * realSize, oclInvJ,     &ierr);CHKERRQ(ierr);
687*20cf1dd8SToby Isaac   o_jacobianDeterminants = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne         * realSize, oclDetJ,     &ierr);CHKERRQ(ierr);
688*20cf1dd8SToby Isaac   o_elemVec              = clCreateBuffer(ocl->ctx_id, CL_MEM_WRITE_ONLY,                       Ne*N_bt    * realSize, NULL,        &ierr);CHKERRQ(ierr);
689*20cf1dd8SToby Isaac   /* Kernel launch */
690*20cf1dd8SToby Isaac   ierr = clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void*) &N_cb);CHKERRQ(ierr);
691*20cf1dd8SToby Isaac   ierr = clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void*) &o_coefficients);CHKERRQ(ierr);
692*20cf1dd8SToby Isaac   ierr = clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void*) &o_coefficientsAux);CHKERRQ(ierr);
693*20cf1dd8SToby Isaac   ierr = clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void*) &o_jacobianInverses);CHKERRQ(ierr);
694*20cf1dd8SToby Isaac   ierr = clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void*) &o_jacobianDeterminants);CHKERRQ(ierr);
695*20cf1dd8SToby Isaac   ierr = clSetKernelArg(ocl_kernel, 5, sizeof(cl_mem), (void*) &o_elemVec);CHKERRQ(ierr);
696*20cf1dd8SToby Isaac   ierr = clEnqueueNDRangeKernel(ocl->queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev);CHKERRQ(ierr);
697*20cf1dd8SToby Isaac   /* Read data back from device */
698*20cf1dd8SToby Isaac   if (sizeof(PetscReal) != realSize) {
699*20cf1dd8SToby Isaac     switch (ocl->realType) {
700*20cf1dd8SToby Isaac     case PETSC_FLOAT:
701*20cf1dd8SToby Isaac     {
702*20cf1dd8SToby Isaac       float   *elem;
703*20cf1dd8SToby Isaac       PetscInt c, b;
704*20cf1dd8SToby Isaac 
705*20cf1dd8SToby Isaac       ierr = PetscFree4(f_coeff,f_coeffAux,f_invJ,f_detJ);CHKERRQ(ierr);
706*20cf1dd8SToby Isaac       ierr = PetscMalloc1(Ne*N_bt, &elem);CHKERRQ(ierr);
707*20cf1dd8SToby Isaac       ierr = clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL);CHKERRQ(ierr);
708*20cf1dd8SToby Isaac       for (c = 0; c < Ne; ++c) {
709*20cf1dd8SToby Isaac         for (b = 0; b < N_bt; ++b) {
710*20cf1dd8SToby Isaac           elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b];
711*20cf1dd8SToby Isaac         }
712*20cf1dd8SToby Isaac       }
713*20cf1dd8SToby Isaac       ierr = PetscFree(elem);CHKERRQ(ierr);
714*20cf1dd8SToby Isaac     }
715*20cf1dd8SToby Isaac     break;
716*20cf1dd8SToby Isaac     case PETSC_DOUBLE:
717*20cf1dd8SToby Isaac     {
718*20cf1dd8SToby Isaac       double  *elem;
719*20cf1dd8SToby Isaac       PetscInt c, b;
720*20cf1dd8SToby Isaac 
721*20cf1dd8SToby Isaac       ierr = PetscFree4(d_coeff,d_coeffAux,d_invJ,d_detJ);CHKERRQ(ierr);
722*20cf1dd8SToby Isaac       ierr = PetscMalloc1(Ne*N_bt, &elem);CHKERRQ(ierr);
723*20cf1dd8SToby Isaac       ierr = clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL);CHKERRQ(ierr);
724*20cf1dd8SToby Isaac       for (c = 0; c < Ne; ++c) {
725*20cf1dd8SToby Isaac         for (b = 0; b < N_bt; ++b) {
726*20cf1dd8SToby Isaac           elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b];
727*20cf1dd8SToby Isaac         }
728*20cf1dd8SToby Isaac       }
729*20cf1dd8SToby Isaac       ierr = PetscFree(elem);CHKERRQ(ierr);
730*20cf1dd8SToby Isaac     }
731*20cf1dd8SToby Isaac     break;
732*20cf1dd8SToby Isaac     default:
733*20cf1dd8SToby Isaac       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
734*20cf1dd8SToby Isaac     }
735*20cf1dd8SToby Isaac   } else {
736*20cf1dd8SToby Isaac     ierr = PetscFree2(r_invJ,r_detJ);CHKERRQ(ierr);
737*20cf1dd8SToby Isaac     ierr = clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elemVec, 0, NULL, NULL);CHKERRQ(ierr);
738*20cf1dd8SToby Isaac   }
739*20cf1dd8SToby Isaac   /* Log performance */
740*20cf1dd8SToby Isaac   ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL);CHKERRQ(ierr);
741*20cf1dd8SToby Isaac   ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END,   sizeof(cl_ulong), &ns_end,   NULL);CHKERRQ(ierr);
742*20cf1dd8SToby Isaac   f0Flops = 0;
743*20cf1dd8SToby Isaac   switch (ocl->op) {
744*20cf1dd8SToby Isaac   case LAPLACIAN:
745*20cf1dd8SToby Isaac     f1Flops = useAux ? dim : 0;break;
746*20cf1dd8SToby Isaac   case ELASTICITY:
747*20cf1dd8SToby Isaac     f1Flops = 2*dim*dim;break;
748*20cf1dd8SToby Isaac   }
749*20cf1dd8SToby Isaac   numFlops = Ne*(
750*20cf1dd8SToby Isaac     N_q*(
751*20cf1dd8SToby Isaac       N_b*N_comp*((useField ? 2 : 0) + (useFieldDer ? 2*dim*(dim + 1) : 0))
752*20cf1dd8SToby Isaac       /*+
753*20cf1dd8SToby Isaac        N_ba*N_compa*((useFieldAux ? 2 : 0) + (useFieldDerAux ? 2*dim*(dim + 1) : 0))*/
754*20cf1dd8SToby Isaac       +
755*20cf1dd8SToby Isaac       N_comp*((useF0 ? f0Flops + 2 : 0) + (useF1 ? f1Flops + 2*dim : 0)))
756*20cf1dd8SToby Isaac     +
757*20cf1dd8SToby Isaac     N_b*((useF0 ? 2 : 0) + (useF1 ? 2*dim*(dim + 1) : 0)));
758*20cf1dd8SToby Isaac   ierr = PetscFEOpenCLLogResidual(fem, (ns_end - ns_start)*1.0e-9, numFlops);CHKERRQ(ierr);
759*20cf1dd8SToby Isaac   /* Cleanup */
760*20cf1dd8SToby Isaac   ierr = clReleaseMemObject(o_coefficients);CHKERRQ(ierr);
761*20cf1dd8SToby Isaac   ierr = clReleaseMemObject(o_coefficientsAux);CHKERRQ(ierr);
762*20cf1dd8SToby Isaac   ierr = clReleaseMemObject(o_jacobianInverses);CHKERRQ(ierr);
763*20cf1dd8SToby Isaac   ierr = clReleaseMemObject(o_jacobianDeterminants);CHKERRQ(ierr);
764*20cf1dd8SToby Isaac   ierr = clReleaseMemObject(o_elemVec);CHKERRQ(ierr);
765*20cf1dd8SToby Isaac   ierr = clReleaseKernel(ocl_kernel);CHKERRQ(ierr);
766*20cf1dd8SToby Isaac   ierr = clReleaseProgram(ocl_prog);CHKERRQ(ierr);
767*20cf1dd8SToby Isaac   PetscFunctionReturn(0);
768*20cf1dd8SToby Isaac }
769*20cf1dd8SToby Isaac 
770*20cf1dd8SToby Isaac PetscErrorCode PetscFEInitialize_OpenCL(PetscFE fem)
771*20cf1dd8SToby Isaac {
772*20cf1dd8SToby Isaac   PetscFunctionBegin;
773*20cf1dd8SToby Isaac   fem->ops->setfromoptions          = NULL;
774*20cf1dd8SToby Isaac   fem->ops->setup                   = PetscFESetUp_Basic;
775*20cf1dd8SToby Isaac   fem->ops->view                    = NULL;
776*20cf1dd8SToby Isaac   fem->ops->destroy                 = PetscFEDestroy_OpenCL;
777*20cf1dd8SToby Isaac   fem->ops->getdimension            = PetscFEGetDimension_Basic;
778*20cf1dd8SToby Isaac   fem->ops->gettabulation           = PetscFEGetTabulation_Basic;
779*20cf1dd8SToby Isaac   fem->ops->integrateresidual       = PetscFEIntegrateResidual_OpenCL;
780*20cf1dd8SToby Isaac   fem->ops->integratebdresidual     = NULL/* PetscFEIntegrateBdResidual_OpenCL */;
781*20cf1dd8SToby Isaac   fem->ops->integratejacobianaction = NULL/* PetscFEIntegrateJacobianAction_OpenCL */;
782*20cf1dd8SToby Isaac   fem->ops->integratejacobian       = PetscFEIntegrateJacobian_Basic;
783*20cf1dd8SToby Isaac   PetscFunctionReturn(0);
784*20cf1dd8SToby Isaac }
785*20cf1dd8SToby Isaac 
786*20cf1dd8SToby Isaac /*MC
787*20cf1dd8SToby Isaac   PETSCFEOPENCL = "opencl" - A PetscFE object that integrates using a vectorized OpenCL implementation
788*20cf1dd8SToby Isaac 
789*20cf1dd8SToby Isaac   Level: intermediate
790*20cf1dd8SToby Isaac 
791*20cf1dd8SToby Isaac .seealso: PetscFEType, PetscFECreate(), PetscFESetType()
792*20cf1dd8SToby Isaac M*/
793*20cf1dd8SToby Isaac 
794*20cf1dd8SToby Isaac PETSC_EXTERN PetscErrorCode PetscFECreate_OpenCL(PetscFE fem)
795*20cf1dd8SToby Isaac {
796*20cf1dd8SToby Isaac   PetscFE_OpenCL *ocl;
797*20cf1dd8SToby Isaac   cl_uint         num_platforms;
798*20cf1dd8SToby Isaac   cl_platform_id  platform_ids[42];
799*20cf1dd8SToby Isaac   cl_uint         num_devices;
800*20cf1dd8SToby Isaac   cl_device_id    device_ids[42];
801*20cf1dd8SToby Isaac   cl_int          ierr2;
802*20cf1dd8SToby Isaac   PetscErrorCode  ierr;
803*20cf1dd8SToby Isaac 
804*20cf1dd8SToby Isaac   PetscFunctionBegin;
805*20cf1dd8SToby Isaac   PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
806*20cf1dd8SToby Isaac   ierr      = PetscNewLog(fem,&ocl);CHKERRQ(ierr);
807*20cf1dd8SToby Isaac   fem->data = ocl;
808*20cf1dd8SToby Isaac 
809*20cf1dd8SToby Isaac   /* Init Platform */
810*20cf1dd8SToby Isaac   ierr = clGetPlatformIDs(42, platform_ids, &num_platforms);CHKERRQ(ierr);
811*20cf1dd8SToby Isaac   if (!num_platforms) SETERRQ(PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL platform found.");
812*20cf1dd8SToby Isaac   ocl->pf_id = platform_ids[0];
813*20cf1dd8SToby Isaac   /* Init Device */
814*20cf1dd8SToby Isaac   ierr = clGetDeviceIDs(ocl->pf_id, CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices);CHKERRQ(ierr);
815*20cf1dd8SToby Isaac   if (!num_devices) SETERRQ(PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL device found.");
816*20cf1dd8SToby Isaac   ocl->dev_id = device_ids[0];
817*20cf1dd8SToby Isaac   /* Create context with one command queue */
818*20cf1dd8SToby Isaac   ocl->ctx_id   = clCreateContext(0, 1, &(ocl->dev_id), NULL, NULL, &ierr2);CHKERRQ(ierr2);
819*20cf1dd8SToby Isaac   ocl->queue_id = clCreateCommandQueue(ocl->ctx_id, ocl->dev_id, CL_QUEUE_PROFILING_ENABLE, &ierr2);CHKERRQ(ierr2);
820*20cf1dd8SToby Isaac   /* Types */
821*20cf1dd8SToby Isaac   ocl->realType = PETSC_FLOAT;
822*20cf1dd8SToby Isaac   /* Register events */
823*20cf1dd8SToby Isaac   ierr = PetscLogEventRegister("OpenCL FEResidual", PETSCFE_CLASSID, &ocl->residualEvent);CHKERRQ(ierr);
824*20cf1dd8SToby Isaac   /* Equation handling */
825*20cf1dd8SToby Isaac   ocl->op = LAPLACIAN;
826*20cf1dd8SToby Isaac 
827*20cf1dd8SToby Isaac   ierr = PetscFEInitialize_OpenCL(fem);CHKERRQ(ierr);
828*20cf1dd8SToby Isaac   PetscFunctionReturn(0);
829*20cf1dd8SToby Isaac }
830*20cf1dd8SToby Isaac 
831*20cf1dd8SToby Isaac PetscErrorCode PetscFEOpenCLSetRealType(PetscFE fem, PetscDataType realType)
832*20cf1dd8SToby Isaac {
833*20cf1dd8SToby Isaac   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
834*20cf1dd8SToby Isaac 
835*20cf1dd8SToby Isaac   PetscFunctionBegin;
836*20cf1dd8SToby Isaac   PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
837*20cf1dd8SToby Isaac   ocl->realType = realType;
838*20cf1dd8SToby Isaac   PetscFunctionReturn(0);
839*20cf1dd8SToby Isaac }
840*20cf1dd8SToby Isaac 
841*20cf1dd8SToby Isaac PetscErrorCode PetscFEOpenCLGetRealType(PetscFE fem, PetscDataType *realType)
842*20cf1dd8SToby Isaac {
843*20cf1dd8SToby Isaac   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
844*20cf1dd8SToby Isaac 
845*20cf1dd8SToby Isaac   PetscFunctionBegin;
846*20cf1dd8SToby Isaac   PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
847*20cf1dd8SToby Isaac   PetscValidPointer(realType, 2);
848*20cf1dd8SToby Isaac   *realType = ocl->realType;
849*20cf1dd8SToby Isaac   PetscFunctionReturn(0);
850*20cf1dd8SToby Isaac }
851*20cf1dd8SToby Isaac 
852*20cf1dd8SToby Isaac #endif /* PETSC_HAVE_OPENCL */
853*20cf1dd8SToby Isaac 
854