xref: /libCEED/examples/petsc/bps.c (revision 1e28448205f356470f1379f42edf372223940310)
1819eb1b3Sjeremylt // Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
2819eb1b3Sjeremylt // the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
3819eb1b3Sjeremylt // reserved. See files LICENSE and NOTICE for details.
4819eb1b3Sjeremylt //
5819eb1b3Sjeremylt // This file is part of CEED, a collection of benchmarks, miniapps, software
6819eb1b3Sjeremylt // libraries and APIs for efficient high-order finite element and spectral
7819eb1b3Sjeremylt // element discretizations for exascale applications. For more information and
8819eb1b3Sjeremylt // source code availability see http://github.com/ceed.
9819eb1b3Sjeremylt //
10819eb1b3Sjeremylt // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11819eb1b3Sjeremylt // a collaborative effort of two U.S. Department of Energy organizations (Office
12819eb1b3Sjeremylt // of Science and the National Nuclear Security Administration) responsible for
13819eb1b3Sjeremylt // the planning and preparation of a capable exascale ecosystem, including
14819eb1b3Sjeremylt // software, applications, hardware, advanced system engineering and early
15819eb1b3Sjeremylt // testbed platforms, in support of the nation's exascale computing imperative.
16819eb1b3Sjeremylt 
170c59ef15SJeremy L Thompson //                        libCEED + PETSc Example: CEED BPs
180c59ef15SJeremy L Thompson //
190c59ef15SJeremy L Thompson // This example demonstrates a simple usage of libCEED with PETSc to solve the
200c59ef15SJeremy L Thompson // CEED BP benchmark problems, see http://ceed.exascaleproject.org/bps.
210c59ef15SJeremy L Thompson //
22cb32e2e7SValeria Barra // The code uses higher level communication protocols in DMPlex.
230c59ef15SJeremy L Thompson //
240c59ef15SJeremy L Thompson // Build with:
250c59ef15SJeremy L Thompson //
260c59ef15SJeremy L Thompson //     make bps [PETSC_DIR=</path/to/petsc>] [CEED_DIR=</path/to/libceed>]
270c59ef15SJeremy L Thompson //
280c59ef15SJeremy L Thompson // Sample runs:
290c59ef15SJeremy L Thompson //
3032d2ee49SValeria Barra //     ./bps -problem bp1 -degree 3
3132d2ee49SValeria Barra //     ./bps -problem bp2 -ceed /cpu/self -degree 3
3232d2ee49SValeria Barra //     ./bps -problem bp3 -ceed /gpu/occa -degree 3
3332d2ee49SValeria Barra //     ./bps -problem bp4 -ceed /cpu/occa -degree 3
3432d2ee49SValeria Barra //     ./bps -problem bp5 -ceed /omp/occa -degree 3
3532d2ee49SValeria Barra //     ./bps -problem bp6 -ceed /ocl/occa -degree 3
360c59ef15SJeremy L Thompson //
37cb32e2e7SValeria Barra //TESTARGS -ceed {ceed_resource} -test -problem bp5 -degree 3
380c59ef15SJeremy L Thompson 
390c59ef15SJeremy L Thompson /// @file
40cb32e2e7SValeria Barra /// CEED BPs example using PETSc with DMPlex
41cb32e2e7SValeria Barra /// See bpsraw.c for a "raw" implementation using a structured grid.
42cb32e2e7SValeria Barra const char help[] = "Solve CEED BPs using PETSc with DMPlex\n";
430c59ef15SJeremy L Thompson 
440c59ef15SJeremy L Thompson #include <stdbool.h>
450c59ef15SJeremy L Thompson #include <string.h>
464d537eeaSYohann #include <petscksp.h>
47cb32e2e7SValeria Barra #include <petscdmplex.h>
484d537eeaSYohann #include <ceed.h>
49cb32e2e7SValeria Barra #include "setup.h"
500c59ef15SJeremy L Thompson 
51d5b2ba77SJed Brown // -----------------------------------------------------------------------------
52d5b2ba77SJed Brown // Utilities
53d5b2ba77SJed Brown // -----------------------------------------------------------------------------
54d5b2ba77SJed Brown 
55d5b2ba77SJed Brown // Utility function, compute three factors of an integer
56d5b2ba77SJed Brown static void Split3(PetscInt size, PetscInt m[3], bool reverse) {
57d5b2ba77SJed Brown   for (PetscInt d=0,sizeleft=size; d<3; d++) {
58d5b2ba77SJed Brown     PetscInt try = (PetscInt)PetscCeilReal(PetscPowReal(sizeleft, 1./(3 - d)));
59d5b2ba77SJed Brown     while (try * (sizeleft / try) != sizeleft) try++;
60d5b2ba77SJed Brown     m[reverse ? 2-d : d] = try;
61d5b2ba77SJed Brown     sizeleft /= try;
62d5b2ba77SJed Brown   }
63d5b2ba77SJed Brown }
64d5b2ba77SJed Brown 
65d5b2ba77SJed Brown static int Max3(const PetscInt a[3]) {
66d5b2ba77SJed Brown   return PetscMax(a[0], PetscMax(a[1], a[2]));
67d5b2ba77SJed Brown }
68d5b2ba77SJed Brown 
69d5b2ba77SJed Brown static int Min3(const PetscInt a[3]) {
70d5b2ba77SJed Brown   return PetscMin(a[0], PetscMin(a[1], a[2]));
71d5b2ba77SJed Brown }
72d5b2ba77SJed Brown 
73*1e284482Svaleriabarra // -----------------------------------------------------------------------------
74*1e284482Svaleriabarra // Parameter structure for running problems
75*1e284482Svaleriabarra // -----------------------------------------------------------------------------
76*1e284482Svaleriabarra typedef struct RunParams_ *RunParams;
77*1e284482Svaleriabarra struct RunParams_ {
780c59ef15SJeremy L Thompson   MPI_Comm comm;
79*1e284482Svaleriabarra   PetscBool test_mode, benchmark_mode, read_mesh, userlnodes, setmemtyperequest,
80*1e284482Svaleriabarra   petschavecuda, write_solution;
81*1e284482Svaleriabarra   char *filename, *ceedresource;
82*1e284482Svaleriabarra   PetscInt localnodes, degree, qextra, dim, ncompu, *melem;
83*1e284482Svaleriabarra   bpType bpchoice;
84*1e284482Svaleriabarra   CeedMemType memtyperequested;
8509a940d7Sjeremylt   PetscLogStage solvestage;
86*1e284482Svaleriabarra };
87*1e284482Svaleriabarra 
88*1e284482Svaleriabarra // -----------------------------------------------------------------------------
89*1e284482Svaleriabarra // Main body of program, called in a loop for performance benchmarking purposes
90*1e284482Svaleriabarra // -----------------------------------------------------------------------------
91*1e284482Svaleriabarra 
92*1e284482Svaleriabarra static PetscErrorCode Run(RunParams rp) {
93*1e284482Svaleriabarra   PetscInt ierr;
94*1e284482Svaleriabarra   double my_rt_start, my_rt, rt_min, rt_max;
95*1e284482Svaleriabarra   PetscInt xlsize, lsize, gsize;
96*1e284482Svaleriabarra   PetscScalar *r;
97819eb1b3Sjeremylt   Vec X, Xloc, rhs, rhsloc;
98cb32e2e7SValeria Barra   Mat matO;
99819eb1b3Sjeremylt   KSP ksp;
100cb32e2e7SValeria Barra   DM  dm;
101cb32e2e7SValeria Barra   UserO userO;
1020c59ef15SJeremy L Thompson   Ceed ceed;
103cb32e2e7SValeria Barra   CeedData ceeddata;
104c70bd2a0Sjeremylt   CeedQFunction qferror;
105c70bd2a0Sjeremylt   CeedOperator operror;
106cb32e2e7SValeria Barra   CeedVector rhsceed, target;
107*1e284482Svaleriabarra 
108*1e284482Svaleriabarra   PetscFunctionBeginUser;
109*1e284482Svaleriabarra   // Setup DM
110*1e284482Svaleriabarra   if (rp->read_mesh) {
111*1e284482Svaleriabarra     ierr = DMPlexCreateFromFile(PETSC_COMM_WORLD, rp->filename, PETSC_TRUE, &dm);
112*1e284482Svaleriabarra     CHKERRQ(ierr);
113*1e284482Svaleriabarra   } else {
114*1e284482Svaleriabarra     if (rp->userlnodes) {
115*1e284482Svaleriabarra       // Find a nicely composite number of elements no less than global nodes
116*1e284482Svaleriabarra       PetscMPIInt size;
117*1e284482Svaleriabarra       ierr = MPI_Comm_size(rp->comm, &size); CHKERRQ(ierr);
118*1e284482Svaleriabarra       for (PetscInt gelem =
119*1e284482Svaleriabarra            PetscMax(1, size * rp->localnodes / PetscPowInt(rp->degree, rp->dim));
120*1e284482Svaleriabarra            ;
121*1e284482Svaleriabarra            gelem++) {
122*1e284482Svaleriabarra         Split3(gelem, rp->melem, true);
123*1e284482Svaleriabarra         if (Max3(rp->melem) / Min3(rp->melem) <= 2) break;
124*1e284482Svaleriabarra       }
125*1e284482Svaleriabarra     }
126*1e284482Svaleriabarra     ierr = DMPlexCreateBoxMesh(PETSC_COMM_WORLD, rp->dim, PETSC_FALSE, rp->melem,
127*1e284482Svaleriabarra                                NULL, NULL, NULL, PETSC_TRUE, &dm); CHKERRQ(ierr);
128*1e284482Svaleriabarra   }
129*1e284482Svaleriabarra 
130*1e284482Svaleriabarra   {
131*1e284482Svaleriabarra     DM dmDist = NULL;
132*1e284482Svaleriabarra     PetscPartitioner part;
133*1e284482Svaleriabarra 
134*1e284482Svaleriabarra     ierr = DMPlexGetPartitioner(dm, &part); CHKERRQ(ierr);
135*1e284482Svaleriabarra     ierr = PetscPartitionerSetFromOptions(part); CHKERRQ(ierr);
136*1e284482Svaleriabarra     ierr = DMPlexDistribute(dm, 0, NULL, &dmDist); CHKERRQ(ierr);
137*1e284482Svaleriabarra     if (dmDist) {
138*1e284482Svaleriabarra       ierr = DMDestroy(&dm); CHKERRQ(ierr);
139*1e284482Svaleriabarra       dm  = dmDist;
140*1e284482Svaleriabarra     }
141*1e284482Svaleriabarra   }
142*1e284482Svaleriabarra 
143*1e284482Svaleriabarra   // Set up libCEED
144*1e284482Svaleriabarra   CeedInit(rp->ceedresource, &ceed);
145*1e284482Svaleriabarra   CeedMemType memtypebackend;
146*1e284482Svaleriabarra   CeedGetPreferredMemType(ceed, &memtypebackend);
147*1e284482Svaleriabarra 
148*1e284482Svaleriabarra   // Check memtype compatibility
149*1e284482Svaleriabarra   if (!rp->setmemtyperequest)
150*1e284482Svaleriabarra     rp->memtyperequested = memtypebackend;
151*1e284482Svaleriabarra   else if (!rp->petschavecuda && rp->memtyperequested == CEED_MEM_DEVICE)
152*1e284482Svaleriabarra     SETERRQ1(PETSC_COMM_WORLD, PETSC_ERR_SUP_SYS,
153*1e284482Svaleriabarra              "PETSc was not built with CUDA. "
154*1e284482Svaleriabarra              "Requested MemType CEED_MEM_DEVICE is not supported.", NULL);
155*1e284482Svaleriabarra 
156*1e284482Svaleriabarra   // Create DM
157*1e284482Svaleriabarra   ierr = SetupDMByDegree(dm, rp->degree, rp->ncompu, rp->bpchoice);
158*1e284482Svaleriabarra   CHKERRQ(ierr);
159*1e284482Svaleriabarra 
160*1e284482Svaleriabarra   // Create vectors
161*1e284482Svaleriabarra   if (rp->memtyperequested == CEED_MEM_DEVICE) {
162*1e284482Svaleriabarra     ierr = DMSetVecType(dm, VECCUDA); CHKERRQ(ierr);
163*1e284482Svaleriabarra   }
164*1e284482Svaleriabarra   ierr = DMCreateGlobalVector(dm, &X); CHKERRQ(ierr);
165*1e284482Svaleriabarra   ierr = VecGetLocalSize(X, &lsize); CHKERRQ(ierr);
166*1e284482Svaleriabarra   ierr = VecGetSize(X, &gsize); CHKERRQ(ierr);
167*1e284482Svaleriabarra   ierr = DMCreateLocalVector(dm, &Xloc); CHKERRQ(ierr);
168*1e284482Svaleriabarra   ierr = VecGetSize(Xloc, &xlsize); CHKERRQ(ierr);
169*1e284482Svaleriabarra   ierr = VecDuplicate(X, &rhs); CHKERRQ(ierr);
170*1e284482Svaleriabarra 
171*1e284482Svaleriabarra   // Operator
172*1e284482Svaleriabarra   ierr = PetscMalloc1(1, &userO); CHKERRQ(ierr);
173*1e284482Svaleriabarra   ierr = MatCreateShell(rp->comm, lsize, lsize, gsize, gsize,
174*1e284482Svaleriabarra                         userO, &matO); CHKERRQ(ierr);
175*1e284482Svaleriabarra   ierr = MatShellSetOperation(matO, MATOP_MULT,
176*1e284482Svaleriabarra                               (void(*)(void))MatMult_Ceed); CHKERRQ(ierr);
177*1e284482Svaleriabarra   ierr = MatShellSetOperation(matO, MATOP_GET_DIAGONAL,
178*1e284482Svaleriabarra                               (void(*)(void))MatGetDiag); CHKERRQ(ierr);
179*1e284482Svaleriabarra   if (rp->memtyperequested == CEED_MEM_DEVICE) {
180*1e284482Svaleriabarra     ierr = MatShellSetVecType(matO, VECCUDA); CHKERRQ(ierr);
181*1e284482Svaleriabarra   }
182*1e284482Svaleriabarra 
183*1e284482Svaleriabarra   // Print summary
184*1e284482Svaleriabarra   if (!rp->test_mode) {
185*1e284482Svaleriabarra     PetscInt P = rp->degree + 1, Q = P + rp->qextra;
186*1e284482Svaleriabarra 
187*1e284482Svaleriabarra     const char *usedresource;
188*1e284482Svaleriabarra     CeedGetResource(ceed, &usedresource);
189*1e284482Svaleriabarra 
190*1e284482Svaleriabarra     VecType vectype;
191*1e284482Svaleriabarra     ierr = VecGetType(X, &vectype); CHKERRQ(ierr);
192*1e284482Svaleriabarra 
193*1e284482Svaleriabarra     PetscInt cStart, cEnd;
194*1e284482Svaleriabarra     ierr = DMPlexGetHeightStratum(dm, 0, &cStart, &cEnd); CHKERRQ(ierr);
195*1e284482Svaleriabarra     ierr = PetscPrintf(rp->comm,
196*1e284482Svaleriabarra                        "\n-- CEED Benchmark Problem %d -- libCEED + PETSc --\n"
197*1e284482Svaleriabarra                        "  PETSc:\n"
198*1e284482Svaleriabarra                        "    PETSc Vec Type                     : %s\n"
199*1e284482Svaleriabarra                        "  libCEED:\n"
200*1e284482Svaleriabarra                        "    libCEED Backend                    : %s\n"
201*1e284482Svaleriabarra                        "    libCEED Backend MemType            : %s\n"
202*1e284482Svaleriabarra                        "    libCEED User Requested MemType     : %s\n"
203*1e284482Svaleriabarra                        "  Mesh:\n"
204*1e284482Svaleriabarra                        "    Number of 1D Basis Nodes (P)       : %d\n"
205*1e284482Svaleriabarra                        "    Number of 1D Quadrature Points (Q) : %d\n"
206*1e284482Svaleriabarra                        "    Global nodes                       : %D\n"
207*1e284482Svaleriabarra                        "    Local Elements                     : %D\n"
208*1e284482Svaleriabarra                        "    Owned nodes                        : %D\n"
209*1e284482Svaleriabarra                        "    DoF per node                       : %D\n",
210*1e284482Svaleriabarra                        rp->bpchoice+1, vectype, usedresource,
211*1e284482Svaleriabarra                        CeedMemTypes[memtypebackend],
212*1e284482Svaleriabarra                        (rp->setmemtyperequest) ?
213*1e284482Svaleriabarra                        CeedMemTypes[rp->memtyperequested] : "none",
214*1e284482Svaleriabarra                        P, Q, gsize/rp->ncompu, cEnd - cStart, lsize/rp->ncompu,
215*1e284482Svaleriabarra                        rp->ncompu);
216*1e284482Svaleriabarra     CHKERRQ(ierr);
217*1e284482Svaleriabarra   }
218*1e284482Svaleriabarra 
219*1e284482Svaleriabarra   // Create RHS vector
220*1e284482Svaleriabarra   ierr = VecDuplicate(Xloc, &rhsloc); CHKERRQ(ierr);
221*1e284482Svaleriabarra   ierr = VecZeroEntries(rhsloc); CHKERRQ(ierr);
222*1e284482Svaleriabarra   if (rp->memtyperequested == CEED_MEM_HOST) {
223*1e284482Svaleriabarra     ierr = VecGetArray(rhsloc, &r); CHKERRQ(ierr);
224*1e284482Svaleriabarra   } else {
225*1e284482Svaleriabarra     ierr = VecCUDAGetArray(rhsloc, &r); CHKERRQ(ierr);
226*1e284482Svaleriabarra   }
227*1e284482Svaleriabarra   CeedVectorCreate(ceed, xlsize, &rhsceed);
228*1e284482Svaleriabarra   CeedVectorSetArray(rhsceed, rp->memtyperequested, CEED_USE_POINTER, r);
229*1e284482Svaleriabarra 
230*1e284482Svaleriabarra   ierr = PetscMalloc1(1, &ceeddata); CHKERRQ(ierr);
231*1e284482Svaleriabarra   ierr = SetupLibceedByDegree(dm, ceed, rp->degree, rp->dim, rp->qextra,
232*1e284482Svaleriabarra                               rp->ncompu, gsize, xlsize, rp->bpchoice, ceeddata,
233*1e284482Svaleriabarra                               true, rhsceed, &target); CHKERRQ(ierr);
234*1e284482Svaleriabarra 
235*1e284482Svaleriabarra   // Gather RHS
236*1e284482Svaleriabarra   CeedVectorSyncArray(rhsceed, rp->memtyperequested);
237*1e284482Svaleriabarra   if (rp->memtyperequested == CEED_MEM_HOST) {
238*1e284482Svaleriabarra     ierr = VecRestoreArray(rhsloc, &r); CHKERRQ(ierr);
239*1e284482Svaleriabarra   } else {
240*1e284482Svaleriabarra     ierr = VecCUDARestoreArray(rhsloc, &r); CHKERRQ(ierr);
241*1e284482Svaleriabarra   }
242*1e284482Svaleriabarra   ierr = VecZeroEntries(rhs); CHKERRQ(ierr);
243*1e284482Svaleriabarra   ierr = DMLocalToGlobalBegin(dm, rhsloc, ADD_VALUES, rhs); CHKERRQ(ierr);
244*1e284482Svaleriabarra   ierr = DMLocalToGlobalEnd(dm, rhsloc, ADD_VALUES, rhs); CHKERRQ(ierr);
245*1e284482Svaleriabarra   CeedVectorDestroy(&rhsceed);
246*1e284482Svaleriabarra 
247*1e284482Svaleriabarra   // Create the error Q-function
248*1e284482Svaleriabarra   CeedQFunctionCreateInterior(ceed, 1, bpOptions[rp->bpchoice].error,
249*1e284482Svaleriabarra                               bpOptions[rp->bpchoice].errorfname, &qferror);
250*1e284482Svaleriabarra   CeedQFunctionAddInput(qferror, "u", rp->ncompu, CEED_EVAL_INTERP);
251*1e284482Svaleriabarra   CeedQFunctionAddInput(qferror, "true_soln", rp->ncompu, CEED_EVAL_NONE);
252*1e284482Svaleriabarra   CeedQFunctionAddOutput(qferror, "error", rp->ncompu, CEED_EVAL_NONE);
253*1e284482Svaleriabarra 
254*1e284482Svaleriabarra   // Create the error operator
255*1e284482Svaleriabarra   CeedOperatorCreate(ceed, qferror, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE,
256*1e284482Svaleriabarra                      &operror);
257*1e284482Svaleriabarra   CeedOperatorSetField(operror, "u", ceeddata->Erestrictu,
258*1e284482Svaleriabarra                        ceeddata->basisu, CEED_VECTOR_ACTIVE);
259*1e284482Svaleriabarra   CeedOperatorSetField(operror, "true_soln", ceeddata->Erestrictui,
260*1e284482Svaleriabarra                        CEED_BASIS_COLLOCATED, target);
261*1e284482Svaleriabarra   CeedOperatorSetField(operror, "error", ceeddata->Erestrictui,
262*1e284482Svaleriabarra                        CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE);
263*1e284482Svaleriabarra 
264*1e284482Svaleriabarra   // Set up Mat
265*1e284482Svaleriabarra   userO->comm = rp->comm;
266*1e284482Svaleriabarra   userO->dm = dm;
267*1e284482Svaleriabarra   userO->Xloc = Xloc;
268*1e284482Svaleriabarra   ierr = VecDuplicate(Xloc, &userO->Yloc); CHKERRQ(ierr);
269*1e284482Svaleriabarra   userO->xceed = ceeddata->xceed;
270*1e284482Svaleriabarra   userO->yceed = ceeddata->yceed;
271*1e284482Svaleriabarra   userO->op = ceeddata->opapply;
272*1e284482Svaleriabarra   userO->ceed = ceed;
273*1e284482Svaleriabarra   userO->memtype = rp->memtyperequested;
274*1e284482Svaleriabarra   if (rp->memtyperequested == CEED_MEM_HOST) {
275*1e284482Svaleriabarra     userO->VecGetArray = VecGetArray;
276*1e284482Svaleriabarra     userO->VecGetArrayRead = VecGetArrayRead;
277*1e284482Svaleriabarra     userO->VecRestoreArray = VecRestoreArray;
278*1e284482Svaleriabarra     userO->VecRestoreArrayRead = VecRestoreArrayRead;
279*1e284482Svaleriabarra   } else {
280*1e284482Svaleriabarra     userO->VecGetArray = VecCUDAGetArray;
281*1e284482Svaleriabarra     userO->VecGetArrayRead = VecCUDAGetArrayRead;
282*1e284482Svaleriabarra     userO->VecRestoreArray = VecCUDARestoreArray;
283*1e284482Svaleriabarra     userO->VecRestoreArrayRead = VecCUDARestoreArrayRead;
284*1e284482Svaleriabarra   }
285*1e284482Svaleriabarra 
286*1e284482Svaleriabarra   ierr = KSPCreate(rp->comm, &ksp); CHKERRQ(ierr);
287*1e284482Svaleriabarra   {
288*1e284482Svaleriabarra     PC pc;
289*1e284482Svaleriabarra     ierr = KSPGetPC(ksp, &pc); CHKERRQ(ierr);
290*1e284482Svaleriabarra     if (rp->bpchoice == CEED_BP1 || rp->bpchoice == CEED_BP2) {
291*1e284482Svaleriabarra       ierr = PCSetType(pc, PCJACOBI); CHKERRQ(ierr);
292*1e284482Svaleriabarra       ierr = PCJacobiSetType(pc, PC_JACOBI_ROWSUM); CHKERRQ(ierr);
293*1e284482Svaleriabarra     } else {
294*1e284482Svaleriabarra       ierr = PCSetType(pc, PCNONE); CHKERRQ(ierr);
295*1e284482Svaleriabarra     }
296*1e284482Svaleriabarra     ierr = KSPSetType(ksp, KSPCG); CHKERRQ(ierr);
297*1e284482Svaleriabarra     ierr = KSPSetNormType(ksp, KSP_NORM_NATURAL); CHKERRQ(ierr);
298*1e284482Svaleriabarra     ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT,
299*1e284482Svaleriabarra                             PETSC_DEFAULT); CHKERRQ(ierr);
300*1e284482Svaleriabarra   }
301*1e284482Svaleriabarra   ierr = KSPSetOperators(ksp, matO, matO); CHKERRQ(ierr);
302*1e284482Svaleriabarra 
303*1e284482Svaleriabarra   // First run, if benchmarking
304*1e284482Svaleriabarra   if (rp->benchmark_mode) {
305*1e284482Svaleriabarra     ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 1);
306*1e284482Svaleriabarra     CHKERRQ(ierr);
307*1e284482Svaleriabarra     my_rt_start = MPI_Wtime();
308*1e284482Svaleriabarra     ierr = KSPSolve(ksp, rhs, X); CHKERRQ(ierr);
309*1e284482Svaleriabarra     my_rt = MPI_Wtime() - my_rt_start;
310*1e284482Svaleriabarra     ierr = MPI_Allreduce(MPI_IN_PLACE, &my_rt, 1, MPI_DOUBLE, MPI_MIN, rp->comm);
311*1e284482Svaleriabarra     CHKERRQ(ierr);
312*1e284482Svaleriabarra     // Set maxits based on first iteration timing
313*1e284482Svaleriabarra     if (my_rt > 0.02) {
314*1e284482Svaleriabarra       ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 5);
315*1e284482Svaleriabarra       CHKERRQ(ierr);
316*1e284482Svaleriabarra     } else {
317*1e284482Svaleriabarra       ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 20);
318*1e284482Svaleriabarra       CHKERRQ(ierr);
319*1e284482Svaleriabarra     }
320*1e284482Svaleriabarra   }
321*1e284482Svaleriabarra   ierr = KSPSetFromOptions(ksp); CHKERRQ(ierr);
322*1e284482Svaleriabarra 
323*1e284482Svaleriabarra   // Timed solve
324*1e284482Svaleriabarra   ierr = VecZeroEntries(X); CHKERRQ(ierr);
325*1e284482Svaleriabarra   ierr = PetscBarrier((PetscObject)ksp); CHKERRQ(ierr);
326*1e284482Svaleriabarra 
327*1e284482Svaleriabarra   // -- Performance logging
328*1e284482Svaleriabarra   ierr = PetscLogStagePush(rp->solvestage); CHKERRQ(ierr);
329*1e284482Svaleriabarra 
330*1e284482Svaleriabarra   // -- Solve
331*1e284482Svaleriabarra   my_rt_start = MPI_Wtime();
332*1e284482Svaleriabarra   ierr = KSPSolve(ksp, rhs, X); CHKERRQ(ierr);
333*1e284482Svaleriabarra   my_rt = MPI_Wtime() - my_rt_start;
334*1e284482Svaleriabarra 
335*1e284482Svaleriabarra   // -- Performance logging
336*1e284482Svaleriabarra   ierr = PetscLogStagePop();
337*1e284482Svaleriabarra 
338*1e284482Svaleriabarra   // Output results
339*1e284482Svaleriabarra   {
340*1e284482Svaleriabarra     KSPType ksptype;
341*1e284482Svaleriabarra     KSPConvergedReason reason;
342*1e284482Svaleriabarra     PetscReal rnorm;
343*1e284482Svaleriabarra     PetscInt its;
344*1e284482Svaleriabarra     ierr = KSPGetType(ksp, &ksptype); CHKERRQ(ierr);
345*1e284482Svaleriabarra     ierr = KSPGetConvergedReason(ksp, &reason); CHKERRQ(ierr);
346*1e284482Svaleriabarra     ierr = KSPGetIterationNumber(ksp, &its); CHKERRQ(ierr);
347*1e284482Svaleriabarra     ierr = KSPGetResidualNorm(ksp, &rnorm); CHKERRQ(ierr);
348*1e284482Svaleriabarra     if (!rp->test_mode || reason < 0 || rnorm > 1e-8) {
349*1e284482Svaleriabarra       ierr = PetscPrintf(rp->comm,
350*1e284482Svaleriabarra                          "  KSP:\n"
351*1e284482Svaleriabarra                          "    KSP Type                           : %s\n"
352*1e284482Svaleriabarra                          "    KSP Convergence                    : %s\n"
353*1e284482Svaleriabarra                          "    Total KSP Iterations               : %D\n"
354*1e284482Svaleriabarra                          "    Final rnorm                        : %e\n",
355*1e284482Svaleriabarra                          ksptype, KSPConvergedReasons[reason], its,
356*1e284482Svaleriabarra                          (double)rnorm); CHKERRQ(ierr);
357*1e284482Svaleriabarra     }
358*1e284482Svaleriabarra     if (!rp->test_mode) {
359*1e284482Svaleriabarra       ierr = PetscPrintf(rp->comm,"  Performance:\n"); CHKERRQ(ierr);
360*1e284482Svaleriabarra     }
361*1e284482Svaleriabarra     {
362*1e284482Svaleriabarra       PetscReal maxerror;
363*1e284482Svaleriabarra       ierr = ComputeErrorMax(userO, operror, X, target, &maxerror);
364*1e284482Svaleriabarra       CHKERRQ(ierr);
365*1e284482Svaleriabarra       PetscReal tol = 5e-2;
366*1e284482Svaleriabarra       if (!rp->test_mode || maxerror > tol) {
367*1e284482Svaleriabarra         ierr = MPI_Allreduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, rp->comm);
368*1e284482Svaleriabarra         CHKERRQ(ierr);
369*1e284482Svaleriabarra         ierr = MPI_Allreduce(&my_rt, &rt_max, 1, MPI_DOUBLE, MPI_MAX, rp->comm);
370*1e284482Svaleriabarra         CHKERRQ(ierr);
371*1e284482Svaleriabarra         ierr = PetscPrintf(rp->comm,
372*1e284482Svaleriabarra                            "    Pointwise Error (max)              : %e\n"
373*1e284482Svaleriabarra                            "    CG Solve Time                      : %g (%g) sec\n",
374*1e284482Svaleriabarra                            (double)maxerror, rt_max, rt_min); CHKERRQ(ierr);
375*1e284482Svaleriabarra       }
376*1e284482Svaleriabarra     }
377*1e284482Svaleriabarra     if (rp->benchmark_mode && (!rp->test_mode)) {
378*1e284482Svaleriabarra       ierr = PetscPrintf(rp->comm,
379*1e284482Svaleriabarra                          "    DoFs/Sec in CG                     : %g (%g) million\n",
380*1e284482Svaleriabarra                          1e-6*gsize*its/rt_max,
381*1e284482Svaleriabarra                          1e-6*gsize*its/rt_min); CHKERRQ(ierr);
382*1e284482Svaleriabarra     }
383*1e284482Svaleriabarra   }
384*1e284482Svaleriabarra 
385*1e284482Svaleriabarra   if (rp->write_solution) {
386*1e284482Svaleriabarra     PetscViewer vtkviewersoln;
387*1e284482Svaleriabarra 
388*1e284482Svaleriabarra     ierr = PetscViewerCreate(rp->comm, &vtkviewersoln); CHKERRQ(ierr);
389*1e284482Svaleriabarra     ierr = PetscViewerSetType(vtkviewersoln, PETSCVIEWERVTK); CHKERRQ(ierr);
390*1e284482Svaleriabarra     ierr = PetscViewerFileSetName(vtkviewersoln, "solution.vtk"); CHKERRQ(ierr);
391*1e284482Svaleriabarra     ierr = VecView(X, vtkviewersoln); CHKERRQ(ierr);
392*1e284482Svaleriabarra     ierr = PetscViewerDestroy(&vtkviewersoln); CHKERRQ(ierr);
393*1e284482Svaleriabarra   }
394*1e284482Svaleriabarra 
395*1e284482Svaleriabarra   // Cleanup
396*1e284482Svaleriabarra   ierr = VecDestroy(&X); CHKERRQ(ierr);
397*1e284482Svaleriabarra   ierr = VecDestroy(&Xloc); CHKERRQ(ierr);
398*1e284482Svaleriabarra   ierr = VecDestroy(&userO->Yloc); CHKERRQ(ierr);
399*1e284482Svaleriabarra   ierr = MatDestroy(&matO); CHKERRQ(ierr);
400*1e284482Svaleriabarra   ierr = PetscFree(userO); CHKERRQ(ierr);
401*1e284482Svaleriabarra   ierr = CeedDataDestroy(0, ceeddata); CHKERRQ(ierr);
402*1e284482Svaleriabarra   ierr = DMDestroy(&dm); CHKERRQ(ierr);
403*1e284482Svaleriabarra 
404*1e284482Svaleriabarra   ierr = VecDestroy(&rhs); CHKERRQ(ierr);
405*1e284482Svaleriabarra   ierr = VecDestroy(&rhsloc); CHKERRQ(ierr);
406*1e284482Svaleriabarra   ierr = KSPDestroy(&ksp); CHKERRQ(ierr);
407*1e284482Svaleriabarra   CeedVectorDestroy(&target);
408*1e284482Svaleriabarra   CeedQFunctionDestroy(&qferror);
409*1e284482Svaleriabarra   CeedOperatorDestroy(&operror);
410*1e284482Svaleriabarra   CeedDestroy(&ceed);
411*1e284482Svaleriabarra   PetscFunctionReturn(0);
412*1e284482Svaleriabarra }
413*1e284482Svaleriabarra 
414*1e284482Svaleriabarra int main(int argc, char **argv) {
415*1e284482Svaleriabarra   PetscInt ierr, commsize;
416*1e284482Svaleriabarra   RunParams rp;
417*1e284482Svaleriabarra   MPI_Comm comm, splitcomm;
418*1e284482Svaleriabarra   PetscLogStage solvestage;
419*1e284482Svaleriabarra   char filename[PETSC_MAX_PATH_LEN],
420*1e284482Svaleriabarra        ceedresource[PETSC_MAX_PATH_LEN] = "/cpu/self";
421*1e284482Svaleriabarra 
422*1e284482Svaleriabarra   PetscInt degree = 3, qextra, dim = 3, melem[3] = {3, 3, 3},
423*1e284482Svaleriabarra            ncompu = 1,  localnodes, maxdegree = 8, totproc, procpercompnode, maxdofsnode = 3 * PetscPowInt(2, 20);
424*1e284482Svaleriabarra   PetscBool test_mode, benchmark_mode, read_mesh, write_solution,
425*1e284482Svaleriabarra             userlnodes = PETSC_FALSE;
4269396343dSjeremylt   CeedMemType memtyperequested;
427199551f5Sjeremylt   bpType bpchoice;
4280c59ef15SJeremy L Thompson 
4299396343dSjeremylt   // Check PETSc CUDA support
4309396343dSjeremylt   PetscBool petschavecuda, setmemtyperequest = PETSC_FALSE;
4319396343dSjeremylt   // *INDENT-OFF*
4329396343dSjeremylt   #ifdef PETSC_HAVE_CUDA
4339396343dSjeremylt   petschavecuda = PETSC_TRUE;
4349396343dSjeremylt   #else
4359396343dSjeremylt   petschavecuda = PETSC_FALSE;
4369396343dSjeremylt   #endif
4379396343dSjeremylt   // *INDENT-ON*
4389396343dSjeremylt 
439*1e284482Svaleriabarra   // Initialize PETSc
4400c59ef15SJeremy L Thompson   ierr = PetscInitialize(&argc, &argv, NULL, help);
4410c59ef15SJeremy L Thompson   if (ierr) return ierr;
4420c59ef15SJeremy L Thompson   comm = PETSC_COMM_WORLD;
443*1e284482Svaleriabarra   ierr = MPI_Comm_size(comm, &commsize);
444*1e284482Svaleriabarra   if (ierr != MPI_SUCCESS) return ierr;
445*1e284482Svaleriabarra   // Determine number of processes per compute node
446*1e284482Svaleriabarra   ierr = MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,
447*1e284482Svaleriabarra                              &splitcomm);
448*1e284482Svaleriabarra   if (ierr != MPI_SUCCESS) return ierr;
449*1e284482Svaleriabarra   ierr = MPI_Comm_size(splitcomm, &procpercompnode);
450*1e284482Svaleriabarra   if (ierr != MPI_SUCCESS) return ierr;
451cb32e2e7SValeria Barra 
45232d2ee49SValeria Barra   // Read command line options
453*1e284482Svaleriabarra   ierr = PetscOptionsBegin(comm, NULL, "CEED BPs in PETSc", NULL);
454*1e284482Svaleriabarra   CHKERRQ(ierr);
455199551f5Sjeremylt   bpchoice = CEED_BP1;
4560c59ef15SJeremy L Thompson   ierr = PetscOptionsEnum("-problem",
4570c59ef15SJeremy L Thompson                           "CEED benchmark problem to solve", NULL,
458199551f5Sjeremylt                           bpTypes, (PetscEnum)bpchoice, (PetscEnum *)&bpchoice,
4590c59ef15SJeremy L Thompson                           NULL); CHKERRQ(ierr);
460199551f5Sjeremylt   ncompu = bpOptions[bpchoice].ncompu;
4610c59ef15SJeremy L Thompson   test_mode = PETSC_FALSE;
4620c59ef15SJeremy L Thompson   ierr = PetscOptionsBool("-test",
4630c59ef15SJeremy L Thompson                           "Testing mode (do not print unless error is large)",
4640c59ef15SJeremy L Thompson                           NULL, test_mode, &test_mode, NULL); CHKERRQ(ierr);
4650c59ef15SJeremy L Thompson   benchmark_mode = PETSC_FALSE;
4660c59ef15SJeremy L Thompson   ierr = PetscOptionsBool("-benchmark",
4670c59ef15SJeremy L Thompson                           "Benchmarking mode (prints benchmark statistics)",
4680c59ef15SJeremy L Thompson                           NULL, benchmark_mode, &benchmark_mode, NULL);
4690c59ef15SJeremy L Thompson   CHKERRQ(ierr);
4706bd9afcaSjeremylt   write_solution = PETSC_FALSE;
4716bd9afcaSjeremylt   ierr = PetscOptionsBool("-write_solution",
4726bd9afcaSjeremylt                           "Write solution for visualization",
4736bd9afcaSjeremylt                           NULL, write_solution, &write_solution, NULL);
4746bd9afcaSjeremylt   CHKERRQ(ierr);
475cb32e2e7SValeria Barra   degree = test_mode ? 3 : 2;
4760c59ef15SJeremy L Thompson   ierr = PetscOptionsInt("-degree", "Polynomial degree of tensor product basis",
4770c59ef15SJeremy L Thompson                          NULL, degree, &degree, NULL); CHKERRQ(ierr);
478199551f5Sjeremylt   qextra = bpOptions[bpchoice].qextra;
4790c59ef15SJeremy L Thompson   ierr = PetscOptionsInt("-qextra", "Number of extra quadrature points",
4800c59ef15SJeremy L Thompson                          NULL, qextra, &qextra, NULL); CHKERRQ(ierr);
4810c59ef15SJeremy L Thompson   ierr = PetscOptionsString("-ceed", "CEED resource specifier",
4820c59ef15SJeremy L Thompson                             NULL, ceedresource, ceedresource,
4830c59ef15SJeremy L Thompson                             sizeof(ceedresource), NULL); CHKERRQ(ierr);
484cb32e2e7SValeria Barra   read_mesh = PETSC_FALSE;
485cb32e2e7SValeria Barra   ierr = PetscOptionsString("-mesh", "Read mesh from file", NULL,
486cb32e2e7SValeria Barra                             filename, filename, sizeof(filename), &read_mesh);
4870c59ef15SJeremy L Thompson   CHKERRQ(ierr);
488cb32e2e7SValeria Barra   if (!read_mesh) {
489cb32e2e7SValeria Barra     PetscInt tmp = dim;
490cb32e2e7SValeria Barra     ierr = PetscOptionsIntArray("-cells", "Number of cells per dimension", NULL,
491cb32e2e7SValeria Barra                                 melem, &tmp, NULL); CHKERRQ(ierr);
492cb32e2e7SValeria Barra   }
4939396343dSjeremylt   memtyperequested = petschavecuda ? CEED_MEM_DEVICE : CEED_MEM_HOST;
4949396343dSjeremylt   ierr = PetscOptionsEnum("-memtype",
4959396343dSjeremylt                           "CEED MemType requested", NULL,
4969396343dSjeremylt                           memTypes, (PetscEnum)memtyperequested,
4979396343dSjeremylt                           (PetscEnum *)&memtyperequested, &setmemtyperequest);
4989396343dSjeremylt   CHKERRQ(ierr);
499153bcb04Svaleriabarra   localnodes = 1000;
500153bcb04Svaleriabarra   ierr = PetscOptionsInt("-local_nodes",
501153bcb04Svaleriabarra                          "Target number of locally owned nodes per process",
502153bcb04Svaleriabarra                          NULL, localnodes, &localnodes, &userlnodes);
503153bcb04Svaleriabarra   CHKERRQ(ierr);
504*1e284482Svaleriabarra   ierr = PetscOptionsInt("-max_degree",
505*1e284482Svaleriabarra                          "Max degree to run in benchmark mode",
506*1e284482Svaleriabarra                          NULL, maxdegree, &maxdegree, NULL); CHKERRQ(ierr);
507*1e284482Svaleriabarra   totproc = commsize;
508*1e284482Svaleriabarra   ierr = PetscOptionsInt("-n",
509*1e284482Svaleriabarra                          "Total number of processors across all compute nodes",
510*1e284482Svaleriabarra                          NULL, totproc, &totproc, NULL); CHKERRQ(ierr);
511*1e284482Svaleriabarra   ierr = PetscOptionsInt("-p",
512*1e284482Svaleriabarra                          "Number of processors per node",
513*1e284482Svaleriabarra                          NULL, procpercompnode, &procpercompnode, NULL);
514*1e284482Svaleriabarra   CHKERRQ(ierr);
5159396343dSjeremylt 
516cb32e2e7SValeria Barra   ierr = PetscOptionsEnd(); CHKERRQ(ierr);
517cb32e2e7SValeria Barra 
518*1e284482Svaleriabarra   // Register PETSc logging stage
51909a940d7Sjeremylt   ierr = PetscLogStageRegister("Solve Stage", &solvestage); CHKERRQ(ierr);
52009a940d7Sjeremylt 
521*1e284482Svaleriabarra   // Setup all parameters needed in Run()
522*1e284482Svaleriabarra   ierr = PetscMalloc1(1, &rp); CHKERRQ(ierr);
523*1e284482Svaleriabarra   rp->comm = comm;
524*1e284482Svaleriabarra   rp->test_mode = test_mode;
525*1e284482Svaleriabarra   rp->benchmark_mode = benchmark_mode;
526*1e284482Svaleriabarra   rp->read_mesh = read_mesh;
527*1e284482Svaleriabarra   rp->userlnodes = userlnodes;
528*1e284482Svaleriabarra   rp->setmemtyperequest = setmemtyperequest;
529*1e284482Svaleriabarra   rp->petschavecuda = petschavecuda;
530*1e284482Svaleriabarra   rp->write_solution = write_solution;
531*1e284482Svaleriabarra   rp->filename = filename;
532*1e284482Svaleriabarra   rp->ceedresource = ceedresource;
533*1e284482Svaleriabarra   rp->localnodes = localnodes;
534*1e284482Svaleriabarra   rp->degree = degree;
535*1e284482Svaleriabarra   rp->qextra = qextra;
536*1e284482Svaleriabarra   rp->dim = dim;
537*1e284482Svaleriabarra   rp->ncompu = ncompu;
538*1e284482Svaleriabarra   rp->melem = melem;
539*1e284482Svaleriabarra   rp->bpchoice = bpchoice;
540*1e284482Svaleriabarra   rp->memtyperequested = memtyperequested;
541*1e284482Svaleriabarra   rp->solvestage = solvestage;
542cb32e2e7SValeria Barra 
543*1e284482Svaleriabarra   if (benchmark_mode) {
544*1e284482Svaleriabarra     PetscInt maxlocnodes = maxdofsnode / procpercompnode;
545*1e284482Svaleriabarra     // In benchmark_mode, call the main body of the program in a loop
546*1e284482Svaleriabarra     for (PetscInt p = 1; p <= maxdegree; p++) {
547*1e284482Svaleriabarra       for (PetscInt e = 1; e * PetscPowInt(degree, 3) <= maxlocnodes; e *= 2) {
548*1e284482Svaleriabarra         rp->degree = p;
549*1e284482Svaleriabarra         rp->localnodes = e * PetscPowInt(degree, 3);
550*1e284482Svaleriabarra         ierr = Run(rp); CHKERRQ(ierr);
5510c59ef15SJeremy L Thompson       }
5520c59ef15SJeremy L Thompson     }
5530c59ef15SJeremy L Thompson   }
554*1e284482Svaleriabarra   else { // Othersise run program only once
555*1e284482Svaleriabarra     ierr = Run(rp); CHKERRQ(ierr);
5560c59ef15SJeremy L Thompson   }
5570c59ef15SJeremy L Thompson 
558*1e284482Svaleriabarra   // Clear memory
559*1e284482Svaleriabarra   ierr = PetscFree(rp); CHKERRQ(ierr);
5606bd9afcaSjeremylt 
5610c59ef15SJeremy L Thompson   return PetscFinalize();
5620c59ef15SJeremy L Thompson }
563