xref: /phasta/phSolver/compressible/solgmrpetsc.c (revision 50a6f6340c649c1738186cf6fd8c42a882135a5f)
1 #ifdef HAVE_PETSC
2 #include <stdio.h>
3 
4 #include "petscsys.h"
5 #include "petscvec.h"
6 #include "petscmat.h"
7 #include "petscpc.h"
8 #include "petscksp.h"
9 
10 #include "assert.h"
11 #include "common_c.h"
12 
13 #include <FCMangle.h>
14 #define SolGMRp FortranCInterface_GLOBAL_(solgmrp,SOLGMRP)
15 #define SolGMRpSclr FortranCInterface_GLOBAL_(solgmrpsclr,SOLGMRPSCLR)
16 #define ElmGMRPETSc FortranCInterface_GLOBAL_(elmgmrpetsc, ELMGMRPETSC)
17 #define ElmGMRPETScSclr FortranCInterface_GLOBAL_(elmgmrpetscsclr, ELMGMRPETSCSCLR)
18 #define rstat FortranCInterface_GLOBAL_(rstat, RSTAT)
19 #define rstatSclr FortranCInterface_GLOBAL_(rstatsclr, RSTATSCLR)
20 #define min(a,b) (((a) < (b)) ? (a) : (b))
21 #define max(a,b) (((a) > (b)) ? (a) : (b))
22 #define get_time FortranCInterface_GLOBAL_(get_time,GET_TIME)
23 #define get_max_time_diff FortranCInterface_GLOBAL_(get_max_time_diff,GET_MAX_TIME_DIFF)
24 
25 typedef long long int gcorp_t;
26 
27 void get_time(uint64_t* rv, uint64_t* cycle);
28 void get_max_time_diff(uint64_t* first, uint64_t* last, uint64_t* c_first, uint64_t* c_last, char* lbl);
29 
30 //#include "auxmpi.h"
31 //      static PetscOffset poff;
32 
33       static Mat lhsP;
34       static PC pc;
35       static KSP ksp;
36       static Vec DyP, resP, DyPLocal, resPGlobal;
37       static PetscErrorCode ierr;
38       static PetscInt PetscOne, PetscRow, PetscCol, LocalRow, LocalCol;
39       static IS LocalIndexSet, GlobalIndexSet;
40       static ISLocalToGlobalMapping VectorMapping;
41       static ISLocalToGlobalMapping GblVectorMapping;
42       static  VecScatter scatter7;
43       static int firstpetsccall = 1;
44 
45       static Mat lhsPs;
46       static PC pcs;
47       static KSP ksps;
48       static Vec DyPs, resPs, DyPLocals, resPGlobals;
49       static IS LocalIndexSets, GlobalIndexSets;
50       static ISLocalToGlobalMapping VectorMappings;
51       static ISLocalToGlobalMapping GblVectorMappings;
52       static VecScatter scatter7s;
53       static int firstpetsccalls = 1;
54 
55 void     SolGMRp(double* y,         double* ac,        double* yold,
56      	double* x,         int* iBC,       double* BC,
57      	int* col,       int* row,       double* lhsk,
58      	double* res,       double* BDiag,
59      	int* iper,
60      	int* ilwork,    double* shp,       double* shgl,      double* shpb,
61      	double* shglb,     double* Dy,        double* rerr, long long int* fncorp)
62 {
63 //
64 // ----------------------------------------------------------------------
65 //
66 //  This is the preconditioned GMRES driver routine.
67 //
68 // input:
69 //  y      (nshg,ndof)           : Y-variables at n+alpha_v
70 //  ac     (nshg,ndof)           : Primvar. accel. variable n+alpha_m
71 //  yold   (nshg,ndof)           : Y-variables at beginning of step
72 //  acold  (nshg,ndof)           : Primvar. accel. variable at begng step
73 //  x      (numnp,nsd)            : node coordinates
74 //  iBC    (nshg)                : BC codes
75 //  BC     (nshg,ndofBC)         : BC constraint parameters
76 //  shp(b) (nen,maxsh,melCat)     : element shape functions (boundary)
77 //  shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions
78 //
79 // output:
80 //  res    (nshg,nflow)           : preconditioned residual
81 //  BDiag  (nshg,nflow,nflow)      : block-diagonal preconditioner
82 //
83 //
84 // Zdenek Johan,  Winter 1991.  (Fortran 90)
85 // ----------------------------------------------------------------------
86 //
87 
88 
89 // Get variables from common_c.h
90       int nshg, nflow, nsd, iownnodes;
91       nshg  = conpar.nshg;
92       nflow = conpar.nflow;
93       nsd = NSD;
94       iownnodes = conpar.iownnodes;
95 
96       gcorp_t nshgt;
97       gcorp_t mbeg;
98       gcorp_t mend;
99       nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h
100       mbeg = (gcorp_t) newdim.minowned;
101       mend = (gcorp_t) newdim.maxowned;
102 
103 
104       int node, element, var, eqn;
105       double valtoinsert;
106       int nenl, iel, lelCat, lcsyst, iorder, nshl;
107       int mattyp, ndofl, nsymdl, npro, ngauss, nppro;
108       double DyFlat[nshg*nflow];
109       double DyFlatPhasta[nshg*nflow];
110       double rmes[nshg*nflow];
111 // DEBUG
112       int i,j,k,l,m;
113       //int indexsetary[nflow*nshg];
114       //int gblindexsetary[nflow*nshg];
115       PetscInt indexsetary[nflow*nshg];
116       PetscInt gblindexsetary[nflow*nshg];
117       PetscInt nodetoinsert;
118 
119       // FIXME: PetscScalar
120       double  real_rtol, real_abstol, real_dtol;
121 // /DEBUG
122       //double parray[1]; // Should be a PetscScalar
123       double *parray; // Should be a PetscScalar
124       PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA;
125       PetscInt petsc_n;
126       PetscOne = 1;
127       uint64_t duration[4];
128 
129 //      printf("%g %g %g \n", *rerr, *(rerr+1) , *(rerr+2));
130 
131        if(firstpetsccall == 1) {
132 /*         call get_time(duration(1),duration(2));
133          call system('sleep 10');
134          call get_time(duration(3),duration(4));
135          call get_max_time_diff(duration(1), duration(3),
136                               duration(2), duration(4),
137                               "TenSecondSleep " // char(0))
138          if(myrank .eq. 0) {
139            !write(*,"(A, E10.3)") "TenSecondSleep ", elapsed
140          }
141 */
142        }
143 //
144 //
145 // .... *******************>> Element Data Formation <<******************
146 //
147 //
148 // .... set the parameters for flux and surface tension calculations
149 //
150 //
151       genpar.idflx = 0 ;
152       if(genpar.idiff >= 1)  genpar.idflx= genpar.idflx + (nflow-1) * nsd;
153       if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd;
154 
155 
156 
157       PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
158       PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
159       gcorp_t glbNZ;
160 
161       if(firstpetsccall == 1) {
162         for(i=0;i<iownnodes;i++) {
163              idiagnz[i]=0;
164              iodiagnz[i]=0;
165         }
166         i=0;
167         for(k=0;k<nshg;k++) {
168           if((fncorp[k] < mbeg) || (fncorp[k] >mend)){
169 // this node is not owned by this rank so we skip
170           } else {
171            for(j=col[i]-1;j<col[i+1]-1;j++) {
172              glbNZ=fncorp[row[j]];
173              if((glbNZ < mbeg) || (glbNZ > mend)) {
174                 iodiagnz[i]++;
175              } else {
176                 idiagnz[i]++;
177              }
178            }
179            i++;
180           }
181         }
182         gcorp_t mind=1000;
183         gcorp_t mino=1000;
184         gcorp_t maxd=0;
185         gcorp_t maxo=0;
186         for(i=0;i<iownnodes;i++) {
187            mind=min(mind,idiagnz[i]);
188            mino=min(mino,iodiagnz[i]);
189            maxd=max(maxd,idiagnz[i]);
190            maxo=max(maxo,iodiagnz[i]);
191 //           iodiagnz[i]=max(iodiagnz[i],10);
192 //           idiagnz[i]=max(idiagnz[i],10);
193 //           iodiagnz[i]=2*iodiagnz[i];   //  estimate a bit higher for off-part interactions
194 //           idiagnz[i]=2*idiagnz[i];   //  estimate a bit higher for off-part interactions
195         }
196 // the above was pretty good but below is faster and not too much more memory...of course once you do this
197 // could just use the constant fill parameters in create but keep it alive for potential later optimization
198 
199         for(i=0;i<iownnodes;i++) {
200            iodiagnz[i]=1.3*maxd;
201            idiagnz[i]=1.3*maxd;
202         }
203 
204 
205 
206         if(workfc.numpe < 200){
207           printf("myrank,i,iownnodes,nshg %d %d %d %d \n",workfc.myrank,i,iownnodes,nshg);
208           printf("myrank,mind,maxd,mino,maxo %d %d %d %d %d \n",workfc.myrank,mind,maxd,mino,maxo);
209         }
210         // Print debug info
211         if(nshgt < 200){
212         int irank;
213         for(irank=0;irank<workfc.numpe;irank++) {
214           if(irank == workfc.myrank){
215             printf("mbeg,mend,myrank,idiagnz, iodiagnz %d %d %d \n",mbeg,mend,workfc.myrank);
216             for(i=0;i<iownnodes;i++) {
217               printf("%d %ld %ld \n",workfc.myrank,idiagnz[i],iodiagnz[i]);
218             }
219           }
220          MPI_Barrier(MPI_COMM_WORLD);
221         }
222         }
223        }
224        if(firstpetsccall == 1) {
225 
226         petsc_bs = (PetscInt) nflow;
227         petsc_m  = (PetscInt) nflow* (PetscInt) iownnodes;
228         petsc_M  = (PetscInt) nshgt * (PetscInt) nflow;
229         petsc_PA  = (PetscInt) 40;
230         // TODO: fixe nshgt for 64 bit integer!!!!!
231 
232         //ierr = MatCreateBAIJ(PETSC_COMM_WORLD, nflow, nflow*iownnodes,
233         //       nflow*iownnodes, nshgt*nflow, nshgt*nflow, 0,
234 
235 // this has no pre-allocate        ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M,
236 // this has no pre-allocate                             0, NULL, 0, NULL, &lhsP);
237 
238 // next 2 do pre-allocate
239 
240 //      MPI_Barrier(MPI_COMM_WORLD);
241 //      if(workfc.myrank ==0) printf("Before matCreateBAIJ petsc_bs,petsc_m,petsc_M,petsc_PA %ld %ld %ld %ld \n",petsc_bs,petsc_m,petsc_M,petsc_PA);
242 
243         ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M,
244                              0, idiagnz, 0, iodiagnz, &lhsP);
245 
246 //                              petsc_PA, NULL, petsc_PA, NULL, &lhsP);
247 //      MPI_Barrier(MPI_COMM_WORLD);
248 //      if(workfc.myrank ==0) printf("Before matSetOoption 1  \n");
249         ierr = MatSetOption(lhsP, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE);
250 
251 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call
252 #ifdef JEDBROWN
253         ierr = MatSetOption(lhsP, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE);
254 #endif
255         ierr = MatSetUp(lhsP);
256 
257       PetscInt myMatStart, myMatEnd;
258       ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd);
259       }
260 //      MPI_Barrier(MPI_COMM_WORLD);
261  //     if(workfc.myrank ==0) printf("Before MatZeroEntries  \n");
262       if(genpar.lhs ==1) ierr = MatZeroEntries(lhsP);
263 
264       get_time(duration, (duration+1));
265 
266 //      MPI_Barrier(MPI_COMM_WORLD);
267  //     if(workfc.myrank ==0) printf("Before elmgmr  \n");
268 //      for (i=0; i<nshg ; i++) {
269 //            assert(fncorp[i]>0);
270 //      }
271 
272       ElmGMRPETSc(y,             ac,            x,
273                   shp,           shgl,          iBC,
274                   BC,            shpb,
275                   shglb,         res,
276                   rmes,                   iper,
277                   ilwork,        rerr ,         &lhsP);
278       get_time((duration+2), (duration+3));
279       get_max_time_diff((duration), (duration+2),
280                         (duration+1), (duration+3),
281                         "ElmGMRPETSc \0");  // char(0))
282 //       printf("after ElmGMRPETSc\n");
283        if(firstpetsccall == 1) {
284       // Setup IndexSet. For now, we mimic vector insertion procedure
285       // Since we always reference by global indexes this doesn't matter
286       // except for cache performance)
287       // TODO: Better arrangment?
288         nodetoinsert = 0;
289         k=0;
290         if(workfc.numpe > 1) {
291           for (i=0; i<nshg ; i++) {
292             nodetoinsert = fncorp[i]-1;
293 //            if(nodetoinsert<0) printf("nodetoinsert <0! myrank: %d, value: %ld\n", workfc.myrank, nodetoinsert);
294 //            assert(fncorp[i]>=0);
295             for (j=1; j<=nflow; j++) {
296               indexsetary[k] = nodetoinsert*nflow+(j-1);
297               assert(indexsetary[k]>=0);
298 //              assert(fncorp[i]>=0);
299               k = k+1;
300             }
301           }
302         }
303         else {
304           for (i=0; i<nshg ; i++) {
305             nodetoinsert = i;
306             for (j=1; j<=nflow; j++) {
307               indexsetary[k] = nodetoinsert*nflow+(j-1);
308               k = k+1;
309             }
310           }
311         }
312 
313         for (i=0; i<nshg*nflow; i++) {
314           gblindexsetary[i] = i ; //TODO: better option for performance?
315         }
316 //  Create Vector Index Maps
317         petsc_n  = (PetscInt) nshg * (PetscInt) nflow;
318         //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, indexsetary,
319      	//       PETSC_COPY_VALUES, &LocalIndexSet);
320         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetary,
321      	       PETSC_COPY_VALUES, &LocalIndexSet);
322 //        printf("after 1st ISCreateGeneral %d\n", ierr);
323         //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, gblindexsetary,
324         //       PETSC_COPY_VALUES, &GlobalIndexSet);
325         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetary,
326                PETSC_COPY_VALUES, &GlobalIndexSet);
327 //        printf("after 2nd ISCreateGeneral %d\n", ierr);
328         ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSet,&VectorMapping);
329 //        printf("after 1st ISLocalToGlobalMappingCreateIS %d\n",ierr);
330         ierr =  ISLocalToGlobalMappingCreateIS(GlobalIndexSet,&GblVectorMapping);
331 //        printf("after 2nd ISLocalToGlobalMappingCreateIS %d\n",ierr);
332       }
333       if(genpar.lhs == 1) {
334       get_time((duration), (duration+1));
335       ierr = MatAssemblyBegin(lhsP, MAT_FINAL_ASSEMBLY);
336       ierr = MatAssemblyEnd(lhsP, MAT_FINAL_ASSEMBLY);
337       get_time((duration+2),(duration+3));
338       get_max_time_diff((duration), (duration+2),
339                         (duration+1), (duration+3),
340                         "MatAssembly \0"); // char(0))
341       get_time(duration, (duration+1));
342       }
343       if(firstpetsccall == 1) {
344         ierr = MatGetLocalSize(lhsP, &LocalRow, &LocalCol);
345 //  TODO Should this be LocalRow? LocalCol? or does LocalRow always == LocalCol
346         //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &resP);
347         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resP);
348         //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &DyP);
349         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyP);
350       }
351       ierr = VecZeroEntries(resP);
352       ierr = VecZeroEntries(DyP);
353       if(firstpetsccall == 1) {
354         //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &resPGlobal);
355         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPGlobal);
356         //ierr = VecCreateSeq(PETSC_COMM_SELF, nshg*nflow, &DyPLocal);
357         ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocal);
358       }
359         ierr = VecZeroEntries(resPGlobal);
360 //         call VecZeroEntries(DyPLocal, ierr)
361 
362       PetscRow=0;
363       k = 0;
364       int index;
365       for (i=0; i<nshg; i++ ){
366         for (j = 1; j<=nflow; j++){
367           index = i + (j-1)*nshg;
368           valtoinsert = res[index];
369           if(workfc.numpe > 1) {
370             PetscRow = (fncorp[i]-1)*nflow+(j-1);
371           }
372           else {
373             PetscRow = i*nflow+(j-1);
374           }
375           assert(fncorp[i]<=nshgt);
376           assert(fncorp[i]>0);
377           assert(PetscRow>=0);
378           assert(PetscRow<=nshgt*nflow);
379           ierr =  VecSetValue(resP, PetscRow, valtoinsert, INSERT_VALUES);
380         }
381       }
382 //      printf("after VecSetValue loop %d\n",ierr);
383       ierr = VecAssemblyBegin(resP);
384       ierr = VecAssemblyEnd(resP);
385       get_time((duration+2), (duration+3));
386       get_max_time_diff((duration), (duration+2),
387                         (duration+1), (duration+3),
388                         "VectorWorkPre \0"); // char(0))
389 
390       get_time((duration),(duration+1));
391 
392       if(firstpetsccall == 1) {
393         ierr = KSPCreate(PETSC_COMM_WORLD, &ksp);
394         ierr = KSPSetOperators(ksp, lhsP, lhsP);
395 //3.4 ierr = KSPSetOperators(ksp, lhsP, lhsP, SAME_NONZERO_PATTERN);
396         ierr = KSPGetPC(ksp, &pc);
397         ierr = PCSetType(pc, PCPBJACOBI);
398         PetscInt maxits;
399         maxits = (PetscInt)  solpar.nGMRES * (PetscInt) solpar.Kspace;
400         //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace);
401         ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits);
402         ierr = KSPSetFromOptions(ksp);
403       }
404       ierr = KSPSolve(ksp, resP, DyP);
405       get_time((duration+2),(duration+3));
406       get_max_time_diff((duration), (duration+2),
407                         (duration+1), (duration+3),
408                         "KSPSolve \0"); // char(0))
409      //if(myrank .eq. 0) then
410      //!write(*,"(A, E10.3)") "KSPSolve ", elapsed
411      //end if
412       get_time((duration),(duration+1));
413       if(firstpetsccall == 1) {
414       ierr = VecScatterCreate(DyP, LocalIndexSet, DyPLocal, GlobalIndexSet, &scatter7);
415       }
416       ierr = VecScatterBegin(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD);
417       ierr = VecScatterEnd(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD);
418       ierr = VecGetArray(DyPLocal, &parray);
419       PetscRow = 0;
420       for ( node = 0; node< nshg; node++) {
421         for (var = 1; var<= nflow; var++) {
422           index = node + (var-1)*nshg;
423           Dy[index] = parray[PetscRow];
424           PetscRow = PetscRow+1;
425         }
426       }
427       ierr = VecRestoreArray(DyPLocal, &parray);
428 
429       firstpetsccall = 0;
430       // later timer ('Solver  ');
431 
432 // .... output the statistics
433 //
434       itrpar.iKs=0; // see rstat()
435       PetscInt its;
436       //ierr = KSPGetIterationNumber(ksp, &itrpar.iKs);
437       ierr = KSPGetIterationNumber(ksp, &its);
438       itrpar.iKs = (int) its;
439       get_time((duration+2),(duration+3));
440       get_max_time_diff((duration), (duration+2),
441                         (duration+1), (duration+3),
442                         "solWork \0"); // char(0))
443       itrpar.ntotGM += (int) its;
444       rstat (res, ilwork);
445 //
446 // .... stop the timer
447 //
448 // 3002 continue                  ! no solve just res.
449       // later call timer ('Back    ')
450 //
451 // .... end
452 //
453 }
454 
455 void     SolGMRpSclr(double* y,         double* ac,
456      	double* x,      double* elDw,   int* iBC,       double* BC,
457      	int* col,       int* row,
458      	int* iper,
459      	int* ilwork,    double* shp,       double* shgl,      double* shpb,
460      	double* shglb,  double* res,   double* Dy,     long long int* fncorp)
461 {
462 //
463 // ----------------------------------------------------------------------
464 //
465 //  This is the preconditioned GMRES driver routine.
466 //
467 // input:
468 //  y      (nshg,ndof)           : Y-variables at n+alpha_v
469 //  ac     (nshg,ndof)           : Primvar. accel. variable n+alpha_m
470 //  yold   (nshg,ndof)           : Y-variables at beginning of step
471 //  acold  (nshg,ndof)           : Primvar. accel. variable at begng step
472 //  x      (numnp,nsd)            : node coordinates
473 //  iBC    (nshg)                : BC codes
474 //  BC     (nshg,ndofBC)         : BC constraint parameters
475 //  shp(b) (nen,maxsh,melCat)     : element shape functions (boundary)
476 //  shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions
477 //
478 // ----------------------------------------------------------------------
479 //
480 
481 
482 // Get variables from common_c.h
483       int nshg, nflow, nsd, iownnodes;
484       nshg  = conpar.nshg;
485       nsd = NSD;
486       iownnodes = conpar.iownnodes;
487 
488       gcorp_t nshgt;
489       gcorp_t mbeg;
490       gcorp_t mend;
491       nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h
492       mbeg = (gcorp_t) newdim.minowned;
493       mend = (gcorp_t) newdim.maxowned;
494 
495 
496       int node, element, var, eqn;
497       double valtoinsert;
498       int nenl, iel, lelCat, lcsyst, iorder, nshl;
499       int mattyp, ndofl, nsymdl, npro, ngauss, nppro;
500       double DyFlats[nshg];
501       double DyFlatPhastas[nshg];
502       double rmes[nshg];
503 // DEBUG
504       int i,j,k,l,m;
505       PetscInt indexsetarys[nshg];
506       PetscInt gblindexsetarys[nshg];
507       PetscInt nodetoinsert;
508 
509       double  real_rtol, real_abstol, real_dtol;
510       double *parray;
511       PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA;
512       PetscInt petsc_n;
513       PetscOne = 1;
514       uint64_t duration[4];
515 
516 
517 //
518 //
519 // .... *******************>> Element Data Formation <<******************
520 //
521 //
522 // .... set the parameters for flux and surface tension calculations
523 //
524 //
525       genpar.idflx = 0 ;
526       if(genpar.idiff >= 1)  genpar.idflx= genpar.idflx + (nflow-1) * nsd;
527       if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd;
528 
529 
530 
531       PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
532       PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
533       gcorp_t glbNZ;
534 
535       if(firstpetsccalls == 1) {
536         for(i=0;i<iownnodes;i++) {
537              idiagnz[i]=0;
538              iodiagnz[i]=0;
539         }
540         i=0;
541         for(k=0;k<nshg;k++) {
542           if((fncorp[k] < mbeg) || (fncorp[k] >mend)){
543 // this node is not owned by this rank so we skip
544           } else {
545            for(j=col[i]-1;j<col[i+1]-1;j++) {
546              glbNZ=fncorp[row[j]];
547              if((glbNZ < mbeg) || (glbNZ > mend)) {
548                 iodiagnz[i]++;
549              } else {
550                 idiagnz[i]++;
551              }
552            }
553            i++;
554           }
555         }
556         gcorp_t mind=1000;
557         gcorp_t mino=1000;
558         gcorp_t maxd=0;
559         gcorp_t maxo=0;
560         for(i=0;i<iownnodes;i++) {
561            mind=min(mind,idiagnz[i]);
562            mino=min(mino,iodiagnz[i]);
563            maxd=max(maxd,idiagnz[i]);
564            maxo=max(maxo,iodiagnz[i]);
565         }
566 
567         for(i=0;i<iownnodes;i++) {
568            iodiagnz[i]=1.3*maxd;
569            idiagnz[i]=1.3*maxd;
570         }
571 
572 
573 
574        }
575        if(firstpetsccalls == 1) {
576 
577         petsc_m  = (PetscInt) iownnodes;
578         petsc_M  = (PetscInt) nshgt;
579         petsc_PA  = (PetscInt) 40;
580 
581         ierr = MatCreateAIJ(PETSC_COMM_WORLD, petsc_m, petsc_m, petsc_M, petsc_M,
582                              0, idiagnz, 0, iodiagnz, &lhsPs);
583 
584         ierr = MatSetOption(lhsPs, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE);
585 
586 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call
587 #ifdef JEDBROWN
588         ierr = MatSetOption(lhsPs, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE);
589 #endif
590         ierr = MatSetUp(lhsPs);
591 
592       PetscInt myMatStart, myMatEnd;
593       ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd);
594       }
595 //      MPI_Barrier(MPI_COMM_WORLD);
596  //     if(workfc.myrank ==0) printf("Before MatZeroEntries  \n");
597       if(genpar.lhs == 1) ierr = MatZeroEntries(lhsPs);
598 
599       get_time(duration, (duration+1));
600 
601 //      MPI_Barrier(MPI_COMM_WORLD);
602  //     if(workfc.myrank ==0) printf("Before elmgmr  \n");
603 //      for (i=0; i<nshg ; i++) {
604 //            assert(fncorp[i]>0);
605 //      }
606 
607       ElmGMRPETScSclr(y,             ac,            x, elDw,
608                   shp,           shgl,          iBC,
609                   BC,            shpb,
610                   shglb,         res,
611                   rmes,                   iper,
612                   ilwork,        &lhsPs);
613        if(firstpetsccalls == 1) {
614       // Setup IndexSet. For now, we mimic vector insertion procedure
615       // Since we always reference by global indexes this doesn't matter
616       // except for cache performance)
617       // TODO: Better arrangment?
618         nodetoinsert = 0;
619         k=0;
620         if(workfc.numpe > 1) {
621           for (i=0; i<nshg ; i++) {
622             nodetoinsert = fncorp[i]-1;
623             indexsetarys[k] = nodetoinsert;
624             k = k+1;
625           }
626         }
627         else {
628           for (i=0; i<nshg ; i++) {
629             nodetoinsert = i;
630             indexsetarys[k] = nodetoinsert;
631             k = k+1;
632           }
633         }
634 
635         for (i=0; i<nshg; i++) {
636           gblindexsetarys[i] = i ; //TODO: better option for performance?
637         }
638 //  Create Vector Index Maps
639         petsc_n  = (PetscInt) nshg;
640         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetarys,
641      	       PETSC_COPY_VALUES, &LocalIndexSets);
642         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetarys,
643                PETSC_COPY_VALUES, &GlobalIndexSets);
644         ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSets,&VectorMappings);
645         ierr =  ISLocalToGlobalMappingCreateIS(GlobalIndexSets,&GblVectorMappings);
646       }
647       if(genpar.lhs ==1) {
648       ierr = MatAssemblyBegin(lhsPs, MAT_FINAL_ASSEMBLY);
649       ierr = MatAssemblyEnd(lhsPs, MAT_FINAL_ASSEMBLY);
650       }
651       if(firstpetsccalls == 1) {
652         ierr = MatGetLocalSize(lhsPs, &LocalRow, &LocalCol);
653         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPs);
654         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyPs);
655       }
656       ierr = VecZeroEntries(resPs);
657       ierr = VecZeroEntries(DyPs);
658       if(firstpetsccalls == 1) {
659         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPGlobals);
660         ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocals);
661       }
662         ierr = VecZeroEntries(resPGlobals);
663 
664       PetscRow=0;
665       k = 0;
666       int index;
667       for (i=0; i<nshg; i++ ){
668           valtoinsert = res[i];
669           if(workfc.numpe > 1) {
670             PetscRow = (fncorp[i]-1);
671           }
672           else {
673             PetscRow = i-1;
674           }
675           assert(fncorp[i]<=nshgt);
676           assert(fncorp[i]>0);
677           assert(PetscRow>=0);
678           assert(PetscRow<=nshgt);
679           ierr =  VecSetValue(resPs, PetscRow, valtoinsert, INSERT_VALUES);
680       }
681 //      printf("after VecSetValue loop %d\n",ierr);
682       ierr = VecAssemblyBegin(resPs);
683       ierr = VecAssemblyEnd(resPs);
684 
685       if(firstpetsccalls == 1) {
686         ierr = KSPCreate(PETSC_COMM_WORLD, &ksps);
687         ierr = KSPSetOperators(ksps, lhsPs, lhsPs);
688 //3.4 ierr = KSPSetOperators(ksps, lhsPs, lhsPs, SAME_NONZERO_PATTERN);
689         ierr = KSPGetPC(ksps, &pcs);
690         ierr = PCSetType(pcs, PCPBJACOBI);
691         PetscInt maxits;
692         maxits = (PetscInt)  solpar.nGMRES * (PetscInt) solpar.Kspace;
693         //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace);
694         ierr = KSPSetTolerances(ksps, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits);
695         ierr = KSPSetFromOptions(ksps);
696       }
697       ierr = KSPSolve(ksps, resPs, DyPs);
698       if(firstpetsccalls == 1) {
699       ierr = VecScatterCreate(DyPs, LocalIndexSets, DyPLocals, GlobalIndexSets, &scatter7s);
700       }
701       ierr = VecScatterBegin(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD);
702       ierr = VecScatterEnd(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD);
703       ierr = VecGetArray(DyPLocals, &parray);
704       PetscRow = 0;
705       for ( node = 0; node< nshg; node++) {
706           index = node;
707           Dy[index] = parray[PetscRow];
708           PetscRow = PetscRow+1;
709       }
710       ierr = VecRestoreArray(DyPLocals, &parray);
711 
712       firstpetsccalls = 0;
713 
714 // .... output the statistics
715 //
716 //      itrpar.iKss=0; // see rstat()
717       PetscInt its;
718       ierr = KSPGetIterationNumber(ksps, &its);
719       itrpar.iKss = (int) its;
720       itrpar.ntotGMs += (int) its;
721       rstatSclr (res, ilwork);
722 //
723 // .... end
724 //
725 }
726 #endif
727