xref: /phasta/phSolver/compressible/solgmrpetsc.c (revision 79f1763e38fe03466335328e9c0a1a0a45544a96)
1 #ifdef HAVE_PETSC
2 #include <stdio.h>
3 
4 #include "petscsys.h"
5 #include "petscvec.h"
6 #include "petscmat.h"
7 #include "petscpc.h"
8 #include "petscksp.h"
9 
10 #include "assert.h"
11 #include "common_c.h"
12 
13 #include <FCMangle.h>
14 #define SolGMRp FortranCInterface_GLOBAL_(solgmrp,SOLGMRP)
15 #define SolGMRpSclr FortranCInterface_GLOBAL_(solgmrpsclr,SOLGMRPSCLR)
16 #define ElmGMRPETSc FortranCInterface_GLOBAL_(elmgmrpetsc, ELMGMRPETSC)
17 #define ElmGMRPETScSclr FortranCInterface_GLOBAL_(elmgmrpetscsclr, ELMGMRPETSCSCLR)
18 #define rstat FortranCInterface_GLOBAL_(rstat, RSTAT)
19 #define rstatSclr FortranCInterface_GLOBAL_(rstatsclr, RSTATSCLR)
20 #define min(a,b) (((a) < (b)) ? (a) : (b))
21 #define max(a,b) (((a) > (b)) ? (a) : (b))
22 #define get_time FortranCInterface_GLOBAL_(get_time,GET_TIME)
23 #define get_max_time_diff FortranCInterface_GLOBAL_(get_max_time_diff,GET_MAX_TIME_DIFF)
24 
25 typedef long long int gcorp_t;
26 
27 void get_time(uint64_t* rv, uint64_t* cycle);
28 void get_max_time_diff(uint64_t* first, uint64_t* last, uint64_t* c_first, uint64_t* c_last, char* lbl);
29 
30 //#include "auxmpi.h"
31 //      static PetscOffset poff;
32 
33       static Mat lhsP;
34       static PC pc;
35       static KSP ksp;
36       static Vec DyP, resP, DyPLocal, resPGlobal;
37       static PetscErrorCode ierr;
38       static PetscInt PetscOne, PetscRow, PetscCol, LocalRow, LocalCol;
39       static IS LocalIndexSet, GlobalIndexSet;
40       static ISLocalToGlobalMapping VectorMapping;
41       static ISLocalToGlobalMapping GblVectorMapping;
42       static  VecScatter scatter7;
43       static int firstpetsccall = 1;
44 
45       static Mat lhsPs;
46       static PC pcs;
47       static KSP ksps;
48       static Vec DyPs, resPs, DyPLocals, resPGlobals;
49       static IS LocalIndexSets, GlobalIndexSets;
50       static ISLocalToGlobalMapping VectorMappings;
51       static ISLocalToGlobalMapping GblVectorMappings;
52       static VecScatter scatter7s;
53       static int firstpetsccalls = 1;
54 
55 void     SolGMRp(double* y,         double* ac,        double* yold,
56      	double* x,         int* iBC,       double* BC,
57      	int* col,       int* row,       double* lhsk,
58      	double* res,       double* BDiag,
59      	int* iper,
60      	int* ilwork,    double* shp,       double* shgl,      double* shpb,
61      	double* shglb,     double* Dy,        double* rerr, long long int* fncorp)
62 {
63 //
64 // ----------------------------------------------------------------------
65 //
66 //  This is the preconditioned GMRES driver routine.
67 //
68 // input:
69 //  y      (nshg,ndof)           : Y-variables at n+alpha_v
70 //  ac     (nshg,ndof)           : Primvar. accel. variable n+alpha_m
71 //  yold   (nshg,ndof)           : Y-variables at beginning of step
72 //  acold  (nshg,ndof)           : Primvar. accel. variable at begng step
73 //  x      (numnp,nsd)            : node coordinates
74 //  iBC    (nshg)                : BC codes
75 //  BC     (nshg,ndofBC)         : BC constraint parameters
76 //  shp(b) (nen,maxsh,melCat)     : element shape functions (boundary)
77 //  shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions
78 //
79 // output:
80 //  res    (nshg,nflow)           : preconditioned residual
81 //  BDiag  (nshg,nflow,nflow)      : block-diagonal preconditioner
82 //
83 //
84 // Zdenek Johan,  Winter 1991.  (Fortran 90)
85 // ----------------------------------------------------------------------
86 //
87 
88 
89 // Get variables from common_c.h
90       int nshg, nflow, nsd, iownnodes;
91       nshg  = conpar.nshg;
92       nflow = conpar.nflow;
93       nsd = NSD;
94       iownnodes = conpar.iownnodes;
95 
96       gcorp_t nshgt;
97       gcorp_t mbeg;
98       gcorp_t mend;
99       nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h
100       mbeg = (gcorp_t) newdim.minowned;
101       mend = (gcorp_t) newdim.maxowned;
102 
103 
104       int node, element, var, eqn;
105       double valtoinsert;
106       int nenl, iel, lelCat, lcsyst, iorder, nshl;
107       int mattyp, ndofl, nsymdl, npro, ngauss, nppro;
108       double DyFlat[nshg*nflow];
109       double DyFlatPhasta[nshg*nflow];
110       double rmes[nshg*nflow];
111 // DEBUG
112       int i,j,k,l,m;
113       //int indexsetary[nflow*nshg];
114       //int gblindexsetary[nflow*nshg];
115       PetscInt indexsetary[nflow*nshg];
116       PetscInt gblindexsetary[nflow*nshg];
117       PetscInt nodetoinsert;
118 
119       // FIXME: PetscScalar
120       double  real_rtol, real_abstol, real_dtol;
121 // /DEBUG
122       //double parray[1]; // Should be a PetscScalar
123       double *parray; // Should be a PetscScalar
124       PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA;
125       PetscInt petsc_n;
126       PetscOne = 1;
127       uint64_t duration[4];
128 
129 //      printf("%g %g %g \n", *rerr, *(rerr+1) , *(rerr+2));
130 
131        if(firstpetsccall == 1) {
132 /*         call get_time(duration(1),duration(2));
133          call system('sleep 10');
134          call get_time(duration(3),duration(4));
135          call get_max_time_diff(duration(1), duration(3),
136                               duration(2), duration(4),
137                               "TenSecondSleep " // char(0))
138          if(myrank .eq. 0) {
139            !write(*,"(A, E10.3)") "TenSecondSleep ", elapsed
140          }
141 */
142        }
143 //
144 //
145 // .... *******************>> Element Data Formation <<******************
146 //
147 //
148 // .... set the parameters for flux and surface tension calculations
149 //
150 //
151       genpar.idflx = 0 ;
152       if(genpar.idiff >= 1)  genpar.idflx= genpar.idflx + (nflow-1) * nsd;
153       if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd;
154 
155 
156 
157       PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
158       PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
159       gcorp_t glbNZ;
160 
161       if(firstpetsccall == 1) {
162         for(i=0;i<iownnodes;i++) {
163              idiagnz[i]=0;
164              iodiagnz[i]=0;
165         }
166         i=0;
167         for(k=0;k<nshg;k++) {
168           if((fncorp[k] < mbeg) || (fncorp[k] >mend)){
169 // this node is not owned by this rank so we skip
170           } else {
171            for(j=col[i]-1;j<col[i+1]-1;j++) {
172 //          assert(row[j]<=nshg);
173 //          assert(fncorp[row[j]-1]<=nshgt);
174              glbNZ=fncorp[row[j]-1];
175              if((glbNZ < mbeg) || (glbNZ > mend)) {
176                 iodiagnz[i]++;
177              } else {
178                 idiagnz[i]++;
179              }
180            }
181            i++;
182           }
183         }
184         gcorp_t mind=1000;
185         gcorp_t mino=1000;
186         gcorp_t maxd=0;
187         gcorp_t maxo=0;
188         for(i=0;i<iownnodes;i++) {
189            mind=min(mind,idiagnz[i]);
190            mino=min(mino,iodiagnz[i]);
191            maxd=max(maxd,idiagnz[i]);
192            maxo=max(maxo,iodiagnz[i]);
193 //           iodiagnz[i]=max(iodiagnz[i],10);
194 //           idiagnz[i]=max(idiagnz[i],10);
195 //           iodiagnz[i]=2*iodiagnz[i];   //  estimate a bit higher for off-part interactions
196 //           idiagnz[i]=2*idiagnz[i];   //  estimate a bit higher for off-part interactions
197         }
198 // the above was pretty good but below is faster and not too much more memory...of course once you do this
199 // could just use the constant fill parameters in create but keep it alive for potential later optimization
200 
201         for(i=0;i<iownnodes;i++) {
202            iodiagnz[i]=1.3*maxd;
203            idiagnz[i]=1.3*maxd;
204         }
205 
206 
207 
208         if(workfc.numpe < 200){
209           printf("myrank,i,iownnodes,nshg %d %d %d %d \n",workfc.myrank,i,iownnodes,nshg);
210           printf("myrank,mind,maxd,mino,maxo %d %d %d %d %d \n",workfc.myrank,mind,maxd,mino,maxo);
211         }
212         // Print debug info
213         if(nshgt < 200){
214         int irank;
215         for(irank=0;irank<workfc.numpe;irank++) {
216           if(irank == workfc.myrank){
217             printf("mbeg,mend,myrank,idiagnz, iodiagnz %d %d %d \n",mbeg,mend,workfc.myrank);
218             for(i=0;i<iownnodes;i++) {
219               printf("%d %ld %ld \n",workfc.myrank,idiagnz[i],iodiagnz[i]);
220             }
221           }
222          MPI_Barrier(MPI_COMM_WORLD);
223         }
224         }
225        }
226        if(firstpetsccall == 1) {
227 
228         petsc_bs = (PetscInt) nflow;
229         petsc_m  = (PetscInt) nflow* (PetscInt) iownnodes;
230         petsc_M  = (PetscInt) nshgt * (PetscInt) nflow;
231         petsc_PA  = (PetscInt) 40;
232         // TODO: fixe nshgt for 64 bit integer!!!!!
233 
234         //ierr = MatCreateBAIJ(PETSC_COMM_WORLD, nflow, nflow*iownnodes,
235         //       nflow*iownnodes, nshgt*nflow, nshgt*nflow, 0,
236 
237 // this has no pre-allocate        ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M,
238 // this has no pre-allocate                             0, NULL, 0, NULL, &lhsP);
239 
240 // next 2 do pre-allocate
241 
242 //      MPI_Barrier(MPI_COMM_WORLD);
243 //      if(workfc.myrank ==0) printf("Before matCreateBAIJ petsc_bs,petsc_m,petsc_M,petsc_PA %ld %ld %ld %ld \n",petsc_bs,petsc_m,petsc_M,petsc_PA);
244 
245         ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M,
246                              0, idiagnz, 0, iodiagnz, &lhsP);
247 
248 //                              petsc_PA, NULL, petsc_PA, NULL, &lhsP);
249 //      MPI_Barrier(MPI_COMM_WORLD);
250 //      if(workfc.myrank ==0) printf("Before matSetOoption 1  \n");
251         ierr = MatSetOption(lhsP, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE);
252 
253 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call
254 #ifdef JEDBROWN
255         ierr = MatSetOption(lhsP, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE);
256 #endif
257         ierr = MatSetUp(lhsP);
258 
259       PetscInt myMatStart, myMatEnd;
260       ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd);
261       }
262 //      MPI_Barrier(MPI_COMM_WORLD);
263  //     if(workfc.myrank ==0) printf("Before MatZeroEntries  \n");
264       if(genpar.lhs ==1) ierr = MatZeroEntries(lhsP);
265 
266       get_time(duration, (duration+1));
267 
268 //      MPI_Barrier(MPI_COMM_WORLD);
269  //     if(workfc.myrank ==0) printf("Before elmgmr  \n");
270 //      for (i=0; i<nshg ; i++) {
271 //            assert(fncorp[i]>0);
272 //      }
273 
274       ElmGMRPETSc(y,             ac,            x,
275                   shp,           shgl,          iBC,
276                   BC,            shpb,
277                   shglb,         res,
278                   rmes,                   iper,
279                   ilwork,        rerr ,         &lhsP);
280       get_time((duration+2), (duration+3));
281       get_max_time_diff((duration), (duration+2),
282                         (duration+1), (duration+3),
283                         "ElmGMRPETSc \0");  // char(0))
284 //       printf("after ElmGMRPETSc\n");
285        if(firstpetsccall == 1) {
286       // Setup IndexSet. For now, we mimic vector insertion procedure
287       // Since we always reference by global indexes this doesn't matter
288       // except for cache performance)
289       // TODO: Better arrangment?
290         nodetoinsert = 0;
291         k=0;
292         if(workfc.numpe > 1) {
293           for (i=0; i<nshg ; i++) {
294             nodetoinsert = fncorp[i]-1;
295 //            if(nodetoinsert<0) printf("nodetoinsert <0! myrank: %d, value: %ld\n", workfc.myrank, nodetoinsert);
296 //            assert(fncorp[i]>=0);
297             for (j=1; j<=nflow; j++) {
298               indexsetary[k] = nodetoinsert*nflow+(j-1);
299               assert(indexsetary[k]>=0);
300 //              assert(fncorp[i]>=0);
301               k = k+1;
302             }
303           }
304         }
305         else {
306           for (i=0; i<nshg ; i++) {
307             nodetoinsert = i;
308             for (j=1; j<=nflow; j++) {
309               indexsetary[k] = nodetoinsert*nflow+(j-1);
310               k = k+1;
311             }
312           }
313         }
314 
315         for (i=0; i<nshg*nflow; i++) {
316           gblindexsetary[i] = i ; //TODO: better option for performance?
317         }
318 //  Create Vector Index Maps
319         petsc_n  = (PetscInt) nshg * (PetscInt) nflow;
320         //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, indexsetary,
321      	//       PETSC_COPY_VALUES, &LocalIndexSet);
322         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetary,
323      	       PETSC_COPY_VALUES, &LocalIndexSet);
324 //        printf("after 1st ISCreateGeneral %d\n", ierr);
325         //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, gblindexsetary,
326         //       PETSC_COPY_VALUES, &GlobalIndexSet);
327         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetary,
328                PETSC_COPY_VALUES, &GlobalIndexSet);
329 //        printf("after 2nd ISCreateGeneral %d\n", ierr);
330         ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSet,&VectorMapping);
331 //        printf("after 1st ISLocalToGlobalMappingCreateIS %d\n",ierr);
332         ierr =  ISLocalToGlobalMappingCreateIS(GlobalIndexSet,&GblVectorMapping);
333 //        printf("after 2nd ISLocalToGlobalMappingCreateIS %d\n",ierr);
334       }
335       if(genpar.lhs == 1) {
336       get_time((duration), (duration+1));
337       ierr = MatAssemblyBegin(lhsP, MAT_FINAL_ASSEMBLY);
338       ierr = MatAssemblyEnd(lhsP, MAT_FINAL_ASSEMBLY);
339       get_time((duration+2),(duration+3));
340       get_max_time_diff((duration), (duration+2),
341                         (duration+1), (duration+3),
342                         "MatAssembly \0"); // char(0))
343       get_time(duration, (duration+1));
344       }
345       if(firstpetsccall == 1) {
346         ierr = MatGetLocalSize(lhsP, &LocalRow, &LocalCol);
347 //  TODO Should this be LocalRow? LocalCol? or does LocalRow always == LocalCol
348         //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &resP);
349         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resP);
350         //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &DyP);
351         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyP);
352       }
353       ierr = VecZeroEntries(resP);
354       ierr = VecZeroEntries(DyP);
355       if(firstpetsccall == 1) {
356         //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &resPGlobal);
357         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPGlobal);
358         //ierr = VecCreateSeq(PETSC_COMM_SELF, nshg*nflow, &DyPLocal);
359         ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocal);
360       }
361         ierr = VecZeroEntries(resPGlobal);
362 //         call VecZeroEntries(DyPLocal, ierr)
363 
364       PetscRow=0;
365       k = 0;
366       int index;
367       for (i=0; i<nshg; i++ ){
368         for (j = 1; j<=nflow; j++){
369           index = i + (j-1)*nshg;
370           valtoinsert = res[index];
371           if(workfc.numpe > 1) {
372             PetscRow = (fncorp[i]-1)*nflow+(j-1);
373           }
374           else {
375             PetscRow = i*nflow+(j-1);
376           }
377           assert(fncorp[i]<=nshgt);
378           assert(fncorp[i]>0);
379           assert(PetscRow>=0);
380           assert(PetscRow<=nshgt*nflow);
381           ierr =  VecSetValue(resP, PetscRow, valtoinsert, INSERT_VALUES);
382         }
383       }
384 //      printf("after VecSetValue loop %d\n",ierr);
385       ierr = VecAssemblyBegin(resP);
386       ierr = VecAssemblyEnd(resP);
387       get_time((duration+2), (duration+3));
388       get_max_time_diff((duration), (duration+2),
389                         (duration+1), (duration+3),
390                         "VectorWorkPre \0"); // char(0))
391 
392       get_time((duration),(duration+1));
393 
394       if(firstpetsccall == 1) {
395         ierr = KSPCreate(PETSC_COMM_WORLD, &ksp);
396         ierr = KSPSetOperators(ksp, lhsP, lhsP);
397 //3.4 ierr = KSPSetOperators(ksp, lhsP, lhsP, SAME_NONZERO_PATTERN);
398         ierr = KSPGetPC(ksp, &pc);
399         ierr = PCSetType(pc, PCPBJACOBI);
400         PetscInt maxits;
401         maxits = (PetscInt)  solpar.nGMRES * (PetscInt) solpar.Kspace;
402         //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace);
403         ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits);
404         ierr = KSPSetFromOptions(ksp);
405       }
406       ierr = KSPSolve(ksp, resP, DyP);
407       get_time((duration+2),(duration+3));
408       get_max_time_diff((duration), (duration+2),
409                         (duration+1), (duration+3),
410                         "KSPSolve \0"); // char(0))
411      //if(myrank .eq. 0) then
412      //!write(*,"(A, E10.3)") "KSPSolve ", elapsed
413      //end if
414       get_time((duration),(duration+1));
415       if(firstpetsccall == 1) {
416       ierr = VecScatterCreate(DyP, LocalIndexSet, DyPLocal, GlobalIndexSet, &scatter7);
417       }
418       ierr = VecScatterBegin(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD);
419       ierr = VecScatterEnd(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD);
420       ierr = VecGetArray(DyPLocal, &parray);
421       PetscRow = 0;
422       for ( node = 0; node< nshg; node++) {
423         for (var = 1; var<= nflow; var++) {
424           index = node + (var-1)*nshg;
425           Dy[index] = parray[PetscRow];
426           PetscRow = PetscRow+1;
427         }
428       }
429       ierr = VecRestoreArray(DyPLocal, &parray);
430 
431       firstpetsccall = 0;
432       // later timer ('Solver  ');
433 
434 // .... output the statistics
435 //
436       itrpar.iKs=0; // see rstat()
437       PetscInt its;
438       //ierr = KSPGetIterationNumber(ksp, &itrpar.iKs);
439       ierr = KSPGetIterationNumber(ksp, &its);
440       itrpar.iKs = (int) its;
441       get_time((duration+2),(duration+3));
442       get_max_time_diff((duration), (duration+2),
443                         (duration+1), (duration+3),
444                         "solWork \0"); // char(0))
445       itrpar.ntotGM += (int) its;
446       rstat (res, ilwork);
447 //
448 // .... stop the timer
449 //
450 // 3002 continue                  ! no solve just res.
451       // later call timer ('Back    ')
452 //
453 // .... end
454 //
455 }
456 
457 void     SolGMRpSclr(double* y,         double* ac,
458      	double* x,      double* elDw,   int* iBC,       double* BC,
459      	int* col,       int* row,
460      	int* iper,
461      	int* ilwork,    double* shp,       double* shgl,      double* shpb,
462      	double* shglb,  double* res,   double* Dy,     long long int* fncorp)
463 {
464 //
465 // ----------------------------------------------------------------------
466 //
467 //  This is the preconditioned GMRES driver routine.
468 //
469 // input:
470 //  y      (nshg,ndof)           : Y-variables at n+alpha_v
471 //  ac     (nshg,ndof)           : Primvar. accel. variable n+alpha_m
472 //  yold   (nshg,ndof)           : Y-variables at beginning of step
473 //  acold  (nshg,ndof)           : Primvar. accel. variable at begng step
474 //  x      (numnp,nsd)            : node coordinates
475 //  iBC    (nshg)                : BC codes
476 //  BC     (nshg,ndofBC)         : BC constraint parameters
477 //  shp(b) (nen,maxsh,melCat)     : element shape functions (boundary)
478 //  shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions
479 //
480 // ----------------------------------------------------------------------
481 //
482 
483 
484 // Get variables from common_c.h
485       int nshg, nflow, nsd, iownnodes;
486       nshg  = conpar.nshg;
487       nsd = NSD;
488       iownnodes = conpar.iownnodes;
489 
490       gcorp_t nshgt;
491       gcorp_t mbeg;
492       gcorp_t mend;
493       nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h
494       mbeg = (gcorp_t) newdim.minowned;
495       mend = (gcorp_t) newdim.maxowned;
496 
497 
498       int node, element, var, eqn;
499       double valtoinsert;
500       int nenl, iel, lelCat, lcsyst, iorder, nshl;
501       int mattyp, ndofl, nsymdl, npro, ngauss, nppro;
502       double DyFlats[nshg];
503       double DyFlatPhastas[nshg];
504       double rmes[nshg];
505 // DEBUG
506       int i,j,k,l,m;
507       PetscInt indexsetarys[nshg];
508       PetscInt gblindexsetarys[nshg];
509       PetscInt nodetoinsert;
510 
511       double  real_rtol, real_abstol, real_dtol;
512       double *parray;
513       PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA;
514       PetscInt petsc_n;
515       PetscOne = 1;
516       uint64_t duration[4];
517 
518 
519 //
520 //
521 // .... *******************>> Element Data Formation <<******************
522 //
523 //
524 // .... set the parameters for flux and surface tension calculations
525 //
526 //
527       genpar.idflx = 0 ;
528       if(genpar.idiff >= 1)  genpar.idflx= genpar.idflx + (nflow-1) * nsd;
529       if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd;
530 
531 
532 
533       PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
534       PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
535       gcorp_t glbNZ;
536 
537       if(firstpetsccalls == 1) {
538         for(i=0;i<iownnodes;i++) {
539              idiagnz[i]=0;
540              iodiagnz[i]=0;
541         }
542         i=0;
543         for(k=0;k<nshg;k++) {
544           if((fncorp[k] < mbeg) || (fncorp[k] >mend)){
545 // this node is not owned by this rank so we skip
546           } else {
547            for(j=col[i]-1;j<col[i+1]-1;j++) {
548              glbNZ=fncorp[row[j]];
549              if((glbNZ < mbeg) || (glbNZ > mend)) {
550                 iodiagnz[i]++;
551              } else {
552                 idiagnz[i]++;
553              }
554            }
555            i++;
556           }
557         }
558         gcorp_t mind=1000;
559         gcorp_t mino=1000;
560         gcorp_t maxd=0;
561         gcorp_t maxo=0;
562         for(i=0;i<iownnodes;i++) {
563            mind=min(mind,idiagnz[i]);
564            mino=min(mino,iodiagnz[i]);
565            maxd=max(maxd,idiagnz[i]);
566            maxo=max(maxo,iodiagnz[i]);
567         }
568 
569         for(i=0;i<iownnodes;i++) {
570            iodiagnz[i]=1.3*maxd;
571            idiagnz[i]=1.3*maxd;
572         }
573 
574 
575 
576        }
577        if(firstpetsccalls == 1) {
578 
579         petsc_m  = (PetscInt) iownnodes;
580         petsc_M  = (PetscInt) nshgt;
581         petsc_PA  = (PetscInt) 40;
582 
583         ierr = MatCreateAIJ(PETSC_COMM_WORLD, petsc_m, petsc_m, petsc_M, petsc_M,
584                              0, idiagnz, 0, iodiagnz, &lhsPs);
585 
586         ierr = MatSetOption(lhsPs, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE);
587 
588 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call
589 #ifdef JEDBROWN
590         ierr = MatSetOption(lhsPs, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE);
591 #endif
592         ierr = MatSetUp(lhsPs);
593 
594       PetscInt myMatStart, myMatEnd;
595       ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd);
596       }
597 //      MPI_Barrier(MPI_COMM_WORLD);
598  //     if(workfc.myrank ==0) printf("Before MatZeroEntries  \n");
599       if(genpar.lhs == 1) ierr = MatZeroEntries(lhsPs);
600 
601       get_time(duration, (duration+1));
602 
603 //      MPI_Barrier(MPI_COMM_WORLD);
604  //     if(workfc.myrank ==0) printf("Before elmgmr  \n");
605 //      for (i=0; i<nshg ; i++) {
606 //            assert(fncorp[i]>0);
607 //      }
608 
609       ElmGMRPETScSclr(y,             ac,            x, elDw,
610                   shp,           shgl,          iBC,
611                   BC,            shpb,
612                   shglb,         res,
613                   rmes,                   iper,
614                   ilwork,        &lhsPs);
615        if(firstpetsccalls == 1) {
616       // Setup IndexSet. For now, we mimic vector insertion procedure
617       // Since we always reference by global indexes this doesn't matter
618       // except for cache performance)
619       // TODO: Better arrangment?
620         nodetoinsert = 0;
621         k=0;
622         if(workfc.numpe > 1) {
623           for (i=0; i<nshg ; i++) {
624             nodetoinsert = fncorp[i]-1;
625             indexsetarys[k] = nodetoinsert;
626             k = k+1;
627           }
628         }
629         else {
630           for (i=0; i<nshg ; i++) {
631             nodetoinsert = i;
632             indexsetarys[k] = nodetoinsert;
633             k = k+1;
634           }
635         }
636 
637         for (i=0; i<nshg; i++) {
638           gblindexsetarys[i] = i ; //TODO: better option for performance?
639         }
640 //  Create Vector Index Maps
641         petsc_n  = (PetscInt) nshg;
642         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetarys,
643      	       PETSC_COPY_VALUES, &LocalIndexSets);
644         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetarys,
645                PETSC_COPY_VALUES, &GlobalIndexSets);
646         ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSets,&VectorMappings);
647         ierr =  ISLocalToGlobalMappingCreateIS(GlobalIndexSets,&GblVectorMappings);
648       }
649       if(genpar.lhs ==1) {
650       ierr = MatAssemblyBegin(lhsPs, MAT_FINAL_ASSEMBLY);
651       ierr = MatAssemblyEnd(lhsPs, MAT_FINAL_ASSEMBLY);
652       }
653       if(firstpetsccalls == 1) {
654         ierr = MatGetLocalSize(lhsPs, &LocalRow, &LocalCol);
655         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPs);
656         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyPs);
657       }
658       ierr = VecZeroEntries(resPs);
659       ierr = VecZeroEntries(DyPs);
660       if(firstpetsccalls == 1) {
661         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPGlobals);
662         ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocals);
663       }
664         ierr = VecZeroEntries(resPGlobals);
665 
666       PetscRow=0;
667       k = 0;
668       int index;
669       for (i=0; i<nshg; i++ ){
670           valtoinsert = res[i];
671           if(workfc.numpe > 1) {
672             PetscRow = (fncorp[i]-1);
673           }
674           else {
675             PetscRow = i-1;
676           }
677           assert(fncorp[i]<=nshgt);
678           assert(fncorp[i]>0);
679           assert(PetscRow>=0);
680           assert(PetscRow<=nshgt);
681           ierr =  VecSetValue(resPs, PetscRow, valtoinsert, INSERT_VALUES);
682       }
683 //      printf("after VecSetValue loop %d\n",ierr);
684       ierr = VecAssemblyBegin(resPs);
685       ierr = VecAssemblyEnd(resPs);
686 
687       if(firstpetsccalls == 1) {
688         ierr = KSPCreate(PETSC_COMM_WORLD, &ksps);
689         ierr = KSPSetOperators(ksps, lhsPs, lhsPs);
690 //3.4 ierr = KSPSetOperators(ksps, lhsPs, lhsPs, SAME_NONZERO_PATTERN);
691         ierr = KSPGetPC(ksps, &pcs);
692         ierr = PCSetType(pcs, PCPBJACOBI);
693         PetscInt maxits;
694         maxits = (PetscInt)  solpar.nGMRES * (PetscInt) solpar.Kspace;
695         //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace);
696         ierr = KSPSetTolerances(ksps, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits);
697         ierr = KSPSetFromOptions(ksps);
698       }
699       ierr = KSPSolve(ksps, resPs, DyPs);
700       if(firstpetsccalls == 1) {
701       ierr = VecScatterCreate(DyPs, LocalIndexSets, DyPLocals, GlobalIndexSets, &scatter7s);
702       }
703       ierr = VecScatterBegin(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD);
704       ierr = VecScatterEnd(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD);
705       ierr = VecGetArray(DyPLocals, &parray);
706       PetscRow = 0;
707       for ( node = 0; node< nshg; node++) {
708           index = node;
709           Dy[index] = parray[PetscRow];
710           PetscRow = PetscRow+1;
711       }
712       ierr = VecRestoreArray(DyPLocals, &parray);
713 
714       firstpetsccalls = 0;
715 
716 // .... output the statistics
717 //
718 //      itrpar.iKss=0; // see rstat()
719       PetscInt its;
720       ierr = KSPGetIterationNumber(ksps, &its);
721       itrpar.iKss = (int) its;
722       itrpar.ntotGMs += (int) its;
723       rstatSclr (res, ilwork);
724 //
725 // .... end
726 //
727 }
728 #endif
729