xref: /phasta/phSolver/compressible/solgmrpetsc.c (revision 956f980fcd26e82f186af0835a4a92fafc91dc7e)
1 #ifdef HAVE_PETSC
2 #include <stdio.h>
3 
4 #include "petscsys.h"
5 #include "petscvec.h"
6 #include "petscmat.h"
7 #include "petscpc.h"
8 #include "petscksp.h"
9 
10 #include "assert.h"
11 #include "common_c.h"
12 
13 #include <FCMangle.h>
14 #define SolGMRp FortranCInterface_GLOBAL_(solgmrp,SOLGMRP)
15 #define SolGMRpSclr FortranCInterface_GLOBAL_(solgmrpsclr,SOLGMRPSCLR)
16 #define ElmGMRPETSc FortranCInterface_GLOBAL_(elmgmrpetsc, ELMGMRPETSC)
17 #define ElmGMRPETScSclr FortranCInterface_GLOBAL_(elmgmrpetscsclr, ELMGMRPETSCSCLR)
18 #define rstat FortranCInterface_GLOBAL_(rstat, RSTAT)
19 #define rstatSclr FortranCInterface_GLOBAL_(rstatsclr, RSTATSCLR)
20 #define min(a,b) (((a) < (b)) ? (a) : (b))
21 #define max(a,b) (((a) > (b)) ? (a) : (b))
22 #define get_time FortranCInterface_GLOBAL_(get_time,GET_TIME)
23 #define get_max_time_diff FortranCInterface_GLOBAL_(get_max_time_diff,GET_MAX_TIME_DIFF)
24 
25 typedef long long int gcorp_t;
26 
27 void get_time(uint64_t* rv, uint64_t* cycle);
28 void get_max_time_diff(uint64_t* first, uint64_t* last, uint64_t* c_first, uint64_t* c_last, char* lbl);
29 
30 //#include "auxmpi.h"
31 //      static PetscOffset poff;
32 
33       static Mat lhsP;
34       static PC pc;
35       static KSP ksp;
36       static Vec DyP, resP, DyPLocal, resPGlobal;
37       static PetscErrorCode ierr;
38       static PetscInt PetscOne, PetscRow, PetscCol, LocalRow, LocalCol;
39       static IS LocalIndexSet, GlobalIndexSet;
40       static ISLocalToGlobalMapping VectorMapping;
41       static ISLocalToGlobalMapping GblVectorMapping;
42       static  VecScatter scatter7;
43       static int firstpetsccall = 1;
44 
45       static Mat lhsPs;
46       static PC pcs;
47       static KSP ksps;
48       static Vec DyPs, resPs, DyPLocals, resPGlobals;
49       static IS LocalIndexSets, GlobalIndexSets;
50       static ISLocalToGlobalMapping VectorMappings;
51       static ISLocalToGlobalMapping GblVectorMappings;
52       static VecScatter scatter7s;
53       static int firstpetsccalls = 1;
54 
55 void     SolGMRp(double* y,         double* ac,        double* yold,
56      	double* x,         int* iBC,       double* BC,
57      	int* col,       int* row,       double* lhsk,
58      	double* res,       double* BDiag,
59      	int* iper,
60      	int* ilwork,    double* shp,       double* shgl,      double* shpb,
61      	double* shglb,     double* Dy,        double* rerr, long long int* fncorp)
62 {
63 //
64 // ----------------------------------------------------------------------
65 //
66 //  This is the preconditioned GMRES driver routine.
67 //
68 // input:
69 //  y      (nshg,ndof)           : Y-variables at n+alpha_v
70 //  ac     (nshg,ndof)           : Primvar. accel. variable n+alpha_m
71 //  yold   (nshg,ndof)           : Y-variables at beginning of step
72 //  acold  (nshg,ndof)           : Primvar. accel. variable at begng step
73 //  x      (numnp,nsd)            : node coordinates
74 //  iBC    (nshg)                : BC codes
75 //  BC     (nshg,ndofBC)         : BC constraint parameters
76 //  shp(b) (nen,maxsh,melCat)     : element shape functions (boundary)
77 //  shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions
78 //
79 // output:
80 //  res    (nshg,nflow)           : preconditioned residual
81 //  BDiag  (nshg,nflow,nflow)      : block-diagonal preconditioner
82 //
83 //
84 // Zdenek Johan,  Winter 1991.  (Fortran 90)
85 // ----------------------------------------------------------------------
86 //
87 
88 
89 // Get variables from common_c.h
90       int nshg, nflow, nsd, iownnodes;
91       nshg  = conpar.nshg;
92       nflow = conpar.nflow;
93       nsd = NSD;
94       iownnodes = conpar.iownnodes;
95 
96       gcorp_t nshgt;
97       gcorp_t mbeg;
98       gcorp_t mend;
99       nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h
100       mbeg = (gcorp_t) newdim.minowned;
101       mend = (gcorp_t) newdim.maxowned;
102 
103 
104       int node, element, var, eqn;
105       double valtoinsert;
106       int nenl, iel, lelCat, lcsyst, iorder, nshl;
107       int mattyp, ndofl, nsymdl, npro, ngauss, nppro;
108       double DyFlat[nshg*nflow];
109       double DyFlatPhasta[nshg*nflow];
110       double rmes[nshg*nflow];
111 // DEBUG
112       int i,j,k,l,m;
113 
114       // FIXME: PetscScalar
115       double  real_rtol, real_abstol, real_dtol;
116 // /DEBUG
117       //double parray[1]; // Should be a PetscScalar
118       double *parray; // Should be a PetscScalar
119       PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA;
120       PetscInt petsc_n;
121       PetscOne = 1;
122       uint64_t duration[4];
123 
124 //      printf("%g %g %g \n", *rerr, *(rerr+1) , *(rerr+2));
125 
126        if(firstpetsccall == 1) {
127 /*         call get_time(duration(1),duration(2));
128          call system('sleep 10');
129          call get_time(duration(3),duration(4));
130          call get_max_time_diff(duration(1), duration(3),
131                               duration(2), duration(4),
132                               "TenSecondSleep " // char(0))
133          if(myrank .eq. 0) {
134            !write(*,"(A, E10.3)") "TenSecondSleep ", elapsed
135          }
136 */
137        }
138 //
139 //
140 // .... *******************>> Element Data Formation <<******************
141 //
142 //
143 // .... set the parameters for flux and surface tension calculations
144 //
145 //
146       genpar.idflx = 0 ;
147       if(genpar.idiff >= 1)  genpar.idflx= genpar.idflx + (nflow-1) * nsd;
148       if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd;
149 
150 
151 
152       gcorp_t glbNZ;
153 
154       if(firstpetsccall == 1) {
155         PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
156         PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
157         for(i=0;i<iownnodes;i++) {
158              idiagnz[i]=0;
159              iodiagnz[i]=0;
160         }
161         i=0;
162         for(k=0;k<nshg;k++) {
163           if((fncorp[k] < mbeg) || (fncorp[k] >mend)){
164 // this node is not owned by this rank so we skip
165           } else {
166            for(j=col[i]-1;j<col[i+1]-1;j++) {
167 //          assert(row[j]<=nshg);
168 //          assert(fncorp[row[j]-1]<=nshgt);
169              glbNZ=fncorp[row[j]-1];
170              if((glbNZ < mbeg) || (glbNZ > mend)) {
171                 iodiagnz[i]++;
172              } else {
173                 idiagnz[i]++;
174              }
175            }
176            i++;
177           }
178         }
179         gcorp_t mind=1000;
180         gcorp_t mino=1000;
181         gcorp_t maxd=0;
182         gcorp_t maxo=0;
183         for(i=0;i<iownnodes;i++) {
184            mind=min(mind,idiagnz[i]);
185            mino=min(mino,iodiagnz[i]);
186            maxd=max(maxd,idiagnz[i]);
187            maxo=max(maxo,iodiagnz[i]);
188 //           iodiagnz[i]=max(iodiagnz[i],10);
189 //           idiagnz[i]=max(idiagnz[i],10);
190 //           iodiagnz[i]=2*iodiagnz[i];   //  estimate a bit higher for off-part interactions
191 //           idiagnz[i]=2*idiagnz[i];   //  estimate a bit higher for off-part interactions
192         }
193 // the above was pretty good but below is faster and not too much more memory...of course once you do this
194 // could just use the constant fill parameters in create but keep it alive for potential later optimization
195 
196         for(i=0;i<iownnodes;i++) {
197            iodiagnz[i]=1.3*maxd;
198            idiagnz[i]=1.3*maxd;
199         }
200 
201 
202 
203         if(workfc.numpe < 200){
204           printf("myrank,i,iownnodes,nshg %d %d %d %d \n",workfc.myrank,i,iownnodes,nshg);
205           printf("myrank,mind,maxd,mino,maxo %d %d %d %d %d \n",workfc.myrank,mind,maxd,mino,maxo);
206         }
207         // Print debug info
208         if(nshgt < 200){
209         int irank;
210         for(irank=0;irank<workfc.numpe;irank++) {
211           if(irank == workfc.myrank){
212             printf("mbeg,mend,myrank,idiagnz, iodiagnz %d %d %d \n",mbeg,mend,workfc.myrank);
213             for(i=0;i<iownnodes;i++) {
214               printf("%d %ld %ld \n",workfc.myrank,idiagnz[i],iodiagnz[i]);
215             }
216           }
217          MPI_Barrier(MPI_COMM_WORLD);
218         }
219         }
220 //       }
221 //       if(firstpetsccall == 1) {
222 
223         petsc_bs = (PetscInt) nflow;
224         petsc_m  = (PetscInt) nflow* (PetscInt) iownnodes;
225         petsc_M  = (PetscInt) nshgt * (PetscInt) nflow;
226         petsc_PA  = (PetscInt) 40;
227         // TODO: fixe nshgt for 64 bit integer!!!!!
228 
229         //ierr = MatCreateBAIJ(PETSC_COMM_WORLD, nflow, nflow*iownnodes,
230         //       nflow*iownnodes, nshgt*nflow, nshgt*nflow, 0,
231 
232 // this has no pre-allocate        ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M,
233 // this has no pre-allocate                             0, NULL, 0, NULL, &lhsP);
234 
235 // next 2 do pre-allocate
236 
237 //      MPI_Barrier(MPI_COMM_WORLD);
238 //      if(workfc.myrank ==0) printf("Before matCreateBAIJ petsc_bs,petsc_m,petsc_M,petsc_PA %ld %ld %ld %ld \n",petsc_bs,petsc_m,petsc_M,petsc_PA);
239 
240         ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M,
241                              0, idiagnz, 0, iodiagnz, &lhsP);
242 
243 //                              petsc_PA, NULL, petsc_PA, NULL, &lhsP);
244 //      MPI_Barrier(MPI_COMM_WORLD);
245 //      if(workfc.myrank ==0) printf("Before matSetOoption 1  \n");
246         ierr = MatSetOption(lhsP, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE);
247 
248 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call
249 #ifdef JEDBROWN
250         ierr = MatSetOption(lhsP, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE);
251 #endif
252         ierr = MatSetUp(lhsP);
253 
254       PetscInt myMatStart, myMatEnd;
255       ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd);
256       }
257 //      MPI_Barrier(MPI_COMM_WORLD);
258  //     if(workfc.myrank ==0) printf("Before MatZeroEntries  \n");
259       if(genpar.lhs ==1) ierr = MatZeroEntries(lhsP);
260 
261       get_time(duration, (duration+1));
262 
263 //      MPI_Barrier(MPI_COMM_WORLD);
264  //     if(workfc.myrank ==0) printf("Before elmgmr  \n");
265 //      for (i=0; i<nshg ; i++) {
266 //            assert(fncorp[i]>0);
267 //      }
268 
269       ElmGMRPETSc(y,             ac,            x,
270                   shp,           shgl,          iBC,
271                   BC,            shpb,
272                   shglb,         res,
273                   rmes,                   iper,
274                   ilwork,        rerr ,         &lhsP);
275       get_time((duration+2), (duration+3));
276       get_max_time_diff((duration), (duration+2),
277                         (duration+1), (duration+3),
278                         "ElmGMRPETSc \0");  // char(0))
279 //       printf("after ElmGMRPETSc\n");
280        if(firstpetsccall == 1) {
281       // Setup IndexSet. For now, we mimic vector insertion procedure
282       // Since we always reference by global indexes this doesn't matter
283       // except for cache performance)
284       // TODO: Better arrangment?
285       //PetscInt indexsetary[nflow*nshg];
286       PetscInt* indexsetary = malloc(sizeof(PetscInt)*nflow*nshg);
287       PetscInt* gblindexsetary = malloc(sizeof(PetscInt)*nflow*nshg);
288       PetscInt nodetoinsert;
289         nodetoinsert = 0;
290         k=0;
291         if(workfc.numpe > 1) {
292           for (i=0; i<nshg ; i++) {
293             nodetoinsert = fncorp[i]-1;
294 //            if(nodetoinsert<0) printf("nodetoinsert <0! myrank: %d, value: %ld\n", workfc.myrank, nodetoinsert);
295 //            assert(fncorp[i]>=0);
296             for (j=1; j<=nflow; j++) {
297               indexsetary[k] = nodetoinsert*nflow+(j-1);
298               assert(indexsetary[k]>=0);
299 //              assert(fncorp[i]>=0);
300               k = k+1;
301             }
302           }
303         }
304         else {
305           for (i=0; i<nshg ; i++) {
306             nodetoinsert = i;
307             for (j=1; j<=nflow; j++) {
308               indexsetary[k] = nodetoinsert*nflow+(j-1);
309               k = k+1;
310             }
311           }
312         }
313 
314         for (i=0; i<nshg*nflow; i++) {
315           gblindexsetary[i] = i ; //TODO: better option for performance?
316         }
317 //  Create Vector Index Maps
318         petsc_n  = (PetscInt) nshg * (PetscInt) nflow;
319         //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, indexsetary,
320      	//       PETSC_COPY_VALUES, &LocalIndexSet);
321         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetary,
322      	       PETSC_COPY_VALUES, &LocalIndexSet);
323 //        printf("after 1st ISCreateGeneral %d\n", ierr);
324         //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, gblindexsetary,
325         //       PETSC_COPY_VALUES, &GlobalIndexSet);
326         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetary,
327                PETSC_COPY_VALUES, &GlobalIndexSet);
328 //        printf("after 2nd ISCreateGeneral %d\n", ierr);
329         ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSet,&VectorMapping);
330 //        printf("after 1st ISLocalToGlobalMappingCreateIS %d\n",ierr);
331         ierr =  ISLocalToGlobalMappingCreateIS(GlobalIndexSet,&GblVectorMapping);
332 //        printf("after 2nd ISLocalToGlobalMappingCreateIS %d\n",ierr);
333       free(indexsetary);
334       free(gblindexsetary);
335       }
336       if(genpar.lhs == 1) {
337       get_time((duration), (duration+1));
338       ierr = MatAssemblyBegin(lhsP, MAT_FINAL_ASSEMBLY);
339       ierr = MatAssemblyEnd(lhsP, MAT_FINAL_ASSEMBLY);
340       get_time((duration+2),(duration+3));
341       get_max_time_diff((duration), (duration+2),
342                         (duration+1), (duration+3),
343                         "MatAssembly \0"); // char(0))
344       get_time(duration, (duration+1));
345       }
346       if(firstpetsccall == 1) {
347         ierr = MatGetLocalSize(lhsP, &LocalRow, &LocalCol);
348 //  TODO Should this be LocalRow? LocalCol? or does LocalRow always == LocalCol
349         //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &resP);
350         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resP);
351         //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &DyP);
352         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyP);
353       }
354       ierr = VecZeroEntries(resP);
355       ierr = VecZeroEntries(DyP);
356       if(firstpetsccall == 1) {
357         //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &resPGlobal);
358         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPGlobal);
359         //ierr = VecCreateSeq(PETSC_COMM_SELF, nshg*nflow, &DyPLocal);
360         ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocal);
361       }
362         ierr = VecZeroEntries(resPGlobal);
363 //         call VecZeroEntries(DyPLocal, ierr)
364 
365       PetscRow=0;
366       k = 0;
367       int index;
368       for (i=0; i<nshg; i++ ){
369         for (j = 1; j<=nflow; j++){
370           index = i + (j-1)*nshg;
371           valtoinsert = res[index];
372           if(workfc.numpe > 1) {
373             PetscRow = (fncorp[i]-1)*nflow+(j-1);
374           }
375           else {
376             PetscRow = i*nflow+(j-1);
377           }
378           assert(fncorp[i]<=nshgt);
379           assert(fncorp[i]>0);
380           assert(PetscRow>=0);
381           assert(PetscRow<=nshgt*nflow);
382           ierr =  VecSetValue(resP, PetscRow, valtoinsert, INSERT_VALUES);
383         }
384       }
385 //      printf("after VecSetValue loop %d\n",ierr);
386       ierr = VecAssemblyBegin(resP);
387       ierr = VecAssemblyEnd(resP);
388       get_time((duration+2), (duration+3));
389       get_max_time_diff((duration), (duration+2),
390                         (duration+1), (duration+3),
391                         "VectorWorkPre \0"); // char(0))
392 
393       get_time((duration),(duration+1));
394 
395       if(firstpetsccall == 1) {
396         ierr = KSPCreate(PETSC_COMM_WORLD, &ksp);
397         ierr = KSPSetOperators(ksp, lhsP, lhsP);
398 //3.4 ierr = KSPSetOperators(ksp, lhsP, lhsP, SAME_NONZERO_PATTERN);
399         ierr = KSPGetPC(ksp, &pc);
400         ierr = PCSetType(pc, PCPBJACOBI);
401         PetscInt maxits;
402         maxits = (PetscInt)  solpar.nGMRES * (PetscInt) solpar.Kspace;
403         //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace);
404         ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits);
405         ierr = KSPSetFromOptions(ksp);
406       }
407       ierr = KSPSolve(ksp, resP, DyP);
408       get_time((duration+2),(duration+3));
409       get_max_time_diff((duration), (duration+2),
410                         (duration+1), (duration+3),
411                         "KSPSolve \0"); // char(0))
412      //if(myrank .eq. 0) then
413      //!write(*,"(A, E10.3)") "KSPSolve ", elapsed
414      //end if
415       get_time((duration),(duration+1));
416       if(firstpetsccall == 1) {
417       ierr = VecScatterCreate(DyP, LocalIndexSet, DyPLocal, GlobalIndexSet, &scatter7);
418       }
419       ierr = VecScatterBegin(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD);
420       ierr = VecScatterEnd(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD);
421       ierr = VecGetArray(DyPLocal, &parray);
422       PetscRow = 0;
423       for ( node = 0; node< nshg; node++) {
424         for (var = 1; var<= nflow; var++) {
425           index = node + (var-1)*nshg;
426           Dy[index] = parray[PetscRow];
427           PetscRow = PetscRow+1;
428         }
429       }
430       ierr = VecRestoreArray(DyPLocal, &parray);
431 
432       firstpetsccall = 0;
433       // later timer ('Solver  ');
434 
435 
436 // .... output the statistics
437 //
438       itrpar.iKs=0; // see rstat()
439       PetscInt its;
440       //ierr = KSPGetIterationNumber(ksp, &itrpar.iKs);
441       ierr = KSPGetIterationNumber(ksp, &its);
442       itrpar.iKs = (int) its;
443       get_time((duration+2),(duration+3));
444       get_max_time_diff((duration), (duration+2),
445                         (duration+1), (duration+3),
446                         "solWork \0"); // char(0))
447       itrpar.ntotGM += (int) its;
448       rstat (res, ilwork);
449 //
450 // .... stop the timer
451 //
452 // 3002 continue                  ! no solve just res.
453       // later call timer ('Back    ')
454 //
455 // .... end
456 //
457 }
458 
459 void     SolGMRpSclr(double* y,         double* ac,
460      	double* x,      double* elDw,   int* iBC,       double* BC,
461      	int* col,       int* row,
462      	int* iper,
463      	int* ilwork,    double* shp,       double* shgl,      double* shpb,
464      	double* shglb,  double* res,   double* Dy,     long long int* fncorp)
465 {
466 //
467 // ----------------------------------------------------------------------
468 //
469 //  This is the preconditioned GMRES driver routine.
470 //
471 // input:
472 //  y      (nshg,ndof)           : Y-variables at n+alpha_v
473 //  ac     (nshg,ndof)           : Primvar. accel. variable n+alpha_m
474 //  yold   (nshg,ndof)           : Y-variables at beginning of step
475 //  acold  (nshg,ndof)           : Primvar. accel. variable at begng step
476 //  x      (numnp,nsd)            : node coordinates
477 //  iBC    (nshg)                : BC codes
478 //  BC     (nshg,ndofBC)         : BC constraint parameters
479 //  shp(b) (nen,maxsh,melCat)     : element shape functions (boundary)
480 //  shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions
481 //
482 // ----------------------------------------------------------------------
483 //
484 
485 
486 // Get variables from common_c.h
487       int nshg, nflow, nsd, iownnodes;
488       nshg  = conpar.nshg;
489       nsd = NSD;
490       iownnodes = conpar.iownnodes;
491 
492       gcorp_t nshgt;
493       gcorp_t mbeg;
494       gcorp_t mend;
495       nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h
496       mbeg = (gcorp_t) newdim.minowned;
497       mend = (gcorp_t) newdim.maxowned;
498 
499 
500       int node, element, var, eqn;
501       double valtoinsert;
502       int nenl, iel, lelCat, lcsyst, iorder, nshl;
503       int mattyp, ndofl, nsymdl, npro, ngauss, nppro;
504       double DyFlats[nshg];
505       double DyFlatPhastas[nshg];
506       double rmes[nshg];
507 // DEBUG
508       int i,j,k,l,m;
509 
510       double  real_rtol, real_abstol, real_dtol;
511       double *parray;
512       PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA;
513       PetscInt petsc_n;
514       PetscOne = 1;
515       uint64_t duration[4];
516 
517 
518 //
519 //
520 // .... *******************>> Element Data Formation <<******************
521 //
522 //
523 // .... set the parameters for flux and surface tension calculations
524 //
525 //
526       genpar.idflx = 0 ;
527       if(genpar.idiff >= 1)  genpar.idflx= genpar.idflx + (nflow-1) * nsd;
528       if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd;
529 
530 
531 
532       gcorp_t glbNZ;
533 
534       if(firstpetsccalls == 1) {
535         PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
536         PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
537         for(i=0;i<iownnodes;i++) {
538              idiagnz[i]=0;
539              iodiagnz[i]=0;
540         }
541         i=0;
542         for(k=0;k<nshg;k++) {
543           if((fncorp[k] < mbeg) || (fncorp[k] >mend)){
544 // this node is not owned by this rank so we skip
545           } else {
546            for(j=col[i]-1;j<col[i+1]-1;j++) {
547              glbNZ=fncorp[row[j]-1];
548              if((glbNZ < mbeg) || (glbNZ > mend)) {
549                 iodiagnz[i]++;
550              } else {
551                 idiagnz[i]++;
552              }
553            }
554            i++;
555           }
556         }
557         gcorp_t mind=1000;
558         gcorp_t mino=1000;
559         gcorp_t maxd=0;
560         gcorp_t maxo=0;
561         for(i=0;i<iownnodes;i++) {
562            mind=min(mind,idiagnz[i]);
563            mino=min(mino,iodiagnz[i]);
564            maxd=max(maxd,idiagnz[i]);
565            maxo=max(maxo,iodiagnz[i]);
566         }
567 
568         for(i=0;i<iownnodes;i++) {
569            iodiagnz[i]=1.3*maxd;
570            idiagnz[i]=1.3*maxd;
571         }
572 
573 
574 
575 //       }
576 //       if(firstpetsccalls == 1) {
577 
578         petsc_m  = (PetscInt) iownnodes;
579         petsc_M  = (PetscInt) nshgt;
580         petsc_PA  = (PetscInt) 40;
581 
582         ierr = MatCreateAIJ(PETSC_COMM_WORLD, petsc_m, petsc_m, petsc_M, petsc_M,
583                              0, idiagnz, 0, iodiagnz, &lhsPs);
584 
585         ierr = MatSetOption(lhsPs, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE);
586 
587 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call
588 #ifdef JEDBROWN
589         ierr = MatSetOption(lhsPs, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE);
590 #endif
591         ierr = MatSetUp(lhsPs);
592 
593       PetscInt myMatStart, myMatEnd;
594       ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd);
595       }
596 //      MPI_Barrier(MPI_COMM_WORLD);
597  //     if(workfc.myrank ==0) printf("Before MatZeroEntries  \n");
598       if(genpar.lhs == 1) ierr = MatZeroEntries(lhsPs);
599 
600       get_time(duration, (duration+1));
601 
602 //      MPI_Barrier(MPI_COMM_WORLD);
603  //     if(workfc.myrank ==0) printf("Before elmgmr  \n");
604 //      for (i=0; i<nshg ; i++) {
605 //            assert(fncorp[i]>0);
606 //      }
607 
608       ElmGMRPETScSclr(y,             ac,            x, elDw,
609                   shp,           shgl,          iBC,
610                   BC,            shpb,
611                   shglb,         res,
612                   rmes,                   iper,
613                   ilwork,        &lhsPs);
614        if(firstpetsccalls == 1) {
615       // Setup IndexSet. For now, we mimic vector insertion procedure
616       // Since we always reference by global indexes this doesn't matter
617       // except for cache performance)
618       // TODO: Better arrangment?
619 
620       PetscInt* indexsetarys = malloc(sizeof(PetscInt)*nshg);
621       PetscInt* gblindexsetarys = malloc(sizeof(PetscInt)*nshg);
622       PetscInt nodetoinsert;
623 
624         nodetoinsert = 0;
625         k=0;
626         if(workfc.numpe > 1) {
627           for (i=0; i<nshg ; i++) {
628             nodetoinsert = fncorp[i]-1;
629             indexsetarys[k] = nodetoinsert;
630             k = k+1;
631           }
632         }
633         else {
634           for (i=0; i<nshg ; i++) {
635             nodetoinsert = i;
636             indexsetarys[k] = nodetoinsert;
637             k = k+1;
638           }
639         }
640 
641         for (i=0; i<nshg; i++) {
642           gblindexsetarys[i] = i ; //TODO: better option for performance?
643         }
644 //  Create Vector Index Maps
645         petsc_n  = (PetscInt) nshg;
646         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetarys,
647      	       PETSC_COPY_VALUES, &LocalIndexSets);
648         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetarys,
649                PETSC_COPY_VALUES, &GlobalIndexSets);
650         ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSets,&VectorMappings);
651         ierr =  ISLocalToGlobalMappingCreateIS(GlobalIndexSets,&GblVectorMappings);
652       free(indexsetarys);
653       free(gblindexsetarys);
654       }
655       if(genpar.lhs ==1) {
656       ierr = MatAssemblyBegin(lhsPs, MAT_FINAL_ASSEMBLY);
657       ierr = MatAssemblyEnd(lhsPs, MAT_FINAL_ASSEMBLY);
658       }
659       if(firstpetsccalls == 1) {
660         ierr = MatGetLocalSize(lhsPs, &LocalRow, &LocalCol);
661         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPs);
662         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyPs);
663       }
664       ierr = VecZeroEntries(resPs);
665       ierr = VecZeroEntries(DyPs);
666       if(firstpetsccalls == 1) {
667         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPGlobals);
668         ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocals);
669       }
670         ierr = VecZeroEntries(resPGlobals);
671 
672       PetscRow=0;
673       k = 0;
674       int index;
675       for (i=0; i<nshg; i++ ){
676           valtoinsert = res[i];
677           if(workfc.numpe > 1) {
678             PetscRow = (fncorp[i]-1);
679           }
680           else {
681             PetscRow = i-1;
682           }
683           assert(fncorp[i]<=nshgt);
684           assert(fncorp[i]>0);
685           assert(PetscRow>=0);
686           assert(PetscRow<=nshgt);
687           ierr =  VecSetValue(resPs, PetscRow, valtoinsert, INSERT_VALUES);
688       }
689 //      printf("after VecSetValue loop %d\n",ierr);
690       ierr = VecAssemblyBegin(resPs);
691       ierr = VecAssemblyEnd(resPs);
692 
693       if(firstpetsccalls == 1) {
694         ierr = KSPCreate(PETSC_COMM_WORLD, &ksps);
695         ierr = KSPSetOperators(ksps, lhsPs, lhsPs);
696 //3.4 ierr = KSPSetOperators(ksps, lhsPs, lhsPs, SAME_NONZERO_PATTERN);
697         ierr = KSPGetPC(ksps, &pcs);
698         ierr = PCSetType(pcs, PCPBJACOBI);
699         PetscInt maxits;
700         maxits = (PetscInt)  solpar.nGMRES * (PetscInt) solpar.Kspace;
701         //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace);
702         ierr = KSPSetTolerances(ksps, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits);
703         ierr = KSPSetFromOptions(ksps);
704       }
705       ierr = KSPSolve(ksps, resPs, DyPs);
706       if(firstpetsccalls == 1) {
707       ierr = VecScatterCreate(DyPs, LocalIndexSets, DyPLocals, GlobalIndexSets, &scatter7s);
708       }
709       ierr = VecScatterBegin(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD);
710       ierr = VecScatterEnd(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD);
711       ierr = VecGetArray(DyPLocals, &parray);
712       PetscRow = 0;
713       for ( node = 0; node< nshg; node++) {
714           index = node;
715           Dy[index] = parray[PetscRow];
716           PetscRow = PetscRow+1;
717       }
718       ierr = VecRestoreArray(DyPLocals, &parray);
719 
720       firstpetsccalls = 0;
721 
722 // .... output the statistics
723 //
724 //      itrpar.iKss=0; // see rstat()
725       PetscInt its;
726       ierr = KSPGetIterationNumber(ksps, &its);
727       itrpar.iKss = (int) its;
728       itrpar.ntotGMs += (int) its;
729       rstatSclr (res, ilwork);
730 //
731 // .... end
732 //
733 }
734 #endif
735