xref: /phasta/phSolver/compressible/solgmrpetsc.c (revision 0be30ed5f961ad1a1373ad7d2e756d471b0c0231)
1 #ifdef HAVE_PETSC
2 #include <stdio.h>
3 
4 #include "petscsys.h"
5 #include "petscvec.h"
6 #include "petscmat.h"
7 #include "petscpc.h"
8 #include "petscksp.h"
9 
10 #include "assert.h"
11 #include "common_c.h"
12 
13 #include <FCMangle.h>
14 #define SolGMRp FortranCInterface_GLOBAL_(solgmrp,SOLGMRP)
15 #define SolGMRpSclr FortranCInterface_GLOBAL_(solgmrpsclr,SOLGMRPSCLR)
16 #define ElmGMRPETSc FortranCInterface_GLOBAL_(elmgmrpetsc, ELMGMRPETSC)
17 #define ElmGMRPETScSclr FortranCInterface_GLOBAL_(elmgmrpetscsclr, ELMGMRPETSCSCLR)
18 #define rstat FortranCInterface_GLOBAL_(rstat, RSTAT)
19 #define rstatSclr FortranCInterface_GLOBAL_(rstatsclr, RSTATSCLR)
20 #define min(a,b) (((a) < (b)) ? (a) : (b))
21 #define max(a,b) (((a) > (b)) ? (a) : (b))
22 #define get_time FortranCInterface_GLOBAL_(get_time,GET_TIME)
23 #define get_max_time_diff FortranCInterface_GLOBAL_(get_max_time_diff,GET_MAX_TIME_DIFF)
24 
25 typedef long long int gcorp_t;
26 
27 void get_time(uint64_t* rv, uint64_t* cycle);
28 void get_max_time_diff(uint64_t* first, uint64_t* last, uint64_t* c_first, uint64_t* c_last, char* lbl);
29 
30 //#include "auxmpi.h"
31 //      static PetscOffset poff;
32 
33       static Mat lhsP;
34       static PC pc;
35       static KSP ksp;
36       static Vec DyP, resP, DyPLocal;
37       static PetscErrorCode ierr;
38       static PetscInt PetscOne, PetscRow, PetscCol, LocalRow, LocalCol;
39       static IS LocalIndexSet, GlobalIndexSet;
40       static ISLocalToGlobalMapping VectorMapping;
41       static ISLocalToGlobalMapping GblVectorMapping;
42       static  VecScatter scatter7;
43       static int firstpetsccall = 1;
44 
45       static Mat lhsPs;
46       static PC pcs;
47       static KSP ksps;
48       static Vec DyPs, resPs, DyPLocals;
49       static IS LocalIndexSets, GlobalIndexSets;
50       static ISLocalToGlobalMapping VectorMappings;
51       static ISLocalToGlobalMapping GblVectorMappings;
52       static VecScatter scatter7s;
53       static int firstpetsccalls = 1;
54 
55       static int rankdump=-1;  // 8121 was the problem rank with 3.5.3
56 
57 
58 void     SolGMRp(double* y,         double* ac,        double* yold,
59      	double* x,         int* iBC,       double* BC,
60      	int* col,       int* row,       double* lhsk,
61      	double* res,       double* BDiag,
62      	int* iper,
63      	int* ilwork,    double* shp,       double* shgl,      double* shpb,
64      	double* shglb,     double* Dy,        double* rerr, long long int* fncorp)
65 {
66 //
67 // ----------------------------------------------------------------------
68 //
69 //  This is the preconditioned GMRES driver routine.
70 //
71 // input:
72 //  y      (nshg,ndof)           : Y-variables at n+alpha_v
73 //  ac     (nshg,ndof)           : Primvar. accel. variable n+alpha_m
74 //  yold   (nshg,ndof)           : Y-variables at beginning of step
75 //  acold  (nshg,ndof)           : Primvar. accel. variable at begng step
76 //  x      (numnp,nsd)            : node coordinates
77 //  iBC    (nshg)                : BC codes
78 //  BC     (nshg,ndofBC)         : BC constraint parameters
79 //  shp(b) (nen,maxsh,melCat)     : element shape functions (boundary)
80 //  shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions
81 //
82 // output:
83 //  res    (nshg,nflow)           : preconditioned residual
84 //  BDiag  (nshg,nflow,nflow)      : block-diagonal preconditioner
85 //
86 //
87 // Zdenek Johan,  Winter 1991.  (Fortran 90)
88 // ----------------------------------------------------------------------
89 //
90 
91 
92 // Get variables from common_c.h
93       int nshg, nflow, nsd, iownnodes;
94       nshg  = conpar.nshg;
95       nflow = conpar.nflow;
96       nsd = NSD;
97       iownnodes = conpar.iownnodes;
98 
99       gcorp_t nshgt;
100       gcorp_t mbeg;
101       gcorp_t mend;
102       nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h
103       mbeg = (gcorp_t) newdim.minowned;
104       mend = (gcorp_t) newdim.maxowned;
105 
106 
107       int node, element, var, eqn;
108       double valtoinsert;
109       int nenl, iel, lelCat, lcsyst, iorder, nshl;
110       int mattyp, ndofl, nsymdl, npro, ngauss, nppro;
111       double DyFlat[nshg*nflow];
112       double DyFlatPhasta[nshg*nflow];
113       double rmes[nshg*nflow];
114 // DEBUG
115       int i,j,k,l,m;
116 
117       // FIXME: PetscScalar
118       double  real_rtol, real_abstol, real_dtol;
119 // /DEBUG
120       //double parray[1]; // Should be a PetscScalar
121       double *parray; // Should be a PetscScalar
122       PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA;
123       PetscInt petsc_n;
124       PetscOne = 1;
125       uint64_t duration[4];
126 
127 //      printf("%g %g %g \n", *rerr, *(rerr+1) , *(rerr+2));
128 
129        if(firstpetsccall == 1) {
130 /*         call get_time(duration(1),duration(2));
131          call system('sleep 10');
132          call get_time(duration(3),duration(4));
133          call get_max_time_diff(duration(1), duration(3),
134                               duration(2), duration(4),
135                               "TenSecondSleep " // char(0))
136          if(myrank .eq. 0) {
137            !write(*,"(A, E10.3)") "TenSecondSleep ", elapsed
138          }
139 */
140        }
141 //
142 //
143 // .... *******************>> Element Data Formation <<******************
144 //
145 //
146 // .... set the parameters for flux and surface tension calculations
147 //
148 //
149       genpar.idflx = 0 ;
150       if(genpar.idiff >= 1)  genpar.idflx= genpar.idflx + (nflow-1) * nsd;
151       if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd;
152 
153 
154 
155       gcorp_t glbNZ;
156 
157       if(firstpetsccall == 1) {
158         PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
159         PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
160         for(i=0;i<iownnodes;i++) {
161              idiagnz[i]=0;
162              iodiagnz[i]=0;
163         }
164         i=0;
165         for(k=0;k<nshg;k++) {
166           if((fncorp[k] < mbeg) || (fncorp[k] >mend)){
167 // this node is not owned by this rank so we skip
168           } else {
169            for(j=col[i]-1;j<col[i+1]-1;j++) {
170 //          assert(row[j]<=nshg);
171 //          assert(fncorp[row[j]-1]<=nshgt);
172              glbNZ=fncorp[row[j]-1];
173              if((glbNZ < mbeg) || (glbNZ > mend)) {
174                 iodiagnz[i]++;
175              } else {
176                 idiagnz[i]++;
177              }
178            }
179            i++;
180           }
181         }
182         gcorp_t mind=1000;
183         gcorp_t mino=1000;
184         gcorp_t maxd=0;
185         gcorp_t maxo=0;
186         for(i=0;i<iownnodes;i++) {
187            mind=min(mind,idiagnz[i]);
188            mino=min(mino,iodiagnz[i]);
189            maxd=max(maxd,idiagnz[i]);
190            maxo=max(maxo,iodiagnz[i]);
191 //           iodiagnz[i]=max(iodiagnz[i],10);
192 //           idiagnz[i]=max(idiagnz[i],10);
193 //           iodiagnz[i]=2*iodiagnz[i];   //  estimate a bit higher for off-part interactions
194 //           idiagnz[i]=2*idiagnz[i];   //  estimate a bit higher for off-part interactions
195         }
196 // the above was pretty good but below is faster and not too much more memory...of course once you do this
197 // could just use the constant fill parameters in create but keep it alive for potential later optimization
198 
199         for(i=0;i<iownnodes;i++) {
200            iodiagnz[i]=1.3*maxd;
201            idiagnz[i]=1.3*maxd;
202         }
203 
204 
205 
206         if(workfc.numpe < 200){
207           printf("myrank,i,iownnodes,nshg %d %d %d %d \n",workfc.myrank,i,iownnodes,nshg);
208           printf("myrank,mind,maxd,mino,maxo %d %d %d %d %d \n",workfc.myrank,mind,maxd,mino,maxo);
209         }
210         // Print debug info
211         if(nshgt < 200){
212         int irank;
213         for(irank=0;irank<workfc.numpe;irank++) {
214           if(irank == workfc.myrank){
215             printf("mbeg,mend,myrank,idiagnz, iodiagnz %d %d %d \n",mbeg,mend,workfc.myrank);
216             for(i=0;i<iownnodes;i++) {
217               printf("%d %ld %ld \n",workfc.myrank,idiagnz[i],iodiagnz[i]);
218             }
219           }
220          MPI_Barrier(MPI_COMM_WORLD);
221         }
222         }
223 //       }
224 //       if(firstpetsccall == 1) {
225 
226         petsc_bs = (PetscInt) nflow;
227         petsc_m  = (PetscInt) nflow* (PetscInt) iownnodes;
228         petsc_M  = (PetscInt) nshgt * (PetscInt) nflow;
229         petsc_PA  = (PetscInt) 40;
230         // TODO: fixe nshgt for 64 bit integer!!!!!
231 
232         //ierr = MatCreateBAIJ(PETSC_COMM_WORLD, nflow, nflow*iownnodes,
233         //       nflow*iownnodes, nshgt*nflow, nshgt*nflow, 0,
234 
235 // this has no pre-allocate        ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M,
236 // this has no pre-allocate                             0, NULL, 0, NULL, &lhsP);
237 
238 // next 2 do pre-allocate
239 
240 //      MPI_Barrier(MPI_COMM_WORLD);
241 //      if(workfc.myrank ==0) printf("Before matCreateBAIJ petsc_bs,petsc_m,petsc_M,petsc_PA %ld %ld %ld %ld \n",petsc_bs,petsc_m,petsc_M,petsc_PA);
242 
243         ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M,
244                              0, idiagnz, 0, iodiagnz, &lhsP);
245 
246 //                              petsc_PA, NULL, petsc_PA, NULL, &lhsP);
247 //      MPI_Barrier(MPI_COMM_WORLD);
248 //      if(workfc.myrank ==0) printf("Before matSetOoption 1  \n");
249         ierr = MatSetOption(lhsP, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE);
250 
251 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call
252 #ifdef HIDEJEDBROWN
253         ierr = MatSetOption(lhsP, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE);
254 #endif
255         ierr = MatSetUp(lhsP);
256 
257       PetscInt myMatStart, myMatEnd;
258       ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd);
259 //debug
260       if(workfc.myrank == rankdump) printf("Flow myrank,myMatStart,myMatEnd %d,%d,%d, \n", workfc.myrank,myMatStart,myMatEnd);
261       }
262 //      MPI_Barrier(MPI_COMM_WORLD);
263  //     if(workfc.myrank ==0) printf("Before MatZeroEntries  \n");
264       if(genpar.lhs ==1) ierr = MatZeroEntries(lhsP);
265 
266       get_time(duration, (duration+1));
267 
268 //      MPI_Barrier(MPI_COMM_WORLD);
269  //     if(workfc.myrank ==0) printf("Before elmgmr  \n");
270 //      for (i=0; i<nshg ; i++) {
271 //            assert(fncorp[i]>0);
272 //      }
273 
274       ElmGMRPETSc(y,             ac,            x,
275                   shp,           shgl,          iBC,
276                   BC,            shpb,
277                   shglb,         res,
278                   rmes,                   iper,
279                   ilwork,        rerr ,         &lhsP);
280 //  Below was just to confirm that what is in rstat is the res or b vector given to PETSc
281 //      rstat (res, ilwork);
282 //      if(workfc.myrank ==0) printf("residual after ElmGMRPETSc\n");
283       get_time((duration+2), (duration+3));
284       get_max_time_diff((duration), (duration+2),
285                         (duration+1), (duration+3),
286                         "ElmGMRPETSc \0");  // char(0))
287 //       printf("after ElmGMRPETSc\n");
288        if(firstpetsccall == 1) {
289       // Setup IndexSet. For now, we mimic vector insertion procedure
290       // Since we always reference by global indexes this doesn't matter
291       // except for cache performance)
292       // TODO: Better arrangment?
293       //PetscInt indexsetary[nflow*nshg];
294       PetscInt* indexsetary = malloc(sizeof(PetscInt)*nflow*nshg);
295       PetscInt* gblindexsetary = malloc(sizeof(PetscInt)*nflow*nshg);
296       PetscInt nodetoinsert;
297         nodetoinsert = 0;
298         k=0;
299          if(workfc.myrank == rankdump) {
300              printf("myrank,i,iownnodes,nshg %d %d %d %d \n",workfc.myrank,i,iownnodes,nshg);
301              printf("myrank,mbeg,mend %d %d %d \n",workfc.myrank,mbeg,mend);
302            }
303         if(workfc.numpe > 1) {
304           for (i=0; i<nshg ; i++) {
305             nodetoinsert = fncorp[i]-1;
306 
307 //debug
308          if(workfc.myrank == rankdump) {
309              printf("myrank,i,nodetoinsert %d %d %d \n",workfc.myrank,i,nodetoinsert);
310          }
311 
312 //            if(nodetoinsert<0) printf("nodetoinsert <0! myrank: %d, value: %ld\n", workfc.myrank, nodetoinsert);
313 //            assert(fncorp[i]>=0);
314             for (j=1; j<=nflow; j++) {
315               indexsetary[k] = nodetoinsert*nflow+(j-1);
316               assert(indexsetary[k]>=0);
317 //              assert(fncorp[i]>=0);
318               k = k+1;
319             }
320           }
321         }
322         else {
323           for (i=0; i<nshg ; i++) {
324             nodetoinsert = i;
325             for (j=1; j<=nflow; j++) {
326               indexsetary[k] = nodetoinsert*nflow+(j-1);
327               k = k+1;
328             }
329           }
330         }
331 
332         for (i=0; i<nshg*nflow; i++) {
333           gblindexsetary[i] = i ; //TODO: better option for performance?
334 //debug
335          if(workfc.myrank == rankdump) {
336              printf("myrank,i,glbindexsetary %d %d %d \n",workfc.myrank,i,gblindexsetary[i]);
337          }
338         }
339 //  Create Vector Index Maps
340         petsc_n  = (PetscInt) nshg * (PetscInt) nflow;
341         //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, indexsetary,
342      	//       PETSC_COPY_VALUES, &LocalIndexSet);
343         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetary,
344      	       PETSC_COPY_VALUES, &LocalIndexSet);
345 //        printf("after 1st ISCreateGeneral %d\n", ierr);
346         //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, gblindexsetary,
347         //       PETSC_COPY_VALUES, &GlobalIndexSet);
348         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetary,
349                PETSC_COPY_VALUES, &GlobalIndexSet);
350 //        printf("after 2nd ISCreateGeneral %d\n", ierr);
351         ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSet,&VectorMapping);
352 //        printf("after 1st ISLocalToGlobalMappingCreateIS %d\n",ierr);
353         ierr =  ISLocalToGlobalMappingCreateIS(GlobalIndexSet,&GblVectorMapping);
354 //        printf("after 2nd ISLocalToGlobalMappingCreateIS %d\n",ierr);
355       free(indexsetary);
356       free(gblindexsetary);
357       }
358       if(genpar.lhs == 1) {
359       get_time((duration), (duration+1));
360       ierr = MatAssemblyBegin(lhsP, MAT_FINAL_ASSEMBLY);
361       ierr = MatAssemblyEnd(lhsP, MAT_FINAL_ASSEMBLY);
362       get_time((duration+2),(duration+3));
363       get_max_time_diff((duration), (duration+2),
364                         (duration+1), (duration+3),
365                         "MatAssembly \0"); // char(0))
366       get_time(duration, (duration+1));
367       }
368       if(firstpetsccall == 1) {
369         ierr = MatGetLocalSize(lhsP, &LocalRow, &LocalCol);
370 //  TODO Should this be LocalRow? LocalCol? or does LocalRow always == LocalCol
371         //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &resP);
372         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resP);
373         //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &DyP);
374         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyP);
375       }
376       ierr = VecZeroEntries(resP);
377       ierr = VecZeroEntries(DyP);
378       if(firstpetsccall == 1) {
379         //ierr = VecCreateSeq(PETSC_COMM_SELF, nshg*nflow, &DyPLocal);
380         ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocal);
381       }
382 //         call VecZeroEntries(DyPLocal, ierr)
383 
384       PetscRow=0;
385       k = 0;
386       int index;
387       for (i=0; i<nshg; i++ ){
388         for (j = 1; j<=nflow; j++){
389           index = i + (j-1)*nshg;
390           valtoinsert = res[index];
391           if(workfc.numpe > 1) {
392             PetscRow = (fncorp[i]-1)*nflow+(j-1);
393           }
394           else {
395             PetscRow = i*nflow+(j-1);
396           }
397           assert(fncorp[i]<=nshgt);
398           assert(fncorp[i]>0);
399           assert(PetscRow>=0);
400           assert(PetscRow<=nshgt*nflow);
401           ierr =  VecSetValue(resP, PetscRow, valtoinsert, INSERT_VALUES);
402         }
403       }
404 //      printf("after VecSetValue loop %d\n",ierr);
405       ierr = VecAssemblyBegin(resP);
406       ierr = VecAssemblyEnd(resP);
407       get_time((duration+2), (duration+3));
408       get_max_time_diff((duration), (duration+2),
409                         (duration+1), (duration+3),
410                         "VectorWorkPre \0"); // char(0))
411 
412       get_time((duration),(duration+1));
413 
414       if(firstpetsccall == 1) {
415         ierr = KSPCreate(PETSC_COMM_WORLD, &ksp);
416         ierr = KSPSetOperators(ksp, lhsP, lhsP);
417 //3.4 ierr = KSPSetOperators(ksp, lhsP, lhsP, SAME_NONZERO_PATTERN);
418         ierr = KSPGetPC(ksp, &pc);
419         ierr = PCSetType(pc, PCPBJACOBI);
420         PetscInt maxits;
421         maxits = (PetscInt)  solpar.nGMRES * (PetscInt) solpar.Kspace;
422         //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace);
423         ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits);
424         ierr = KSPSetFromOptions(ksp);
425       }
426       ierr = KSPSolve(ksp, resP, DyP);
427       get_time((duration+2),(duration+3));
428       get_max_time_diff((duration), (duration+2),
429                         (duration+1), (duration+3),
430                         "KSPSolve \0"); // char(0))
431      //if(myrank .eq. 0) then
432      //!write(*,"(A, E10.3)") "KSPSolve ", elapsed
433      //end if
434       get_time((duration),(duration+1));
435       if(firstpetsccall == 1) {
436       ierr = VecScatterCreate(DyP, LocalIndexSet, DyPLocal, GlobalIndexSet, &scatter7);
437       }
438       ierr = VecScatterBegin(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD);
439       ierr = VecScatterEnd(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD);
440       ierr = VecGetArray(DyPLocal, &parray);
441       PetscRow = 0;
442       for ( node = 0; node< nshg; node++) {
443         for (var = 1; var<= nflow; var++) {
444           index = node + (var-1)*nshg;
445           Dy[index] = parray[PetscRow];
446           PetscRow = PetscRow+1;
447         }
448       }
449       ierr = VecRestoreArray(DyPLocal, &parray);
450 
451       firstpetsccall = 0;
452       // later timer ('Solver  ');
453 
454 
455 // .... output the statistics
456 //
457       itrpar.iKs=0; // see rstat()
458       PetscInt its;
459       //ierr = KSPGetIterationNumber(ksp, &itrpar.iKs);
460       ierr = KSPGetIterationNumber(ksp, &its);
461       itrpar.iKs = (int) its;
462       get_time((duration+2),(duration+3));
463       get_max_time_diff((duration), (duration+2),
464                         (duration+1), (duration+3),
465                         "solWork \0"); // char(0))
466       itrpar.ntotGM += (int) its;
467       rstat (res, ilwork);
468 //
469 // .... stop the timer
470 //
471 // 3002 continue                  ! no solve just res.
472       // later call timer ('Back    ')
473 //
474 // .... end
475 //
476 }
477 
478 void     SolGMRpSclr(double* y,         double* ac,
479      	double* x,      double* elDw,   int* iBC,       double* BC,
480      	int* col,       int* row,
481      	int* iper,
482      	int* ilwork,    double* shp,       double* shgl,      double* shpb,
483      	double* shglb,  double* res,   double* Dy,     long long int* fncorp)
484 {
485 //
486 // ----------------------------------------------------------------------
487 //
488 //  This is the preconditioned GMRES driver routine.
489 //
490 // input:
491 //  y      (nshg,ndof)           : Y-variables at n+alpha_v
492 //  ac     (nshg,ndof)           : Primvar. accel. variable n+alpha_m
493 //  yold   (nshg,ndof)           : Y-variables at beginning of step
494 //  acold  (nshg,ndof)           : Primvar. accel. variable at begng step
495 //  x      (numnp,nsd)            : node coordinates
496 //  iBC    (nshg)                : BC codes
497 //  BC     (nshg,ndofBC)         : BC constraint parameters
498 //  shp(b) (nen,maxsh,melCat)     : element shape functions (boundary)
499 //  shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions
500 //
501 // ----------------------------------------------------------------------
502 //
503 
504 
505 // Get variables from common_c.h
506       int nshg, nflow, nsd, iownnodes;
507       nshg  = conpar.nshg;
508       nsd = NSD;
509       iownnodes = conpar.iownnodes;
510 
511       gcorp_t nshgt;
512       gcorp_t mbeg;
513       gcorp_t mend;
514       nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h
515       mbeg = (gcorp_t) newdim.minowned;
516       mend = (gcorp_t) newdim.maxowned;
517 
518 
519       int node, element, var, eqn;
520       double valtoinsert;
521       int nenl, iel, lelCat, lcsyst, iorder, nshl;
522       int mattyp, ndofl, nsymdl, npro, ngauss, nppro;
523       double DyFlats[nshg];
524       double DyFlatPhastas[nshg];
525       double rmes[nshg];
526 // DEBUG
527       int i,j,k,l,m;
528 
529       double  real_rtol, real_abstol, real_dtol;
530       double *parray;
531       PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA;
532       PetscInt petsc_n;
533       PetscOne = 1;
534       uint64_t duration[4];
535 
536 
537 //
538 //
539 // .... *******************>> Element Data Formation <<******************
540 //
541 //
542 // .... set the parameters for flux and surface tension calculations
543 //
544 //
545       genpar.idflx = 0 ;
546       if(genpar.idiff >= 1)  genpar.idflx= genpar.idflx + (nflow-1) * nsd;
547       if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd;
548 
549 
550 
551       gcorp_t glbNZ;
552 
553       if(firstpetsccalls == 1) {
554         PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
555         PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
556         for(i=0;i<iownnodes;i++) {
557              idiagnz[i]=0;
558              iodiagnz[i]=0;
559         }
560         i=0;
561         for(k=0;k<nshg;k++) {
562           if((fncorp[k] < mbeg) || (fncorp[k] >mend)){
563 // this node is not owned by this rank so we skip
564           } else {
565            for(j=col[i]-1;j<col[i+1]-1;j++) {
566              glbNZ=fncorp[row[j]-1];
567              if((glbNZ < mbeg) || (glbNZ > mend)) {
568                 iodiagnz[i]++;
569              } else {
570                 idiagnz[i]++;
571              }
572            }
573            i++;
574           }
575         }
576         gcorp_t mind=1000;
577         gcorp_t mino=1000;
578         gcorp_t maxd=0;
579         gcorp_t maxo=0;
580         for(i=0;i<iownnodes;i++) {
581            mind=min(mind,idiagnz[i]);
582            mino=min(mino,iodiagnz[i]);
583            maxd=max(maxd,idiagnz[i]);
584            maxo=max(maxo,iodiagnz[i]);
585         }
586 
587         for(i=0;i<iownnodes;i++) {
588            iodiagnz[i]=1.3*maxd;
589            idiagnz[i]=1.3*maxd;
590         }
591 
592 
593 
594 //       }
595 //       if(firstpetsccalls == 1) {
596 
597         petsc_m  = (PetscInt) iownnodes;
598         petsc_M  = (PetscInt) nshgt;
599         petsc_PA  = (PetscInt) 40;
600 
601         ierr = MatCreateAIJ(PETSC_COMM_WORLD, petsc_m, petsc_m, petsc_M, petsc_M,
602                             0, idiagnz, 0, iodiagnz, &lhsPs);
603 
604         ierr = MatSetOption(lhsPs, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE);
605 
606 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call
607 #ifdef HIDEJEDBROWN
608         ierr = MatSetOption(lhsPs, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE);
609 #endif
610         ierr = MatSetUp(lhsPs);
611 
612       PetscInt myMatStart, myMatEnd;
613       ierr = MatGetOwnershipRange(lhsPs, &myMatStart, &myMatEnd);
614       if(workfc.myrank ==rankdump) printf("Sclr myrank,myMatStart,myMatEnd %d,%d,%d, \n", workfc.myrank,myMatStart,myMatEnd);
615       }
616 //      MPI_Barrier(MPI_COMM_WORLD);
617  //     if(workfc.myrank ==0) printf("Before MatZeroEntries  \n");
618       if(genpar.lhs == 1) ierr = MatZeroEntries(lhsPs);
619 
620       get_time(duration, (duration+1));
621 
622 //      MPI_Barrier(MPI_COMM_WORLD);
623  //     if(workfc.myrank ==0) printf("Before elmgmr  \n");
624 //      for (i=0; i<nshg ; i++) {
625 //            assert(fncorp[i]>0);
626 //      }
627 
628       ElmGMRPETScSclr(y,             ac,            x, elDw,
629                   shp,           shgl,          iBC,
630                   BC,            shpb,
631                   shglb,         res,
632                   rmes,                   iper,
633                   ilwork,        &lhsPs);
634        if(firstpetsccalls == 1) {
635       // Setup IndexSet. For now, we mimic vector insertion procedure
636       // Since we always reference by global indexes this doesn't matter
637       // except for cache performance)
638       // TODO: Better arrangment?
639 
640       PetscInt* indexsetarys = malloc(sizeof(PetscInt)*nshg);
641       PetscInt* gblindexsetarys = malloc(sizeof(PetscInt)*nshg);
642       PetscInt nodetoinsert;
643 
644         nodetoinsert = 0;
645         k=0;
646 
647 //debug
648          if(workfc.myrank == rankdump) {
649              printf("myrank,i,iownnodes,nshg %d %d %d %d \n",workfc.myrank,i,iownnodes,nshg);
650              printf("myrank,mbeg,mend %d %d %d \n",workfc.myrank,mbeg,mend);
651            }
652 
653         if(workfc.numpe > 1) {
654           for (i=0; i<nshg ; i++) {
655             nodetoinsert = fncorp[i]-1;
656 //debug
657          if(workfc.myrank == rankdump) {
658              printf("myrank,i,nodetoinsert %d %d %d \n",workfc.myrank,i,nodetoinsert);
659          }
660 
661             indexsetarys[k] = nodetoinsert;
662             k = k+1;
663           }
664         }
665         else {
666           for (i=0; i<nshg ; i++) {
667             nodetoinsert = i;
668             indexsetarys[k] = nodetoinsert;
669             k = k+1;
670           }
671         }
672 
673         for (i=0; i<nshg; i++) {
674           gblindexsetarys[i] = i ; //TODO: better option for performance?
675 //debug
676          if(workfc.myrank == rankdump) {
677              printf("myrank,i,glbindexsetarys %d %d %d \n",workfc.myrank,i,gblindexsetarys[i]);
678          }
679         }
680 //  Create Vector Index Maps
681         petsc_n  = (PetscInt) nshg;
682         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetarys,
683      	       PETSC_COPY_VALUES, &LocalIndexSets);
684         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetarys,
685                PETSC_COPY_VALUES, &GlobalIndexSets);
686         ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSets,&VectorMappings);
687         ierr =  ISLocalToGlobalMappingCreateIS(GlobalIndexSets,&GblVectorMappings);
688       free(indexsetarys);
689       free(gblindexsetarys);
690       }
691       if(genpar.lhs ==1) {
692       ierr = MatAssemblyBegin(lhsPs, MAT_FINAL_ASSEMBLY);
693       ierr = MatAssemblyEnd(lhsPs, MAT_FINAL_ASSEMBLY);
694       }
695       if(firstpetsccalls == 1) {
696         ierr = MatGetLocalSize(lhsPs, &LocalRow, &LocalCol);
697         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPs);
698         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyPs);
699       }
700       ierr = VecZeroEntries(resPs);
701       ierr = VecZeroEntries(DyPs);
702       if(firstpetsccalls == 1) {
703         ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocals);
704       }
705 
706       PetscRow=0;
707       k = 0;
708       int index;
709       for (i=0; i<nshg; i++ ){
710           valtoinsert = res[i];
711           if(workfc.numpe > 1) {
712             PetscRow = (fncorp[i]-1);
713           }
714           else {
715             PetscRow = i-1;
716           }
717           assert(fncorp[i]<=nshgt);
718           assert(fncorp[i]>0);
719           assert(PetscRow>=0);
720           assert(PetscRow<=nshgt);
721           ierr =  VecSetValue(resPs, PetscRow, valtoinsert, INSERT_VALUES);
722       }
723 //      printf("after VecSetValue loop %d\n",ierr);
724       ierr = VecAssemblyBegin(resPs);
725       ierr = VecAssemblyEnd(resPs);
726 
727       if(firstpetsccalls == 1) {
728         ierr = KSPCreate(PETSC_COMM_WORLD, &ksps);
729         ierr = KSPSetOperators(ksps, lhsPs, lhsPs);
730 //3.4 ierr = KSPSetOperators(ksps, lhsPs, lhsPs, SAME_NONZERO_PATTERN);
731         ierr = KSPGetPC(ksps, &pcs);
732         ierr = PCSetType(pcs, PCPBJACOBI);
733         PetscInt maxits;
734         maxits = (PetscInt)  solpar.nGMRES * (PetscInt) solpar.Kspace;
735         //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace);
736         ierr = KSPSetTolerances(ksps, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits);
737         ierr = KSPSetFromOptions(ksps);
738       }
739       ierr = KSPSolve(ksps, resPs, DyPs);
740       if(firstpetsccalls == 1) {
741       ierr = VecScatterCreate(DyPs, LocalIndexSets, DyPLocals, GlobalIndexSets, &scatter7s);
742       }
743       ierr = VecScatterBegin(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD);
744       ierr = VecScatterEnd(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD);
745       ierr = VecGetArray(DyPLocals, &parray);
746       PetscRow = 0;
747       for ( node = 0; node< nshg; node++) {
748           index = node;
749           Dy[index] = parray[PetscRow];
750           PetscRow = PetscRow+1;
751       }
752       ierr = VecRestoreArray(DyPLocals, &parray);
753 
754       firstpetsccalls = 0;
755 
756 // .... output the statistics
757 //
758 //      itrpar.iKss=0; // see rstat()
759       PetscInt its;
760       ierr = KSPGetIterationNumber(ksps, &its);
761       itrpar.iKss = (int) its;
762       itrpar.ntotGMs += (int) its;
763       rstatSclr (res, ilwork);
764 //
765 // .... end
766 //
767 }
768 #endif
769