xref: /phasta/phSolver/compressible/solgmrpetsc.c (revision 1016729149754f57cd03fe576ba6fd0f1723ab31)
1 #include <stdio.h>
2 
3 #include "petscsys.h"
4 #include "petscvec.h"
5 #include "petscmat.h"
6 #include "petscpc.h"
7 #include "petscksp.h"
8 
9 #include "assert.h"
10 #include "common_c.h"
11 
12 #include <FCMangle.h>
13 #define SolGMRp FortranCInterface_GLOBAL_(solgmrp,SOLGMRP)
14 #define SolGMRpSclr FortranCInterface_GLOBAL_(solgmrpsclr,SOLGMRPSCLR)
15 #define ElmGMRPETSc FortranCInterface_GLOBAL_(elmgmrpetsc, ELMGMRPETSC)
16 #define ElmGMRPETScSclr FortranCInterface_GLOBAL_(elmgmrpetscsclr, ELMGMRPETSCSCLR)
17 #define rstat FortranCInterface_GLOBAL_(rstat, RSTAT)
18 #define rstatSclr FortranCInterface_GLOBAL_(rstatsclr, RSTATSCLR)
19 #define min(a,b) (((a) < (b)) ? (a) : (b))
20 #define max(a,b) (((a) > (b)) ? (a) : (b))
21 #define get_time FortranCInterface_GLOBAL_(get_time,GET_TIME)
22 #define get_max_time_diff FortranCInterface_GLOBAL_(get_max_time_diff,GET_MAX_TIME_DIFF)
23 
24 typedef long long int gcorp_t;
25 
26 void get_time(uint64_t* rv, uint64_t* cycle);
27 void get_max_time_diff(uint64_t* first, uint64_t* last, uint64_t* c_first, uint64_t* c_last, char* lbl);
28 
29 //#include "auxmpi.h"
30 //      static PetscOffset poff;
31 
32       static Mat lhsP;
33       static PC pc;
34       static KSP ksp;
35       static Vec DyP, resP, DyPLocal, resPGlobal;
36       static PetscErrorCode ierr;
37       static PetscInt PetscOne, PetscRow, PetscCol, LocalRow, LocalCol;
38       static IS LocalIndexSet, GlobalIndexSet;
39       static ISLocalToGlobalMapping VectorMapping;
40       static ISLocalToGlobalMapping GblVectorMapping;
41       static  VecScatter scatter7;
42       static int firstpetsccall = 1;
43 
44       static Mat lhsPs;
45       static PC pcs;
46       static KSP ksps;
47       static Vec DyPs, resPs, DyPLocals, resPGlobals;
48       static IS LocalIndexSets, GlobalIndexSets;
49       static ISLocalToGlobalMapping VectorMappings;
50       static ISLocalToGlobalMapping GblVectorMappings;
51       static VecScatter scatter7s;
52       static int firstpetsccalls = 1;
53 
54 void     SolGMRp(double* y,         double* ac,        double* yold,
55      	double* x,         int* iBC,       double* BC,
56      	int* col,       int* row,       double* lhsk,
57      	double* res,       double* BDiag,
58      	int* iper,
59      	int* ilwork,    double* shp,       double* shgl,      double* shpb,
60      	double* shglb,     double* Dy,        double* rerr, long long int* fncorp)
61 {
62 //
63 // ----------------------------------------------------------------------
64 //
65 //  This is the preconditioned GMRES driver routine.
66 //
67 // input:
68 //  y      (nshg,ndof)           : Y-variables at n+alpha_v
69 //  ac     (nshg,ndof)           : Primvar. accel. variable n+alpha_m
70 //  yold   (nshg,ndof)           : Y-variables at beginning of step
71 //  acold  (nshg,ndof)           : Primvar. accel. variable at begng step
72 //  x      (numnp,nsd)            : node coordinates
73 //  iBC    (nshg)                : BC codes
74 //  BC     (nshg,ndofBC)         : BC constraint parameters
75 //  shp(b) (nen,maxsh,melCat)     : element shape functions (boundary)
76 //  shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions
77 //
78 // output:
79 //  res    (nshg,nflow)           : preconditioned residual
80 //  BDiag  (nshg,nflow,nflow)      : block-diagonal preconditioner
81 //
82 //
83 // Zdenek Johan,  Winter 1991.  (Fortran 90)
84 // ----------------------------------------------------------------------
85 //
86 
87 
88 // Get variables from common_c.h
89       int nshg, nflow, nsd, iownnodes;
90       nshg  = conpar.nshg;
91       nflow = conpar.nflow;
92       nsd = NSD;
93       iownnodes = conpar.iownnodes;
94 
95       gcorp_t nshgt;
96       gcorp_t mbeg;
97       gcorp_t mend;
98       nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h
99       mbeg = (gcorp_t) newdim.minowned;
100       mend = (gcorp_t) newdim.maxowned;
101 
102 
103       int node, element, var, eqn;
104       double valtoinsert;
105       int nenl, iel, lelCat, lcsyst, iorder, nshl;
106       int mattyp, ndofl, nsymdl, npro, ngauss, nppro;
107       double DyFlat[nshg*nflow];
108       double DyFlatPhasta[nshg*nflow];
109       double rmes[nshg*nflow];
110 // DEBUG
111       int i,j,k,l,m;
112       //int indexsetary[nflow*nshg];
113       //int gblindexsetary[nflow*nshg];
114       PetscInt indexsetary[nflow*nshg];
115       PetscInt gblindexsetary[nflow*nshg];
116       PetscInt nodetoinsert;
117 
118       // FIXME: PetscScalar
119       double  real_rtol, real_abstol, real_dtol;
120 // /DEBUG
121       //double parray[1]; // Should be a PetscScalar
122       double *parray; // Should be a PetscScalar
123       PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA;
124       PetscInt petsc_n;
125       PetscOne = 1;
126       uint64_t duration[4];
127 
128 //      printf("%g %g %g \n", *rerr, *(rerr+1) , *(rerr+2));
129 
130        if(firstpetsccall == 1) {
131 /*         call get_time(duration(1),duration(2));
132          call system('sleep 10');
133          call get_time(duration(3),duration(4));
134          call get_max_time_diff(duration(1), duration(3),
135                               duration(2), duration(4),
136                               "TenSecondSleep " // char(0))
137          if(myrank .eq. 0) {
138            !write(*,"(A, E10.3)") "TenSecondSleep ", elapsed
139          }
140 */
141        }
142 //
143 //
144 // .... *******************>> Element Data Formation <<******************
145 //
146 //
147 // .... set the parameters for flux and surface tension calculations
148 //
149 //
150       genpar.idflx = 0 ;
151       if(genpar.idiff >= 1)  genpar.idflx= genpar.idflx + (nflow-1) * nsd;
152       if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd;
153 
154 
155 
156       PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
157       PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
158       gcorp_t glbNZ;
159 
160       if(firstpetsccall == 1) {
161         for(i=0;i<iownnodes;i++) {
162              idiagnz[i]=0;
163              iodiagnz[i]=0;
164         }
165         i=0;
166         for(k=0;k<nshg;k++) {
167           if((fncorp[k] < mbeg) || (fncorp[k] >mend)){
168 // this node is not owned by this rank so we skip
169           } else {
170            for(j=col[i]-1;j<col[i+1]-1;j++) {
171              glbNZ=fncorp[row[j]];
172              if((glbNZ < mbeg) || (glbNZ > mend)) {
173                 iodiagnz[i]++;
174              } else {
175                 idiagnz[i]++;
176              }
177            }
178            i++;
179           }
180         }
181         gcorp_t mind=1000;
182         gcorp_t mino=1000;
183         gcorp_t maxd=0;
184         gcorp_t maxo=0;
185         for(i=0;i<iownnodes;i++) {
186            mind=min(mind,idiagnz[i]);
187            mino=min(mino,iodiagnz[i]);
188            maxd=max(maxd,idiagnz[i]);
189            maxo=max(maxo,iodiagnz[i]);
190 //           iodiagnz[i]=max(iodiagnz[i],10);
191 //           idiagnz[i]=max(idiagnz[i],10);
192 //           iodiagnz[i]=2*iodiagnz[i];   //  estimate a bit higher for off-part interactions
193 //           idiagnz[i]=2*idiagnz[i];   //  estimate a bit higher for off-part interactions
194         }
195 // the above was pretty good but below is faster and not too much more memory...of course once you do this
196 // could just use the constant fill parameters in create but keep it alive for potential later optimization
197 
198         for(i=0;i<iownnodes;i++) {
199            iodiagnz[i]=1.3*maxd;
200            idiagnz[i]=1.3*maxd;
201         }
202 
203 
204 
205         if(workfc.numpe < 200){
206           printf("myrank,i,iownnodes,nshg %d %d %d %d \n",workfc.myrank,i,iownnodes,nshg);
207           printf("myrank,mind,maxd,mino,maxo %d %d %d %d %d \n",workfc.myrank,mind,maxd,mino,maxo);
208         }
209         // Print debug info
210         if(nshgt < 200){
211         int irank;
212         for(irank=0;irank<workfc.numpe;irank++) {
213           if(irank == workfc.myrank){
214             printf("mbeg,mend,myrank,idiagnz, iodiagnz %d %d %d \n",mbeg,mend,workfc.myrank);
215             for(i=0;i<iownnodes;i++) {
216               printf("%d %ld %ld \n",workfc.myrank,idiagnz[i],iodiagnz[i]);
217             }
218           }
219          MPI_Barrier(MPI_COMM_WORLD);
220         }
221         }
222        }
223        if(firstpetsccall == 1) {
224 
225         petsc_bs = (PetscInt) nflow;
226         petsc_m  = (PetscInt) nflow* (PetscInt) iownnodes;
227         petsc_M  = (PetscInt) nshgt * (PetscInt) nflow;
228         petsc_PA  = (PetscInt) 40;
229         // TODO: fixe nshgt for 64 bit integer!!!!!
230 
231         //ierr = MatCreateBAIJ(PETSC_COMM_WORLD, nflow, nflow*iownnodes,
232         //       nflow*iownnodes, nshgt*nflow, nshgt*nflow, 0,
233 
234 // this has no pre-allocate        ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M,
235 // this has no pre-allocate                             0, NULL, 0, NULL, &lhsP);
236 
237 // next 2 do pre-allocate
238 
239 //      MPI_Barrier(MPI_COMM_WORLD);
240 //      if(workfc.myrank ==0) printf("Before matCreateBAIJ petsc_bs,petsc_m,petsc_M,petsc_PA %ld %ld %ld %ld \n",petsc_bs,petsc_m,petsc_M,petsc_PA);
241 
242         ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M,
243                              0, idiagnz, 0, iodiagnz, &lhsP);
244 
245 //                              petsc_PA, NULL, petsc_PA, NULL, &lhsP);
246 //      MPI_Barrier(MPI_COMM_WORLD);
247 //      if(workfc.myrank ==0) printf("Before matSetOoption 1  \n");
248         ierr = MatSetOption(lhsP, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE);
249 
250 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call
251         ierr = MatSetOption(lhsP, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE);
252         ierr = MatSetUp(lhsP);
253 
254       PetscInt myMatStart, myMatEnd;
255       ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd);
256       }
257 //      MPI_Barrier(MPI_COMM_WORLD);
258  //     if(workfc.myrank ==0) printf("Before MatZeroEntries  \n");
259       if(genpar.lhs ==1) ierr = MatZeroEntries(lhsP);
260 
261       get_time(duration, (duration+1));
262 
263 //      MPI_Barrier(MPI_COMM_WORLD);
264  //     if(workfc.myrank ==0) printf("Before elmgmr  \n");
265 //      for (i=0; i<nshg ; i++) {
266 //            assert(fncorp[i]>0);
267 //      }
268 
269       ElmGMRPETSc(y,             ac,            x,
270                   shp,           shgl,          iBC,
271                   BC,            shpb,
272                   shglb,         res,
273                   rmes,                   iper,
274                   ilwork,        rerr ,         &lhsP);
275       get_time((duration+2), (duration+3));
276       get_max_time_diff((duration), (duration+2),
277                         (duration+1), (duration+3),
278                         "ElmGMRPETSc \0");  // char(0))
279 //       printf("after ElmGMRPETSc\n");
280        if(firstpetsccall == 1) {
281       // Setup IndexSet. For now, we mimic vector insertion procedure
282       // Since we always reference by global indexes this doesn't matter
283       // except for cache performance)
284       // TODO: Better arrangment?
285         nodetoinsert = 0;
286         k=0;
287         if(workfc.numpe > 1) {
288           for (i=0; i<nshg ; i++) {
289             nodetoinsert = fncorp[i]-1;
290 //            if(nodetoinsert<0) printf("nodetoinsert <0! myrank: %d, value: %ld\n", workfc.myrank, nodetoinsert);
291 //            assert(fncorp[i]>=0);
292             for (j=1; j<=nflow; j++) {
293               indexsetary[k] = nodetoinsert*nflow+(j-1);
294               assert(indexsetary[k]>=0);
295 //              assert(fncorp[i]>=0);
296               k = k+1;
297             }
298           }
299         }
300         else {
301           for (i=0; i<nshg ; i++) {
302             nodetoinsert = i;
303             for (j=1; j<=nflow; j++) {
304               indexsetary[k] = nodetoinsert*nflow+(j-1);
305               k = k+1;
306             }
307           }
308         }
309 
310         for (i=0; i<nshg*nflow; i++) {
311           gblindexsetary[i] = i ; //TODO: better option for performance?
312         }
313 //  Create Vector Index Maps
314         petsc_n  = (PetscInt) nshg * (PetscInt) nflow;
315         //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, indexsetary,
316      	//       PETSC_COPY_VALUES, &LocalIndexSet);
317         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetary,
318      	       PETSC_COPY_VALUES, &LocalIndexSet);
319 //        printf("after 1st ISCreateGeneral %d\n", ierr);
320         //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, gblindexsetary,
321         //       PETSC_COPY_VALUES, &GlobalIndexSet);
322         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetary,
323                PETSC_COPY_VALUES, &GlobalIndexSet);
324 //        printf("after 2nd ISCreateGeneral %d\n", ierr);
325         ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSet,&VectorMapping);
326 //        printf("after 1st ISLocalToGlobalMappingCreateIS %d\n",ierr);
327         ierr =  ISLocalToGlobalMappingCreateIS(GlobalIndexSet,&GblVectorMapping);
328 //        printf("after 2nd ISLocalToGlobalMappingCreateIS %d\n",ierr);
329       }
330       if(genpar.lhs == 1) {
331       get_time((duration), (duration+1));
332       ierr = MatAssemblyBegin(lhsP, MAT_FINAL_ASSEMBLY);
333       ierr = MatAssemblyEnd(lhsP, MAT_FINAL_ASSEMBLY);
334       get_time((duration+2),(duration+3));
335       get_max_time_diff((duration), (duration+2),
336                         (duration+1), (duration+3),
337                         "MatAssembly \0"); // char(0))
338       get_time(duration, (duration+1));
339       }
340       if(firstpetsccall == 1) {
341         ierr = MatGetLocalSize(lhsP, &LocalRow, &LocalCol);
342 //  TODO Should this be LocalRow? LocalCol? or does LocalRow always == LocalCol
343         //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &resP);
344         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resP);
345         //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &DyP);
346         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyP);
347       }
348       ierr = VecZeroEntries(resP);
349       ierr = VecZeroEntries(DyP);
350       if(firstpetsccall == 1) {
351         //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &resPGlobal);
352         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPGlobal);
353         //ierr = VecCreateSeq(PETSC_COMM_SELF, nshg*nflow, &DyPLocal);
354         ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocal);
355       }
356         ierr = VecZeroEntries(resPGlobal);
357 //         call VecZeroEntries(DyPLocal, ierr)
358 
359       PetscRow=0;
360       k = 0;
361       int index;
362       for (i=0; i<nshg; i++ ){
363         for (j = 1; j<=nflow; j++){
364           index = i + (j-1)*nshg;
365           valtoinsert = res[index];
366           if(workfc.numpe > 1) {
367             PetscRow = (fncorp[i]-1)*nflow+(j-1);
368           }
369           else {
370             PetscRow = i*nflow+(j-1);
371           }
372           assert(fncorp[i]<=nshgt);
373           assert(fncorp[i]>0);
374           assert(PetscRow>=0);
375           assert(PetscRow<=nshgt*nflow);
376           ierr =  VecSetValue(resP, PetscRow, valtoinsert, INSERT_VALUES);
377         }
378       }
379 //      printf("after VecSetValue loop %d\n",ierr);
380       ierr = VecAssemblyBegin(resP);
381       ierr = VecAssemblyEnd(resP);
382       get_time((duration+2), (duration+3));
383       get_max_time_diff((duration), (duration+2),
384                         (duration+1), (duration+3),
385                         "VectorWorkPre \0"); // char(0))
386 
387       get_time((duration),(duration+1));
388 
389       if(firstpetsccall == 1) {
390         ierr = KSPCreate(PETSC_COMM_WORLD, &ksp);
391         ierr = KSPSetOperators(ksp, lhsP, lhsP);
392 //3.4 ierr = KSPSetOperators(ksp, lhsP, lhsP, SAME_NONZERO_PATTERN);
393         ierr = KSPGetPC(ksp, &pc);
394         ierr = PCSetType(pc, PCPBJACOBI);
395         PetscInt maxits;
396         maxits = (PetscInt)  solpar.nGMRES * (PetscInt) solpar.Kspace;
397         //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace);
398         ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits);
399         ierr = KSPSetFromOptions(ksp);
400       }
401       ierr = KSPSolve(ksp, resP, DyP);
402       get_time((duration+2),(duration+3));
403       get_max_time_diff((duration), (duration+2),
404                         (duration+1), (duration+3),
405                         "KSPSolve \0"); // char(0))
406      //if(myrank .eq. 0) then
407      //!write(*,"(A, E10.3)") "KSPSolve ", elapsed
408      //end if
409       get_time((duration),(duration+1));
410       if(firstpetsccall == 1) {
411       ierr = VecScatterCreate(DyP, LocalIndexSet, DyPLocal, GlobalIndexSet, &scatter7);
412       }
413       ierr = VecScatterBegin(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD);
414       ierr = VecScatterEnd(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD);
415       ierr = VecGetArray(DyPLocal, &parray);
416       PetscRow = 0;
417       for ( node = 0; node< nshg; node++) {
418         for (var = 1; var<= nflow; var++) {
419           index = node + (var-1)*nshg;
420           Dy[index] = parray[PetscRow];
421           PetscRow = PetscRow+1;
422         }
423       }
424       ierr = VecRestoreArray(DyPLocal, &parray);
425 
426       firstpetsccall = 0;
427       // later timer ('Solver  ');
428 
429 // .... output the statistics
430 //
431       itrpar.iKs=0; // see rstat()
432       PetscInt its;
433       //ierr = KSPGetIterationNumber(ksp, &itrpar.iKs);
434       ierr = KSPGetIterationNumber(ksp, &its);
435       itrpar.iKs = (int) its;
436       get_time((duration+2),(duration+3));
437       get_max_time_diff((duration), (duration+2),
438                         (duration+1), (duration+3),
439                         "solWork \0"); // char(0))
440       itrpar.ntotGM += (int) its;
441       rstat (res, ilwork);
442 //
443 // .... stop the timer
444 //
445 // 3002 continue                  ! no solve just res.
446       // later call timer ('Back    ')
447 //
448 // .... end
449 //
450 }
451 
452 void     SolGMRpSclr(double* y,         double* ac,
453      	double* x,      double* elDw,   int* iBC,       double* BC,
454      	int* col,       int* row,
455      	int* iper,
456      	int* ilwork,    double* shp,       double* shgl,      double* shpb,
457      	double* shglb,  double* res,   double* Dy,     long long int* fncorp)
458 {
459 //
460 // ----------------------------------------------------------------------
461 //
462 //  This is the preconditioned GMRES driver routine.
463 //
464 // input:
465 //  y      (nshg,ndof)           : Y-variables at n+alpha_v
466 //  ac     (nshg,ndof)           : Primvar. accel. variable n+alpha_m
467 //  yold   (nshg,ndof)           : Y-variables at beginning of step
468 //  acold  (nshg,ndof)           : Primvar. accel. variable at begng step
469 //  x      (numnp,nsd)            : node coordinates
470 //  iBC    (nshg)                : BC codes
471 //  BC     (nshg,ndofBC)         : BC constraint parameters
472 //  shp(b) (nen,maxsh,melCat)     : element shape functions (boundary)
473 //  shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions
474 //
475 // ----------------------------------------------------------------------
476 //
477 
478 
479 // Get variables from common_c.h
480       int nshg, nflow, nsd, iownnodes;
481       nshg  = conpar.nshg;
482       nsd = NSD;
483       iownnodes = conpar.iownnodes;
484 
485       gcorp_t nshgt;
486       gcorp_t mbeg;
487       gcorp_t mend;
488       nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h
489       mbeg = (gcorp_t) newdim.minowned;
490       mend = (gcorp_t) newdim.maxowned;
491 
492 
493       int node, element, var, eqn;
494       double valtoinsert;
495       int nenl, iel, lelCat, lcsyst, iorder, nshl;
496       int mattyp, ndofl, nsymdl, npro, ngauss, nppro;
497       double DyFlats[nshg];
498       double DyFlatPhastas[nshg];
499       double rmes[nshg];
500 // DEBUG
501       int i,j,k,l,m;
502       PetscInt indexsetarys[nshg];
503       PetscInt gblindexsetarys[nshg];
504       PetscInt nodetoinsert;
505 
506       double  real_rtol, real_abstol, real_dtol;
507       double *parray;
508       PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA;
509       PetscInt petsc_n;
510       PetscOne = 1;
511       uint64_t duration[4];
512 
513 
514 //
515 //
516 // .... *******************>> Element Data Formation <<******************
517 //
518 //
519 // .... set the parameters for flux and surface tension calculations
520 //
521 //
522       genpar.idflx = 0 ;
523       if(genpar.idiff >= 1)  genpar.idflx= genpar.idflx + (nflow-1) * nsd;
524       if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd;
525 
526 
527 
528       PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
529       PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
530       gcorp_t glbNZ;
531 
532       if(firstpetsccalls == 1) {
533         for(i=0;i<iownnodes;i++) {
534              idiagnz[i]=0;
535              iodiagnz[i]=0;
536         }
537         i=0;
538         for(k=0;k<nshg;k++) {
539           if((fncorp[k] < mbeg) || (fncorp[k] >mend)){
540 // this node is not owned by this rank so we skip
541           } else {
542            for(j=col[i]-1;j<col[i+1]-1;j++) {
543              glbNZ=fncorp[row[j]];
544              if((glbNZ < mbeg) || (glbNZ > mend)) {
545                 iodiagnz[i]++;
546              } else {
547                 idiagnz[i]++;
548              }
549            }
550            i++;
551           }
552         }
553         gcorp_t mind=1000;
554         gcorp_t mino=1000;
555         gcorp_t maxd=0;
556         gcorp_t maxo=0;
557         for(i=0;i<iownnodes;i++) {
558            mind=min(mind,idiagnz[i]);
559            mino=min(mino,iodiagnz[i]);
560            maxd=max(maxd,idiagnz[i]);
561            maxo=max(maxo,iodiagnz[i]);
562         }
563 
564         for(i=0;i<iownnodes;i++) {
565            iodiagnz[i]=1.3*maxd;
566            idiagnz[i]=1.3*maxd;
567         }
568 
569 
570 
571        }
572        if(firstpetsccalls == 1) {
573 
574         petsc_m  = (PetscInt) iownnodes;
575         petsc_M  = (PetscInt) nshgt;
576         petsc_PA  = (PetscInt) 40;
577 
578         ierr = MatCreateAIJ(PETSC_COMM_WORLD, petsc_m, petsc_m, petsc_M, petsc_M,
579                              0, idiagnz, 0, iodiagnz, &lhsPs);
580 
581         ierr = MatSetOption(lhsPs, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE);
582 
583 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call
584         ierr = MatSetOption(lhsPs, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE);
585         ierr = MatSetUp(lhsPs);
586 
587       PetscInt myMatStart, myMatEnd;
588       ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd);
589       }
590 //      MPI_Barrier(MPI_COMM_WORLD);
591  //     if(workfc.myrank ==0) printf("Before MatZeroEntries  \n");
592       if(genpar.lhs == 1) ierr = MatZeroEntries(lhsPs);
593 
594       get_time(duration, (duration+1));
595 
596 //      MPI_Barrier(MPI_COMM_WORLD);
597  //     if(workfc.myrank ==0) printf("Before elmgmr  \n");
598 //      for (i=0; i<nshg ; i++) {
599 //            assert(fncorp[i]>0);
600 //      }
601 
602       ElmGMRPETScSclr(y,             ac,            x, elDw,
603                   shp,           shgl,          iBC,
604                   BC,            shpb,
605                   shglb,         res,
606                   rmes,                   iper,
607                   ilwork,        &lhsPs);
608        if(firstpetsccalls == 1) {
609       // Setup IndexSet. For now, we mimic vector insertion procedure
610       // Since we always reference by global indexes this doesn't matter
611       // except for cache performance)
612       // TODO: Better arrangment?
613         nodetoinsert = 0;
614         k=0;
615         if(workfc.numpe > 1) {
616           for (i=0; i<nshg ; i++) {
617             nodetoinsert = fncorp[i]-1;
618             indexsetarys[k] = nodetoinsert;
619             k = k+1;
620           }
621         }
622         else {
623           for (i=0; i<nshg ; i++) {
624             nodetoinsert = i;
625             indexsetarys[k] = nodetoinsert;
626             k = k+1;
627           }
628         }
629 
630         for (i=0; i<nshg; i++) {
631           gblindexsetarys[i] = i ; //TODO: better option for performance?
632         }
633 //  Create Vector Index Maps
634         petsc_n  = (PetscInt) nshg;
635         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetarys,
636      	       PETSC_COPY_VALUES, &LocalIndexSets);
637         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetarys,
638                PETSC_COPY_VALUES, &GlobalIndexSets);
639         ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSets,&VectorMappings);
640         ierr =  ISLocalToGlobalMappingCreateIS(GlobalIndexSets,&GblVectorMappings);
641       }
642       if(genpar.lhs ==1) {
643       ierr = MatAssemblyBegin(lhsPs, MAT_FINAL_ASSEMBLY);
644       ierr = MatAssemblyEnd(lhsPs, MAT_FINAL_ASSEMBLY);
645       }
646       if(firstpetsccalls == 1) {
647         ierr = MatGetLocalSize(lhsPs, &LocalRow, &LocalCol);
648         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPs);
649         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyPs);
650       }
651       ierr = VecZeroEntries(resPs);
652       ierr = VecZeroEntries(DyPs);
653       if(firstpetsccalls == 1) {
654         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPGlobals);
655         ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocals);
656       }
657         ierr = VecZeroEntries(resPGlobals);
658 
659       PetscRow=0;
660       k = 0;
661       int index;
662       for (i=0; i<nshg; i++ ){
663           valtoinsert = res[i];
664           if(workfc.numpe > 1) {
665             PetscRow = (fncorp[i]-1);
666           }
667           else {
668             PetscRow = i-1;
669           }
670           assert(fncorp[i]<=nshgt);
671           assert(fncorp[i]>0);
672           assert(PetscRow>=0);
673           assert(PetscRow<=nshgt);
674           ierr =  VecSetValue(resPs, PetscRow, valtoinsert, INSERT_VALUES);
675       }
676 //      printf("after VecSetValue loop %d\n",ierr);
677       ierr = VecAssemblyBegin(resPs);
678       ierr = VecAssemblyEnd(resPs);
679 
680       if(firstpetsccalls == 1) {
681         ierr = KSPCreate(PETSC_COMM_WORLD, &ksps);
682         ierr = KSPSetOperators(ksps, lhsPs, lhsPs);
683 //3.4 ierr = KSPSetOperators(ksps, lhsPs, lhsPs, SAME_NONZERO_PATTERN);
684         ierr = KSPGetPC(ksps, &pcs);
685         ierr = PCSetType(pcs, PCPBJACOBI);
686         PetscInt maxits;
687         maxits = (PetscInt)  solpar.nGMRES * (PetscInt) solpar.Kspace;
688         //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace);
689         ierr = KSPSetTolerances(ksps, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits);
690         ierr = KSPSetFromOptions(ksps);
691       }
692       ierr = KSPSolve(ksps, resPs, DyPs);
693       if(firstpetsccalls == 1) {
694       ierr = VecScatterCreate(DyPs, LocalIndexSets, DyPLocals, GlobalIndexSets, &scatter7s);
695       }
696       ierr = VecScatterBegin(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD);
697       ierr = VecScatterEnd(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD);
698       ierr = VecGetArray(DyPLocals, &parray);
699       PetscRow = 0;
700       for ( node = 0; node< nshg; node++) {
701           index = node;
702           Dy[index] = parray[PetscRow];
703           PetscRow = PetscRow+1;
704       }
705       ierr = VecRestoreArray(DyPLocals, &parray);
706 
707       firstpetsccalls = 0;
708 
709 // .... output the statistics
710 //
711 //      itrpar.iKss=0; // see rstat()
712       PetscInt its;
713       ierr = KSPGetIterationNumber(ksps, &its);
714       itrpar.iKss = (int) its;
715       itrpar.ntotGMs += (int) its;
716       rstatSclr (res, ilwork);
717 //
718 // .... end
719 //
720 }
721 
722