xref: /phasta/phSolver/compressible/solgmrpetsc.c (revision 178603657237c122e977041f94e6448e6c2fef12)
1 #ifdef HAVE_PETSC
2 #include <stdio.h>
3 
4 #include "petscsys.h"
5 #include "petscvec.h"
6 #include "petscmat.h"
7 #include "petscpc.h"
8 #include "petscksp.h"
9 
10 #include "assert.h"
11 #include "common_c.h"
12 
13 #include <FCMangle.h>
14 #define SolGMRp FortranCInterface_GLOBAL_(solgmrp,SOLGMRP)
15 #define SolGMRpSclr FortranCInterface_GLOBAL_(solgmrpsclr,SOLGMRPSCLR)
16 #define ElmGMRPETSc FortranCInterface_GLOBAL_(elmgmrpetsc, ELMGMRPETSC)
17 #define ElmGMRPETScSclr FortranCInterface_GLOBAL_(elmgmrpetscsclr, ELMGMRPETSCSCLR)
18 #define rstat FortranCInterface_GLOBAL_(rstat, RSTAT)
19 #define rstatSclr FortranCInterface_GLOBAL_(rstatsclr, RSTATSCLR)
20 #define min(a,b) (((a) < (b)) ? (a) : (b))
21 #define max(a,b) (((a) > (b)) ? (a) : (b))
22 #define get_time FortranCInterface_GLOBAL_(get_time,GET_TIME)
23 #define get_max_time_diff FortranCInterface_GLOBAL_(get_max_time_diff,GET_MAX_TIME_DIFF)
24 
25 typedef long long int gcorp_t;
26 
27 void get_time(uint64_t* rv, uint64_t* cycle);
28 void get_max_time_diff(uint64_t* first, uint64_t* last, uint64_t* c_first, uint64_t* c_last, char* lbl);
29 
30 //#include "auxmpi.h"
31 //      static PetscOffset poff;
32 
33       static Mat lhsP;
34       static PC pc;
35       static KSP ksp;
36       static Vec DyP, resP, DyPLocal, resPGlobal;
37       static PetscErrorCode ierr;
38       static PetscInt PetscOne, PetscRow, PetscCol, LocalRow, LocalCol;
39       static IS LocalIndexSet, GlobalIndexSet;
40       static ISLocalToGlobalMapping VectorMapping;
41       static ISLocalToGlobalMapping GblVectorMapping;
42       static  VecScatter scatter7;
43       static int firstpetsccall = 1;
44 
45       static Mat lhsPs;
46       static PC pcs;
47       static KSP ksps;
48       static Vec DyPs, resPs, DyPLocals, resPGlobals;
49       static IS LocalIndexSets, GlobalIndexSets;
50       static ISLocalToGlobalMapping VectorMappings;
51       static ISLocalToGlobalMapping GblVectorMappings;
52       static VecScatter scatter7s;
53       static int firstpetsccalls = 1;
54 
55 void     SolGMRp(double* y,         double* ac,        double* yold,
56      	double* x,         int* iBC,       double* BC,
57      	int* col,       int* row,       double* lhsk,
58      	double* res,       double* BDiag,
59      	int* iper,
60      	int* ilwork,    double* shp,       double* shgl,      double* shpb,
61      	double* shglb,     double* Dy,        double* rerr, long long int* fncorp)
62 {
63 //
64 // ----------------------------------------------------------------------
65 //
66 //  This is the preconditioned GMRES driver routine.
67 //
68 // input:
69 //  y      (nshg,ndof)           : Y-variables at n+alpha_v
70 //  ac     (nshg,ndof)           : Primvar. accel. variable n+alpha_m
71 //  yold   (nshg,ndof)           : Y-variables at beginning of step
72 //  acold  (nshg,ndof)           : Primvar. accel. variable at begng step
73 //  x      (numnp,nsd)            : node coordinates
74 //  iBC    (nshg)                : BC codes
75 //  BC     (nshg,ndofBC)         : BC constraint parameters
76 //  shp(b) (nen,maxsh,melCat)     : element shape functions (boundary)
77 //  shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions
78 //
79 // output:
80 //  res    (nshg,nflow)           : preconditioned residual
81 //  BDiag  (nshg,nflow,nflow)      : block-diagonal preconditioner
82 //
83 //
84 // Zdenek Johan,  Winter 1991.  (Fortran 90)
85 // ----------------------------------------------------------------------
86 //
87 
88 
89 // Get variables from common_c.h
90       int nshg, nflow, nsd, iownnodes;
91       nshg  = conpar.nshg;
92       nflow = conpar.nflow;
93       nsd = NSD;
94       iownnodes = conpar.iownnodes;
95 
96       gcorp_t nshgt;
97       gcorp_t mbeg;
98       gcorp_t mend;
99       nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h
100       mbeg = (gcorp_t) newdim.minowned;
101       mend = (gcorp_t) newdim.maxowned;
102 
103 
104       int node, element, var, eqn;
105       double valtoinsert;
106       int nenl, iel, lelCat, lcsyst, iorder, nshl;
107       int mattyp, ndofl, nsymdl, npro, ngauss, nppro;
108       double DyFlat[nshg*nflow];
109       double DyFlatPhasta[nshg*nflow];
110       double rmes[nshg*nflow];
111 // DEBUG
112       int i,j,k,l,m;
113       //int indexsetary[nflow*nshg];
114       //int gblindexsetary[nflow*nshg];
115       PetscInt indexsetary[nflow*nshg];
116       PetscInt gblindexsetary[nflow*nshg];
117       PetscInt nodetoinsert;
118 
119       // FIXME: PetscScalar
120       double  real_rtol, real_abstol, real_dtol;
121 // /DEBUG
122       //double parray[1]; // Should be a PetscScalar
123       double *parray; // Should be a PetscScalar
124       PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA;
125       PetscInt petsc_n;
126       PetscOne = 1;
127       uint64_t duration[4];
128 
129 //      printf("%g %g %g \n", *rerr, *(rerr+1) , *(rerr+2));
130 
131        if(firstpetsccall == 1) {
132 /*         call get_time(duration(1),duration(2));
133          call system('sleep 10');
134          call get_time(duration(3),duration(4));
135          call get_max_time_diff(duration(1), duration(3),
136                               duration(2), duration(4),
137                               "TenSecondSleep " // char(0))
138          if(myrank .eq. 0) {
139            !write(*,"(A, E10.3)") "TenSecondSleep ", elapsed
140          }
141 */
142        }
143 //
144 //
145 // .... *******************>> Element Data Formation <<******************
146 //
147 //
148 // .... set the parameters for flux and surface tension calculations
149 //
150 //
151       genpar.idflx = 0 ;
152       if(genpar.idiff >= 1)  genpar.idflx= genpar.idflx + (nflow-1) * nsd;
153       if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd;
154 
155 
156 
157       PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
158       PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
159       gcorp_t glbNZ;
160 
161       if(firstpetsccall == 1) {
162         for(i=0;i<iownnodes;i++) {
163              idiagnz[i]=0;
164              iodiagnz[i]=0;
165         }
166         i=0;
167         for(k=0;k<nshg;k++) {
168           if((fncorp[k] < mbeg) || (fncorp[k] >mend)){
169 // this node is not owned by this rank so we skip
170           } else {
171            for(j=col[i]-1;j<col[i+1]-1;j++) {
172              glbNZ=fncorp[row[j]];
173              if((glbNZ < mbeg) || (glbNZ > mend)) {
174                 iodiagnz[i]++;
175              } else {
176                 idiagnz[i]++;
177              }
178            }
179            i++;
180           }
181         }
182         gcorp_t mind=1000;
183         gcorp_t mino=1000;
184         gcorp_t maxd=0;
185         gcorp_t maxo=0;
186         for(i=0;i<iownnodes;i++) {
187            mind=min(mind,idiagnz[i]);
188            mino=min(mino,iodiagnz[i]);
189            maxd=max(maxd,idiagnz[i]);
190            maxo=max(maxo,iodiagnz[i]);
191 //           iodiagnz[i]=max(iodiagnz[i],10);
192 //           idiagnz[i]=max(idiagnz[i],10);
193 //           iodiagnz[i]=2*iodiagnz[i];   //  estimate a bit higher for off-part interactions
194 //           idiagnz[i]=2*idiagnz[i];   //  estimate a bit higher for off-part interactions
195         }
196 // the above was pretty good but below is faster and not too much more memory...of course once you do this
197 // could just use the constant fill parameters in create but keep it alive for potential later optimization
198 
199         for(i=0;i<iownnodes;i++) {
200            iodiagnz[i]=1.3*maxd;
201            idiagnz[i]=1.3*maxd;
202         }
203 
204 
205 
206         if(workfc.numpe < 200){
207           printf("myrank,i,iownnodes,nshg %d %d %d %d \n",workfc.myrank,i,iownnodes,nshg);
208           printf("myrank,mind,maxd,mino,maxo %d %d %d %d %d \n",workfc.myrank,mind,maxd,mino,maxo);
209         }
210         // Print debug info
211         if(nshgt < 200){
212         int irank;
213         for(irank=0;irank<workfc.numpe;irank++) {
214           if(irank == workfc.myrank){
215             printf("mbeg,mend,myrank,idiagnz, iodiagnz %d %d %d \n",mbeg,mend,workfc.myrank);
216             for(i=0;i<iownnodes;i++) {
217               printf("%d %ld %ld \n",workfc.myrank,idiagnz[i],iodiagnz[i]);
218             }
219           }
220          MPI_Barrier(MPI_COMM_WORLD);
221         }
222         }
223        }
224        if(firstpetsccall == 1) {
225 
226         petsc_bs = (PetscInt) nflow;
227         petsc_m  = (PetscInt) nflow* (PetscInt) iownnodes;
228         petsc_M  = (PetscInt) nshgt * (PetscInt) nflow;
229         petsc_PA  = (PetscInt) 40;
230         // TODO: fixe nshgt for 64 bit integer!!!!!
231 
232         //ierr = MatCreateBAIJ(PETSC_COMM_WORLD, nflow, nflow*iownnodes,
233         //       nflow*iownnodes, nshgt*nflow, nshgt*nflow, 0,
234 
235 // this has no pre-allocate        ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M,
236 // this has no pre-allocate                             0, NULL, 0, NULL, &lhsP);
237 
238 // next 2 do pre-allocate
239 
240 //      MPI_Barrier(MPI_COMM_WORLD);
241 //      if(workfc.myrank ==0) printf("Before matCreateBAIJ petsc_bs,petsc_m,petsc_M,petsc_PA %ld %ld %ld %ld \n",petsc_bs,petsc_m,petsc_M,petsc_PA);
242 
243         ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M,
244                              0, idiagnz, 0, iodiagnz, &lhsP);
245 
246 //                              petsc_PA, NULL, petsc_PA, NULL, &lhsP);
247 //      MPI_Barrier(MPI_COMM_WORLD);
248 //      if(workfc.myrank ==0) printf("Before matSetOoption 1  \n");
249         ierr = MatSetOption(lhsP, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE);
250 
251 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call
252         ierr = MatSetOption(lhsP, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE);
253         ierr = MatSetUp(lhsP);
254 
255       PetscInt myMatStart, myMatEnd;
256       ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd);
257       }
258 //      MPI_Barrier(MPI_COMM_WORLD);
259  //     if(workfc.myrank ==0) printf("Before MatZeroEntries  \n");
260       if(genpar.lhs ==1) ierr = MatZeroEntries(lhsP);
261 
262       get_time(duration, (duration+1));
263 
264 //      MPI_Barrier(MPI_COMM_WORLD);
265  //     if(workfc.myrank ==0) printf("Before elmgmr  \n");
266 //      for (i=0; i<nshg ; i++) {
267 //            assert(fncorp[i]>0);
268 //      }
269 
270       ElmGMRPETSc(y,             ac,            x,
271                   shp,           shgl,          iBC,
272                   BC,            shpb,
273                   shglb,         res,
274                   rmes,                   iper,
275                   ilwork,        rerr ,         &lhsP);
276       get_time((duration+2), (duration+3));
277       get_max_time_diff((duration), (duration+2),
278                         (duration+1), (duration+3),
279                         "ElmGMRPETSc \0");  // char(0))
280 //       printf("after ElmGMRPETSc\n");
281        if(firstpetsccall == 1) {
282       // Setup IndexSet. For now, we mimic vector insertion procedure
283       // Since we always reference by global indexes this doesn't matter
284       // except for cache performance)
285       // TODO: Better arrangment?
286         nodetoinsert = 0;
287         k=0;
288         if(workfc.numpe > 1) {
289           for (i=0; i<nshg ; i++) {
290             nodetoinsert = fncorp[i]-1;
291 //            if(nodetoinsert<0) printf("nodetoinsert <0! myrank: %d, value: %ld\n", workfc.myrank, nodetoinsert);
292 //            assert(fncorp[i]>=0);
293             for (j=1; j<=nflow; j++) {
294               indexsetary[k] = nodetoinsert*nflow+(j-1);
295               assert(indexsetary[k]>=0);
296 //              assert(fncorp[i]>=0);
297               k = k+1;
298             }
299           }
300         }
301         else {
302           for (i=0; i<nshg ; i++) {
303             nodetoinsert = i;
304             for (j=1; j<=nflow; j++) {
305               indexsetary[k] = nodetoinsert*nflow+(j-1);
306               k = k+1;
307             }
308           }
309         }
310 
311         for (i=0; i<nshg*nflow; i++) {
312           gblindexsetary[i] = i ; //TODO: better option for performance?
313         }
314 //  Create Vector Index Maps
315         petsc_n  = (PetscInt) nshg * (PetscInt) nflow;
316         //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, indexsetary,
317      	//       PETSC_COPY_VALUES, &LocalIndexSet);
318         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetary,
319      	       PETSC_COPY_VALUES, &LocalIndexSet);
320 //        printf("after 1st ISCreateGeneral %d\n", ierr);
321         //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, gblindexsetary,
322         //       PETSC_COPY_VALUES, &GlobalIndexSet);
323         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetary,
324                PETSC_COPY_VALUES, &GlobalIndexSet);
325 //        printf("after 2nd ISCreateGeneral %d\n", ierr);
326         ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSet,&VectorMapping);
327 //        printf("after 1st ISLocalToGlobalMappingCreateIS %d\n",ierr);
328         ierr =  ISLocalToGlobalMappingCreateIS(GlobalIndexSet,&GblVectorMapping);
329 //        printf("after 2nd ISLocalToGlobalMappingCreateIS %d\n",ierr);
330       }
331       if(genpar.lhs == 1) {
332       get_time((duration), (duration+1));
333       ierr = MatAssemblyBegin(lhsP, MAT_FINAL_ASSEMBLY);
334       ierr = MatAssemblyEnd(lhsP, MAT_FINAL_ASSEMBLY);
335       get_time((duration+2),(duration+3));
336       get_max_time_diff((duration), (duration+2),
337                         (duration+1), (duration+3),
338                         "MatAssembly \0"); // char(0))
339       get_time(duration, (duration+1));
340       }
341       if(firstpetsccall == 1) {
342         ierr = MatGetLocalSize(lhsP, &LocalRow, &LocalCol);
343 //  TODO Should this be LocalRow? LocalCol? or does LocalRow always == LocalCol
344         //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &resP);
345         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resP);
346         //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &DyP);
347         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyP);
348       }
349       ierr = VecZeroEntries(resP);
350       ierr = VecZeroEntries(DyP);
351       if(firstpetsccall == 1) {
352         //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &resPGlobal);
353         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPGlobal);
354         //ierr = VecCreateSeq(PETSC_COMM_SELF, nshg*nflow, &DyPLocal);
355         ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocal);
356       }
357         ierr = VecZeroEntries(resPGlobal);
358 //         call VecZeroEntries(DyPLocal, ierr)
359 
360       PetscRow=0;
361       k = 0;
362       int index;
363       for (i=0; i<nshg; i++ ){
364         for (j = 1; j<=nflow; j++){
365           index = i + (j-1)*nshg;
366           valtoinsert = res[index];
367           if(workfc.numpe > 1) {
368             PetscRow = (fncorp[i]-1)*nflow+(j-1);
369           }
370           else {
371             PetscRow = i*nflow+(j-1);
372           }
373           assert(fncorp[i]<=nshgt);
374           assert(fncorp[i]>0);
375           assert(PetscRow>=0);
376           assert(PetscRow<=nshgt*nflow);
377           ierr =  VecSetValue(resP, PetscRow, valtoinsert, INSERT_VALUES);
378         }
379       }
380 //      printf("after VecSetValue loop %d\n",ierr);
381       ierr = VecAssemblyBegin(resP);
382       ierr = VecAssemblyEnd(resP);
383       get_time((duration+2), (duration+3));
384       get_max_time_diff((duration), (duration+2),
385                         (duration+1), (duration+3),
386                         "VectorWorkPre \0"); // char(0))
387 
388       get_time((duration),(duration+1));
389 
390       if(firstpetsccall == 1) {
391         ierr = KSPCreate(PETSC_COMM_WORLD, &ksp);
392         ierr = KSPSetOperators(ksp, lhsP, lhsP);
393 //3.4 ierr = KSPSetOperators(ksp, lhsP, lhsP, SAME_NONZERO_PATTERN);
394         ierr = KSPGetPC(ksp, &pc);
395         ierr = PCSetType(pc, PCPBJACOBI);
396         PetscInt maxits;
397         maxits = (PetscInt)  solpar.nGMRES * (PetscInt) solpar.Kspace;
398         //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace);
399         ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits);
400         ierr = KSPSetFromOptions(ksp);
401       }
402       ierr = KSPSolve(ksp, resP, DyP);
403       get_time((duration+2),(duration+3));
404       get_max_time_diff((duration), (duration+2),
405                         (duration+1), (duration+3),
406                         "KSPSolve \0"); // char(0))
407      //if(myrank .eq. 0) then
408      //!write(*,"(A, E10.3)") "KSPSolve ", elapsed
409      //end if
410       get_time((duration),(duration+1));
411       if(firstpetsccall == 1) {
412       ierr = VecScatterCreate(DyP, LocalIndexSet, DyPLocal, GlobalIndexSet, &scatter7);
413       }
414       ierr = VecScatterBegin(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD);
415       ierr = VecScatterEnd(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD);
416       ierr = VecGetArray(DyPLocal, &parray);
417       PetscRow = 0;
418       for ( node = 0; node< nshg; node++) {
419         for (var = 1; var<= nflow; var++) {
420           index = node + (var-1)*nshg;
421           Dy[index] = parray[PetscRow];
422           PetscRow = PetscRow+1;
423         }
424       }
425       ierr = VecRestoreArray(DyPLocal, &parray);
426 
427       firstpetsccall = 0;
428       // later timer ('Solver  ');
429 
430 // .... output the statistics
431 //
432       itrpar.iKs=0; // see rstat()
433       PetscInt its;
434       //ierr = KSPGetIterationNumber(ksp, &itrpar.iKs);
435       ierr = KSPGetIterationNumber(ksp, &its);
436       itrpar.iKs = (int) its;
437       get_time((duration+2),(duration+3));
438       get_max_time_diff((duration), (duration+2),
439                         (duration+1), (duration+3),
440                         "solWork \0"); // char(0))
441       itrpar.ntotGM += (int) its;
442       rstat (res, ilwork);
443 //
444 // .... stop the timer
445 //
446 // 3002 continue                  ! no solve just res.
447       // later call timer ('Back    ')
448 //
449 // .... end
450 //
451 }
452 
453 void     SolGMRpSclr(double* y,         double* ac,
454      	double* x,      double* elDw,   int* iBC,       double* BC,
455      	int* col,       int* row,
456      	int* iper,
457      	int* ilwork,    double* shp,       double* shgl,      double* shpb,
458      	double* shglb,  double* res,   double* Dy,     long long int* fncorp)
459 {
460 //
461 // ----------------------------------------------------------------------
462 //
463 //  This is the preconditioned GMRES driver routine.
464 //
465 // input:
466 //  y      (nshg,ndof)           : Y-variables at n+alpha_v
467 //  ac     (nshg,ndof)           : Primvar. accel. variable n+alpha_m
468 //  yold   (nshg,ndof)           : Y-variables at beginning of step
469 //  acold  (nshg,ndof)           : Primvar. accel. variable at begng step
470 //  x      (numnp,nsd)            : node coordinates
471 //  iBC    (nshg)                : BC codes
472 //  BC     (nshg,ndofBC)         : BC constraint parameters
473 //  shp(b) (nen,maxsh,melCat)     : element shape functions (boundary)
474 //  shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions
475 //
476 // ----------------------------------------------------------------------
477 //
478 
479 
480 // Get variables from common_c.h
481       int nshg, nflow, nsd, iownnodes;
482       nshg  = conpar.nshg;
483       nsd = NSD;
484       iownnodes = conpar.iownnodes;
485 
486       gcorp_t nshgt;
487       gcorp_t mbeg;
488       gcorp_t mend;
489       nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h
490       mbeg = (gcorp_t) newdim.minowned;
491       mend = (gcorp_t) newdim.maxowned;
492 
493 
494       int node, element, var, eqn;
495       double valtoinsert;
496       int nenl, iel, lelCat, lcsyst, iorder, nshl;
497       int mattyp, ndofl, nsymdl, npro, ngauss, nppro;
498       double DyFlats[nshg];
499       double DyFlatPhastas[nshg];
500       double rmes[nshg];
501 // DEBUG
502       int i,j,k,l,m;
503       PetscInt indexsetarys[nshg];
504       PetscInt gblindexsetarys[nshg];
505       PetscInt nodetoinsert;
506 
507       double  real_rtol, real_abstol, real_dtol;
508       double *parray;
509       PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA;
510       PetscInt petsc_n;
511       PetscOne = 1;
512       uint64_t duration[4];
513 
514 
515 //
516 //
517 // .... *******************>> Element Data Formation <<******************
518 //
519 //
520 // .... set the parameters for flux and surface tension calculations
521 //
522 //
523       genpar.idflx = 0 ;
524       if(genpar.idiff >= 1)  genpar.idflx= genpar.idflx + (nflow-1) * nsd;
525       if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd;
526 
527 
528 
529       PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
530       PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
531       gcorp_t glbNZ;
532 
533       if(firstpetsccalls == 1) {
534         for(i=0;i<iownnodes;i++) {
535              idiagnz[i]=0;
536              iodiagnz[i]=0;
537         }
538         i=0;
539         for(k=0;k<nshg;k++) {
540           if((fncorp[k] < mbeg) || (fncorp[k] >mend)){
541 // this node is not owned by this rank so we skip
542           } else {
543            for(j=col[i]-1;j<col[i+1]-1;j++) {
544              glbNZ=fncorp[row[j]];
545              if((glbNZ < mbeg) || (glbNZ > mend)) {
546                 iodiagnz[i]++;
547              } else {
548                 idiagnz[i]++;
549              }
550            }
551            i++;
552           }
553         }
554         gcorp_t mind=1000;
555         gcorp_t mino=1000;
556         gcorp_t maxd=0;
557         gcorp_t maxo=0;
558         for(i=0;i<iownnodes;i++) {
559            mind=min(mind,idiagnz[i]);
560            mino=min(mino,iodiagnz[i]);
561            maxd=max(maxd,idiagnz[i]);
562            maxo=max(maxo,iodiagnz[i]);
563         }
564 
565         for(i=0;i<iownnodes;i++) {
566            iodiagnz[i]=1.3*maxd;
567            idiagnz[i]=1.3*maxd;
568         }
569 
570 
571 
572        }
573        if(firstpetsccalls == 1) {
574 
575         petsc_m  = (PetscInt) iownnodes;
576         petsc_M  = (PetscInt) nshgt;
577         petsc_PA  = (PetscInt) 40;
578 
579         ierr = MatCreateAIJ(PETSC_COMM_WORLD, petsc_m, petsc_m, petsc_M, petsc_M,
580                              0, idiagnz, 0, iodiagnz, &lhsPs);
581 
582         ierr = MatSetOption(lhsPs, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE);
583 
584 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call
585         ierr = MatSetOption(lhsPs, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE);
586         ierr = MatSetUp(lhsPs);
587 
588       PetscInt myMatStart, myMatEnd;
589       ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd);
590       }
591 //      MPI_Barrier(MPI_COMM_WORLD);
592  //     if(workfc.myrank ==0) printf("Before MatZeroEntries  \n");
593       if(genpar.lhs == 1) ierr = MatZeroEntries(lhsPs);
594 
595       get_time(duration, (duration+1));
596 
597 //      MPI_Barrier(MPI_COMM_WORLD);
598  //     if(workfc.myrank ==0) printf("Before elmgmr  \n");
599 //      for (i=0; i<nshg ; i++) {
600 //            assert(fncorp[i]>0);
601 //      }
602 
603       ElmGMRPETScSclr(y,             ac,            x, elDw,
604                   shp,           shgl,          iBC,
605                   BC,            shpb,
606                   shglb,         res,
607                   rmes,                   iper,
608                   ilwork,        &lhsPs);
609        if(firstpetsccalls == 1) {
610       // Setup IndexSet. For now, we mimic vector insertion procedure
611       // Since we always reference by global indexes this doesn't matter
612       // except for cache performance)
613       // TODO: Better arrangment?
614         nodetoinsert = 0;
615         k=0;
616         if(workfc.numpe > 1) {
617           for (i=0; i<nshg ; i++) {
618             nodetoinsert = fncorp[i]-1;
619             indexsetarys[k] = nodetoinsert;
620             k = k+1;
621           }
622         }
623         else {
624           for (i=0; i<nshg ; i++) {
625             nodetoinsert = i;
626             indexsetarys[k] = nodetoinsert;
627             k = k+1;
628           }
629         }
630 
631         for (i=0; i<nshg; i++) {
632           gblindexsetarys[i] = i ; //TODO: better option for performance?
633         }
634 //  Create Vector Index Maps
635         petsc_n  = (PetscInt) nshg;
636         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetarys,
637      	       PETSC_COPY_VALUES, &LocalIndexSets);
638         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetarys,
639                PETSC_COPY_VALUES, &GlobalIndexSets);
640         ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSets,&VectorMappings);
641         ierr =  ISLocalToGlobalMappingCreateIS(GlobalIndexSets,&GblVectorMappings);
642       }
643       if(genpar.lhs ==1) {
644       ierr = MatAssemblyBegin(lhsPs, MAT_FINAL_ASSEMBLY);
645       ierr = MatAssemblyEnd(lhsPs, MAT_FINAL_ASSEMBLY);
646       }
647       if(firstpetsccalls == 1) {
648         ierr = MatGetLocalSize(lhsPs, &LocalRow, &LocalCol);
649         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPs);
650         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyPs);
651       }
652       ierr = VecZeroEntries(resPs);
653       ierr = VecZeroEntries(DyPs);
654       if(firstpetsccalls == 1) {
655         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPGlobals);
656         ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocals);
657       }
658         ierr = VecZeroEntries(resPGlobals);
659 
660       PetscRow=0;
661       k = 0;
662       int index;
663       for (i=0; i<nshg; i++ ){
664           valtoinsert = res[i];
665           if(workfc.numpe > 1) {
666             PetscRow = (fncorp[i]-1);
667           }
668           else {
669             PetscRow = i-1;
670           }
671           assert(fncorp[i]<=nshgt);
672           assert(fncorp[i]>0);
673           assert(PetscRow>=0);
674           assert(PetscRow<=nshgt);
675           ierr =  VecSetValue(resPs, PetscRow, valtoinsert, INSERT_VALUES);
676       }
677 //      printf("after VecSetValue loop %d\n",ierr);
678       ierr = VecAssemblyBegin(resPs);
679       ierr = VecAssemblyEnd(resPs);
680 
681       if(firstpetsccalls == 1) {
682         ierr = KSPCreate(PETSC_COMM_WORLD, &ksps);
683         ierr = KSPSetOperators(ksps, lhsPs, lhsPs);
684 //3.4 ierr = KSPSetOperators(ksps, lhsPs, lhsPs, SAME_NONZERO_PATTERN);
685         ierr = KSPGetPC(ksps, &pcs);
686         ierr = PCSetType(pcs, PCPBJACOBI);
687         PetscInt maxits;
688         maxits = (PetscInt)  solpar.nGMRES * (PetscInt) solpar.Kspace;
689         //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace);
690         ierr = KSPSetTolerances(ksps, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits);
691         ierr = KSPSetFromOptions(ksps);
692       }
693       ierr = KSPSolve(ksps, resPs, DyPs);
694       if(firstpetsccalls == 1) {
695       ierr = VecScatterCreate(DyPs, LocalIndexSets, DyPLocals, GlobalIndexSets, &scatter7s);
696       }
697       ierr = VecScatterBegin(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD);
698       ierr = VecScatterEnd(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD);
699       ierr = VecGetArray(DyPLocals, &parray);
700       PetscRow = 0;
701       for ( node = 0; node< nshg; node++) {
702           index = node;
703           Dy[index] = parray[PetscRow];
704           PetscRow = PetscRow+1;
705       }
706       ierr = VecRestoreArray(DyPLocals, &parray);
707 
708       firstpetsccalls = 0;
709 
710 // .... output the statistics
711 //
712 //      itrpar.iKss=0; // see rstat()
713       PetscInt its;
714       ierr = KSPGetIterationNumber(ksps, &its);
715       itrpar.iKss = (int) its;
716       itrpar.ntotGMs += (int) its;
717       rstatSclr (res, ilwork);
718 //
719 // .... end
720 //
721 }
722 #endif
723