1 #ifdef HAVE_PETSC 2 #include <stdio.h> 3 4 #include "petscsys.h" 5 #include "petscvec.h" 6 #include "petscmat.h" 7 #include "petscpc.h" 8 #include "petscksp.h" 9 10 #include "assert.h" 11 #include "common_c.h" 12 13 #include <FCMangle.h> 14 #define SolGMRp FortranCInterface_GLOBAL_(solgmrp,SOLGMRP) 15 #define SolGMRpSclr FortranCInterface_GLOBAL_(solgmrpsclr,SOLGMRPSCLR) 16 #define ElmGMRPETSc FortranCInterface_GLOBAL_(elmgmrpetsc, ELMGMRPETSC) 17 #define ElmGMRPETScSclr FortranCInterface_GLOBAL_(elmgmrpetscsclr, ELMGMRPETSCSCLR) 18 #define rstat FortranCInterface_GLOBAL_(rstat, RSTAT) 19 #define rstatSclr FortranCInterface_GLOBAL_(rstatsclr, RSTATSCLR) 20 #define min(a,b) (((a) < (b)) ? (a) : (b)) 21 #define max(a,b) (((a) > (b)) ? (a) : (b)) 22 #define get_time FortranCInterface_GLOBAL_(get_time,GET_TIME) 23 #define get_max_time_diff FortranCInterface_GLOBAL_(get_max_time_diff,GET_MAX_TIME_DIFF) 24 25 typedef long long int gcorp_t; 26 27 void get_time(uint64_t* rv, uint64_t* cycle); 28 void get_max_time_diff(uint64_t* first, uint64_t* last, uint64_t* c_first, uint64_t* c_last, char* lbl); 29 30 //#include "auxmpi.h" 31 // static PetscOffset poff; 32 33 static Mat lhsP; 34 static PC pc; 35 static KSP ksp; 36 static Vec DyP, resP, DyPLocal, resPGlobal; 37 static PetscErrorCode ierr; 38 static PetscInt PetscOne, PetscRow, PetscCol, LocalRow, LocalCol; 39 static IS LocalIndexSet, GlobalIndexSet; 40 static ISLocalToGlobalMapping VectorMapping; 41 static ISLocalToGlobalMapping GblVectorMapping; 42 static VecScatter scatter7; 43 static int firstpetsccall = 1; 44 45 static Mat lhsPs; 46 static PC pcs; 47 static KSP ksps; 48 static Vec DyPs, resPs, DyPLocals, resPGlobals; 49 static IS LocalIndexSets, GlobalIndexSets; 50 static ISLocalToGlobalMapping VectorMappings; 51 static ISLocalToGlobalMapping GblVectorMappings; 52 static VecScatter scatter7s; 53 static int firstpetsccalls = 1; 54 55 void SolGMRp(double* y, double* ac, double* yold, 56 double* x, int* iBC, double* BC, 57 int* col, int* row, double* lhsk, 58 double* res, double* BDiag, 59 int* iper, 60 int* ilwork, double* shp, double* shgl, double* shpb, 61 double* shglb, double* Dy, double* rerr, long long int* fncorp) 62 { 63 // 64 // ---------------------------------------------------------------------- 65 // 66 // This is the preconditioned GMRES driver routine. 67 // 68 // input: 69 // y (nshg,ndof) : Y-variables at n+alpha_v 70 // ac (nshg,ndof) : Primvar. accel. variable n+alpha_m 71 // yold (nshg,ndof) : Y-variables at beginning of step 72 // acold (nshg,ndof) : Primvar. accel. variable at begng step 73 // x (numnp,nsd) : node coordinates 74 // iBC (nshg) : BC codes 75 // BC (nshg,ndofBC) : BC constraint parameters 76 // shp(b) (nen,maxsh,melCat) : element shape functions (boundary) 77 // shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions 78 // 79 // output: 80 // res (nshg,nflow) : preconditioned residual 81 // BDiag (nshg,nflow,nflow) : block-diagonal preconditioner 82 // 83 // 84 // Zdenek Johan, Winter 1991. (Fortran 90) 85 // ---------------------------------------------------------------------- 86 // 87 88 89 // Get variables from common_c.h 90 int nshg, nflow, nsd, iownnodes; 91 nshg = conpar.nshg; 92 nflow = conpar.nflow; 93 nsd = NSD; 94 iownnodes = conpar.iownnodes; 95 96 gcorp_t nshgt; 97 gcorp_t mbeg; 98 gcorp_t mend; 99 nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h 100 mbeg = (gcorp_t) newdim.minowned; 101 mend = (gcorp_t) newdim.maxowned; 102 103 104 int node, element, var, eqn; 105 double valtoinsert; 106 int nenl, iel, lelCat, lcsyst, iorder, nshl; 107 int mattyp, ndofl, nsymdl, npro, ngauss, nppro; 108 double DyFlat[nshg*nflow]; 109 double DyFlatPhasta[nshg*nflow]; 110 double rmes[nshg*nflow]; 111 // DEBUG 112 int i,j,k,l,m; 113 //int indexsetary[nflow*nshg]; 114 //int gblindexsetary[nflow*nshg]; 115 PetscInt indexsetary[nflow*nshg]; 116 PetscInt gblindexsetary[nflow*nshg]; 117 PetscInt nodetoinsert; 118 119 // FIXME: PetscScalar 120 double real_rtol, real_abstol, real_dtol; 121 // /DEBUG 122 //double parray[1]; // Should be a PetscScalar 123 double *parray; // Should be a PetscScalar 124 PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA; 125 PetscInt petsc_n; 126 PetscOne = 1; 127 uint64_t duration[4]; 128 129 // printf("%g %g %g \n", *rerr, *(rerr+1) , *(rerr+2)); 130 131 if(firstpetsccall == 1) { 132 /* call get_time(duration(1),duration(2)); 133 call system('sleep 10'); 134 call get_time(duration(3),duration(4)); 135 call get_max_time_diff(duration(1), duration(3), 136 duration(2), duration(4), 137 "TenSecondSleep " // char(0)) 138 if(myrank .eq. 0) { 139 !write(*,"(A, E10.3)") "TenSecondSleep ", elapsed 140 } 141 */ 142 } 143 // 144 // 145 // .... *******************>> Element Data Formation <<****************** 146 // 147 // 148 // .... set the parameters for flux and surface tension calculations 149 // 150 // 151 genpar.idflx = 0 ; 152 if(genpar.idiff >= 1) genpar.idflx= genpar.idflx + (nflow-1) * nsd; 153 if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd; 154 155 156 157 PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes); 158 PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes); 159 gcorp_t glbNZ; 160 161 if(firstpetsccall == 1) { 162 for(i=0;i<iownnodes;i++) { 163 idiagnz[i]=0; 164 iodiagnz[i]=0; 165 } 166 i=0; 167 for(k=0;k<nshg;k++) { 168 if((fncorp[k] < mbeg) || (fncorp[k] >mend)){ 169 // this node is not owned by this rank so we skip 170 } else { 171 for(j=col[i]-1;j<col[i+1]-1;j++) { 172 glbNZ=fncorp[row[j]]; 173 if((glbNZ < mbeg) || (glbNZ > mend)) { 174 iodiagnz[i]++; 175 } else { 176 idiagnz[i]++; 177 } 178 } 179 i++; 180 } 181 } 182 gcorp_t mind=1000; 183 gcorp_t mino=1000; 184 gcorp_t maxd=0; 185 gcorp_t maxo=0; 186 for(i=0;i<iownnodes;i++) { 187 mind=min(mind,idiagnz[i]); 188 mino=min(mino,iodiagnz[i]); 189 maxd=max(maxd,idiagnz[i]); 190 maxo=max(maxo,iodiagnz[i]); 191 // iodiagnz[i]=max(iodiagnz[i],10); 192 // idiagnz[i]=max(idiagnz[i],10); 193 // iodiagnz[i]=2*iodiagnz[i]; // estimate a bit higher for off-part interactions 194 // idiagnz[i]=2*idiagnz[i]; // estimate a bit higher for off-part interactions 195 } 196 // the above was pretty good but below is faster and not too much more memory...of course once you do this 197 // could just use the constant fill parameters in create but keep it alive for potential later optimization 198 199 for(i=0;i<iownnodes;i++) { 200 iodiagnz[i]=1.3*maxd; 201 idiagnz[i]=1.3*maxd; 202 } 203 204 205 206 if(workfc.numpe < 200){ 207 printf("myrank,i,iownnodes,nshg %d %d %d %d \n",workfc.myrank,i,iownnodes,nshg); 208 printf("myrank,mind,maxd,mino,maxo %d %d %d %d %d \n",workfc.myrank,mind,maxd,mino,maxo); 209 } 210 // Print debug info 211 if(nshgt < 200){ 212 int irank; 213 for(irank=0;irank<workfc.numpe;irank++) { 214 if(irank == workfc.myrank){ 215 printf("mbeg,mend,myrank,idiagnz, iodiagnz %d %d %d \n",mbeg,mend,workfc.myrank); 216 for(i=0;i<iownnodes;i++) { 217 printf("%d %ld %ld \n",workfc.myrank,idiagnz[i],iodiagnz[i]); 218 } 219 } 220 MPI_Barrier(MPI_COMM_WORLD); 221 } 222 } 223 } 224 if(firstpetsccall == 1) { 225 226 petsc_bs = (PetscInt) nflow; 227 petsc_m = (PetscInt) nflow* (PetscInt) iownnodes; 228 petsc_M = (PetscInt) nshgt * (PetscInt) nflow; 229 petsc_PA = (PetscInt) 40; 230 // TODO: fixe nshgt for 64 bit integer!!!!! 231 232 //ierr = MatCreateBAIJ(PETSC_COMM_WORLD, nflow, nflow*iownnodes, 233 // nflow*iownnodes, nshgt*nflow, nshgt*nflow, 0, 234 235 // this has no pre-allocate ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M, 236 // this has no pre-allocate 0, NULL, 0, NULL, &lhsP); 237 238 // next 2 do pre-allocate 239 240 // MPI_Barrier(MPI_COMM_WORLD); 241 // if(workfc.myrank ==0) printf("Before matCreateBAIJ petsc_bs,petsc_m,petsc_M,petsc_PA %ld %ld %ld %ld \n",petsc_bs,petsc_m,petsc_M,petsc_PA); 242 243 ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M, 244 0, idiagnz, 0, iodiagnz, &lhsP); 245 246 // petsc_PA, NULL, petsc_PA, NULL, &lhsP); 247 // MPI_Barrier(MPI_COMM_WORLD); 248 // if(workfc.myrank ==0) printf("Before matSetOoption 1 \n"); 249 ierr = MatSetOption(lhsP, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE); 250 251 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call 252 ierr = MatSetOption(lhsP, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE); 253 ierr = MatSetUp(lhsP); 254 255 PetscInt myMatStart, myMatEnd; 256 ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd); 257 } 258 // MPI_Barrier(MPI_COMM_WORLD); 259 // if(workfc.myrank ==0) printf("Before MatZeroEntries \n"); 260 if(genpar.lhs ==1) ierr = MatZeroEntries(lhsP); 261 262 get_time(duration, (duration+1)); 263 264 // MPI_Barrier(MPI_COMM_WORLD); 265 // if(workfc.myrank ==0) printf("Before elmgmr \n"); 266 // for (i=0; i<nshg ; i++) { 267 // assert(fncorp[i]>0); 268 // } 269 270 ElmGMRPETSc(y, ac, x, 271 shp, shgl, iBC, 272 BC, shpb, 273 shglb, res, 274 rmes, iper, 275 ilwork, rerr , &lhsP); 276 get_time((duration+2), (duration+3)); 277 get_max_time_diff((duration), (duration+2), 278 (duration+1), (duration+3), 279 "ElmGMRPETSc \0"); // char(0)) 280 // printf("after ElmGMRPETSc\n"); 281 if(firstpetsccall == 1) { 282 // Setup IndexSet. For now, we mimic vector insertion procedure 283 // Since we always reference by global indexes this doesn't matter 284 // except for cache performance) 285 // TODO: Better arrangment? 286 nodetoinsert = 0; 287 k=0; 288 if(workfc.numpe > 1) { 289 for (i=0; i<nshg ; i++) { 290 nodetoinsert = fncorp[i]-1; 291 // if(nodetoinsert<0) printf("nodetoinsert <0! myrank: %d, value: %ld\n", workfc.myrank, nodetoinsert); 292 // assert(fncorp[i]>=0); 293 for (j=1; j<=nflow; j++) { 294 indexsetary[k] = nodetoinsert*nflow+(j-1); 295 assert(indexsetary[k]>=0); 296 // assert(fncorp[i]>=0); 297 k = k+1; 298 } 299 } 300 } 301 else { 302 for (i=0; i<nshg ; i++) { 303 nodetoinsert = i; 304 for (j=1; j<=nflow; j++) { 305 indexsetary[k] = nodetoinsert*nflow+(j-1); 306 k = k+1; 307 } 308 } 309 } 310 311 for (i=0; i<nshg*nflow; i++) { 312 gblindexsetary[i] = i ; //TODO: better option for performance? 313 } 314 // Create Vector Index Maps 315 petsc_n = (PetscInt) nshg * (PetscInt) nflow; 316 //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, indexsetary, 317 // PETSC_COPY_VALUES, &LocalIndexSet); 318 ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetary, 319 PETSC_COPY_VALUES, &LocalIndexSet); 320 // printf("after 1st ISCreateGeneral %d\n", ierr); 321 //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, gblindexsetary, 322 // PETSC_COPY_VALUES, &GlobalIndexSet); 323 ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetary, 324 PETSC_COPY_VALUES, &GlobalIndexSet); 325 // printf("after 2nd ISCreateGeneral %d\n", ierr); 326 ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSet,&VectorMapping); 327 // printf("after 1st ISLocalToGlobalMappingCreateIS %d\n",ierr); 328 ierr = ISLocalToGlobalMappingCreateIS(GlobalIndexSet,&GblVectorMapping); 329 // printf("after 2nd ISLocalToGlobalMappingCreateIS %d\n",ierr); 330 } 331 if(genpar.lhs == 1) { 332 get_time((duration), (duration+1)); 333 ierr = MatAssemblyBegin(lhsP, MAT_FINAL_ASSEMBLY); 334 ierr = MatAssemblyEnd(lhsP, MAT_FINAL_ASSEMBLY); 335 get_time((duration+2),(duration+3)); 336 get_max_time_diff((duration), (duration+2), 337 (duration+1), (duration+3), 338 "MatAssembly \0"); // char(0)) 339 get_time(duration, (duration+1)); 340 } 341 if(firstpetsccall == 1) { 342 ierr = MatGetLocalSize(lhsP, &LocalRow, &LocalCol); 343 // TODO Should this be LocalRow? LocalCol? or does LocalRow always == LocalCol 344 //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &resP); 345 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resP); 346 //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &DyP); 347 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyP); 348 } 349 ierr = VecZeroEntries(resP); 350 ierr = VecZeroEntries(DyP); 351 if(firstpetsccall == 1) { 352 //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &resPGlobal); 353 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPGlobal); 354 //ierr = VecCreateSeq(PETSC_COMM_SELF, nshg*nflow, &DyPLocal); 355 ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocal); 356 } 357 ierr = VecZeroEntries(resPGlobal); 358 // call VecZeroEntries(DyPLocal, ierr) 359 360 PetscRow=0; 361 k = 0; 362 int index; 363 for (i=0; i<nshg; i++ ){ 364 for (j = 1; j<=nflow; j++){ 365 index = i + (j-1)*nshg; 366 valtoinsert = res[index]; 367 if(workfc.numpe > 1) { 368 PetscRow = (fncorp[i]-1)*nflow+(j-1); 369 } 370 else { 371 PetscRow = i*nflow+(j-1); 372 } 373 assert(fncorp[i]<=nshgt); 374 assert(fncorp[i]>0); 375 assert(PetscRow>=0); 376 assert(PetscRow<=nshgt*nflow); 377 ierr = VecSetValue(resP, PetscRow, valtoinsert, INSERT_VALUES); 378 } 379 } 380 // printf("after VecSetValue loop %d\n",ierr); 381 ierr = VecAssemblyBegin(resP); 382 ierr = VecAssemblyEnd(resP); 383 get_time((duration+2), (duration+3)); 384 get_max_time_diff((duration), (duration+2), 385 (duration+1), (duration+3), 386 "VectorWorkPre \0"); // char(0)) 387 388 get_time((duration),(duration+1)); 389 390 if(firstpetsccall == 1) { 391 ierr = KSPCreate(PETSC_COMM_WORLD, &ksp); 392 ierr = KSPSetOperators(ksp, lhsP, lhsP); 393 //3.4 ierr = KSPSetOperators(ksp, lhsP, lhsP, SAME_NONZERO_PATTERN); 394 ierr = KSPGetPC(ksp, &pc); 395 ierr = PCSetType(pc, PCPBJACOBI); 396 PetscInt maxits; 397 maxits = (PetscInt) solpar.nGMRES * (PetscInt) solpar.Kspace; 398 //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace); 399 ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits); 400 ierr = KSPSetFromOptions(ksp); 401 } 402 ierr = KSPSolve(ksp, resP, DyP); 403 get_time((duration+2),(duration+3)); 404 get_max_time_diff((duration), (duration+2), 405 (duration+1), (duration+3), 406 "KSPSolve \0"); // char(0)) 407 //if(myrank .eq. 0) then 408 //!write(*,"(A, E10.3)") "KSPSolve ", elapsed 409 //end if 410 get_time((duration),(duration+1)); 411 if(firstpetsccall == 1) { 412 ierr = VecScatterCreate(DyP, LocalIndexSet, DyPLocal, GlobalIndexSet, &scatter7); 413 } 414 ierr = VecScatterBegin(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD); 415 ierr = VecScatterEnd(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD); 416 ierr = VecGetArray(DyPLocal, &parray); 417 PetscRow = 0; 418 for ( node = 0; node< nshg; node++) { 419 for (var = 1; var<= nflow; var++) { 420 index = node + (var-1)*nshg; 421 Dy[index] = parray[PetscRow]; 422 PetscRow = PetscRow+1; 423 } 424 } 425 ierr = VecRestoreArray(DyPLocal, &parray); 426 427 firstpetsccall = 0; 428 // later timer ('Solver '); 429 430 // .... output the statistics 431 // 432 itrpar.iKs=0; // see rstat() 433 PetscInt its; 434 //ierr = KSPGetIterationNumber(ksp, &itrpar.iKs); 435 ierr = KSPGetIterationNumber(ksp, &its); 436 itrpar.iKs = (int) its; 437 get_time((duration+2),(duration+3)); 438 get_max_time_diff((duration), (duration+2), 439 (duration+1), (duration+3), 440 "solWork \0"); // char(0)) 441 itrpar.ntotGM += (int) its; 442 rstat (res, ilwork); 443 // 444 // .... stop the timer 445 // 446 // 3002 continue ! no solve just res. 447 // later call timer ('Back ') 448 // 449 // .... end 450 // 451 } 452 453 void SolGMRpSclr(double* y, double* ac, 454 double* x, double* elDw, int* iBC, double* BC, 455 int* col, int* row, 456 int* iper, 457 int* ilwork, double* shp, double* shgl, double* shpb, 458 double* shglb, double* res, double* Dy, long long int* fncorp) 459 { 460 // 461 // ---------------------------------------------------------------------- 462 // 463 // This is the preconditioned GMRES driver routine. 464 // 465 // input: 466 // y (nshg,ndof) : Y-variables at n+alpha_v 467 // ac (nshg,ndof) : Primvar. accel. variable n+alpha_m 468 // yold (nshg,ndof) : Y-variables at beginning of step 469 // acold (nshg,ndof) : Primvar. accel. variable at begng step 470 // x (numnp,nsd) : node coordinates 471 // iBC (nshg) : BC codes 472 // BC (nshg,ndofBC) : BC constraint parameters 473 // shp(b) (nen,maxsh,melCat) : element shape functions (boundary) 474 // shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions 475 // 476 // ---------------------------------------------------------------------- 477 // 478 479 480 // Get variables from common_c.h 481 int nshg, nflow, nsd, iownnodes; 482 nshg = conpar.nshg; 483 nsd = NSD; 484 iownnodes = conpar.iownnodes; 485 486 gcorp_t nshgt; 487 gcorp_t mbeg; 488 gcorp_t mend; 489 nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h 490 mbeg = (gcorp_t) newdim.minowned; 491 mend = (gcorp_t) newdim.maxowned; 492 493 494 int node, element, var, eqn; 495 double valtoinsert; 496 int nenl, iel, lelCat, lcsyst, iorder, nshl; 497 int mattyp, ndofl, nsymdl, npro, ngauss, nppro; 498 double DyFlats[nshg]; 499 double DyFlatPhastas[nshg]; 500 double rmes[nshg]; 501 // DEBUG 502 int i,j,k,l,m; 503 PetscInt indexsetarys[nshg]; 504 PetscInt gblindexsetarys[nshg]; 505 PetscInt nodetoinsert; 506 507 double real_rtol, real_abstol, real_dtol; 508 double *parray; 509 PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA; 510 PetscInt petsc_n; 511 PetscOne = 1; 512 uint64_t duration[4]; 513 514 515 // 516 // 517 // .... *******************>> Element Data Formation <<****************** 518 // 519 // 520 // .... set the parameters for flux and surface tension calculations 521 // 522 // 523 genpar.idflx = 0 ; 524 if(genpar.idiff >= 1) genpar.idflx= genpar.idflx + (nflow-1) * nsd; 525 if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd; 526 527 528 529 PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes); 530 PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes); 531 gcorp_t glbNZ; 532 533 if(firstpetsccalls == 1) { 534 for(i=0;i<iownnodes;i++) { 535 idiagnz[i]=0; 536 iodiagnz[i]=0; 537 } 538 i=0; 539 for(k=0;k<nshg;k++) { 540 if((fncorp[k] < mbeg) || (fncorp[k] >mend)){ 541 // this node is not owned by this rank so we skip 542 } else { 543 for(j=col[i]-1;j<col[i+1]-1;j++) { 544 glbNZ=fncorp[row[j]]; 545 if((glbNZ < mbeg) || (glbNZ > mend)) { 546 iodiagnz[i]++; 547 } else { 548 idiagnz[i]++; 549 } 550 } 551 i++; 552 } 553 } 554 gcorp_t mind=1000; 555 gcorp_t mino=1000; 556 gcorp_t maxd=0; 557 gcorp_t maxo=0; 558 for(i=0;i<iownnodes;i++) { 559 mind=min(mind,idiagnz[i]); 560 mino=min(mino,iodiagnz[i]); 561 maxd=max(maxd,idiagnz[i]); 562 maxo=max(maxo,iodiagnz[i]); 563 } 564 565 for(i=0;i<iownnodes;i++) { 566 iodiagnz[i]=1.3*maxd; 567 idiagnz[i]=1.3*maxd; 568 } 569 570 571 572 } 573 if(firstpetsccalls == 1) { 574 575 petsc_m = (PetscInt) iownnodes; 576 petsc_M = (PetscInt) nshgt; 577 petsc_PA = (PetscInt) 40; 578 579 ierr = MatCreateAIJ(PETSC_COMM_WORLD, petsc_m, petsc_m, petsc_M, petsc_M, 580 0, idiagnz, 0, iodiagnz, &lhsPs); 581 582 ierr = MatSetOption(lhsPs, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE); 583 584 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call 585 ierr = MatSetOption(lhsPs, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE); 586 ierr = MatSetUp(lhsPs); 587 588 PetscInt myMatStart, myMatEnd; 589 ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd); 590 } 591 // MPI_Barrier(MPI_COMM_WORLD); 592 // if(workfc.myrank ==0) printf("Before MatZeroEntries \n"); 593 if(genpar.lhs == 1) ierr = MatZeroEntries(lhsPs); 594 595 get_time(duration, (duration+1)); 596 597 // MPI_Barrier(MPI_COMM_WORLD); 598 // if(workfc.myrank ==0) printf("Before elmgmr \n"); 599 // for (i=0; i<nshg ; i++) { 600 // assert(fncorp[i]>0); 601 // } 602 603 ElmGMRPETScSclr(y, ac, x, elDw, 604 shp, shgl, iBC, 605 BC, shpb, 606 shglb, res, 607 rmes, iper, 608 ilwork, &lhsPs); 609 if(firstpetsccalls == 1) { 610 // Setup IndexSet. For now, we mimic vector insertion procedure 611 // Since we always reference by global indexes this doesn't matter 612 // except for cache performance) 613 // TODO: Better arrangment? 614 nodetoinsert = 0; 615 k=0; 616 if(workfc.numpe > 1) { 617 for (i=0; i<nshg ; i++) { 618 nodetoinsert = fncorp[i]-1; 619 indexsetarys[k] = nodetoinsert; 620 k = k+1; 621 } 622 } 623 else { 624 for (i=0; i<nshg ; i++) { 625 nodetoinsert = i; 626 indexsetarys[k] = nodetoinsert; 627 k = k+1; 628 } 629 } 630 631 for (i=0; i<nshg; i++) { 632 gblindexsetarys[i] = i ; //TODO: better option for performance? 633 } 634 // Create Vector Index Maps 635 petsc_n = (PetscInt) nshg; 636 ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetarys, 637 PETSC_COPY_VALUES, &LocalIndexSets); 638 ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetarys, 639 PETSC_COPY_VALUES, &GlobalIndexSets); 640 ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSets,&VectorMappings); 641 ierr = ISLocalToGlobalMappingCreateIS(GlobalIndexSets,&GblVectorMappings); 642 } 643 if(genpar.lhs ==1) { 644 ierr = MatAssemblyBegin(lhsPs, MAT_FINAL_ASSEMBLY); 645 ierr = MatAssemblyEnd(lhsPs, MAT_FINAL_ASSEMBLY); 646 } 647 if(firstpetsccalls == 1) { 648 ierr = MatGetLocalSize(lhsPs, &LocalRow, &LocalCol); 649 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPs); 650 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyPs); 651 } 652 ierr = VecZeroEntries(resPs); 653 ierr = VecZeroEntries(DyPs); 654 if(firstpetsccalls == 1) { 655 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPGlobals); 656 ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocals); 657 } 658 ierr = VecZeroEntries(resPGlobals); 659 660 PetscRow=0; 661 k = 0; 662 int index; 663 for (i=0; i<nshg; i++ ){ 664 valtoinsert = res[i]; 665 if(workfc.numpe > 1) { 666 PetscRow = (fncorp[i]-1); 667 } 668 else { 669 PetscRow = i-1; 670 } 671 assert(fncorp[i]<=nshgt); 672 assert(fncorp[i]>0); 673 assert(PetscRow>=0); 674 assert(PetscRow<=nshgt); 675 ierr = VecSetValue(resPs, PetscRow, valtoinsert, INSERT_VALUES); 676 } 677 // printf("after VecSetValue loop %d\n",ierr); 678 ierr = VecAssemblyBegin(resPs); 679 ierr = VecAssemblyEnd(resPs); 680 681 if(firstpetsccalls == 1) { 682 ierr = KSPCreate(PETSC_COMM_WORLD, &ksps); 683 ierr = KSPSetOperators(ksps, lhsPs, lhsPs); 684 //3.4 ierr = KSPSetOperators(ksps, lhsPs, lhsPs, SAME_NONZERO_PATTERN); 685 ierr = KSPGetPC(ksps, &pcs); 686 ierr = PCSetType(pcs, PCPBJACOBI); 687 PetscInt maxits; 688 maxits = (PetscInt) solpar.nGMRES * (PetscInt) solpar.Kspace; 689 //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace); 690 ierr = KSPSetTolerances(ksps, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits); 691 ierr = KSPSetFromOptions(ksps); 692 } 693 ierr = KSPSolve(ksps, resPs, DyPs); 694 if(firstpetsccalls == 1) { 695 ierr = VecScatterCreate(DyPs, LocalIndexSets, DyPLocals, GlobalIndexSets, &scatter7s); 696 } 697 ierr = VecScatterBegin(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD); 698 ierr = VecScatterEnd(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD); 699 ierr = VecGetArray(DyPLocals, &parray); 700 PetscRow = 0; 701 for ( node = 0; node< nshg; node++) { 702 index = node; 703 Dy[index] = parray[PetscRow]; 704 PetscRow = PetscRow+1; 705 } 706 ierr = VecRestoreArray(DyPLocals, &parray); 707 708 firstpetsccalls = 0; 709 710 // .... output the statistics 711 // 712 // itrpar.iKss=0; // see rstat() 713 PetscInt its; 714 ierr = KSPGetIterationNumber(ksps, &its); 715 itrpar.iKss = (int) its; 716 itrpar.ntotGMs += (int) its; 717 rstatSclr (res, ilwork); 718 // 719 // .... end 720 // 721 } 722 #endif 723