1 #ifdef HAVE_PETSC 2 #include <stdio.h> 3 4 #include "petscsys.h" 5 #include "petscvec.h" 6 #include "petscmat.h" 7 #include "petscpc.h" 8 #include "petscksp.h" 9 10 #include "assert.h" 11 #include "common_c.h" 12 13 #include <FCMangle.h> 14 #define SolGMRp FortranCInterface_GLOBAL_(solgmrp,SOLGMRP) 15 #define SolGMRpSclr FortranCInterface_GLOBAL_(solgmrpsclr,SOLGMRPSCLR) 16 #define ElmGMRPETSc FortranCInterface_GLOBAL_(elmgmrpetsc, ELMGMRPETSC) 17 #define ElmGMRPETScSclr FortranCInterface_GLOBAL_(elmgmrpetscsclr, ELMGMRPETSCSCLR) 18 #define rstat FortranCInterface_GLOBAL_(rstat, RSTAT) 19 #define rstatSclr FortranCInterface_GLOBAL_(rstatsclr, RSTATSCLR) 20 #define min(a,b) (((a) < (b)) ? (a) : (b)) 21 #define max(a,b) (((a) > (b)) ? (a) : (b)) 22 #define get_time FortranCInterface_GLOBAL_(get_time,GET_TIME) 23 #define get_max_time_diff FortranCInterface_GLOBAL_(get_max_time_diff,GET_MAX_TIME_DIFF) 24 25 typedef long long int gcorp_t; 26 27 void get_time(uint64_t* rv, uint64_t* cycle); 28 void get_max_time_diff(uint64_t* first, uint64_t* last, uint64_t* c_first, uint64_t* c_last, char* lbl); 29 30 //#include "auxmpi.h" 31 // static PetscOffset poff; 32 33 static Mat lhsP; 34 static PC pc; 35 static KSP ksp; 36 static Vec DyP, resP, DyPLocal, resPGlobal; 37 static PetscErrorCode ierr; 38 static PetscInt PetscOne, PetscRow, PetscCol, LocalRow, LocalCol; 39 static IS LocalIndexSet, GlobalIndexSet; 40 static ISLocalToGlobalMapping VectorMapping; 41 static ISLocalToGlobalMapping GblVectorMapping; 42 static VecScatter scatter7; 43 static int firstpetsccall = 1; 44 45 static Mat lhsPs; 46 static PC pcs; 47 static KSP ksps; 48 static Vec DyPs, resPs, DyPLocals, resPGlobals; 49 static IS LocalIndexSets, GlobalIndexSets; 50 static ISLocalToGlobalMapping VectorMappings; 51 static ISLocalToGlobalMapping GblVectorMappings; 52 static VecScatter scatter7s; 53 static int firstpetsccalls = 1; 54 55 void SolGMRp(double* y, double* ac, double* yold, 56 double* x, int* iBC, double* BC, 57 int* col, int* row, double* lhsk, 58 double* res, double* BDiag, 59 int* iper, 60 int* ilwork, double* shp, double* shgl, double* shpb, 61 double* shglb, double* Dy, double* rerr, long long int* fncorp) 62 { 63 // 64 // ---------------------------------------------------------------------- 65 // 66 // This is the preconditioned GMRES driver routine. 67 // 68 // input: 69 // y (nshg,ndof) : Y-variables at n+alpha_v 70 // ac (nshg,ndof) : Primvar. accel. variable n+alpha_m 71 // yold (nshg,ndof) : Y-variables at beginning of step 72 // acold (nshg,ndof) : Primvar. accel. variable at begng step 73 // x (numnp,nsd) : node coordinates 74 // iBC (nshg) : BC codes 75 // BC (nshg,ndofBC) : BC constraint parameters 76 // shp(b) (nen,maxsh,melCat) : element shape functions (boundary) 77 // shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions 78 // 79 // output: 80 // res (nshg,nflow) : preconditioned residual 81 // BDiag (nshg,nflow,nflow) : block-diagonal preconditioner 82 // 83 // 84 // Zdenek Johan, Winter 1991. (Fortran 90) 85 // ---------------------------------------------------------------------- 86 // 87 88 89 // Get variables from common_c.h 90 int nshg, nflow, nsd, iownnodes; 91 nshg = conpar.nshg; 92 nflow = conpar.nflow; 93 nsd = NSD; 94 iownnodes = conpar.iownnodes; 95 96 gcorp_t nshgt; 97 gcorp_t mbeg; 98 gcorp_t mend; 99 nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h 100 mbeg = (gcorp_t) newdim.minowned; 101 mend = (gcorp_t) newdim.maxowned; 102 103 104 int node, element, var, eqn; 105 double valtoinsert; 106 int nenl, iel, lelCat, lcsyst, iorder, nshl; 107 int mattyp, ndofl, nsymdl, npro, ngauss, nppro; 108 double DyFlat[nshg*nflow]; 109 double DyFlatPhasta[nshg*nflow]; 110 double rmes[nshg*nflow]; 111 // DEBUG 112 int i,j,k,l,m; 113 //int indexsetary[nflow*nshg]; 114 //int gblindexsetary[nflow*nshg]; 115 PetscInt indexsetary[nflow*nshg]; 116 PetscInt gblindexsetary[nflow*nshg]; 117 PetscInt nodetoinsert; 118 119 // FIXME: PetscScalar 120 double real_rtol, real_abstol, real_dtol; 121 // /DEBUG 122 //double parray[1]; // Should be a PetscScalar 123 double *parray; // Should be a PetscScalar 124 PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA; 125 PetscInt petsc_n; 126 PetscOne = 1; 127 uint64_t duration[4]; 128 129 // printf("%g %g %g \n", *rerr, *(rerr+1) , *(rerr+2)); 130 131 if(firstpetsccall == 1) { 132 /* call get_time(duration(1),duration(2)); 133 call system('sleep 10'); 134 call get_time(duration(3),duration(4)); 135 call get_max_time_diff(duration(1), duration(3), 136 duration(2), duration(4), 137 "TenSecondSleep " // char(0)) 138 if(myrank .eq. 0) { 139 !write(*,"(A, E10.3)") "TenSecondSleep ", elapsed 140 } 141 */ 142 } 143 // 144 // 145 // .... *******************>> Element Data Formation <<****************** 146 // 147 // 148 // .... set the parameters for flux and surface tension calculations 149 // 150 // 151 genpar.idflx = 0 ; 152 if(genpar.idiff >= 1) genpar.idflx= genpar.idflx + (nflow-1) * nsd; 153 if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd; 154 155 156 157 PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes); 158 PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes); 159 gcorp_t glbNZ; 160 161 if(firstpetsccall == 1) { 162 for(i=0;i<iownnodes;i++) { 163 idiagnz[i]=0; 164 iodiagnz[i]=0; 165 } 166 i=0; 167 for(k=0;k<nshg;k++) { 168 if((fncorp[k] < mbeg) || (fncorp[k] >mend)){ 169 // this node is not owned by this rank so we skip 170 } else { 171 for(j=col[i]-1;j<col[i+1]-1;j++) { 172 glbNZ=fncorp[row[j]]; 173 if((glbNZ < mbeg) || (glbNZ > mend)) { 174 iodiagnz[i]++; 175 } else { 176 idiagnz[i]++; 177 } 178 } 179 i++; 180 } 181 } 182 gcorp_t mind=1000; 183 gcorp_t mino=1000; 184 gcorp_t maxd=0; 185 gcorp_t maxo=0; 186 for(i=0;i<iownnodes;i++) { 187 mind=min(mind,idiagnz[i]); 188 mino=min(mino,iodiagnz[i]); 189 maxd=max(maxd,idiagnz[i]); 190 maxo=max(maxo,iodiagnz[i]); 191 // iodiagnz[i]=max(iodiagnz[i],10); 192 // idiagnz[i]=max(idiagnz[i],10); 193 // iodiagnz[i]=2*iodiagnz[i]; // estimate a bit higher for off-part interactions 194 // idiagnz[i]=2*idiagnz[i]; // estimate a bit higher for off-part interactions 195 } 196 // the above was pretty good but below is faster and not too much more memory...of course once you do this 197 // could just use the constant fill parameters in create but keep it alive for potential later optimization 198 199 for(i=0;i<iownnodes;i++) { 200 iodiagnz[i]=1.3*maxd; 201 idiagnz[i]=1.3*maxd; 202 } 203 204 205 206 if(workfc.numpe < 200){ 207 printf("myrank,i,iownnodes,nshg %d %d %d %d \n",workfc.myrank,i,iownnodes,nshg); 208 printf("myrank,mind,maxd,mino,maxo %d %d %d %d %d \n",workfc.myrank,mind,maxd,mino,maxo); 209 } 210 // Print debug info 211 if(nshgt < 200){ 212 int irank; 213 for(irank=0;irank<workfc.numpe;irank++) { 214 if(irank == workfc.myrank){ 215 printf("mbeg,mend,myrank,idiagnz, iodiagnz %d %d %d \n",mbeg,mend,workfc.myrank); 216 for(i=0;i<iownnodes;i++) { 217 printf("%d %ld %ld \n",workfc.myrank,idiagnz[i],iodiagnz[i]); 218 } 219 } 220 MPI_Barrier(MPI_COMM_WORLD); 221 } 222 } 223 } 224 if(firstpetsccall == 1) { 225 226 petsc_bs = (PetscInt) nflow; 227 petsc_m = (PetscInt) nflow* (PetscInt) iownnodes; 228 petsc_M = (PetscInt) nshgt * (PetscInt) nflow; 229 petsc_PA = (PetscInt) 40; 230 // TODO: fixe nshgt for 64 bit integer!!!!! 231 232 //ierr = MatCreateBAIJ(PETSC_COMM_WORLD, nflow, nflow*iownnodes, 233 // nflow*iownnodes, nshgt*nflow, nshgt*nflow, 0, 234 235 // this has no pre-allocate ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M, 236 // this has no pre-allocate 0, NULL, 0, NULL, &lhsP); 237 238 // next 2 do pre-allocate 239 240 // MPI_Barrier(MPI_COMM_WORLD); 241 // if(workfc.myrank ==0) printf("Before matCreateBAIJ petsc_bs,petsc_m,petsc_M,petsc_PA %ld %ld %ld %ld \n",petsc_bs,petsc_m,petsc_M,petsc_PA); 242 243 ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M, 244 0, idiagnz, 0, iodiagnz, &lhsP); 245 246 // petsc_PA, NULL, petsc_PA, NULL, &lhsP); 247 // MPI_Barrier(MPI_COMM_WORLD); 248 // if(workfc.myrank ==0) printf("Before matSetOoption 1 \n"); 249 ierr = MatSetOption(lhsP, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE); 250 251 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call 252 #ifdef JEDBROWN 253 ierr = MatSetOption(lhsP, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE); 254 #endif 255 ierr = MatSetUp(lhsP); 256 257 PetscInt myMatStart, myMatEnd; 258 ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd); 259 } 260 // MPI_Barrier(MPI_COMM_WORLD); 261 // if(workfc.myrank ==0) printf("Before MatZeroEntries \n"); 262 if(genpar.lhs ==1) ierr = MatZeroEntries(lhsP); 263 264 get_time(duration, (duration+1)); 265 266 // MPI_Barrier(MPI_COMM_WORLD); 267 // if(workfc.myrank ==0) printf("Before elmgmr \n"); 268 // for (i=0; i<nshg ; i++) { 269 // assert(fncorp[i]>0); 270 // } 271 272 ElmGMRPETSc(y, ac, x, 273 shp, shgl, iBC, 274 BC, shpb, 275 shglb, res, 276 rmes, iper, 277 ilwork, rerr , &lhsP); 278 get_time((duration+2), (duration+3)); 279 get_max_time_diff((duration), (duration+2), 280 (duration+1), (duration+3), 281 "ElmGMRPETSc \0"); // char(0)) 282 // printf("after ElmGMRPETSc\n"); 283 if(firstpetsccall == 1) { 284 // Setup IndexSet. For now, we mimic vector insertion procedure 285 // Since we always reference by global indexes this doesn't matter 286 // except for cache performance) 287 // TODO: Better arrangment? 288 nodetoinsert = 0; 289 k=0; 290 if(workfc.numpe > 1) { 291 for (i=0; i<nshg ; i++) { 292 nodetoinsert = fncorp[i]-1; 293 // if(nodetoinsert<0) printf("nodetoinsert <0! myrank: %d, value: %ld\n", workfc.myrank, nodetoinsert); 294 // assert(fncorp[i]>=0); 295 for (j=1; j<=nflow; j++) { 296 indexsetary[k] = nodetoinsert*nflow+(j-1); 297 assert(indexsetary[k]>=0); 298 // assert(fncorp[i]>=0); 299 k = k+1; 300 } 301 } 302 } 303 else { 304 for (i=0; i<nshg ; i++) { 305 nodetoinsert = i; 306 for (j=1; j<=nflow; j++) { 307 indexsetary[k] = nodetoinsert*nflow+(j-1); 308 k = k+1; 309 } 310 } 311 } 312 313 for (i=0; i<nshg*nflow; i++) { 314 gblindexsetary[i] = i ; //TODO: better option for performance? 315 } 316 // Create Vector Index Maps 317 petsc_n = (PetscInt) nshg * (PetscInt) nflow; 318 //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, indexsetary, 319 // PETSC_COPY_VALUES, &LocalIndexSet); 320 ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetary, 321 PETSC_COPY_VALUES, &LocalIndexSet); 322 // printf("after 1st ISCreateGeneral %d\n", ierr); 323 //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, gblindexsetary, 324 // PETSC_COPY_VALUES, &GlobalIndexSet); 325 ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetary, 326 PETSC_COPY_VALUES, &GlobalIndexSet); 327 // printf("after 2nd ISCreateGeneral %d\n", ierr); 328 ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSet,&VectorMapping); 329 // printf("after 1st ISLocalToGlobalMappingCreateIS %d\n",ierr); 330 ierr = ISLocalToGlobalMappingCreateIS(GlobalIndexSet,&GblVectorMapping); 331 // printf("after 2nd ISLocalToGlobalMappingCreateIS %d\n",ierr); 332 } 333 if(genpar.lhs == 1) { 334 get_time((duration), (duration+1)); 335 ierr = MatAssemblyBegin(lhsP, MAT_FINAL_ASSEMBLY); 336 ierr = MatAssemblyEnd(lhsP, MAT_FINAL_ASSEMBLY); 337 get_time((duration+2),(duration+3)); 338 get_max_time_diff((duration), (duration+2), 339 (duration+1), (duration+3), 340 "MatAssembly \0"); // char(0)) 341 get_time(duration, (duration+1)); 342 } 343 if(firstpetsccall == 1) { 344 ierr = MatGetLocalSize(lhsP, &LocalRow, &LocalCol); 345 // TODO Should this be LocalRow? LocalCol? or does LocalRow always == LocalCol 346 //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &resP); 347 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resP); 348 //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &DyP); 349 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyP); 350 } 351 ierr = VecZeroEntries(resP); 352 ierr = VecZeroEntries(DyP); 353 if(firstpetsccall == 1) { 354 //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &resPGlobal); 355 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPGlobal); 356 //ierr = VecCreateSeq(PETSC_COMM_SELF, nshg*nflow, &DyPLocal); 357 ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocal); 358 } 359 ierr = VecZeroEntries(resPGlobal); 360 // call VecZeroEntries(DyPLocal, ierr) 361 362 PetscRow=0; 363 k = 0; 364 int index; 365 for (i=0; i<nshg; i++ ){ 366 for (j = 1; j<=nflow; j++){ 367 index = i + (j-1)*nshg; 368 valtoinsert = res[index]; 369 if(workfc.numpe > 1) { 370 PetscRow = (fncorp[i]-1)*nflow+(j-1); 371 } 372 else { 373 PetscRow = i*nflow+(j-1); 374 } 375 assert(fncorp[i]<=nshgt); 376 assert(fncorp[i]>0); 377 assert(PetscRow>=0); 378 assert(PetscRow<=nshgt*nflow); 379 ierr = VecSetValue(resP, PetscRow, valtoinsert, INSERT_VALUES); 380 } 381 } 382 // printf("after VecSetValue loop %d\n",ierr); 383 ierr = VecAssemblyBegin(resP); 384 ierr = VecAssemblyEnd(resP); 385 get_time((duration+2), (duration+3)); 386 get_max_time_diff((duration), (duration+2), 387 (duration+1), (duration+3), 388 "VectorWorkPre \0"); // char(0)) 389 390 get_time((duration),(duration+1)); 391 392 if(firstpetsccall == 1) { 393 ierr = KSPCreate(PETSC_COMM_WORLD, &ksp); 394 ierr = KSPSetOperators(ksp, lhsP, lhsP); 395 //3.4 ierr = KSPSetOperators(ksp, lhsP, lhsP, SAME_NONZERO_PATTERN); 396 ierr = KSPGetPC(ksp, &pc); 397 ierr = PCSetType(pc, PCPBJACOBI); 398 PetscInt maxits; 399 maxits = (PetscInt) solpar.nGMRES * (PetscInt) solpar.Kspace; 400 //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace); 401 ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits); 402 ierr = KSPSetFromOptions(ksp); 403 } 404 ierr = KSPSolve(ksp, resP, DyP); 405 get_time((duration+2),(duration+3)); 406 get_max_time_diff((duration), (duration+2), 407 (duration+1), (duration+3), 408 "KSPSolve \0"); // char(0)) 409 //if(myrank .eq. 0) then 410 //!write(*,"(A, E10.3)") "KSPSolve ", elapsed 411 //end if 412 get_time((duration),(duration+1)); 413 if(firstpetsccall == 1) { 414 ierr = VecScatterCreate(DyP, LocalIndexSet, DyPLocal, GlobalIndexSet, &scatter7); 415 } 416 ierr = VecScatterBegin(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD); 417 ierr = VecScatterEnd(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD); 418 ierr = VecGetArray(DyPLocal, &parray); 419 PetscRow = 0; 420 for ( node = 0; node< nshg; node++) { 421 for (var = 1; var<= nflow; var++) { 422 index = node + (var-1)*nshg; 423 Dy[index] = parray[PetscRow]; 424 PetscRow = PetscRow+1; 425 } 426 } 427 ierr = VecRestoreArray(DyPLocal, &parray); 428 429 firstpetsccall = 0; 430 // later timer ('Solver '); 431 432 // .... output the statistics 433 // 434 itrpar.iKs=0; // see rstat() 435 PetscInt its; 436 //ierr = KSPGetIterationNumber(ksp, &itrpar.iKs); 437 ierr = KSPGetIterationNumber(ksp, &its); 438 itrpar.iKs = (int) its; 439 get_time((duration+2),(duration+3)); 440 get_max_time_diff((duration), (duration+2), 441 (duration+1), (duration+3), 442 "solWork \0"); // char(0)) 443 itrpar.ntotGM += (int) its; 444 rstat (res, ilwork); 445 // 446 // .... stop the timer 447 // 448 // 3002 continue ! no solve just res. 449 // later call timer ('Back ') 450 // 451 // .... end 452 // 453 } 454 455 void SolGMRpSclr(double* y, double* ac, 456 double* x, double* elDw, int* iBC, double* BC, 457 int* col, int* row, 458 int* iper, 459 int* ilwork, double* shp, double* shgl, double* shpb, 460 double* shglb, double* res, double* Dy, long long int* fncorp) 461 { 462 // 463 // ---------------------------------------------------------------------- 464 // 465 // This is the preconditioned GMRES driver routine. 466 // 467 // input: 468 // y (nshg,ndof) : Y-variables at n+alpha_v 469 // ac (nshg,ndof) : Primvar. accel. variable n+alpha_m 470 // yold (nshg,ndof) : Y-variables at beginning of step 471 // acold (nshg,ndof) : Primvar. accel. variable at begng step 472 // x (numnp,nsd) : node coordinates 473 // iBC (nshg) : BC codes 474 // BC (nshg,ndofBC) : BC constraint parameters 475 // shp(b) (nen,maxsh,melCat) : element shape functions (boundary) 476 // shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions 477 // 478 // ---------------------------------------------------------------------- 479 // 480 481 482 // Get variables from common_c.h 483 int nshg, nflow, nsd, iownnodes; 484 nshg = conpar.nshg; 485 nsd = NSD; 486 iownnodes = conpar.iownnodes; 487 488 gcorp_t nshgt; 489 gcorp_t mbeg; 490 gcorp_t mend; 491 nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h 492 mbeg = (gcorp_t) newdim.minowned; 493 mend = (gcorp_t) newdim.maxowned; 494 495 496 int node, element, var, eqn; 497 double valtoinsert; 498 int nenl, iel, lelCat, lcsyst, iorder, nshl; 499 int mattyp, ndofl, nsymdl, npro, ngauss, nppro; 500 double DyFlats[nshg]; 501 double DyFlatPhastas[nshg]; 502 double rmes[nshg]; 503 // DEBUG 504 int i,j,k,l,m; 505 PetscInt indexsetarys[nshg]; 506 PetscInt gblindexsetarys[nshg]; 507 PetscInt nodetoinsert; 508 509 double real_rtol, real_abstol, real_dtol; 510 double *parray; 511 PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA; 512 PetscInt petsc_n; 513 PetscOne = 1; 514 uint64_t duration[4]; 515 516 517 // 518 // 519 // .... *******************>> Element Data Formation <<****************** 520 // 521 // 522 // .... set the parameters for flux and surface tension calculations 523 // 524 // 525 genpar.idflx = 0 ; 526 if(genpar.idiff >= 1) genpar.idflx= genpar.idflx + (nflow-1) * nsd; 527 if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd; 528 529 530 531 PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes); 532 PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes); 533 gcorp_t glbNZ; 534 535 if(firstpetsccalls == 1) { 536 for(i=0;i<iownnodes;i++) { 537 idiagnz[i]=0; 538 iodiagnz[i]=0; 539 } 540 i=0; 541 for(k=0;k<nshg;k++) { 542 if((fncorp[k] < mbeg) || (fncorp[k] >mend)){ 543 // this node is not owned by this rank so we skip 544 } else { 545 for(j=col[i]-1;j<col[i+1]-1;j++) { 546 glbNZ=fncorp[row[j]]; 547 if((glbNZ < mbeg) || (glbNZ > mend)) { 548 iodiagnz[i]++; 549 } else { 550 idiagnz[i]++; 551 } 552 } 553 i++; 554 } 555 } 556 gcorp_t mind=1000; 557 gcorp_t mino=1000; 558 gcorp_t maxd=0; 559 gcorp_t maxo=0; 560 for(i=0;i<iownnodes;i++) { 561 mind=min(mind,idiagnz[i]); 562 mino=min(mino,iodiagnz[i]); 563 maxd=max(maxd,idiagnz[i]); 564 maxo=max(maxo,iodiagnz[i]); 565 } 566 567 for(i=0;i<iownnodes;i++) { 568 iodiagnz[i]=1.3*maxd; 569 idiagnz[i]=1.3*maxd; 570 } 571 572 573 574 } 575 if(firstpetsccalls == 1) { 576 577 petsc_m = (PetscInt) iownnodes; 578 petsc_M = (PetscInt) nshgt; 579 petsc_PA = (PetscInt) 40; 580 581 ierr = MatCreateAIJ(PETSC_COMM_WORLD, petsc_m, petsc_m, petsc_M, petsc_M, 582 0, idiagnz, 0, iodiagnz, &lhsPs); 583 584 ierr = MatSetOption(lhsPs, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE); 585 586 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call 587 #ifdef JEDBROWN 588 ierr = MatSetOption(lhsPs, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE); 589 #endif 590 ierr = MatSetUp(lhsPs); 591 592 PetscInt myMatStart, myMatEnd; 593 ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd); 594 } 595 // MPI_Barrier(MPI_COMM_WORLD); 596 // if(workfc.myrank ==0) printf("Before MatZeroEntries \n"); 597 if(genpar.lhs == 1) ierr = MatZeroEntries(lhsPs); 598 599 get_time(duration, (duration+1)); 600 601 // MPI_Barrier(MPI_COMM_WORLD); 602 // if(workfc.myrank ==0) printf("Before elmgmr \n"); 603 // for (i=0; i<nshg ; i++) { 604 // assert(fncorp[i]>0); 605 // } 606 607 ElmGMRPETScSclr(y, ac, x, elDw, 608 shp, shgl, iBC, 609 BC, shpb, 610 shglb, res, 611 rmes, iper, 612 ilwork, &lhsPs); 613 if(firstpetsccalls == 1) { 614 // Setup IndexSet. For now, we mimic vector insertion procedure 615 // Since we always reference by global indexes this doesn't matter 616 // except for cache performance) 617 // TODO: Better arrangment? 618 nodetoinsert = 0; 619 k=0; 620 if(workfc.numpe > 1) { 621 for (i=0; i<nshg ; i++) { 622 nodetoinsert = fncorp[i]-1; 623 indexsetarys[k] = nodetoinsert; 624 k = k+1; 625 } 626 } 627 else { 628 for (i=0; i<nshg ; i++) { 629 nodetoinsert = i; 630 indexsetarys[k] = nodetoinsert; 631 k = k+1; 632 } 633 } 634 635 for (i=0; i<nshg; i++) { 636 gblindexsetarys[i] = i ; //TODO: better option for performance? 637 } 638 // Create Vector Index Maps 639 petsc_n = (PetscInt) nshg; 640 ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetarys, 641 PETSC_COPY_VALUES, &LocalIndexSets); 642 ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetarys, 643 PETSC_COPY_VALUES, &GlobalIndexSets); 644 ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSets,&VectorMappings); 645 ierr = ISLocalToGlobalMappingCreateIS(GlobalIndexSets,&GblVectorMappings); 646 } 647 if(genpar.lhs ==1) { 648 ierr = MatAssemblyBegin(lhsPs, MAT_FINAL_ASSEMBLY); 649 ierr = MatAssemblyEnd(lhsPs, MAT_FINAL_ASSEMBLY); 650 } 651 if(firstpetsccalls == 1) { 652 ierr = MatGetLocalSize(lhsPs, &LocalRow, &LocalCol); 653 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPs); 654 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyPs); 655 } 656 ierr = VecZeroEntries(resPs); 657 ierr = VecZeroEntries(DyPs); 658 if(firstpetsccalls == 1) { 659 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPGlobals); 660 ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocals); 661 } 662 ierr = VecZeroEntries(resPGlobals); 663 664 PetscRow=0; 665 k = 0; 666 int index; 667 for (i=0; i<nshg; i++ ){ 668 valtoinsert = res[i]; 669 if(workfc.numpe > 1) { 670 PetscRow = (fncorp[i]-1); 671 } 672 else { 673 PetscRow = i-1; 674 } 675 assert(fncorp[i]<=nshgt); 676 assert(fncorp[i]>0); 677 assert(PetscRow>=0); 678 assert(PetscRow<=nshgt); 679 ierr = VecSetValue(resPs, PetscRow, valtoinsert, INSERT_VALUES); 680 } 681 // printf("after VecSetValue loop %d\n",ierr); 682 ierr = VecAssemblyBegin(resPs); 683 ierr = VecAssemblyEnd(resPs); 684 685 if(firstpetsccalls == 1) { 686 ierr = KSPCreate(PETSC_COMM_WORLD, &ksps); 687 ierr = KSPSetOperators(ksps, lhsPs, lhsPs); 688 //3.4 ierr = KSPSetOperators(ksps, lhsPs, lhsPs, SAME_NONZERO_PATTERN); 689 ierr = KSPGetPC(ksps, &pcs); 690 ierr = PCSetType(pcs, PCPBJACOBI); 691 PetscInt maxits; 692 maxits = (PetscInt) solpar.nGMRES * (PetscInt) solpar.Kspace; 693 //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace); 694 ierr = KSPSetTolerances(ksps, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits); 695 ierr = KSPSetFromOptions(ksps); 696 } 697 ierr = KSPSolve(ksps, resPs, DyPs); 698 if(firstpetsccalls == 1) { 699 ierr = VecScatterCreate(DyPs, LocalIndexSets, DyPLocals, GlobalIndexSets, &scatter7s); 700 } 701 ierr = VecScatterBegin(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD); 702 ierr = VecScatterEnd(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD); 703 ierr = VecGetArray(DyPLocals, &parray); 704 PetscRow = 0; 705 for ( node = 0; node< nshg; node++) { 706 index = node; 707 Dy[index] = parray[PetscRow]; 708 PetscRow = PetscRow+1; 709 } 710 ierr = VecRestoreArray(DyPLocals, &parray); 711 712 firstpetsccalls = 0; 713 714 // .... output the statistics 715 // 716 // itrpar.iKss=0; // see rstat() 717 PetscInt its; 718 ierr = KSPGetIterationNumber(ksps, &its); 719 itrpar.iKss = (int) its; 720 itrpar.ntotGMs += (int) its; 721 rstatSclr (res, ilwork); 722 // 723 // .... end 724 // 725 } 726 #endif 727