1 #include <stdio.h> 2 3 #include "petscsys.h" 4 #include "petscvec.h" 5 #include "petscmat.h" 6 #include "petscpc.h" 7 #include "petscksp.h" 8 9 #include "assert.h" 10 #include "common_c.h" 11 12 #include <FCMangle.h> 13 #define SolGMRp FortranCInterface_GLOBAL_(solgmrp,SOLGMRP) 14 #define SolGMRpSclr FortranCInterface_GLOBAL_(solgmrpsclr,SOLGMRPSCLR) 15 #define ElmGMRPETSc FortranCInterface_GLOBAL_(elmgmrpetsc, ELMGMRPETSC) 16 #define ElmGMRPETScSclr FortranCInterface_GLOBAL_(elmgmrpetscsclr, ELMGMRPETSCSCLR) 17 #define rstat FortranCInterface_GLOBAL_(rstat, RSTAT) 18 #define rstatSclr FortranCInterface_GLOBAL_(rstatsclr, RSTATSCLR) 19 #define min(a,b) (((a) < (b)) ? (a) : (b)) 20 #define max(a,b) (((a) > (b)) ? (a) : (b)) 21 #define get_time FortranCInterface_GLOBAL_(get_time,GET_TIME) 22 #define get_max_time_diff FortranCInterface_GLOBAL_(get_max_time_diff,GET_MAX_TIME_DIFF) 23 24 typedef long long int gcorp_t; 25 26 void get_time(uint64_t* rv, uint64_t* cycle); 27 void get_max_time_diff(uint64_t* first, uint64_t* last, uint64_t* c_first, uint64_t* c_last, char* lbl); 28 29 //#include "auxmpi.h" 30 // static PetscOffset poff; 31 32 static Mat lhsP; 33 static PC pc; 34 static KSP ksp; 35 static Vec DyP, resP, DyPLocal, resPGlobal; 36 static PetscErrorCode ierr; 37 static PetscInt PetscOne, PetscRow, PetscCol, LocalRow, LocalCol; 38 static IS LocalIndexSet, GlobalIndexSet; 39 static ISLocalToGlobalMapping VectorMapping; 40 static ISLocalToGlobalMapping GblVectorMapping; 41 static VecScatter scatter7; 42 static int firstpetsccall = 1; 43 44 static Mat lhsPs; 45 static PC pcs; 46 static KSP ksps; 47 static Vec DyPs, resPs, DyPLocals, resPGlobals; 48 static IS LocalIndexSets, GlobalIndexSets; 49 static ISLocalToGlobalMapping VectorMappings; 50 static ISLocalToGlobalMapping GblVectorMappings; 51 static VecScatter scatter7s; 52 static int firstpetsccalls = 1; 53 54 void SolGMRp(double* y, double* ac, double* yold, 55 double* x, int* iBC, double* BC, 56 int* col, int* row, double* lhsk, 57 double* res, double* BDiag, 58 int* iper, 59 int* ilwork, double* shp, double* shgl, double* shpb, 60 double* shglb, double* Dy, double* rerr, long long int* fncorp) 61 { 62 // 63 // ---------------------------------------------------------------------- 64 // 65 // This is the preconditioned GMRES driver routine. 66 // 67 // input: 68 // y (nshg,ndof) : Y-variables at n+alpha_v 69 // ac (nshg,ndof) : Primvar. accel. variable n+alpha_m 70 // yold (nshg,ndof) : Y-variables at beginning of step 71 // acold (nshg,ndof) : Primvar. accel. variable at begng step 72 // x (numnp,nsd) : node coordinates 73 // iBC (nshg) : BC codes 74 // BC (nshg,ndofBC) : BC constraint parameters 75 // shp(b) (nen,maxsh,melCat) : element shape functions (boundary) 76 // shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions 77 // 78 // output: 79 // res (nshg,nflow) : preconditioned residual 80 // BDiag (nshg,nflow,nflow) : block-diagonal preconditioner 81 // 82 // 83 // Zdenek Johan, Winter 1991. (Fortran 90) 84 // ---------------------------------------------------------------------- 85 // 86 87 88 // Get variables from common_c.h 89 int nshg, nflow, nsd, iownnodes; 90 nshg = conpar.nshg; 91 nflow = conpar.nflow; 92 nsd = NSD; 93 iownnodes = conpar.iownnodes; 94 95 gcorp_t nshgt; 96 gcorp_t mbeg; 97 gcorp_t mend; 98 nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h 99 mbeg = (gcorp_t) newdim.minowned; 100 mend = (gcorp_t) newdim.maxowned; 101 102 103 int node, element, var, eqn; 104 double valtoinsert; 105 int nenl, iel, lelCat, lcsyst, iorder, nshl; 106 int mattyp, ndofl, nsymdl, npro, ngauss, nppro; 107 double DyFlat[nshg*nflow]; 108 double DyFlatPhasta[nshg*nflow]; 109 double rmes[nshg*nflow]; 110 // DEBUG 111 int i,j,k,l,m; 112 //int indexsetary[nflow*nshg]; 113 //int gblindexsetary[nflow*nshg]; 114 PetscInt indexsetary[nflow*nshg]; 115 PetscInt gblindexsetary[nflow*nshg]; 116 PetscInt nodetoinsert; 117 118 // FIXME: PetscScalar 119 double real_rtol, real_abstol, real_dtol; 120 // /DEBUG 121 //double parray[1]; // Should be a PetscScalar 122 double *parray; // Should be a PetscScalar 123 PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA; 124 PetscInt petsc_n; 125 PetscOne = 1; 126 uint64_t duration[4]; 127 128 // printf("%g %g %g \n", *rerr, *(rerr+1) , *(rerr+2)); 129 130 if(firstpetsccall == 1) { 131 /* call get_time(duration(1),duration(2)); 132 call system('sleep 10'); 133 call get_time(duration(3),duration(4)); 134 call get_max_time_diff(duration(1), duration(3), 135 duration(2), duration(4), 136 "TenSecondSleep " // char(0)) 137 if(myrank .eq. 0) { 138 !write(*,"(A, E10.3)") "TenSecondSleep ", elapsed 139 } 140 */ 141 } 142 // 143 // 144 // .... *******************>> Element Data Formation <<****************** 145 // 146 // 147 // .... set the parameters for flux and surface tension calculations 148 // 149 // 150 genpar.idflx = 0 ; 151 if(genpar.idiff >= 1) genpar.idflx= genpar.idflx + (nflow-1) * nsd; 152 if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd; 153 154 155 156 PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes); 157 PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes); 158 gcorp_t glbNZ; 159 160 if(firstpetsccall == 1) { 161 for(i=0;i<iownnodes;i++) { 162 idiagnz[i]=0; 163 iodiagnz[i]=0; 164 } 165 i=0; 166 for(k=0;k<nshg;k++) { 167 if((fncorp[k] < mbeg) || (fncorp[k] >mend)){ 168 // this node is not owned by this rank so we skip 169 } else { 170 for(j=col[i]-1;j<col[i+1]-1;j++) { 171 glbNZ=fncorp[row[j]]; 172 if((glbNZ < mbeg) || (glbNZ > mend)) { 173 iodiagnz[i]++; 174 } else { 175 idiagnz[i]++; 176 } 177 } 178 i++; 179 } 180 } 181 gcorp_t mind=1000; 182 gcorp_t mino=1000; 183 gcorp_t maxd=0; 184 gcorp_t maxo=0; 185 for(i=0;i<iownnodes;i++) { 186 mind=min(mind,idiagnz[i]); 187 mino=min(mino,iodiagnz[i]); 188 maxd=max(maxd,idiagnz[i]); 189 maxo=max(maxo,iodiagnz[i]); 190 // iodiagnz[i]=max(iodiagnz[i],10); 191 // idiagnz[i]=max(idiagnz[i],10); 192 // iodiagnz[i]=2*iodiagnz[i]; // estimate a bit higher for off-part interactions 193 // idiagnz[i]=2*idiagnz[i]; // estimate a bit higher for off-part interactions 194 } 195 // the above was pretty good but below is faster and not too much more memory...of course once you do this 196 // could just use the constant fill parameters in create but keep it alive for potential later optimization 197 198 for(i=0;i<iownnodes;i++) { 199 iodiagnz[i]=1.3*maxd; 200 idiagnz[i]=1.3*maxd; 201 } 202 203 204 205 if(workfc.numpe < 200){ 206 printf("myrank,i,iownnodes,nshg %d %d %d %d \n",workfc.myrank,i,iownnodes,nshg); 207 printf("myrank,mind,maxd,mino,maxo %d %d %d %d %d \n",workfc.myrank,mind,maxd,mino,maxo); 208 } 209 // Print debug info 210 if(nshgt < 200){ 211 int irank; 212 for(irank=0;irank<workfc.numpe;irank++) { 213 if(irank == workfc.myrank){ 214 printf("mbeg,mend,myrank,idiagnz, iodiagnz %d %d %d \n",mbeg,mend,workfc.myrank); 215 for(i=0;i<iownnodes;i++) { 216 printf("%d %ld %ld \n",workfc.myrank,idiagnz[i],iodiagnz[i]); 217 } 218 } 219 MPI_Barrier(MPI_COMM_WORLD); 220 } 221 } 222 } 223 if(firstpetsccall == 1) { 224 225 petsc_bs = (PetscInt) nflow; 226 petsc_m = (PetscInt) nflow* (PetscInt) iownnodes; 227 petsc_M = (PetscInt) nshgt * (PetscInt) nflow; 228 petsc_PA = (PetscInt) 40; 229 // TODO: fixe nshgt for 64 bit integer!!!!! 230 231 //ierr = MatCreateBAIJ(PETSC_COMM_WORLD, nflow, nflow*iownnodes, 232 // nflow*iownnodes, nshgt*nflow, nshgt*nflow, 0, 233 234 // this has no pre-allocate ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M, 235 // this has no pre-allocate 0, NULL, 0, NULL, &lhsP); 236 237 // next 2 do pre-allocate 238 239 // MPI_Barrier(MPI_COMM_WORLD); 240 // if(workfc.myrank ==0) printf("Before matCreateBAIJ petsc_bs,petsc_m,petsc_M,petsc_PA %ld %ld %ld %ld \n",petsc_bs,petsc_m,petsc_M,petsc_PA); 241 242 ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M, 243 0, idiagnz, 0, iodiagnz, &lhsP); 244 245 // petsc_PA, NULL, petsc_PA, NULL, &lhsP); 246 // MPI_Barrier(MPI_COMM_WORLD); 247 // if(workfc.myrank ==0) printf("Before matSetOoption 1 \n"); 248 ierr = MatSetOption(lhsP, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE); 249 250 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call 251 ierr = MatSetOption(lhsP, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE); 252 ierr = MatSetUp(lhsP); 253 254 PetscInt myMatStart, myMatEnd; 255 ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd); 256 } 257 // MPI_Barrier(MPI_COMM_WORLD); 258 // if(workfc.myrank ==0) printf("Before MatZeroEntries \n"); 259 if(genpar.lhs ==1) ierr = MatZeroEntries(lhsP); 260 261 get_time(duration, (duration+1)); 262 263 // MPI_Barrier(MPI_COMM_WORLD); 264 // if(workfc.myrank ==0) printf("Before elmgmr \n"); 265 // for (i=0; i<nshg ; i++) { 266 // assert(fncorp[i]>0); 267 // } 268 269 ElmGMRPETSc(y, ac, x, 270 shp, shgl, iBC, 271 BC, shpb, 272 shglb, res, 273 rmes, iper, 274 ilwork, rerr , &lhsP); 275 get_time((duration+2), (duration+3)); 276 get_max_time_diff((duration), (duration+2), 277 (duration+1), (duration+3), 278 "ElmGMRPETSc \0"); // char(0)) 279 // printf("after ElmGMRPETSc\n"); 280 if(firstpetsccall == 1) { 281 // Setup IndexSet. For now, we mimic vector insertion procedure 282 // Since we always reference by global indexes this doesn't matter 283 // except for cache performance) 284 // TODO: Better arrangment? 285 nodetoinsert = 0; 286 k=0; 287 if(workfc.numpe > 1) { 288 for (i=0; i<nshg ; i++) { 289 nodetoinsert = fncorp[i]-1; 290 // if(nodetoinsert<0) printf("nodetoinsert <0! myrank: %d, value: %ld\n", workfc.myrank, nodetoinsert); 291 // assert(fncorp[i]>=0); 292 for (j=1; j<=nflow; j++) { 293 indexsetary[k] = nodetoinsert*nflow+(j-1); 294 assert(indexsetary[k]>=0); 295 // assert(fncorp[i]>=0); 296 k = k+1; 297 } 298 } 299 } 300 else { 301 for (i=0; i<nshg ; i++) { 302 nodetoinsert = i; 303 for (j=1; j<=nflow; j++) { 304 indexsetary[k] = nodetoinsert*nflow+(j-1); 305 k = k+1; 306 } 307 } 308 } 309 310 for (i=0; i<nshg*nflow; i++) { 311 gblindexsetary[i] = i ; //TODO: better option for performance? 312 } 313 // Create Vector Index Maps 314 petsc_n = (PetscInt) nshg * (PetscInt) nflow; 315 //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, indexsetary, 316 // PETSC_COPY_VALUES, &LocalIndexSet); 317 ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetary, 318 PETSC_COPY_VALUES, &LocalIndexSet); 319 // printf("after 1st ISCreateGeneral %d\n", ierr); 320 //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, gblindexsetary, 321 // PETSC_COPY_VALUES, &GlobalIndexSet); 322 ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetary, 323 PETSC_COPY_VALUES, &GlobalIndexSet); 324 // printf("after 2nd ISCreateGeneral %d\n", ierr); 325 ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSet,&VectorMapping); 326 // printf("after 1st ISLocalToGlobalMappingCreateIS %d\n",ierr); 327 ierr = ISLocalToGlobalMappingCreateIS(GlobalIndexSet,&GblVectorMapping); 328 // printf("after 2nd ISLocalToGlobalMappingCreateIS %d\n",ierr); 329 } 330 if(genpar.lhs == 1) { 331 get_time((duration), (duration+1)); 332 ierr = MatAssemblyBegin(lhsP, MAT_FINAL_ASSEMBLY); 333 ierr = MatAssemblyEnd(lhsP, MAT_FINAL_ASSEMBLY); 334 get_time((duration+2),(duration+3)); 335 get_max_time_diff((duration), (duration+2), 336 (duration+1), (duration+3), 337 "MatAssembly \0"); // char(0)) 338 get_time(duration, (duration+1)); 339 } 340 if(firstpetsccall == 1) { 341 ierr = MatGetLocalSize(lhsP, &LocalRow, &LocalCol); 342 // TODO Should this be LocalRow? LocalCol? or does LocalRow always == LocalCol 343 //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &resP); 344 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resP); 345 //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &DyP); 346 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyP); 347 } 348 ierr = VecZeroEntries(resP); 349 ierr = VecZeroEntries(DyP); 350 if(firstpetsccall == 1) { 351 //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &resPGlobal); 352 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPGlobal); 353 //ierr = VecCreateSeq(PETSC_COMM_SELF, nshg*nflow, &DyPLocal); 354 ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocal); 355 } 356 ierr = VecZeroEntries(resPGlobal); 357 // call VecZeroEntries(DyPLocal, ierr) 358 359 PetscRow=0; 360 k = 0; 361 int index; 362 for (i=0; i<nshg; i++ ){ 363 for (j = 1; j<=nflow; j++){ 364 index = i + (j-1)*nshg; 365 valtoinsert = res[index]; 366 if(workfc.numpe > 1) { 367 PetscRow = (fncorp[i]-1)*nflow+(j-1); 368 } 369 else { 370 PetscRow = i*nflow+(j-1); 371 } 372 assert(fncorp[i]<=nshgt); 373 assert(fncorp[i]>0); 374 assert(PetscRow>=0); 375 assert(PetscRow<=nshgt*nflow); 376 ierr = VecSetValue(resP, PetscRow, valtoinsert, INSERT_VALUES); 377 } 378 } 379 // printf("after VecSetValue loop %d\n",ierr); 380 ierr = VecAssemblyBegin(resP); 381 ierr = VecAssemblyEnd(resP); 382 get_time((duration+2), (duration+3)); 383 get_max_time_diff((duration), (duration+2), 384 (duration+1), (duration+3), 385 "VectorWorkPre \0"); // char(0)) 386 387 get_time((duration),(duration+1)); 388 389 if(firstpetsccall == 1) { 390 ierr = KSPCreate(PETSC_COMM_WORLD, &ksp); 391 ierr = KSPSetOperators(ksp, lhsP, lhsP); 392 //3.4 ierr = KSPSetOperators(ksp, lhsP, lhsP, SAME_NONZERO_PATTERN); 393 ierr = KSPGetPC(ksp, &pc); 394 ierr = PCSetType(pc, PCPBJACOBI); 395 PetscInt maxits; 396 maxits = (PetscInt) solpar.nGMRES * (PetscInt) solpar.Kspace; 397 //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace); 398 ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits); 399 ierr = KSPSetFromOptions(ksp); 400 } 401 ierr = KSPSolve(ksp, resP, DyP); 402 get_time((duration+2),(duration+3)); 403 get_max_time_diff((duration), (duration+2), 404 (duration+1), (duration+3), 405 "KSPSolve \0"); // char(0)) 406 //if(myrank .eq. 0) then 407 //!write(*,"(A, E10.3)") "KSPSolve ", elapsed 408 //end if 409 get_time((duration),(duration+1)); 410 if(firstpetsccall == 1) { 411 ierr = VecScatterCreate(DyP, LocalIndexSet, DyPLocal, GlobalIndexSet, &scatter7); 412 } 413 ierr = VecScatterBegin(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD); 414 ierr = VecScatterEnd(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD); 415 ierr = VecGetArray(DyPLocal, &parray); 416 PetscRow = 0; 417 for ( node = 0; node< nshg; node++) { 418 for (var = 1; var<= nflow; var++) { 419 index = node + (var-1)*nshg; 420 Dy[index] = parray[PetscRow]; 421 PetscRow = PetscRow+1; 422 } 423 } 424 ierr = VecRestoreArray(DyPLocal, &parray); 425 426 firstpetsccall = 0; 427 // later timer ('Solver '); 428 429 // .... output the statistics 430 // 431 itrpar.iKs=0; // see rstat() 432 PetscInt its; 433 //ierr = KSPGetIterationNumber(ksp, &itrpar.iKs); 434 ierr = KSPGetIterationNumber(ksp, &its); 435 itrpar.iKs = (int) its; 436 get_time((duration+2),(duration+3)); 437 get_max_time_diff((duration), (duration+2), 438 (duration+1), (duration+3), 439 "solWork \0"); // char(0)) 440 itrpar.ntotGM += (int) its; 441 rstat (res, ilwork); 442 // 443 // .... stop the timer 444 // 445 // 3002 continue ! no solve just res. 446 // later call timer ('Back ') 447 // 448 // .... end 449 // 450 } 451 452 void SolGMRpSclr(double* y, double* ac, 453 double* x, double* elDw, int* iBC, double* BC, 454 int* col, int* row, 455 int* iper, 456 int* ilwork, double* shp, double* shgl, double* shpb, 457 double* shglb, double* res, double* Dy, long long int* fncorp) 458 { 459 // 460 // ---------------------------------------------------------------------- 461 // 462 // This is the preconditioned GMRES driver routine. 463 // 464 // input: 465 // y (nshg,ndof) : Y-variables at n+alpha_v 466 // ac (nshg,ndof) : Primvar. accel. variable n+alpha_m 467 // yold (nshg,ndof) : Y-variables at beginning of step 468 // acold (nshg,ndof) : Primvar. accel. variable at begng step 469 // x (numnp,nsd) : node coordinates 470 // iBC (nshg) : BC codes 471 // BC (nshg,ndofBC) : BC constraint parameters 472 // shp(b) (nen,maxsh,melCat) : element shape functions (boundary) 473 // shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions 474 // 475 // ---------------------------------------------------------------------- 476 // 477 478 479 // Get variables from common_c.h 480 int nshg, nflow, nsd, iownnodes; 481 nshg = conpar.nshg; 482 nsd = NSD; 483 iownnodes = conpar.iownnodes; 484 485 gcorp_t nshgt; 486 gcorp_t mbeg; 487 gcorp_t mend; 488 nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h 489 mbeg = (gcorp_t) newdim.minowned; 490 mend = (gcorp_t) newdim.maxowned; 491 492 493 int node, element, var, eqn; 494 double valtoinsert; 495 int nenl, iel, lelCat, lcsyst, iorder, nshl; 496 int mattyp, ndofl, nsymdl, npro, ngauss, nppro; 497 double DyFlats[nshg]; 498 double DyFlatPhastas[nshg]; 499 double rmes[nshg]; 500 // DEBUG 501 int i,j,k,l,m; 502 PetscInt indexsetarys[nshg]; 503 PetscInt gblindexsetarys[nshg]; 504 PetscInt nodetoinsert; 505 506 double real_rtol, real_abstol, real_dtol; 507 double *parray; 508 PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA; 509 PetscInt petsc_n; 510 PetscOne = 1; 511 uint64_t duration[4]; 512 513 514 // 515 // 516 // .... *******************>> Element Data Formation <<****************** 517 // 518 // 519 // .... set the parameters for flux and surface tension calculations 520 // 521 // 522 genpar.idflx = 0 ; 523 if(genpar.idiff >= 1) genpar.idflx= genpar.idflx + (nflow-1) * nsd; 524 if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd; 525 526 527 528 PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes); 529 PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes); 530 gcorp_t glbNZ; 531 532 if(firstpetsccalls == 1) { 533 for(i=0;i<iownnodes;i++) { 534 idiagnz[i]=0; 535 iodiagnz[i]=0; 536 } 537 i=0; 538 for(k=0;k<nshg;k++) { 539 if((fncorp[k] < mbeg) || (fncorp[k] >mend)){ 540 // this node is not owned by this rank so we skip 541 } else { 542 for(j=col[i]-1;j<col[i+1]-1;j++) { 543 glbNZ=fncorp[row[j]]; 544 if((glbNZ < mbeg) || (glbNZ > mend)) { 545 iodiagnz[i]++; 546 } else { 547 idiagnz[i]++; 548 } 549 } 550 i++; 551 } 552 } 553 gcorp_t mind=1000; 554 gcorp_t mino=1000; 555 gcorp_t maxd=0; 556 gcorp_t maxo=0; 557 for(i=0;i<iownnodes;i++) { 558 mind=min(mind,idiagnz[i]); 559 mino=min(mino,iodiagnz[i]); 560 maxd=max(maxd,idiagnz[i]); 561 maxo=max(maxo,iodiagnz[i]); 562 } 563 564 for(i=0;i<iownnodes;i++) { 565 iodiagnz[i]=1.3*maxd; 566 idiagnz[i]=1.3*maxd; 567 } 568 569 570 571 } 572 if(firstpetsccalls == 1) { 573 574 petsc_m = (PetscInt) iownnodes; 575 petsc_M = (PetscInt) nshgt; 576 petsc_PA = (PetscInt) 40; 577 578 ierr = MatCreateAIJ(PETSC_COMM_WORLD, petsc_m, petsc_m, petsc_M, petsc_M, 579 0, idiagnz, 0, iodiagnz, &lhsPs); 580 581 ierr = MatSetOption(lhsPs, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE); 582 583 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call 584 ierr = MatSetOption(lhsPs, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE); 585 ierr = MatSetUp(lhsPs); 586 587 PetscInt myMatStart, myMatEnd; 588 ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd); 589 } 590 // MPI_Barrier(MPI_COMM_WORLD); 591 // if(workfc.myrank ==0) printf("Before MatZeroEntries \n"); 592 if(genpar.lhs == 1) ierr = MatZeroEntries(lhsPs); 593 594 get_time(duration, (duration+1)); 595 596 // MPI_Barrier(MPI_COMM_WORLD); 597 // if(workfc.myrank ==0) printf("Before elmgmr \n"); 598 // for (i=0; i<nshg ; i++) { 599 // assert(fncorp[i]>0); 600 // } 601 602 ElmGMRPETScSclr(y, ac, x, elDw, 603 shp, shgl, iBC, 604 BC, shpb, 605 shglb, res, 606 rmes, iper, 607 ilwork, &lhsPs); 608 if(firstpetsccalls == 1) { 609 // Setup IndexSet. For now, we mimic vector insertion procedure 610 // Since we always reference by global indexes this doesn't matter 611 // except for cache performance) 612 // TODO: Better arrangment? 613 nodetoinsert = 0; 614 k=0; 615 if(workfc.numpe > 1) { 616 for (i=0; i<nshg ; i++) { 617 nodetoinsert = fncorp[i]-1; 618 indexsetarys[k] = nodetoinsert; 619 k = k+1; 620 } 621 } 622 else { 623 for (i=0; i<nshg ; i++) { 624 nodetoinsert = i; 625 indexsetarys[k] = nodetoinsert; 626 k = k+1; 627 } 628 } 629 630 for (i=0; i<nshg; i++) { 631 gblindexsetarys[i] = i ; //TODO: better option for performance? 632 } 633 // Create Vector Index Maps 634 petsc_n = (PetscInt) nshg; 635 ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetarys, 636 PETSC_COPY_VALUES, &LocalIndexSets); 637 ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetarys, 638 PETSC_COPY_VALUES, &GlobalIndexSets); 639 ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSets,&VectorMappings); 640 ierr = ISLocalToGlobalMappingCreateIS(GlobalIndexSets,&GblVectorMappings); 641 } 642 if(genpar.lhs ==1) { 643 ierr = MatAssemblyBegin(lhsPs, MAT_FINAL_ASSEMBLY); 644 ierr = MatAssemblyEnd(lhsPs, MAT_FINAL_ASSEMBLY); 645 } 646 if(firstpetsccalls == 1) { 647 ierr = MatGetLocalSize(lhsPs, &LocalRow, &LocalCol); 648 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPs); 649 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyPs); 650 } 651 ierr = VecZeroEntries(resPs); 652 ierr = VecZeroEntries(DyPs); 653 if(firstpetsccalls == 1) { 654 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPGlobals); 655 ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocals); 656 } 657 ierr = VecZeroEntries(resPGlobals); 658 659 PetscRow=0; 660 k = 0; 661 int index; 662 for (i=0; i<nshg; i++ ){ 663 valtoinsert = res[i]; 664 if(workfc.numpe > 1) { 665 PetscRow = (fncorp[i]-1); 666 } 667 else { 668 PetscRow = i-1; 669 } 670 assert(fncorp[i]<=nshgt); 671 assert(fncorp[i]>0); 672 assert(PetscRow>=0); 673 assert(PetscRow<=nshgt); 674 ierr = VecSetValue(resPs, PetscRow, valtoinsert, INSERT_VALUES); 675 } 676 // printf("after VecSetValue loop %d\n",ierr); 677 ierr = VecAssemblyBegin(resPs); 678 ierr = VecAssemblyEnd(resPs); 679 680 if(firstpetsccalls == 1) { 681 ierr = KSPCreate(PETSC_COMM_WORLD, &ksps); 682 ierr = KSPSetOperators(ksps, lhsPs, lhsPs); 683 //3.4 ierr = KSPSetOperators(ksps, lhsPs, lhsPs, SAME_NONZERO_PATTERN); 684 ierr = KSPGetPC(ksps, &pcs); 685 ierr = PCSetType(pcs, PCPBJACOBI); 686 PetscInt maxits; 687 maxits = (PetscInt) solpar.nGMRES * (PetscInt) solpar.Kspace; 688 //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace); 689 ierr = KSPSetTolerances(ksps, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits); 690 ierr = KSPSetFromOptions(ksps); 691 } 692 ierr = KSPSolve(ksps, resPs, DyPs); 693 if(firstpetsccalls == 1) { 694 ierr = VecScatterCreate(DyPs, LocalIndexSets, DyPLocals, GlobalIndexSets, &scatter7s); 695 } 696 ierr = VecScatterBegin(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD); 697 ierr = VecScatterEnd(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD); 698 ierr = VecGetArray(DyPLocals, &parray); 699 PetscRow = 0; 700 for ( node = 0; node< nshg; node++) { 701 index = node; 702 Dy[index] = parray[PetscRow]; 703 PetscRow = PetscRow+1; 704 } 705 ierr = VecRestoreArray(DyPLocals, &parray); 706 707 firstpetsccalls = 0; 708 709 // .... output the statistics 710 // 711 // itrpar.iKss=0; // see rstat() 712 PetscInt its; 713 ierr = KSPGetIterationNumber(ksps, &its); 714 itrpar.iKss = (int) its; 715 itrpar.ntotGMs += (int) its; 716 rstatSclr (res, ilwork); 717 // 718 // .... end 719 // 720 } 721 722