1 #ifdef HAVE_PETSC 2 #include <stdio.h> 3 4 #include "petscsys.h" 5 #include "petscvec.h" 6 #include "petscmat.h" 7 #include "petscpc.h" 8 #include "petscksp.h" 9 10 #include "assert.h" 11 #include "common_c.h" 12 13 #include <FCMangle.h> 14 #define SolGMRp FortranCInterface_GLOBAL_(solgmrp,SOLGMRP) 15 #define SolGMRpSclr FortranCInterface_GLOBAL_(solgmrpsclr,SOLGMRPSCLR) 16 #define ElmGMRPETSc FortranCInterface_GLOBAL_(elmgmrpetsc, ELMGMRPETSC) 17 #define ElmGMRPETScSclr FortranCInterface_GLOBAL_(elmgmrpetscsclr, ELMGMRPETSCSCLR) 18 #define rstat FortranCInterface_GLOBAL_(rstat, RSTAT) 19 #define rstatSclr FortranCInterface_GLOBAL_(rstatsclr, RSTATSCLR) 20 #define min(a,b) (((a) < (b)) ? (a) : (b)) 21 #define max(a,b) (((a) > (b)) ? (a) : (b)) 22 #define get_time FortranCInterface_GLOBAL_(get_time,GET_TIME) 23 #define get_max_time_diff FortranCInterface_GLOBAL_(get_max_time_diff,GET_MAX_TIME_DIFF) 24 25 typedef long long int gcorp_t; 26 27 void get_time(uint64_t* rv, uint64_t* cycle); 28 void get_max_time_diff(uint64_t* first, uint64_t* last, uint64_t* c_first, uint64_t* c_last, char* lbl); 29 30 //#include "auxmpi.h" 31 // static PetscOffset poff; 32 33 static Mat lhsP; 34 static PC pc; 35 static KSP ksp; 36 static Vec DyP, resP, DyPLocal, resPGlobal; 37 static PetscErrorCode ierr; 38 static PetscInt PetscOne, PetscRow, PetscCol, LocalRow, LocalCol; 39 static IS LocalIndexSet, GlobalIndexSet; 40 static ISLocalToGlobalMapping VectorMapping; 41 static ISLocalToGlobalMapping GblVectorMapping; 42 static VecScatter scatter7; 43 static int firstpetsccall = 1; 44 45 static Mat lhsPs; 46 static PC pcs; 47 static KSP ksps; 48 static Vec DyPs, resPs, DyPLocals, resPGlobals; 49 static IS LocalIndexSets, GlobalIndexSets; 50 static ISLocalToGlobalMapping VectorMappings; 51 static ISLocalToGlobalMapping GblVectorMappings; 52 static VecScatter scatter7s; 53 static int firstpetsccalls = 1; 54 55 void SolGMRp(double* y, double* ac, double* yold, 56 double* x, int* iBC, double* BC, 57 int* col, int* row, double* lhsk, 58 double* res, double* BDiag, 59 int* iper, 60 int* ilwork, double* shp, double* shgl, double* shpb, 61 double* shglb, double* Dy, double* rerr, long long int* fncorp) 62 { 63 // 64 // ---------------------------------------------------------------------- 65 // 66 // This is the preconditioned GMRES driver routine. 67 // 68 // input: 69 // y (nshg,ndof) : Y-variables at n+alpha_v 70 // ac (nshg,ndof) : Primvar. accel. variable n+alpha_m 71 // yold (nshg,ndof) : Y-variables at beginning of step 72 // acold (nshg,ndof) : Primvar. accel. variable at begng step 73 // x (numnp,nsd) : node coordinates 74 // iBC (nshg) : BC codes 75 // BC (nshg,ndofBC) : BC constraint parameters 76 // shp(b) (nen,maxsh,melCat) : element shape functions (boundary) 77 // shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions 78 // 79 // output: 80 // res (nshg,nflow) : preconditioned residual 81 // BDiag (nshg,nflow,nflow) : block-diagonal preconditioner 82 // 83 // 84 // Zdenek Johan, Winter 1991. (Fortran 90) 85 // ---------------------------------------------------------------------- 86 // 87 88 89 // Get variables from common_c.h 90 int nshg, nflow, nsd, iownnodes; 91 nshg = conpar.nshg; 92 nflow = conpar.nflow; 93 nsd = NSD; 94 iownnodes = conpar.iownnodes; 95 96 gcorp_t nshgt; 97 gcorp_t mbeg; 98 gcorp_t mend; 99 nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h 100 mbeg = (gcorp_t) newdim.minowned; 101 mend = (gcorp_t) newdim.maxowned; 102 103 104 int node, element, var, eqn; 105 double valtoinsert; 106 int nenl, iel, lelCat, lcsyst, iorder, nshl; 107 int mattyp, ndofl, nsymdl, npro, ngauss, nppro; 108 double DyFlat[nshg*nflow]; 109 double DyFlatPhasta[nshg*nflow]; 110 double rmes[nshg*nflow]; 111 // DEBUG 112 int i,j,k,l,m; 113 //int indexsetary[nflow*nshg]; 114 //int gblindexsetary[nflow*nshg]; 115 PetscInt indexsetary[nflow*nshg]; 116 PetscInt gblindexsetary[nflow*nshg]; 117 PetscInt nodetoinsert; 118 119 // FIXME: PetscScalar 120 double real_rtol, real_abstol, real_dtol; 121 // /DEBUG 122 //double parray[1]; // Should be a PetscScalar 123 double *parray; // Should be a PetscScalar 124 PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA; 125 PetscInt petsc_n; 126 PetscOne = 1; 127 uint64_t duration[4]; 128 129 // printf("%g %g %g \n", *rerr, *(rerr+1) , *(rerr+2)); 130 131 if(firstpetsccall == 1) { 132 /* call get_time(duration(1),duration(2)); 133 call system('sleep 10'); 134 call get_time(duration(3),duration(4)); 135 call get_max_time_diff(duration(1), duration(3), 136 duration(2), duration(4), 137 "TenSecondSleep " // char(0)) 138 if(myrank .eq. 0) { 139 !write(*,"(A, E10.3)") "TenSecondSleep ", elapsed 140 } 141 */ 142 } 143 // 144 // 145 // .... *******************>> Element Data Formation <<****************** 146 // 147 // 148 // .... set the parameters for flux and surface tension calculations 149 // 150 // 151 genpar.idflx = 0 ; 152 if(genpar.idiff >= 1) genpar.idflx= genpar.idflx + (nflow-1) * nsd; 153 if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd; 154 155 156 157 gcorp_t glbNZ; 158 159 if(firstpetsccall == 1) { 160 PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes); 161 PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes); 162 for(i=0;i<iownnodes;i++) { 163 idiagnz[i]=0; 164 iodiagnz[i]=0; 165 } 166 i=0; 167 for(k=0;k<nshg;k++) { 168 if((fncorp[k] < mbeg) || (fncorp[k] >mend)){ 169 // this node is not owned by this rank so we skip 170 } else { 171 for(j=col[i]-1;j<col[i+1]-1;j++) { 172 // assert(row[j]<=nshg); 173 // assert(fncorp[row[j]-1]<=nshgt); 174 glbNZ=fncorp[row[j]-1]; 175 if((glbNZ < mbeg) || (glbNZ > mend)) { 176 iodiagnz[i]++; 177 } else { 178 idiagnz[i]++; 179 } 180 } 181 i++; 182 } 183 } 184 gcorp_t mind=1000; 185 gcorp_t mino=1000; 186 gcorp_t maxd=0; 187 gcorp_t maxo=0; 188 for(i=0;i<iownnodes;i++) { 189 mind=min(mind,idiagnz[i]); 190 mino=min(mino,iodiagnz[i]); 191 maxd=max(maxd,idiagnz[i]); 192 maxo=max(maxo,iodiagnz[i]); 193 // iodiagnz[i]=max(iodiagnz[i],10); 194 // idiagnz[i]=max(idiagnz[i],10); 195 // iodiagnz[i]=2*iodiagnz[i]; // estimate a bit higher for off-part interactions 196 // idiagnz[i]=2*idiagnz[i]; // estimate a bit higher for off-part interactions 197 } 198 // the above was pretty good but below is faster and not too much more memory...of course once you do this 199 // could just use the constant fill parameters in create but keep it alive for potential later optimization 200 201 for(i=0;i<iownnodes;i++) { 202 iodiagnz[i]=1.3*maxd; 203 idiagnz[i]=1.3*maxd; 204 } 205 206 207 208 if(workfc.numpe < 200){ 209 printf("myrank,i,iownnodes,nshg %d %d %d %d \n",workfc.myrank,i,iownnodes,nshg); 210 printf("myrank,mind,maxd,mino,maxo %d %d %d %d %d \n",workfc.myrank,mind,maxd,mino,maxo); 211 } 212 // Print debug info 213 if(nshgt < 200){ 214 int irank; 215 for(irank=0;irank<workfc.numpe;irank++) { 216 if(irank == workfc.myrank){ 217 printf("mbeg,mend,myrank,idiagnz, iodiagnz %d %d %d \n",mbeg,mend,workfc.myrank); 218 for(i=0;i<iownnodes;i++) { 219 printf("%d %ld %ld \n",workfc.myrank,idiagnz[i],iodiagnz[i]); 220 } 221 } 222 MPI_Barrier(MPI_COMM_WORLD); 223 } 224 } 225 // } 226 // if(firstpetsccall == 1) { 227 228 petsc_bs = (PetscInt) nflow; 229 petsc_m = (PetscInt) nflow* (PetscInt) iownnodes; 230 petsc_M = (PetscInt) nshgt * (PetscInt) nflow; 231 petsc_PA = (PetscInt) 40; 232 // TODO: fixe nshgt for 64 bit integer!!!!! 233 234 //ierr = MatCreateBAIJ(PETSC_COMM_WORLD, nflow, nflow*iownnodes, 235 // nflow*iownnodes, nshgt*nflow, nshgt*nflow, 0, 236 237 // this has no pre-allocate ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M, 238 // this has no pre-allocate 0, NULL, 0, NULL, &lhsP); 239 240 // next 2 do pre-allocate 241 242 // MPI_Barrier(MPI_COMM_WORLD); 243 // if(workfc.myrank ==0) printf("Before matCreateBAIJ petsc_bs,petsc_m,petsc_M,petsc_PA %ld %ld %ld %ld \n",petsc_bs,petsc_m,petsc_M,petsc_PA); 244 245 ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M, 246 0, idiagnz, 0, iodiagnz, &lhsP); 247 248 // petsc_PA, NULL, petsc_PA, NULL, &lhsP); 249 // MPI_Barrier(MPI_COMM_WORLD); 250 // if(workfc.myrank ==0) printf("Before matSetOoption 1 \n"); 251 ierr = MatSetOption(lhsP, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE); 252 253 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call 254 #ifdef JEDBROWN 255 ierr = MatSetOption(lhsP, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE); 256 #endif 257 ierr = MatSetUp(lhsP); 258 259 PetscInt myMatStart, myMatEnd; 260 ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd); 261 } 262 // MPI_Barrier(MPI_COMM_WORLD); 263 // if(workfc.myrank ==0) printf("Before MatZeroEntries \n"); 264 if(genpar.lhs ==1) ierr = MatZeroEntries(lhsP); 265 266 get_time(duration, (duration+1)); 267 268 // MPI_Barrier(MPI_COMM_WORLD); 269 // if(workfc.myrank ==0) printf("Before elmgmr \n"); 270 // for (i=0; i<nshg ; i++) { 271 // assert(fncorp[i]>0); 272 // } 273 274 ElmGMRPETSc(y, ac, x, 275 shp, shgl, iBC, 276 BC, shpb, 277 shglb, res, 278 rmes, iper, 279 ilwork, rerr , &lhsP); 280 get_time((duration+2), (duration+3)); 281 get_max_time_diff((duration), (duration+2), 282 (duration+1), (duration+3), 283 "ElmGMRPETSc \0"); // char(0)) 284 // printf("after ElmGMRPETSc\n"); 285 if(firstpetsccall == 1) { 286 // Setup IndexSet. For now, we mimic vector insertion procedure 287 // Since we always reference by global indexes this doesn't matter 288 // except for cache performance) 289 // TODO: Better arrangment? 290 nodetoinsert = 0; 291 k=0; 292 if(workfc.numpe > 1) { 293 for (i=0; i<nshg ; i++) { 294 nodetoinsert = fncorp[i]-1; 295 // if(nodetoinsert<0) printf("nodetoinsert <0! myrank: %d, value: %ld\n", workfc.myrank, nodetoinsert); 296 // assert(fncorp[i]>=0); 297 for (j=1; j<=nflow; j++) { 298 indexsetary[k] = nodetoinsert*nflow+(j-1); 299 assert(indexsetary[k]>=0); 300 // assert(fncorp[i]>=0); 301 k = k+1; 302 } 303 } 304 } 305 else { 306 for (i=0; i<nshg ; i++) { 307 nodetoinsert = i; 308 for (j=1; j<=nflow; j++) { 309 indexsetary[k] = nodetoinsert*nflow+(j-1); 310 k = k+1; 311 } 312 } 313 } 314 315 for (i=0; i<nshg*nflow; i++) { 316 gblindexsetary[i] = i ; //TODO: better option for performance? 317 } 318 // Create Vector Index Maps 319 petsc_n = (PetscInt) nshg * (PetscInt) nflow; 320 //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, indexsetary, 321 // PETSC_COPY_VALUES, &LocalIndexSet); 322 ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetary, 323 PETSC_COPY_VALUES, &LocalIndexSet); 324 // printf("after 1st ISCreateGeneral %d\n", ierr); 325 //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, gblindexsetary, 326 // PETSC_COPY_VALUES, &GlobalIndexSet); 327 ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetary, 328 PETSC_COPY_VALUES, &GlobalIndexSet); 329 // printf("after 2nd ISCreateGeneral %d\n", ierr); 330 ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSet,&VectorMapping); 331 // printf("after 1st ISLocalToGlobalMappingCreateIS %d\n",ierr); 332 ierr = ISLocalToGlobalMappingCreateIS(GlobalIndexSet,&GblVectorMapping); 333 // printf("after 2nd ISLocalToGlobalMappingCreateIS %d\n",ierr); 334 } 335 if(genpar.lhs == 1) { 336 get_time((duration), (duration+1)); 337 ierr = MatAssemblyBegin(lhsP, MAT_FINAL_ASSEMBLY); 338 ierr = MatAssemblyEnd(lhsP, MAT_FINAL_ASSEMBLY); 339 get_time((duration+2),(duration+3)); 340 get_max_time_diff((duration), (duration+2), 341 (duration+1), (duration+3), 342 "MatAssembly \0"); // char(0)) 343 get_time(duration, (duration+1)); 344 } 345 if(firstpetsccall == 1) { 346 ierr = MatGetLocalSize(lhsP, &LocalRow, &LocalCol); 347 // TODO Should this be LocalRow? LocalCol? or does LocalRow always == LocalCol 348 //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &resP); 349 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resP); 350 //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &DyP); 351 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyP); 352 } 353 ierr = VecZeroEntries(resP); 354 ierr = VecZeroEntries(DyP); 355 if(firstpetsccall == 1) { 356 //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &resPGlobal); 357 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPGlobal); 358 //ierr = VecCreateSeq(PETSC_COMM_SELF, nshg*nflow, &DyPLocal); 359 ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocal); 360 } 361 ierr = VecZeroEntries(resPGlobal); 362 // call VecZeroEntries(DyPLocal, ierr) 363 364 PetscRow=0; 365 k = 0; 366 int index; 367 for (i=0; i<nshg; i++ ){ 368 for (j = 1; j<=nflow; j++){ 369 index = i + (j-1)*nshg; 370 valtoinsert = res[index]; 371 if(workfc.numpe > 1) { 372 PetscRow = (fncorp[i]-1)*nflow+(j-1); 373 } 374 else { 375 PetscRow = i*nflow+(j-1); 376 } 377 assert(fncorp[i]<=nshgt); 378 assert(fncorp[i]>0); 379 assert(PetscRow>=0); 380 assert(PetscRow<=nshgt*nflow); 381 ierr = VecSetValue(resP, PetscRow, valtoinsert, INSERT_VALUES); 382 } 383 } 384 // printf("after VecSetValue loop %d\n",ierr); 385 ierr = VecAssemblyBegin(resP); 386 ierr = VecAssemblyEnd(resP); 387 get_time((duration+2), (duration+3)); 388 get_max_time_diff((duration), (duration+2), 389 (duration+1), (duration+3), 390 "VectorWorkPre \0"); // char(0)) 391 392 get_time((duration),(duration+1)); 393 394 if(firstpetsccall == 1) { 395 ierr = KSPCreate(PETSC_COMM_WORLD, &ksp); 396 ierr = KSPSetOperators(ksp, lhsP, lhsP); 397 //3.4 ierr = KSPSetOperators(ksp, lhsP, lhsP, SAME_NONZERO_PATTERN); 398 ierr = KSPGetPC(ksp, &pc); 399 ierr = PCSetType(pc, PCPBJACOBI); 400 PetscInt maxits; 401 maxits = (PetscInt) solpar.nGMRES * (PetscInt) solpar.Kspace; 402 //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace); 403 ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits); 404 ierr = KSPSetFromOptions(ksp); 405 } 406 ierr = KSPSolve(ksp, resP, DyP); 407 get_time((duration+2),(duration+3)); 408 get_max_time_diff((duration), (duration+2), 409 (duration+1), (duration+3), 410 "KSPSolve \0"); // char(0)) 411 //if(myrank .eq. 0) then 412 //!write(*,"(A, E10.3)") "KSPSolve ", elapsed 413 //end if 414 get_time((duration),(duration+1)); 415 if(firstpetsccall == 1) { 416 ierr = VecScatterCreate(DyP, LocalIndexSet, DyPLocal, GlobalIndexSet, &scatter7); 417 } 418 ierr = VecScatterBegin(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD); 419 ierr = VecScatterEnd(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD); 420 ierr = VecGetArray(DyPLocal, &parray); 421 PetscRow = 0; 422 for ( node = 0; node< nshg; node++) { 423 for (var = 1; var<= nflow; var++) { 424 index = node + (var-1)*nshg; 425 Dy[index] = parray[PetscRow]; 426 PetscRow = PetscRow+1; 427 } 428 } 429 ierr = VecRestoreArray(DyPLocal, &parray); 430 431 firstpetsccall = 0; 432 // later timer ('Solver '); 433 434 // .... output the statistics 435 // 436 itrpar.iKs=0; // see rstat() 437 PetscInt its; 438 //ierr = KSPGetIterationNumber(ksp, &itrpar.iKs); 439 ierr = KSPGetIterationNumber(ksp, &its); 440 itrpar.iKs = (int) its; 441 get_time((duration+2),(duration+3)); 442 get_max_time_diff((duration), (duration+2), 443 (duration+1), (duration+3), 444 "solWork \0"); // char(0)) 445 itrpar.ntotGM += (int) its; 446 rstat (res, ilwork); 447 // 448 // .... stop the timer 449 // 450 // 3002 continue ! no solve just res. 451 // later call timer ('Back ') 452 // 453 // .... end 454 // 455 } 456 457 void SolGMRpSclr(double* y, double* ac, 458 double* x, double* elDw, int* iBC, double* BC, 459 int* col, int* row, 460 int* iper, 461 int* ilwork, double* shp, double* shgl, double* shpb, 462 double* shglb, double* res, double* Dy, long long int* fncorp) 463 { 464 // 465 // ---------------------------------------------------------------------- 466 // 467 // This is the preconditioned GMRES driver routine. 468 // 469 // input: 470 // y (nshg,ndof) : Y-variables at n+alpha_v 471 // ac (nshg,ndof) : Primvar. accel. variable n+alpha_m 472 // yold (nshg,ndof) : Y-variables at beginning of step 473 // acold (nshg,ndof) : Primvar. accel. variable at begng step 474 // x (numnp,nsd) : node coordinates 475 // iBC (nshg) : BC codes 476 // BC (nshg,ndofBC) : BC constraint parameters 477 // shp(b) (nen,maxsh,melCat) : element shape functions (boundary) 478 // shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions 479 // 480 // ---------------------------------------------------------------------- 481 // 482 483 484 // Get variables from common_c.h 485 int nshg, nflow, nsd, iownnodes; 486 nshg = conpar.nshg; 487 nsd = NSD; 488 iownnodes = conpar.iownnodes; 489 490 gcorp_t nshgt; 491 gcorp_t mbeg; 492 gcorp_t mend; 493 nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h 494 mbeg = (gcorp_t) newdim.minowned; 495 mend = (gcorp_t) newdim.maxowned; 496 497 498 int node, element, var, eqn; 499 double valtoinsert; 500 int nenl, iel, lelCat, lcsyst, iorder, nshl; 501 int mattyp, ndofl, nsymdl, npro, ngauss, nppro; 502 double DyFlats[nshg]; 503 double DyFlatPhastas[nshg]; 504 double rmes[nshg]; 505 // DEBUG 506 int i,j,k,l,m; 507 PetscInt indexsetarys[nshg]; 508 PetscInt gblindexsetarys[nshg]; 509 PetscInt nodetoinsert; 510 511 double real_rtol, real_abstol, real_dtol; 512 double *parray; 513 PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA; 514 PetscInt petsc_n; 515 PetscOne = 1; 516 uint64_t duration[4]; 517 518 519 // 520 // 521 // .... *******************>> Element Data Formation <<****************** 522 // 523 // 524 // .... set the parameters for flux and surface tension calculations 525 // 526 // 527 genpar.idflx = 0 ; 528 if(genpar.idiff >= 1) genpar.idflx= genpar.idflx + (nflow-1) * nsd; 529 if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd; 530 531 532 533 gcorp_t glbNZ; 534 535 if(firstpetsccalls == 1) { 536 PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes); 537 PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes); 538 for(i=0;i<iownnodes;i++) { 539 idiagnz[i]=0; 540 iodiagnz[i]=0; 541 } 542 i=0; 543 for(k=0;k<nshg;k++) { 544 if((fncorp[k] < mbeg) || (fncorp[k] >mend)){ 545 // this node is not owned by this rank so we skip 546 } else { 547 for(j=col[i]-1;j<col[i+1]-1;j++) { 548 glbNZ=fncorp[row[j]-1]; 549 if((glbNZ < mbeg) || (glbNZ > mend)) { 550 iodiagnz[i]++; 551 } else { 552 idiagnz[i]++; 553 } 554 } 555 i++; 556 } 557 } 558 gcorp_t mind=1000; 559 gcorp_t mino=1000; 560 gcorp_t maxd=0; 561 gcorp_t maxo=0; 562 for(i=0;i<iownnodes;i++) { 563 mind=min(mind,idiagnz[i]); 564 mino=min(mino,iodiagnz[i]); 565 maxd=max(maxd,idiagnz[i]); 566 maxo=max(maxo,iodiagnz[i]); 567 } 568 569 for(i=0;i<iownnodes;i++) { 570 iodiagnz[i]=1.3*maxd; 571 idiagnz[i]=1.3*maxd; 572 } 573 574 575 576 // } 577 // if(firstpetsccalls == 1) { 578 579 petsc_m = (PetscInt) iownnodes; 580 petsc_M = (PetscInt) nshgt; 581 petsc_PA = (PetscInt) 40; 582 583 ierr = MatCreateAIJ(PETSC_COMM_WORLD, petsc_m, petsc_m, petsc_M, petsc_M, 584 0, idiagnz, 0, iodiagnz, &lhsPs); 585 586 ierr = MatSetOption(lhsPs, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE); 587 588 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call 589 #ifdef JEDBROWN 590 ierr = MatSetOption(lhsPs, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE); 591 #endif 592 ierr = MatSetUp(lhsPs); 593 594 PetscInt myMatStart, myMatEnd; 595 ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd); 596 } 597 // MPI_Barrier(MPI_COMM_WORLD); 598 // if(workfc.myrank ==0) printf("Before MatZeroEntries \n"); 599 if(genpar.lhs == 1) ierr = MatZeroEntries(lhsPs); 600 601 get_time(duration, (duration+1)); 602 603 // MPI_Barrier(MPI_COMM_WORLD); 604 // if(workfc.myrank ==0) printf("Before elmgmr \n"); 605 // for (i=0; i<nshg ; i++) { 606 // assert(fncorp[i]>0); 607 // } 608 609 ElmGMRPETScSclr(y, ac, x, elDw, 610 shp, shgl, iBC, 611 BC, shpb, 612 shglb, res, 613 rmes, iper, 614 ilwork, &lhsPs); 615 if(firstpetsccalls == 1) { 616 // Setup IndexSet. For now, we mimic vector insertion procedure 617 // Since we always reference by global indexes this doesn't matter 618 // except for cache performance) 619 // TODO: Better arrangment? 620 nodetoinsert = 0; 621 k=0; 622 if(workfc.numpe > 1) { 623 for (i=0; i<nshg ; i++) { 624 nodetoinsert = fncorp[i]-1; 625 indexsetarys[k] = nodetoinsert; 626 k = k+1; 627 } 628 } 629 else { 630 for (i=0; i<nshg ; i++) { 631 nodetoinsert = i; 632 indexsetarys[k] = nodetoinsert; 633 k = k+1; 634 } 635 } 636 637 for (i=0; i<nshg; i++) { 638 gblindexsetarys[i] = i ; //TODO: better option for performance? 639 } 640 // Create Vector Index Maps 641 petsc_n = (PetscInt) nshg; 642 ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetarys, 643 PETSC_COPY_VALUES, &LocalIndexSets); 644 ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetarys, 645 PETSC_COPY_VALUES, &GlobalIndexSets); 646 ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSets,&VectorMappings); 647 ierr = ISLocalToGlobalMappingCreateIS(GlobalIndexSets,&GblVectorMappings); 648 } 649 if(genpar.lhs ==1) { 650 ierr = MatAssemblyBegin(lhsPs, MAT_FINAL_ASSEMBLY); 651 ierr = MatAssemblyEnd(lhsPs, MAT_FINAL_ASSEMBLY); 652 } 653 if(firstpetsccalls == 1) { 654 ierr = MatGetLocalSize(lhsPs, &LocalRow, &LocalCol); 655 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPs); 656 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyPs); 657 } 658 ierr = VecZeroEntries(resPs); 659 ierr = VecZeroEntries(DyPs); 660 if(firstpetsccalls == 1) { 661 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPGlobals); 662 ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocals); 663 } 664 ierr = VecZeroEntries(resPGlobals); 665 666 PetscRow=0; 667 k = 0; 668 int index; 669 for (i=0; i<nshg; i++ ){ 670 valtoinsert = res[i]; 671 if(workfc.numpe > 1) { 672 PetscRow = (fncorp[i]-1); 673 } 674 else { 675 PetscRow = i-1; 676 } 677 assert(fncorp[i]<=nshgt); 678 assert(fncorp[i]>0); 679 assert(PetscRow>=0); 680 assert(PetscRow<=nshgt); 681 ierr = VecSetValue(resPs, PetscRow, valtoinsert, INSERT_VALUES); 682 } 683 // printf("after VecSetValue loop %d\n",ierr); 684 ierr = VecAssemblyBegin(resPs); 685 ierr = VecAssemblyEnd(resPs); 686 687 if(firstpetsccalls == 1) { 688 ierr = KSPCreate(PETSC_COMM_WORLD, &ksps); 689 ierr = KSPSetOperators(ksps, lhsPs, lhsPs); 690 //3.4 ierr = KSPSetOperators(ksps, lhsPs, lhsPs, SAME_NONZERO_PATTERN); 691 ierr = KSPGetPC(ksps, &pcs); 692 ierr = PCSetType(pcs, PCPBJACOBI); 693 PetscInt maxits; 694 maxits = (PetscInt) solpar.nGMRES * (PetscInt) solpar.Kspace; 695 //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace); 696 ierr = KSPSetTolerances(ksps, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits); 697 ierr = KSPSetFromOptions(ksps); 698 } 699 ierr = KSPSolve(ksps, resPs, DyPs); 700 if(firstpetsccalls == 1) { 701 ierr = VecScatterCreate(DyPs, LocalIndexSets, DyPLocals, GlobalIndexSets, &scatter7s); 702 } 703 ierr = VecScatterBegin(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD); 704 ierr = VecScatterEnd(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD); 705 ierr = VecGetArray(DyPLocals, &parray); 706 PetscRow = 0; 707 for ( node = 0; node< nshg; node++) { 708 index = node; 709 Dy[index] = parray[PetscRow]; 710 PetscRow = PetscRow+1; 711 } 712 ierr = VecRestoreArray(DyPLocals, &parray); 713 714 firstpetsccalls = 0; 715 716 // .... output the statistics 717 // 718 // itrpar.iKss=0; // see rstat() 719 PetscInt its; 720 ierr = KSPGetIterationNumber(ksps, &its); 721 itrpar.iKss = (int) its; 722 itrpar.ntotGMs += (int) its; 723 rstatSclr (res, ilwork); 724 // 725 // .... end 726 // 727 } 728 #endif 729