1 #ifdef HAVE_PETSC 2 #include <stdio.h> 3 4 #include "petscsys.h" 5 #include "petscvec.h" 6 #include "petscmat.h" 7 #include "petscpc.h" 8 #include "petscksp.h" 9 10 #include "assert.h" 11 #include "common_c.h" 12 13 #include <FCMangle.h> 14 #define SolGMRp FortranCInterface_GLOBAL_(solgmrp,SOLGMRP) 15 #define SolGMRpSclr FortranCInterface_GLOBAL_(solgmrpsclr,SOLGMRPSCLR) 16 #define ElmGMRPETSc FortranCInterface_GLOBAL_(elmgmrpetsc, ELMGMRPETSC) 17 #define ElmGMRPETScSclr FortranCInterface_GLOBAL_(elmgmrpetscsclr, ELMGMRPETSCSCLR) 18 #define rstat FortranCInterface_GLOBAL_(rstat, RSTAT) 19 #define rstatSclr FortranCInterface_GLOBAL_(rstatsclr, RSTATSCLR) 20 #define min(a,b) (((a) < (b)) ? (a) : (b)) 21 #define max(a,b) (((a) > (b)) ? (a) : (b)) 22 #define get_time FortranCInterface_GLOBAL_(get_time,GET_TIME) 23 #define get_max_time_diff FortranCInterface_GLOBAL_(get_max_time_diff,GET_MAX_TIME_DIFF) 24 25 typedef long long int gcorp_t; 26 27 void get_time(uint64_t* rv, uint64_t* cycle); 28 void get_max_time_diff(uint64_t* first, uint64_t* last, uint64_t* c_first, uint64_t* c_last, char* lbl); 29 30 //#include "auxmpi.h" 31 // static PetscOffset poff; 32 33 static Mat lhsP; 34 static PC pc; 35 static KSP ksp; 36 static Vec DyP, resP, DyPLocal, resPGlobal; 37 static PetscErrorCode ierr; 38 static PetscInt PetscOne, PetscRow, PetscCol, LocalRow, LocalCol; 39 static IS LocalIndexSet, GlobalIndexSet; 40 static ISLocalToGlobalMapping VectorMapping; 41 static ISLocalToGlobalMapping GblVectorMapping; 42 static VecScatter scatter7; 43 static int firstpetsccall = 1; 44 45 static Mat lhsPs; 46 static PC pcs; 47 static KSP ksps; 48 static Vec DyPs, resPs, DyPLocals, resPGlobals; 49 static IS LocalIndexSets, GlobalIndexSets; 50 static ISLocalToGlobalMapping VectorMappings; 51 static ISLocalToGlobalMapping GblVectorMappings; 52 static VecScatter scatter7s; 53 static int firstpetsccalls = 1; 54 55 void SolGMRp(double* y, double* ac, double* yold, 56 double* x, int* iBC, double* BC, 57 int* col, int* row, double* lhsk, 58 double* res, double* BDiag, 59 int* iper, 60 int* ilwork, double* shp, double* shgl, double* shpb, 61 double* shglb, double* Dy, double* rerr, long long int* fncorp) 62 { 63 // 64 // ---------------------------------------------------------------------- 65 // 66 // This is the preconditioned GMRES driver routine. 67 // 68 // input: 69 // y (nshg,ndof) : Y-variables at n+alpha_v 70 // ac (nshg,ndof) : Primvar. accel. variable n+alpha_m 71 // yold (nshg,ndof) : Y-variables at beginning of step 72 // acold (nshg,ndof) : Primvar. accel. variable at begng step 73 // x (numnp,nsd) : node coordinates 74 // iBC (nshg) : BC codes 75 // BC (nshg,ndofBC) : BC constraint parameters 76 // shp(b) (nen,maxsh,melCat) : element shape functions (boundary) 77 // shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions 78 // 79 // output: 80 // res (nshg,nflow) : preconditioned residual 81 // BDiag (nshg,nflow,nflow) : block-diagonal preconditioner 82 // 83 // 84 // Zdenek Johan, Winter 1991. (Fortran 90) 85 // ---------------------------------------------------------------------- 86 // 87 88 89 // Get variables from common_c.h 90 int nshg, nflow, nsd, iownnodes; 91 nshg = conpar.nshg; 92 nflow = conpar.nflow; 93 nsd = NSD; 94 iownnodes = conpar.iownnodes; 95 96 gcorp_t nshgt; 97 gcorp_t mbeg; 98 gcorp_t mend; 99 nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h 100 mbeg = (gcorp_t) newdim.minowned; 101 mend = (gcorp_t) newdim.maxowned; 102 103 104 int node, element, var, eqn; 105 double valtoinsert; 106 int nenl, iel, lelCat, lcsyst, iorder, nshl; 107 int mattyp, ndofl, nsymdl, npro, ngauss, nppro; 108 double DyFlat[nshg*nflow]; 109 double DyFlatPhasta[nshg*nflow]; 110 double rmes[nshg*nflow]; 111 // DEBUG 112 int i,j,k,l,m; 113 114 // FIXME: PetscScalar 115 double real_rtol, real_abstol, real_dtol; 116 // /DEBUG 117 //double parray[1]; // Should be a PetscScalar 118 double *parray; // Should be a PetscScalar 119 PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA; 120 PetscInt petsc_n; 121 PetscOne = 1; 122 uint64_t duration[4]; 123 124 // printf("%g %g %g \n", *rerr, *(rerr+1) , *(rerr+2)); 125 126 if(firstpetsccall == 1) { 127 /* call get_time(duration(1),duration(2)); 128 call system('sleep 10'); 129 call get_time(duration(3),duration(4)); 130 call get_max_time_diff(duration(1), duration(3), 131 duration(2), duration(4), 132 "TenSecondSleep " // char(0)) 133 if(myrank .eq. 0) { 134 !write(*,"(A, E10.3)") "TenSecondSleep ", elapsed 135 } 136 */ 137 } 138 // 139 // 140 // .... *******************>> Element Data Formation <<****************** 141 // 142 // 143 // .... set the parameters for flux and surface tension calculations 144 // 145 // 146 genpar.idflx = 0 ; 147 if(genpar.idiff >= 1) genpar.idflx= genpar.idflx + (nflow-1) * nsd; 148 if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd; 149 150 151 152 gcorp_t glbNZ; 153 154 if(firstpetsccall == 1) { 155 PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes); 156 PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes); 157 for(i=0;i<iownnodes;i++) { 158 idiagnz[i]=0; 159 iodiagnz[i]=0; 160 } 161 i=0; 162 for(k=0;k<nshg;k++) { 163 if((fncorp[k] < mbeg) || (fncorp[k] >mend)){ 164 // this node is not owned by this rank so we skip 165 } else { 166 for(j=col[i]-1;j<col[i+1]-1;j++) { 167 // assert(row[j]<=nshg); 168 // assert(fncorp[row[j]-1]<=nshgt); 169 glbNZ=fncorp[row[j]-1]; 170 if((glbNZ < mbeg) || (glbNZ > mend)) { 171 iodiagnz[i]++; 172 } else { 173 idiagnz[i]++; 174 } 175 } 176 i++; 177 } 178 } 179 gcorp_t mind=1000; 180 gcorp_t mino=1000; 181 gcorp_t maxd=0; 182 gcorp_t maxo=0; 183 for(i=0;i<iownnodes;i++) { 184 mind=min(mind,idiagnz[i]); 185 mino=min(mino,iodiagnz[i]); 186 maxd=max(maxd,idiagnz[i]); 187 maxo=max(maxo,iodiagnz[i]); 188 // iodiagnz[i]=max(iodiagnz[i],10); 189 // idiagnz[i]=max(idiagnz[i],10); 190 // iodiagnz[i]=2*iodiagnz[i]; // estimate a bit higher for off-part interactions 191 // idiagnz[i]=2*idiagnz[i]; // estimate a bit higher for off-part interactions 192 } 193 // the above was pretty good but below is faster and not too much more memory...of course once you do this 194 // could just use the constant fill parameters in create but keep it alive for potential later optimization 195 196 for(i=0;i<iownnodes;i++) { 197 iodiagnz[i]=1.3*maxd; 198 idiagnz[i]=1.3*maxd; 199 } 200 201 202 203 if(workfc.numpe < 200){ 204 printf("myrank,i,iownnodes,nshg %d %d %d %d \n",workfc.myrank,i,iownnodes,nshg); 205 printf("myrank,mind,maxd,mino,maxo %d %d %d %d %d \n",workfc.myrank,mind,maxd,mino,maxo); 206 } 207 // Print debug info 208 if(nshgt < 200){ 209 int irank; 210 for(irank=0;irank<workfc.numpe;irank++) { 211 if(irank == workfc.myrank){ 212 printf("mbeg,mend,myrank,idiagnz, iodiagnz %d %d %d \n",mbeg,mend,workfc.myrank); 213 for(i=0;i<iownnodes;i++) { 214 printf("%d %ld %ld \n",workfc.myrank,idiagnz[i],iodiagnz[i]); 215 } 216 } 217 MPI_Barrier(MPI_COMM_WORLD); 218 } 219 } 220 // } 221 // if(firstpetsccall == 1) { 222 223 petsc_bs = (PetscInt) nflow; 224 petsc_m = (PetscInt) nflow* (PetscInt) iownnodes; 225 petsc_M = (PetscInt) nshgt * (PetscInt) nflow; 226 petsc_PA = (PetscInt) 40; 227 // TODO: fixe nshgt for 64 bit integer!!!!! 228 229 //ierr = MatCreateBAIJ(PETSC_COMM_WORLD, nflow, nflow*iownnodes, 230 // nflow*iownnodes, nshgt*nflow, nshgt*nflow, 0, 231 232 // this has no pre-allocate ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M, 233 // this has no pre-allocate 0, NULL, 0, NULL, &lhsP); 234 235 // next 2 do pre-allocate 236 237 // MPI_Barrier(MPI_COMM_WORLD); 238 // if(workfc.myrank ==0) printf("Before matCreateBAIJ petsc_bs,petsc_m,petsc_M,petsc_PA %ld %ld %ld %ld \n",petsc_bs,petsc_m,petsc_M,petsc_PA); 239 240 ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M, 241 0, idiagnz, 0, iodiagnz, &lhsP); 242 243 // petsc_PA, NULL, petsc_PA, NULL, &lhsP); 244 // MPI_Barrier(MPI_COMM_WORLD); 245 // if(workfc.myrank ==0) printf("Before matSetOoption 1 \n"); 246 ierr = MatSetOption(lhsP, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE); 247 248 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call 249 #ifdef JEDBROWN 250 ierr = MatSetOption(lhsP, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE); 251 #endif 252 ierr = MatSetUp(lhsP); 253 254 PetscInt myMatStart, myMatEnd; 255 ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd); 256 } 257 // MPI_Barrier(MPI_COMM_WORLD); 258 // if(workfc.myrank ==0) printf("Before MatZeroEntries \n"); 259 if(genpar.lhs ==1) ierr = MatZeroEntries(lhsP); 260 261 get_time(duration, (duration+1)); 262 263 // MPI_Barrier(MPI_COMM_WORLD); 264 // if(workfc.myrank ==0) printf("Before elmgmr \n"); 265 // for (i=0; i<nshg ; i++) { 266 // assert(fncorp[i]>0); 267 // } 268 269 ElmGMRPETSc(y, ac, x, 270 shp, shgl, iBC, 271 BC, shpb, 272 shglb, res, 273 rmes, iper, 274 ilwork, rerr , &lhsP); 275 get_time((duration+2), (duration+3)); 276 get_max_time_diff((duration), (duration+2), 277 (duration+1), (duration+3), 278 "ElmGMRPETSc \0"); // char(0)) 279 // printf("after ElmGMRPETSc\n"); 280 if(firstpetsccall == 1) { 281 // Setup IndexSet. For now, we mimic vector insertion procedure 282 // Since we always reference by global indexes this doesn't matter 283 // except for cache performance) 284 // TODO: Better arrangment? 285 //PetscInt indexsetary[nflow*nshg]; 286 PetscInt* indexsetary = malloc(sizeof(PetscInt)*nflow*nshg); 287 PetscInt* gblindexsetary = malloc(sizeof(PetscInt)*nflow*nshg); 288 PetscInt nodetoinsert; 289 nodetoinsert = 0; 290 k=0; 291 if(workfc.numpe > 1) { 292 for (i=0; i<nshg ; i++) { 293 nodetoinsert = fncorp[i]-1; 294 // if(nodetoinsert<0) printf("nodetoinsert <0! myrank: %d, value: %ld\n", workfc.myrank, nodetoinsert); 295 // assert(fncorp[i]>=0); 296 for (j=1; j<=nflow; j++) { 297 indexsetary[k] = nodetoinsert*nflow+(j-1); 298 assert(indexsetary[k]>=0); 299 // assert(fncorp[i]>=0); 300 k = k+1; 301 } 302 } 303 } 304 else { 305 for (i=0; i<nshg ; i++) { 306 nodetoinsert = i; 307 for (j=1; j<=nflow; j++) { 308 indexsetary[k] = nodetoinsert*nflow+(j-1); 309 k = k+1; 310 } 311 } 312 } 313 314 for (i=0; i<nshg*nflow; i++) { 315 gblindexsetary[i] = i ; //TODO: better option for performance? 316 } 317 // Create Vector Index Maps 318 petsc_n = (PetscInt) nshg * (PetscInt) nflow; 319 //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, indexsetary, 320 // PETSC_COPY_VALUES, &LocalIndexSet); 321 ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetary, 322 PETSC_COPY_VALUES, &LocalIndexSet); 323 // printf("after 1st ISCreateGeneral %d\n", ierr); 324 //ierr = ISCreateGeneral(PETSC_COMM_SELF, nshg*nflow, gblindexsetary, 325 // PETSC_COPY_VALUES, &GlobalIndexSet); 326 ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetary, 327 PETSC_COPY_VALUES, &GlobalIndexSet); 328 // printf("after 2nd ISCreateGeneral %d\n", ierr); 329 ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSet,&VectorMapping); 330 // printf("after 1st ISLocalToGlobalMappingCreateIS %d\n",ierr); 331 ierr = ISLocalToGlobalMappingCreateIS(GlobalIndexSet,&GblVectorMapping); 332 // printf("after 2nd ISLocalToGlobalMappingCreateIS %d\n",ierr); 333 free(indexsetary); 334 free(gblindexsetary); 335 } 336 if(genpar.lhs == 1) { 337 get_time((duration), (duration+1)); 338 ierr = MatAssemblyBegin(lhsP, MAT_FINAL_ASSEMBLY); 339 ierr = MatAssemblyEnd(lhsP, MAT_FINAL_ASSEMBLY); 340 get_time((duration+2),(duration+3)); 341 get_max_time_diff((duration), (duration+2), 342 (duration+1), (duration+3), 343 "MatAssembly \0"); // char(0)) 344 get_time(duration, (duration+1)); 345 } 346 if(firstpetsccall == 1) { 347 ierr = MatGetLocalSize(lhsP, &LocalRow, &LocalCol); 348 // TODO Should this be LocalRow? LocalCol? or does LocalRow always == LocalCol 349 //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &resP); 350 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resP); 351 //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &DyP); 352 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyP); 353 } 354 ierr = VecZeroEntries(resP); 355 ierr = VecZeroEntries(DyP); 356 if(firstpetsccall == 1) { 357 //ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, nshgt*nflow, &resPGlobal); 358 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPGlobal); 359 //ierr = VecCreateSeq(PETSC_COMM_SELF, nshg*nflow, &DyPLocal); 360 ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocal); 361 } 362 ierr = VecZeroEntries(resPGlobal); 363 // call VecZeroEntries(DyPLocal, ierr) 364 365 PetscRow=0; 366 k = 0; 367 int index; 368 for (i=0; i<nshg; i++ ){ 369 for (j = 1; j<=nflow; j++){ 370 index = i + (j-1)*nshg; 371 valtoinsert = res[index]; 372 if(workfc.numpe > 1) { 373 PetscRow = (fncorp[i]-1)*nflow+(j-1); 374 } 375 else { 376 PetscRow = i*nflow+(j-1); 377 } 378 assert(fncorp[i]<=nshgt); 379 assert(fncorp[i]>0); 380 assert(PetscRow>=0); 381 assert(PetscRow<=nshgt*nflow); 382 ierr = VecSetValue(resP, PetscRow, valtoinsert, INSERT_VALUES); 383 } 384 } 385 // printf("after VecSetValue loop %d\n",ierr); 386 ierr = VecAssemblyBegin(resP); 387 ierr = VecAssemblyEnd(resP); 388 get_time((duration+2), (duration+3)); 389 get_max_time_diff((duration), (duration+2), 390 (duration+1), (duration+3), 391 "VectorWorkPre \0"); // char(0)) 392 393 get_time((duration),(duration+1)); 394 395 if(firstpetsccall == 1) { 396 ierr = KSPCreate(PETSC_COMM_WORLD, &ksp); 397 ierr = KSPSetOperators(ksp, lhsP, lhsP); 398 //3.4 ierr = KSPSetOperators(ksp, lhsP, lhsP, SAME_NONZERO_PATTERN); 399 ierr = KSPGetPC(ksp, &pc); 400 ierr = PCSetType(pc, PCPBJACOBI); 401 PetscInt maxits; 402 maxits = (PetscInt) solpar.nGMRES * (PetscInt) solpar.Kspace; 403 //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace); 404 ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits); 405 ierr = KSPSetFromOptions(ksp); 406 } 407 ierr = KSPSolve(ksp, resP, DyP); 408 get_time((duration+2),(duration+3)); 409 get_max_time_diff((duration), (duration+2), 410 (duration+1), (duration+3), 411 "KSPSolve \0"); // char(0)) 412 //if(myrank .eq. 0) then 413 //!write(*,"(A, E10.3)") "KSPSolve ", elapsed 414 //end if 415 get_time((duration),(duration+1)); 416 if(firstpetsccall == 1) { 417 ierr = VecScatterCreate(DyP, LocalIndexSet, DyPLocal, GlobalIndexSet, &scatter7); 418 } 419 ierr = VecScatterBegin(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD); 420 ierr = VecScatterEnd(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD); 421 ierr = VecGetArray(DyPLocal, &parray); 422 PetscRow = 0; 423 for ( node = 0; node< nshg; node++) { 424 for (var = 1; var<= nflow; var++) { 425 index = node + (var-1)*nshg; 426 Dy[index] = parray[PetscRow]; 427 PetscRow = PetscRow+1; 428 } 429 } 430 ierr = VecRestoreArray(DyPLocal, &parray); 431 432 firstpetsccall = 0; 433 // later timer ('Solver '); 434 435 436 // .... output the statistics 437 // 438 itrpar.iKs=0; // see rstat() 439 PetscInt its; 440 //ierr = KSPGetIterationNumber(ksp, &itrpar.iKs); 441 ierr = KSPGetIterationNumber(ksp, &its); 442 itrpar.iKs = (int) its; 443 get_time((duration+2),(duration+3)); 444 get_max_time_diff((duration), (duration+2), 445 (duration+1), (duration+3), 446 "solWork \0"); // char(0)) 447 itrpar.ntotGM += (int) its; 448 rstat (res, ilwork); 449 // 450 // .... stop the timer 451 // 452 // 3002 continue ! no solve just res. 453 // later call timer ('Back ') 454 // 455 // .... end 456 // 457 } 458 459 void SolGMRpSclr(double* y, double* ac, 460 double* x, double* elDw, int* iBC, double* BC, 461 int* col, int* row, 462 int* iper, 463 int* ilwork, double* shp, double* shgl, double* shpb, 464 double* shglb, double* res, double* Dy, long long int* fncorp) 465 { 466 // 467 // ---------------------------------------------------------------------- 468 // 469 // This is the preconditioned GMRES driver routine. 470 // 471 // input: 472 // y (nshg,ndof) : Y-variables at n+alpha_v 473 // ac (nshg,ndof) : Primvar. accel. variable n+alpha_m 474 // yold (nshg,ndof) : Y-variables at beginning of step 475 // acold (nshg,ndof) : Primvar. accel. variable at begng step 476 // x (numnp,nsd) : node coordinates 477 // iBC (nshg) : BC codes 478 // BC (nshg,ndofBC) : BC constraint parameters 479 // shp(b) (nen,maxsh,melCat) : element shape functions (boundary) 480 // shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions 481 // 482 // ---------------------------------------------------------------------- 483 // 484 485 486 // Get variables from common_c.h 487 int nshg, nflow, nsd, iownnodes; 488 nshg = conpar.nshg; 489 nsd = NSD; 490 iownnodes = conpar.iownnodes; 491 492 gcorp_t nshgt; 493 gcorp_t mbeg; 494 gcorp_t mend; 495 nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h 496 mbeg = (gcorp_t) newdim.minowned; 497 mend = (gcorp_t) newdim.maxowned; 498 499 500 int node, element, var, eqn; 501 double valtoinsert; 502 int nenl, iel, lelCat, lcsyst, iorder, nshl; 503 int mattyp, ndofl, nsymdl, npro, ngauss, nppro; 504 double DyFlats[nshg]; 505 double DyFlatPhastas[nshg]; 506 double rmes[nshg]; 507 // DEBUG 508 int i,j,k,l,m; 509 510 double real_rtol, real_abstol, real_dtol; 511 double *parray; 512 PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA; 513 PetscInt petsc_n; 514 PetscOne = 1; 515 uint64_t duration[4]; 516 517 518 // 519 // 520 // .... *******************>> Element Data Formation <<****************** 521 // 522 // 523 // .... set the parameters for flux and surface tension calculations 524 // 525 // 526 genpar.idflx = 0 ; 527 if(genpar.idiff >= 1) genpar.idflx= genpar.idflx + (nflow-1) * nsd; 528 if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd; 529 530 531 532 gcorp_t glbNZ; 533 534 if(firstpetsccalls == 1) { 535 PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes); 536 PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes); 537 for(i=0;i<iownnodes;i++) { 538 idiagnz[i]=0; 539 iodiagnz[i]=0; 540 } 541 i=0; 542 for(k=0;k<nshg;k++) { 543 if((fncorp[k] < mbeg) || (fncorp[k] >mend)){ 544 // this node is not owned by this rank so we skip 545 } else { 546 for(j=col[i]-1;j<col[i+1]-1;j++) { 547 glbNZ=fncorp[row[j]-1]; 548 if((glbNZ < mbeg) || (glbNZ > mend)) { 549 iodiagnz[i]++; 550 } else { 551 idiagnz[i]++; 552 } 553 } 554 i++; 555 } 556 } 557 gcorp_t mind=1000; 558 gcorp_t mino=1000; 559 gcorp_t maxd=0; 560 gcorp_t maxo=0; 561 for(i=0;i<iownnodes;i++) { 562 mind=min(mind,idiagnz[i]); 563 mino=min(mino,iodiagnz[i]); 564 maxd=max(maxd,idiagnz[i]); 565 maxo=max(maxo,iodiagnz[i]); 566 } 567 568 for(i=0;i<iownnodes;i++) { 569 iodiagnz[i]=1.3*maxd; 570 idiagnz[i]=1.3*maxd; 571 } 572 573 574 575 // } 576 // if(firstpetsccalls == 1) { 577 578 petsc_m = (PetscInt) iownnodes; 579 petsc_M = (PetscInt) nshgt; 580 petsc_PA = (PetscInt) 40; 581 582 ierr = MatCreateAIJ(PETSC_COMM_WORLD, petsc_m, petsc_m, petsc_M, petsc_M, 583 0, idiagnz, 0, iodiagnz, &lhsPs); 584 585 ierr = MatSetOption(lhsPs, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE); 586 587 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call 588 #ifdef JEDBROWN 589 ierr = MatSetOption(lhsPs, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE); 590 #endif 591 ierr = MatSetUp(lhsPs); 592 593 PetscInt myMatStart, myMatEnd; 594 ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd); 595 } 596 // MPI_Barrier(MPI_COMM_WORLD); 597 // if(workfc.myrank ==0) printf("Before MatZeroEntries \n"); 598 if(genpar.lhs == 1) ierr = MatZeroEntries(lhsPs); 599 600 get_time(duration, (duration+1)); 601 602 // MPI_Barrier(MPI_COMM_WORLD); 603 // if(workfc.myrank ==0) printf("Before elmgmr \n"); 604 // for (i=0; i<nshg ; i++) { 605 // assert(fncorp[i]>0); 606 // } 607 608 ElmGMRPETScSclr(y, ac, x, elDw, 609 shp, shgl, iBC, 610 BC, shpb, 611 shglb, res, 612 rmes, iper, 613 ilwork, &lhsPs); 614 if(firstpetsccalls == 1) { 615 // Setup IndexSet. For now, we mimic vector insertion procedure 616 // Since we always reference by global indexes this doesn't matter 617 // except for cache performance) 618 // TODO: Better arrangment? 619 620 PetscInt* indexsetarys = malloc(sizeof(PetscInt)*nshg); 621 PetscInt* gblindexsetarys = malloc(sizeof(PetscInt)*nshg); 622 PetscInt nodetoinsert; 623 624 nodetoinsert = 0; 625 k=0; 626 if(workfc.numpe > 1) { 627 for (i=0; i<nshg ; i++) { 628 nodetoinsert = fncorp[i]-1; 629 indexsetarys[k] = nodetoinsert; 630 k = k+1; 631 } 632 } 633 else { 634 for (i=0; i<nshg ; i++) { 635 nodetoinsert = i; 636 indexsetarys[k] = nodetoinsert; 637 k = k+1; 638 } 639 } 640 641 for (i=0; i<nshg; i++) { 642 gblindexsetarys[i] = i ; //TODO: better option for performance? 643 } 644 // Create Vector Index Maps 645 petsc_n = (PetscInt) nshg; 646 ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetarys, 647 PETSC_COPY_VALUES, &LocalIndexSets); 648 ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, gblindexsetarys, 649 PETSC_COPY_VALUES, &GlobalIndexSets); 650 ierr = ISLocalToGlobalMappingCreateIS(LocalIndexSets,&VectorMappings); 651 ierr = ISLocalToGlobalMappingCreateIS(GlobalIndexSets,&GblVectorMappings); 652 free(indexsetarys); 653 free(gblindexsetarys); 654 } 655 if(genpar.lhs ==1) { 656 ierr = MatAssemblyBegin(lhsPs, MAT_FINAL_ASSEMBLY); 657 ierr = MatAssemblyEnd(lhsPs, MAT_FINAL_ASSEMBLY); 658 } 659 if(firstpetsccalls == 1) { 660 ierr = MatGetLocalSize(lhsPs, &LocalRow, &LocalCol); 661 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPs); 662 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyPs); 663 } 664 ierr = VecZeroEntries(resPs); 665 ierr = VecZeroEntries(DyPs); 666 if(firstpetsccalls == 1) { 667 ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPGlobals); 668 ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocals); 669 } 670 ierr = VecZeroEntries(resPGlobals); 671 672 PetscRow=0; 673 k = 0; 674 int index; 675 for (i=0; i<nshg; i++ ){ 676 valtoinsert = res[i]; 677 if(workfc.numpe > 1) { 678 PetscRow = (fncorp[i]-1); 679 } 680 else { 681 PetscRow = i-1; 682 } 683 assert(fncorp[i]<=nshgt); 684 assert(fncorp[i]>0); 685 assert(PetscRow>=0); 686 assert(PetscRow<=nshgt); 687 ierr = VecSetValue(resPs, PetscRow, valtoinsert, INSERT_VALUES); 688 } 689 // printf("after VecSetValue loop %d\n",ierr); 690 ierr = VecAssemblyBegin(resPs); 691 ierr = VecAssemblyEnd(resPs); 692 693 if(firstpetsccalls == 1) { 694 ierr = KSPCreate(PETSC_COMM_WORLD, &ksps); 695 ierr = KSPSetOperators(ksps, lhsPs, lhsPs); 696 //3.4 ierr = KSPSetOperators(ksps, lhsPs, lhsPs, SAME_NONZERO_PATTERN); 697 ierr = KSPGetPC(ksps, &pcs); 698 ierr = PCSetType(pcs, PCPBJACOBI); 699 PetscInt maxits; 700 maxits = (PetscInt) solpar.nGMRES * (PetscInt) solpar.Kspace; 701 //ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, solpar.nGMRES*solpar.Kspace); 702 ierr = KSPSetTolerances(ksps, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits); 703 ierr = KSPSetFromOptions(ksps); 704 } 705 ierr = KSPSolve(ksps, resPs, DyPs); 706 if(firstpetsccalls == 1) { 707 ierr = VecScatterCreate(DyPs, LocalIndexSets, DyPLocals, GlobalIndexSets, &scatter7s); 708 } 709 ierr = VecScatterBegin(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD); 710 ierr = VecScatterEnd(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD); 711 ierr = VecGetArray(DyPLocals, &parray); 712 PetscRow = 0; 713 for ( node = 0; node< nshg; node++) { 714 index = node; 715 Dy[index] = parray[PetscRow]; 716 PetscRow = PetscRow+1; 717 } 718 ierr = VecRestoreArray(DyPLocals, &parray); 719 720 firstpetsccalls = 0; 721 722 // .... output the statistics 723 // 724 // itrpar.iKss=0; // see rstat() 725 PetscInt its; 726 ierr = KSPGetIterationNumber(ksps, &its); 727 itrpar.iKss = (int) its; 728 itrpar.ntotGMs += (int) its; 729 rstatSclr (res, ilwork); 730 // 731 // .... end 732 // 733 } 734 #endif 735