1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/baij/mpi/mpibaij.h> 7 #include <petscbt.h> 8 9 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**); 10 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 11 extern PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 12 extern PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 14 PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 15 { 16 PetscInt i,N=C->cmap->N, bs=C->rmap->bs; 17 IS *is_new; 18 19 PetscFunctionBegin; 20 PetscCall(PetscMalloc1(imax,&is_new)); 21 /* Convert the indices into block format */ 22 PetscCall(ISCompressIndicesGeneral(N,C->rmap->n,bs,imax,is,is_new)); 23 PetscCheckFalse(ov < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 24 for (i=0; i<ov; ++i) { 25 PetscCall(MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new)); 26 } 27 for (i=0; i<imax; i++) PetscCall(ISDestroy(&is[i])); 28 PetscCall(ISExpandIndicesGeneral(N,N,bs,imax,is_new,is)); 29 for (i=0; i<imax; i++) PetscCall(ISDestroy(&is_new[i])); 30 PetscCall(PetscFree(is_new)); 31 PetscFunctionReturn(0); 32 } 33 34 /* 35 Sample message format: 36 If a processor A wants processor B to process some elements corresponding 37 to index sets is[1], is[5] 38 mesg [0] = 2 (no of index sets in the mesg) 39 ----------- 40 mesg [1] = 1 => is[1] 41 mesg [2] = sizeof(is[1]); 42 ----------- 43 mesg [5] = 5 => is[5] 44 mesg [6] = sizeof(is[5]); 45 ----------- 46 mesg [7] 47 mesg [n] data(is[1]) 48 ----------- 49 mesg[n+1] 50 mesg[m] data(is[5]) 51 ----------- 52 53 Notes: 54 nrqs - no of requests sent (or to be sent out) 55 nrqr - no of requests received (which have to be or which have been processed) 56 */ 57 PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[]) 58 { 59 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 60 const PetscInt **idx,*idx_i; 61 PetscInt *n,*w3,*w4,**data,len; 62 PetscMPIInt size,rank,tag1,tag2,*w2,*w1,nrqr; 63 PetscInt Mbs,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr; 64 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 65 PetscMPIInt *onodes1,*olengths1,*onodes2,*olengths2,proc=-1; 66 PetscBT *table; 67 MPI_Comm comm,*iscomms; 68 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 69 char *t_p; 70 71 PetscFunctionBegin; 72 PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 73 size = c->size; 74 rank = c->rank; 75 Mbs = c->Mbs; 76 77 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag1)); 78 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2)); 79 80 PetscCall(PetscMalloc2(imax+1,(PetscInt***)&idx,imax,&n)); 81 82 for (i=0; i<imax; i++) { 83 PetscCall(ISGetIndices(is[i],&idx[i])); 84 PetscCall(ISGetLocalSize(is[i],&n[i])); 85 } 86 87 /* evaluate communication - mesg to who,length of mesg, and buffer space 88 required. Based on this, buffers are allocated, and data copied into them*/ 89 PetscCall(PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4)); 90 for (i=0; i<imax; i++) { 91 PetscCall(PetscArrayzero(w4,size)); /* initialise work vector*/ 92 idx_i = idx[i]; 93 len = n[i]; 94 for (j=0; j<len; j++) { 95 row = idx_i[j]; 96 PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 97 PetscCall(PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc)); 98 w4[proc]++; 99 } 100 for (j=0; j<size; j++) { 101 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 102 } 103 } 104 105 nrqs = 0; /* no of outgoing messages */ 106 msz = 0; /* total mesg length (for all proc */ 107 w1[rank] = 0; /* no mesg sent to itself */ 108 w3[rank] = 0; 109 for (i=0; i<size; i++) { 110 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 111 } 112 /* pa - is list of processors to communicate with */ 113 PetscCall(PetscMalloc1(nrqs,&pa)); 114 for (i=0,j=0; i<size; i++) { 115 if (w1[i]) {pa[j] = i; j++;} 116 } 117 118 /* Each message would have a header = 1 + 2*(no of IS) + data */ 119 for (i=0; i<nrqs; i++) { 120 j = pa[i]; 121 w1[j] += w2[j] + 2*w3[j]; 122 msz += w1[j]; 123 } 124 125 /* Determine the number of messages to expect, their lengths, from from-ids */ 126 PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr)); 127 PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1)); 128 129 /* Now post the Irecvs corresponding to these messages */ 130 PetscCall(PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1)); 131 132 /* Allocate Memory for outgoing messages */ 133 PetscCall(PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr)); 134 PetscCall(PetscArrayzero(outdat,size)); 135 PetscCall(PetscArrayzero(ptr,size)); 136 { 137 PetscInt *iptr = tmp,ict = 0; 138 for (i=0; i<nrqs; i++) { 139 j = pa[i]; 140 iptr += ict; 141 outdat[j] = iptr; 142 ict = w1[j]; 143 } 144 } 145 146 /* Form the outgoing messages */ 147 /*plug in the headers*/ 148 for (i=0; i<nrqs; i++) { 149 j = pa[i]; 150 outdat[j][0] = 0; 151 PetscCall(PetscArrayzero(outdat[j]+1,2*w3[j])); 152 ptr[j] = outdat[j] + 2*w3[j] + 1; 153 } 154 155 /* Memory for doing local proc's work*/ 156 { 157 PetscCall(PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mbs*imax,&d_p, (Mbs/PETSC_BITS_PER_BYTE+1)*imax,&t_p)); 158 159 for (i=0; i<imax; i++) { 160 table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i; 161 data[i] = d_p + (Mbs)*i; 162 } 163 } 164 165 /* Parse the IS and update local tables and the outgoing buf with the data*/ 166 { 167 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 168 PetscBT table_i; 169 170 for (i=0; i<imax; i++) { 171 PetscCall(PetscArrayzero(ctr,size)); 172 n_i = n[i]; 173 table_i = table[i]; 174 idx_i = idx[i]; 175 data_i = data[i]; 176 isz_i = isz[i]; 177 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 178 row = idx_i[j]; 179 PetscCall(PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc)); 180 if (proc != rank) { /* copy to the outgoing buffer */ 181 ctr[proc]++; 182 *ptr[proc] = row; 183 ptr[proc]++; 184 } else { /* Update the local table */ 185 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 186 } 187 } 188 /* Update the headers for the current IS */ 189 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 190 if ((ctr_j = ctr[j])) { 191 outdat_j = outdat[j]; 192 k = ++outdat_j[0]; 193 outdat_j[2*k] = ctr_j; 194 outdat_j[2*k-1] = i; 195 } 196 } 197 isz[i] = isz_i; 198 } 199 } 200 201 /* Now post the sends */ 202 PetscCall(PetscMalloc1(nrqs,&s_waits1)); 203 for (i=0; i<nrqs; ++i) { 204 j = pa[i]; 205 PetscCallMPI(MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i)); 206 } 207 208 /* No longer need the original indices*/ 209 for (i=0; i<imax; ++i) { 210 PetscCall(ISRestoreIndices(is[i],idx+i)); 211 } 212 PetscCall(PetscFree2(*(PetscInt***)&idx,n)); 213 214 PetscCall(PetscMalloc1(imax,&iscomms)); 215 for (i=0; i<imax; ++i) { 216 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL)); 217 PetscCall(ISDestroy(&is[i])); 218 } 219 220 /* Do Local work*/ 221 PetscCall(MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data)); 222 223 /* Receive messages*/ 224 PetscCallMPI(MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE)); 225 PetscCallMPI(MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE)); 226 227 /* Phase 1 sends are complete - deallocate buffers */ 228 PetscCall(PetscFree4(outdat,ptr,tmp,ctr)); 229 PetscCall(PetscFree4(w1,w2,w3,w4)); 230 231 PetscCall(PetscMalloc1(nrqr,&xdata)); 232 PetscCall(PetscMalloc1(nrqr,&isz1)); 233 PetscCall(MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1)); 234 if (rbuf) { 235 PetscCall(PetscFree(rbuf[0])); 236 PetscCall(PetscFree(rbuf)); 237 } 238 239 /* Send the data back*/ 240 /* Do a global reduction to know the buffer space req for incoming messages*/ 241 { 242 PetscMPIInt *rw1; 243 244 PetscCall(PetscCalloc1(size,&rw1)); 245 246 for (i=0; i<nrqr; ++i) { 247 proc = onodes1[i]; 248 rw1[proc] = isz1[i]; 249 } 250 251 /* Determine the number of messages to expect, their lengths, from from-ids */ 252 PetscCall(PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2)); 253 PetscCall(PetscFree(rw1)); 254 } 255 /* Now post the Irecvs corresponding to these messages */ 256 PetscCall(PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2)); 257 258 /* Now post the sends */ 259 PetscCall(PetscMalloc1(nrqr,&s_waits2)); 260 for (i=0; i<nrqr; ++i) { 261 j = onodes1[i]; 262 PetscCallMPI(MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i)); 263 } 264 265 PetscCall(PetscFree(onodes1)); 266 PetscCall(PetscFree(olengths1)); 267 268 /* receive work done on other processors*/ 269 { 270 PetscMPIInt idex; 271 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 272 PetscBT table_i; 273 274 for (i=0; i<nrqs; ++i) { 275 PetscCallMPI(MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE)); 276 /* Process the message*/ 277 rbuf2_i = rbuf2[idex]; 278 ct1 = 2*rbuf2_i[0]+1; 279 jmax = rbuf2[idex][0]; 280 for (j=1; j<=jmax; j++) { 281 max = rbuf2_i[2*j]; 282 is_no = rbuf2_i[2*j-1]; 283 isz_i = isz[is_no]; 284 data_i = data[is_no]; 285 table_i = table[is_no]; 286 for (k=0; k<max; k++,ct1++) { 287 row = rbuf2_i[ct1]; 288 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 289 } 290 isz[is_no] = isz_i; 291 } 292 } 293 PetscCallMPI(MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE)); 294 } 295 296 for (i=0; i<imax; ++i) { 297 PetscCall(ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i)); 298 PetscCall(PetscCommDestroy(&iscomms[i])); 299 } 300 301 PetscCall(PetscFree(iscomms)); 302 PetscCall(PetscFree(onodes2)); 303 PetscCall(PetscFree(olengths2)); 304 305 PetscCall(PetscFree(pa)); 306 if (rbuf2) { 307 PetscCall(PetscFree(rbuf2[0])); 308 PetscCall(PetscFree(rbuf2)); 309 } 310 PetscCall(PetscFree(s_waits1)); 311 PetscCall(PetscFree(r_waits1)); 312 PetscCall(PetscFree(s_waits2)); 313 PetscCall(PetscFree(r_waits2)); 314 PetscCall(PetscFree5(table,data,isz,d_p,t_p)); 315 if (xdata) { 316 PetscCall(PetscFree(xdata[0])); 317 PetscCall(PetscFree(xdata)); 318 } 319 PetscCall(PetscFree(isz1)); 320 PetscFunctionReturn(0); 321 } 322 323 /* 324 MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do 325 the work on the local processor. 326 327 Inputs: 328 C - MAT_MPIBAIJ; 329 imax - total no of index sets processed at a time; 330 table - an array of char - size = Mbs bits. 331 332 Output: 333 isz - array containing the count of the solution elements corresponding 334 to each index set; 335 data - pointer to the solutions 336 */ 337 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 338 { 339 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 340 Mat A = c->A,B = c->B; 341 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 342 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 343 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 344 PetscBT table_i; 345 346 PetscFunctionBegin; 347 rstart = c->rstartbs; 348 cstart = c->cstartbs; 349 ai = a->i; 350 aj = a->j; 351 bi = b->i; 352 bj = b->j; 353 garray = c->garray; 354 355 for (i=0; i<imax; i++) { 356 data_i = data[i]; 357 table_i = table[i]; 358 isz_i = isz[i]; 359 for (j=0,max=isz[i]; j<max; j++) { 360 row = data_i[j] - rstart; 361 start = ai[row]; 362 end = ai[row+1]; 363 for (k=start; k<end; k++) { /* Amat */ 364 val = aj[k] + cstart; 365 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 366 } 367 start = bi[row]; 368 end = bi[row+1]; 369 for (k=start; k<end; k++) { /* Bmat */ 370 val = garray[bj[k]]; 371 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 372 } 373 } 374 isz[i] = isz_i; 375 } 376 PetscFunctionReturn(0); 377 } 378 /* 379 MatIncreaseOverlap_MPIBAIJ_Receive - Process the received messages, 380 and return the output 381 382 Input: 383 C - the matrix 384 nrqr - no of messages being processed. 385 rbuf - an array of pointers to the received requests 386 387 Output: 388 xdata - array of messages to be sent back 389 isz1 - size of each message 390 391 For better efficiency perhaps we should malloc separately each xdata[i], 392 then if a remalloc is required we need only copy the data for that one row 393 rather than all previous rows as it is now where a single large chunk of 394 memory is used. 395 396 */ 397 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 398 { 399 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 400 Mat A = c->A,B = c->B; 401 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 402 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 403 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 404 PetscInt val,max1,max2,Mbs,no_malloc =0,*tmp,new_estimate,ctr; 405 PetscInt *rbuf_i,kmax,rbuf_0; 406 PetscBT xtable; 407 408 PetscFunctionBegin; 409 Mbs = c->Mbs; 410 rstart = c->rstartbs; 411 cstart = c->cstartbs; 412 ai = a->i; 413 aj = a->j; 414 bi = b->i; 415 bj = b->j; 416 garray = c->garray; 417 418 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 419 rbuf_i = rbuf[i]; 420 rbuf_0 = rbuf_i[0]; 421 ct += rbuf_0; 422 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 423 } 424 425 if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs; 426 else max1 = 1; 427 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 428 if (nrqr) { 429 PetscCall(PetscMalloc1(mem_estimate,&xdata[0])); 430 ++no_malloc; 431 } 432 PetscCall(PetscBTCreate(Mbs,&xtable)); 433 PetscCall(PetscArrayzero(isz1,nrqr)); 434 435 ct3 = 0; 436 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 437 rbuf_i = rbuf[i]; 438 rbuf_0 = rbuf_i[0]; 439 ct1 = 2*rbuf_0+1; 440 ct2 = ct1; 441 ct3 += ct1; 442 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 443 PetscCall(PetscBTMemzero(Mbs,xtable)); 444 oct2 = ct2; 445 kmax = rbuf_i[2*j]; 446 for (k=0; k<kmax; k++,ct1++) { 447 row = rbuf_i[ct1]; 448 if (!PetscBTLookupSet(xtable,row)) { 449 if (!(ct3 < mem_estimate)) { 450 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 451 PetscCall(PetscMalloc1(new_estimate,&tmp)); 452 PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate)); 453 PetscCall(PetscFree(xdata[0])); 454 xdata[0] = tmp; 455 mem_estimate = new_estimate; ++no_malloc; 456 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 457 } 458 xdata[i][ct2++] = row; 459 ct3++; 460 } 461 } 462 for (k=oct2,max2=ct2; k<max2; k++) { 463 row = xdata[i][k] - rstart; 464 start = ai[row]; 465 end = ai[row+1]; 466 for (l=start; l<end; l++) { 467 val = aj[l] + cstart; 468 if (!PetscBTLookupSet(xtable,val)) { 469 if (!(ct3 < mem_estimate)) { 470 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 471 PetscCall(PetscMalloc1(new_estimate,&tmp)); 472 PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate)); 473 PetscCall(PetscFree(xdata[0])); 474 xdata[0] = tmp; 475 mem_estimate = new_estimate; ++no_malloc; 476 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 477 } 478 xdata[i][ct2++] = val; 479 ct3++; 480 } 481 } 482 start = bi[row]; 483 end = bi[row+1]; 484 for (l=start; l<end; l++) { 485 val = garray[bj[l]]; 486 if (!PetscBTLookupSet(xtable,val)) { 487 if (!(ct3 < mem_estimate)) { 488 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 489 PetscCall(PetscMalloc1(new_estimate,&tmp)); 490 PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate)); 491 PetscCall(PetscFree(xdata[0])); 492 xdata[0] = tmp; 493 mem_estimate = new_estimate; ++no_malloc; 494 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 495 } 496 xdata[i][ct2++] = val; 497 ct3++; 498 } 499 } 500 } 501 /* Update the header*/ 502 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 503 xdata[i][2*j-1] = rbuf_i[2*j-1]; 504 } 505 xdata[i][0] = rbuf_0; 506 if (i+1<nrqr) xdata[i+1] = xdata[i] + ct2; 507 isz1[i] = ct2; /* size of each message */ 508 } 509 PetscCall(PetscBTDestroy(&xtable)); 510 PetscCall(PetscInfo(C,"Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT ", no of mallocs = %" PetscInt_FMT "\n",mem_estimate,ct3,no_malloc)); 511 PetscFunctionReturn(0); 512 } 513 514 PetscErrorCode MatCreateSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 515 { 516 IS *isrow_block,*iscol_block; 517 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 518 PetscInt nmax,nstages_local,nstages,i,pos,max_no,N=C->cmap->N,bs=C->rmap->bs; 519 Mat_SeqBAIJ *subc; 520 Mat_SubSppt *smat; 521 522 PetscFunctionBegin; 523 /* The compression and expansion should be avoided. Doesn't point 524 out errors, might change the indices, hence buggey */ 525 PetscCall(PetscMalloc2(ismax+1,&isrow_block,ismax+1,&iscol_block)); 526 PetscCall(ISCompressIndicesGeneral(N,C->rmap->n,bs,ismax,isrow,isrow_block)); 527 PetscCall(ISCompressIndicesGeneral(N,C->cmap->n,bs,ismax,iscol,iscol_block)); 528 529 /* Determine the number of stages through which submatrices are done */ 530 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 531 else nmax = 20*1000000 / (c->Nbs * sizeof(PetscInt)); 532 if (!nmax) nmax = 1; 533 534 if (scall == MAT_INITIAL_MATRIX) { 535 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 536 537 /* Make sure every processor loops through the nstages */ 538 PetscCall(MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C))); 539 540 /* Allocate memory to hold all the submatrices and dummy submatrices */ 541 PetscCall(PetscCalloc1(ismax+nstages,submat)); 542 } else { /* MAT_REUSE_MATRIX */ 543 if (ismax) { 544 subc = (Mat_SeqBAIJ*)((*submat)[0]->data); 545 smat = subc->submatis1; 546 } else { /* (*submat)[0] is a dummy matrix */ 547 smat = (Mat_SubSppt*)(*submat)[0]->data; 548 } 549 PetscCheck(smat,PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"MatCreateSubMatrices(...,MAT_REUSE_MATRIX,...) requires submat"); 550 nstages = smat->nstages; 551 } 552 553 for (i=0,pos=0; i<nstages; i++) { 554 if (pos+nmax <= ismax) max_no = nmax; 555 else if (pos >= ismax) max_no = 0; 556 else max_no = ismax-pos; 557 558 PetscCall(MatCreateSubMatrices_MPIBAIJ_local(C,max_no,isrow_block+pos,iscol_block+pos,scall,*submat+pos)); 559 if (!max_no) { 560 if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 561 smat = (Mat_SubSppt*)(*submat)[pos]->data; 562 smat->nstages = nstages; 563 } 564 pos++; /* advance to next dummy matrix if any */ 565 } else pos += max_no; 566 } 567 568 if (scall == MAT_INITIAL_MATRIX && ismax) { 569 /* save nstages for reuse */ 570 subc = (Mat_SeqBAIJ*)((*submat)[0]->data); 571 smat = subc->submatis1; 572 smat->nstages = nstages; 573 } 574 575 for (i=0; i<ismax; i++) { 576 PetscCall(ISDestroy(&isrow_block[i])); 577 PetscCall(ISDestroy(&iscol_block[i])); 578 } 579 PetscCall(PetscFree2(isrow_block,iscol_block)); 580 PetscFunctionReturn(0); 581 } 582 583 #if defined(PETSC_USE_CTABLE) 584 PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank) 585 { 586 PetscInt nGlobalNd = proc_gnode[size]; 587 PetscMPIInt fproc; 588 589 PetscFunctionBegin; 590 PetscCall(PetscMPIIntCast((PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)),&fproc)); 591 if (fproc > size) fproc = size; 592 while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) { 593 if (row < proc_gnode[fproc]) fproc--; 594 else fproc++; 595 } 596 *rank = fproc; 597 PetscFunctionReturn(0); 598 } 599 #endif 600 601 /* -------------------------------------------------------------------------*/ 602 /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */ 603 PetscErrorCode MatCreateSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 604 { 605 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 606 Mat A = c->A; 607 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*subc; 608 const PetscInt **icol,**irow; 609 PetscInt *nrow,*ncol,start; 610 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 611 PetscInt **sbuf1,**sbuf2,*sbuf2_i,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 612 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 613 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 614 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 615 #if defined(PETSC_USE_CTABLE) 616 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 617 #else 618 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 619 #endif 620 const PetscInt *irow_i,*icol_i; 621 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 622 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 623 MPI_Request *r_waits4,*s_waits3,*s_waits4; 624 MPI_Comm comm; 625 PetscScalar **rbuf4,*rbuf4_i=NULL,**sbuf_aa,*vals,*mat_a=NULL,*imat_a=NULL,*sbuf_aa_i; 626 PetscMPIInt *onodes1,*olengths1,end; 627 PetscInt **row2proc,*row2proc_i,*imat_ilen,*imat_j,*imat_i; 628 Mat_SubSppt *smat_i; 629 PetscBool *issorted,colflag,iscsorted=PETSC_TRUE; 630 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 631 PetscInt bs=C->rmap->bs,bs2=c->bs2,rstart = c->rstartbs; 632 PetscBool ijonly=c->ijonly; /* private flag indicates only matrix data structures are requested */ 633 PetscInt nzA,nzB,*a_i=a->i,*b_i=b->i,*a_j = a->j,*b_j = b->j,ctmp,imark,*cworkA,*cworkB; 634 PetscScalar *vworkA=NULL,*vworkB=NULL,*a_a = a->a,*b_a = b->a; 635 PetscInt cstart = c->cstartbs,*bmap = c->garray; 636 PetscBool *allrows,*allcolumns; 637 638 PetscFunctionBegin; 639 PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 640 size = c->size; 641 rank = c->rank; 642 643 PetscCall(PetscMalloc5(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns,ismax,&allrows)); 644 PetscCall(PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted)); 645 646 for (i=0; i<ismax; i++) { 647 PetscCall(ISSorted(iscol[i],&issorted[i])); 648 if (!issorted[i]) iscsorted = issorted[i]; /* columns are not sorted! */ 649 PetscCall(ISSorted(isrow[i],&issorted[i])); 650 651 /* Check for special case: allcolumns */ 652 PetscCall(ISIdentity(iscol[i],&colflag)); 653 PetscCall(ISGetLocalSize(iscol[i],&ncol[i])); 654 655 if (colflag && ncol[i] == c->Nbs) { 656 allcolumns[i] = PETSC_TRUE; 657 icol[i] = NULL; 658 } else { 659 allcolumns[i] = PETSC_FALSE; 660 PetscCall(ISGetIndices(iscol[i],&icol[i])); 661 } 662 663 /* Check for special case: allrows */ 664 PetscCall(ISIdentity(isrow[i],&colflag)); 665 PetscCall(ISGetLocalSize(isrow[i],&nrow[i])); 666 if (colflag && nrow[i] == c->Mbs) { 667 allrows[i] = PETSC_TRUE; 668 irow[i] = NULL; 669 } else { 670 allrows[i] = PETSC_FALSE; 671 PetscCall(ISGetIndices(isrow[i],&irow[i])); 672 } 673 } 674 675 if (scall == MAT_REUSE_MATRIX) { 676 /* Assumes new rows are same length as the old rows */ 677 for (i=0; i<ismax; i++) { 678 subc = (Mat_SeqBAIJ*)(submats[i]->data); 679 PetscCheckFalse(subc->mbs != nrow[i] || subc->nbs != ncol[i],PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 680 681 /* Initial matrix as if empty */ 682 PetscCall(PetscArrayzero(subc->ilen,subc->mbs)); 683 684 /* Initial matrix as if empty */ 685 submats[i]->factortype = C->factortype; 686 687 smat_i = subc->submatis1; 688 689 nrqs = smat_i->nrqs; 690 nrqr = smat_i->nrqr; 691 rbuf1 = smat_i->rbuf1; 692 rbuf2 = smat_i->rbuf2; 693 rbuf3 = smat_i->rbuf3; 694 req_source2 = smat_i->req_source2; 695 696 sbuf1 = smat_i->sbuf1; 697 sbuf2 = smat_i->sbuf2; 698 ptr = smat_i->ptr; 699 tmp = smat_i->tmp; 700 ctr = smat_i->ctr; 701 702 pa = smat_i->pa; 703 req_size = smat_i->req_size; 704 req_source1 = smat_i->req_source1; 705 706 allcolumns[i] = smat_i->allcolumns; 707 allrows[i] = smat_i->allrows; 708 row2proc[i] = smat_i->row2proc; 709 rmap[i] = smat_i->rmap; 710 cmap[i] = smat_i->cmap; 711 } 712 713 if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */ 714 PetscCheckFalse(!submats[0],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 715 smat_i = (Mat_SubSppt*)submats[0]->data; 716 717 nrqs = smat_i->nrqs; 718 nrqr = smat_i->nrqr; 719 rbuf1 = smat_i->rbuf1; 720 rbuf2 = smat_i->rbuf2; 721 rbuf3 = smat_i->rbuf3; 722 req_source2 = smat_i->req_source2; 723 724 sbuf1 = smat_i->sbuf1; 725 sbuf2 = smat_i->sbuf2; 726 ptr = smat_i->ptr; 727 tmp = smat_i->tmp; 728 ctr = smat_i->ctr; 729 730 pa = smat_i->pa; 731 req_size = smat_i->req_size; 732 req_source1 = smat_i->req_source1; 733 734 allcolumns[0] = PETSC_FALSE; 735 } 736 } else { /* scall == MAT_INITIAL_MATRIX */ 737 /* Get some new tags to keep the communication clean */ 738 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2)); 739 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag3)); 740 741 /* evaluate communication - mesg to who, length of mesg, and buffer space 742 required. Based on this, buffers are allocated, and data copied into them*/ 743 PetscCall(PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4)); /* mesg size, initialize work vectors */ 744 745 for (i=0; i<ismax; i++) { 746 jmax = nrow[i]; 747 irow_i = irow[i]; 748 749 PetscCall(PetscMalloc1(jmax,&row2proc_i)); 750 row2proc[i] = row2proc_i; 751 752 if (issorted[i]) proc = 0; 753 for (j=0; j<jmax; j++) { 754 if (!issorted[i]) proc = 0; 755 if (allrows[i]) row = j; 756 else row = irow_i[j]; 757 758 while (row >= c->rangebs[proc+1]) proc++; 759 w4[proc]++; 760 row2proc_i[j] = proc; /* map row index to proc */ 761 } 762 for (j=0; j<size; j++) { 763 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 764 } 765 } 766 767 nrqs = 0; /* no of outgoing messages */ 768 msz = 0; /* total mesg length (for all procs) */ 769 w1[rank] = 0; /* no mesg sent to self */ 770 w3[rank] = 0; 771 for (i=0; i<size; i++) { 772 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 773 } 774 PetscCall(PetscMalloc1(nrqs,&pa)); /*(proc -array)*/ 775 for (i=0,j=0; i<size; i++) { 776 if (w1[i]) { pa[j] = i; j++; } 777 } 778 779 /* Each message would have a header = 1 + 2*(no of IS) + data */ 780 for (i=0; i<nrqs; i++) { 781 j = pa[i]; 782 w1[j] += w2[j] + 2* w3[j]; 783 msz += w1[j]; 784 } 785 PetscCall(PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz)); 786 787 /* Determine the number of messages to expect, their lengths, from from-ids */ 788 PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr)); 789 PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1)); 790 791 /* Now post the Irecvs corresponding to these messages */ 792 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag0)); 793 PetscCall(PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1)); 794 795 /* Allocate Memory for outgoing messages */ 796 PetscCall(PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr)); 797 PetscCall(PetscArrayzero(sbuf1,size)); 798 PetscCall(PetscArrayzero(ptr,size)); 799 800 { 801 PetscInt *iptr = tmp; 802 k = 0; 803 for (i=0; i<nrqs; i++) { 804 j = pa[i]; 805 iptr += k; 806 sbuf1[j] = iptr; 807 k = w1[j]; 808 } 809 } 810 811 /* Form the outgoing messages. Initialize the header space */ 812 for (i=0; i<nrqs; i++) { 813 j = pa[i]; 814 sbuf1[j][0] = 0; 815 PetscCall(PetscArrayzero(sbuf1[j]+1,2*w3[j])); 816 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 817 } 818 819 /* Parse the isrow and copy data into outbuf */ 820 for (i=0; i<ismax; i++) { 821 row2proc_i = row2proc[i]; 822 PetscCall(PetscArrayzero(ctr,size)); 823 irow_i = irow[i]; 824 jmax = nrow[i]; 825 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 826 proc = row2proc_i[j]; 827 if (allrows[i]) row = j; 828 else row = irow_i[j]; 829 830 if (proc != rank) { /* copy to the outgoing buf*/ 831 ctr[proc]++; 832 *ptr[proc] = row; 833 ptr[proc]++; 834 } 835 } 836 /* Update the headers for the current IS */ 837 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 838 if ((ctr_j = ctr[j])) { 839 sbuf1_j = sbuf1[j]; 840 k = ++sbuf1_j[0]; 841 sbuf1_j[2*k] = ctr_j; 842 sbuf1_j[2*k-1] = i; 843 } 844 } 845 } 846 847 /* Now post the sends */ 848 PetscCall(PetscMalloc1(nrqs,&s_waits1)); 849 for (i=0; i<nrqs; ++i) { 850 j = pa[i]; 851 PetscCallMPI(MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i)); 852 } 853 854 /* Post Receives to capture the buffer size */ 855 PetscCall(PetscMalloc1(nrqs,&r_waits2)); 856 PetscCall(PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3)); 857 if (nrqs) rbuf2[0] = tmp + msz; 858 for (i=1; i<nrqs; ++i) { 859 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 860 } 861 for (i=0; i<nrqs; ++i) { 862 j = pa[i]; 863 PetscCallMPI(MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i)); 864 } 865 866 /* Send to other procs the buf size they should allocate */ 867 /* Receive messages*/ 868 PetscCall(PetscMalloc1(nrqr,&s_waits2)); 869 PetscCall(PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1)); 870 871 PetscCallMPI(MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE)); 872 for (i=0; i<nrqr; ++i) { 873 req_size[i] = 0; 874 rbuf1_i = rbuf1[i]; 875 start = 2*rbuf1_i[0] + 1; 876 end = olengths1[i]; 877 PetscCall(PetscMalloc1(end,&sbuf2[i])); 878 sbuf2_i = sbuf2[i]; 879 for (j=start; j<end; j++) { 880 row = rbuf1_i[j] - rstart; 881 ncols = a_i[row+1] - a_i[row] + b_i[row+1] - b_i[row]; 882 sbuf2_i[j] = ncols; 883 req_size[i] += ncols; 884 } 885 req_source1[i] = onodes1[i]; 886 /* form the header */ 887 sbuf2_i[0] = req_size[i]; 888 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 889 890 PetscCallMPI(MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i)); 891 } 892 893 PetscCall(PetscFree(onodes1)); 894 PetscCall(PetscFree(olengths1)); 895 896 PetscCall(PetscFree(r_waits1)); 897 PetscCall(PetscFree4(w1,w2,w3,w4)); 898 899 /* Receive messages*/ 900 PetscCall(PetscMalloc1(nrqs,&r_waits3)); 901 902 PetscCallMPI(MPI_Waitall(nrqs,r_waits2,MPI_STATUSES_IGNORE)); 903 for (i=0; i<nrqs; ++i) { 904 PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf3[i])); 905 req_source2[i] = pa[i]; 906 PetscCallMPI(MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i)); 907 } 908 PetscCall(PetscFree(r_waits2)); 909 910 /* Wait on sends1 and sends2 */ 911 PetscCallMPI(MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE)); 912 PetscCallMPI(MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE)); 913 PetscCall(PetscFree(s_waits1)); 914 PetscCall(PetscFree(s_waits2)); 915 916 /* Now allocate sending buffers for a->j, and send them off */ 917 PetscCall(PetscMalloc1(nrqr,&sbuf_aj)); 918 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 919 if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aj[0])); 920 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 921 922 PetscCall(PetscMalloc1(nrqr,&s_waits3)); 923 { 924 925 for (i=0; i<nrqr; i++) { 926 rbuf1_i = rbuf1[i]; 927 sbuf_aj_i = sbuf_aj[i]; 928 ct1 = 2*rbuf1_i[0] + 1; 929 ct2 = 0; 930 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 931 kmax = rbuf1[i][2*j]; 932 for (k=0; k<kmax; k++,ct1++) { 933 row = rbuf1_i[ct1] - rstart; 934 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 935 ncols = nzA + nzB; 936 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 937 938 /* load the column indices for this row into cols */ 939 cols = sbuf_aj_i + ct2; 940 for (l=0; l<nzB; l++) { 941 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp; 942 else break; 943 } 944 imark = l; 945 for (l=0; l<nzA; l++) {cols[imark+l] = cstart + cworkA[l];} 946 for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]]; 947 ct2 += ncols; 948 } 949 } 950 PetscCallMPI(MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i)); 951 } 952 } 953 954 /* create col map: global col of C -> local col of submatrices */ 955 #if defined(PETSC_USE_CTABLE) 956 for (i=0; i<ismax; i++) { 957 if (!allcolumns[i]) { 958 PetscCall(PetscTableCreate(ncol[i],c->Nbs,&cmap[i])); 959 960 jmax = ncol[i]; 961 icol_i = icol[i]; 962 cmap_i = cmap[i]; 963 for (j=0; j<jmax; j++) { 964 PetscCall(PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES)); 965 } 966 } else cmap[i] = NULL; 967 } 968 #else 969 for (i=0; i<ismax; i++) { 970 if (!allcolumns[i]) { 971 PetscCall(PetscCalloc1(c->Nbs,&cmap[i])); 972 jmax = ncol[i]; 973 icol_i = icol[i]; 974 cmap_i = cmap[i]; 975 for (j=0; j<jmax; j++) cmap_i[icol_i[j]] = j+1; 976 } else cmap[i] = NULL; 977 } 978 #endif 979 980 /* Create lens which is required for MatCreate... */ 981 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 982 PetscCall(PetscMalloc1(ismax,&lens)); 983 984 if (ismax) { 985 PetscCall(PetscCalloc1(j,&lens[0])); 986 } 987 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 988 989 /* Update lens from local data */ 990 for (i=0; i<ismax; i++) { 991 row2proc_i = row2proc[i]; 992 jmax = nrow[i]; 993 if (!allcolumns[i]) cmap_i = cmap[i]; 994 irow_i = irow[i]; 995 lens_i = lens[i]; 996 for (j=0; j<jmax; j++) { 997 if (allrows[i]) row = j; 998 else row = irow_i[j]; /* global blocked row of C */ 999 1000 proc = row2proc_i[j]; 1001 if (proc == rank) { 1002 /* Get indices from matA and then from matB */ 1003 #if defined(PETSC_USE_CTABLE) 1004 PetscInt tt; 1005 #endif 1006 row = row - rstart; 1007 nzA = a_i[row+1] - a_i[row]; 1008 nzB = b_i[row+1] - b_i[row]; 1009 cworkA = a_j + a_i[row]; 1010 cworkB = b_j + b_i[row]; 1011 1012 if (!allcolumns[i]) { 1013 #if defined(PETSC_USE_CTABLE) 1014 for (k=0; k<nzA; k++) { 1015 PetscCall(PetscTableFind(cmap_i,cstart+cworkA[k]+1,&tt)); 1016 if (tt) lens_i[j]++; 1017 } 1018 for (k=0; k<nzB; k++) { 1019 PetscCall(PetscTableFind(cmap_i,bmap[cworkB[k]]+1,&tt)); 1020 if (tt) lens_i[j]++; 1021 } 1022 1023 #else 1024 for (k=0; k<nzA; k++) { 1025 if (cmap_i[cstart + cworkA[k]]) lens_i[j]++; 1026 } 1027 for (k=0; k<nzB; k++) { 1028 if (cmap_i[bmap[cworkB[k]]]) lens_i[j]++; 1029 } 1030 #endif 1031 } else { /* allcolumns */ 1032 lens_i[j] = nzA + nzB; 1033 } 1034 } 1035 } 1036 } 1037 1038 /* Create row map: global row of C -> local row of submatrices */ 1039 for (i=0; i<ismax; i++) { 1040 if (!allrows[i]) { 1041 #if defined(PETSC_USE_CTABLE) 1042 PetscCall(PetscTableCreate(nrow[i],c->Mbs,&rmap[i])); 1043 irow_i = irow[i]; 1044 jmax = nrow[i]; 1045 for (j=0; j<jmax; j++) { 1046 if (allrows[i]) { 1047 PetscCall(PetscTableAdd(rmap[i],j+1,j+1,INSERT_VALUES)); 1048 } else { 1049 PetscCall(PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES)); 1050 } 1051 } 1052 #else 1053 PetscCall(PetscCalloc1(c->Mbs,&rmap[i])); 1054 rmap_i = rmap[i]; 1055 irow_i = irow[i]; 1056 jmax = nrow[i]; 1057 for (j=0; j<jmax; j++) { 1058 if (allrows[i]) rmap_i[j] = j; 1059 else rmap_i[irow_i[j]] = j; 1060 } 1061 #endif 1062 } else rmap[i] = NULL; 1063 } 1064 1065 /* Update lens from offproc data */ 1066 { 1067 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1068 1069 PetscCallMPI(MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE)); 1070 for (tmp2=0; tmp2<nrqs; tmp2++) { 1071 sbuf1_i = sbuf1[pa[tmp2]]; 1072 jmax = sbuf1_i[0]; 1073 ct1 = 2*jmax+1; 1074 ct2 = 0; 1075 rbuf2_i = rbuf2[tmp2]; 1076 rbuf3_i = rbuf3[tmp2]; 1077 for (j=1; j<=jmax; j++) { 1078 is_no = sbuf1_i[2*j-1]; 1079 max1 = sbuf1_i[2*j]; 1080 lens_i = lens[is_no]; 1081 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1082 rmap_i = rmap[is_no]; 1083 for (k=0; k<max1; k++,ct1++) { 1084 if (allrows[is_no]) { 1085 row = sbuf1_i[ct1]; 1086 } else { 1087 #if defined(PETSC_USE_CTABLE) 1088 PetscCall(PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row)); 1089 row--; 1090 PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1091 #else 1092 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1093 #endif 1094 } 1095 max2 = rbuf2_i[ct1]; 1096 for (l=0; l<max2; l++,ct2++) { 1097 if (!allcolumns[is_no]) { 1098 #if defined(PETSC_USE_CTABLE) 1099 PetscCall(PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol)); 1100 #else 1101 tcol = cmap_i[rbuf3_i[ct2]]; 1102 #endif 1103 if (tcol) lens_i[row]++; 1104 } else { /* allcolumns */ 1105 lens_i[row]++; /* lens_i[row] += max2 ? */ 1106 } 1107 } 1108 } 1109 } 1110 } 1111 } 1112 PetscCall(PetscFree(r_waits3)); 1113 PetscCallMPI(MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE)); 1114 PetscCall(PetscFree(s_waits3)); 1115 1116 /* Create the submatrices */ 1117 for (i=0; i<ismax; i++) { 1118 PetscInt bs_tmp; 1119 if (ijonly) bs_tmp = 1; 1120 else bs_tmp = bs; 1121 1122 PetscCall(MatCreate(PETSC_COMM_SELF,submats+i)); 1123 PetscCall(MatSetSizes(submats[i],nrow[i]*bs_tmp,ncol[i]*bs_tmp,PETSC_DETERMINE,PETSC_DETERMINE)); 1124 1125 PetscCall(MatSetType(submats[i],((PetscObject)A)->type_name)); 1126 PetscCall(MatSeqBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i])); 1127 PetscCall(MatSeqSBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i])); /* this subroutine is used by SBAIJ routines */ 1128 1129 /* create struct Mat_SubSppt and attached it to submat */ 1130 PetscCall(PetscNew(&smat_i)); 1131 subc = (Mat_SeqBAIJ*)submats[i]->data; 1132 subc->submatis1 = smat_i; 1133 1134 smat_i->destroy = submats[i]->ops->destroy; 1135 submats[i]->ops->destroy = MatDestroySubMatrix_SeqBAIJ; 1136 submats[i]->factortype = C->factortype; 1137 1138 smat_i->id = i; 1139 smat_i->nrqs = nrqs; 1140 smat_i->nrqr = nrqr; 1141 smat_i->rbuf1 = rbuf1; 1142 smat_i->rbuf2 = rbuf2; 1143 smat_i->rbuf3 = rbuf3; 1144 smat_i->sbuf2 = sbuf2; 1145 smat_i->req_source2 = req_source2; 1146 1147 smat_i->sbuf1 = sbuf1; 1148 smat_i->ptr = ptr; 1149 smat_i->tmp = tmp; 1150 smat_i->ctr = ctr; 1151 1152 smat_i->pa = pa; 1153 smat_i->req_size = req_size; 1154 smat_i->req_source1 = req_source1; 1155 1156 smat_i->allcolumns = allcolumns[i]; 1157 smat_i->allrows = allrows[i]; 1158 smat_i->singleis = PETSC_FALSE; 1159 smat_i->row2proc = row2proc[i]; 1160 smat_i->rmap = rmap[i]; 1161 smat_i->cmap = cmap[i]; 1162 } 1163 1164 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 1165 PetscCall(MatCreate(PETSC_COMM_SELF,&submats[0])); 1166 PetscCall(MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE)); 1167 PetscCall(MatSetType(submats[0],MATDUMMY)); 1168 1169 /* create struct Mat_SubSppt and attached it to submat */ 1170 PetscCall(PetscNewLog(submats[0],&smat_i)); 1171 submats[0]->data = (void*)smat_i; 1172 1173 smat_i->destroy = submats[0]->ops->destroy; 1174 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 1175 submats[0]->factortype = C->factortype; 1176 1177 smat_i->id = 0; 1178 smat_i->nrqs = nrqs; 1179 smat_i->nrqr = nrqr; 1180 smat_i->rbuf1 = rbuf1; 1181 smat_i->rbuf2 = rbuf2; 1182 smat_i->rbuf3 = rbuf3; 1183 smat_i->sbuf2 = sbuf2; 1184 smat_i->req_source2 = req_source2; 1185 1186 smat_i->sbuf1 = sbuf1; 1187 smat_i->ptr = ptr; 1188 smat_i->tmp = tmp; 1189 smat_i->ctr = ctr; 1190 1191 smat_i->pa = pa; 1192 smat_i->req_size = req_size; 1193 smat_i->req_source1 = req_source1; 1194 1195 smat_i->allcolumns = PETSC_FALSE; 1196 smat_i->singleis = PETSC_FALSE; 1197 smat_i->row2proc = NULL; 1198 smat_i->rmap = NULL; 1199 smat_i->cmap = NULL; 1200 } 1201 1202 if (ismax) PetscCall(PetscFree(lens[0])); 1203 PetscCall(PetscFree(lens)); 1204 if (sbuf_aj) { 1205 PetscCall(PetscFree(sbuf_aj[0])); 1206 PetscCall(PetscFree(sbuf_aj)); 1207 } 1208 1209 } /* endof scall == MAT_INITIAL_MATRIX */ 1210 1211 /* Post recv matrix values */ 1212 if (!ijonly) { 1213 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag4)); 1214 PetscCall(PetscMalloc1(nrqs,&rbuf4)); 1215 PetscCall(PetscMalloc1(nrqs,&r_waits4)); 1216 for (i=0; i<nrqs; ++i) { 1217 PetscCall(PetscMalloc1(rbuf2[i][0]*bs2,&rbuf4[i])); 1218 PetscCallMPI(MPI_Irecv(rbuf4[i],rbuf2[i][0]*bs2,MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i)); 1219 } 1220 1221 /* Allocate sending buffers for a->a, and send them off */ 1222 PetscCall(PetscMalloc1(nrqr,&sbuf_aa)); 1223 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1224 1225 if (nrqr) PetscCall(PetscMalloc1(j*bs2,&sbuf_aa[0])); 1226 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2; 1227 1228 PetscCall(PetscMalloc1(nrqr,&s_waits4)); 1229 1230 for (i=0; i<nrqr; i++) { 1231 rbuf1_i = rbuf1[i]; 1232 sbuf_aa_i = sbuf_aa[i]; 1233 ct1 = 2*rbuf1_i[0]+1; 1234 ct2 = 0; 1235 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1236 kmax = rbuf1_i[2*j]; 1237 for (k=0; k<kmax; k++,ct1++) { 1238 row = rbuf1_i[ct1] - rstart; 1239 nzA = a_i[row+1] - a_i[row]; 1240 nzB = b_i[row+1] - b_i[row]; 1241 ncols = nzA + nzB; 1242 cworkB = b_j + b_i[row]; 1243 vworkA = a_a + a_i[row]*bs2; 1244 vworkB = b_a + b_i[row]*bs2; 1245 1246 /* load the column values for this row into vals*/ 1247 vals = sbuf_aa_i+ct2*bs2; 1248 for (l=0; l<nzB; l++) { 1249 if ((bmap[cworkB[l]]) < cstart) { 1250 PetscCall(PetscArraycpy(vals+l*bs2,vworkB+l*bs2,bs2)); 1251 } else break; 1252 } 1253 imark = l; 1254 for (l=0; l<nzA; l++) { 1255 PetscCall(PetscArraycpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2)); 1256 } 1257 for (l=imark; l<nzB; l++) { 1258 PetscCall(PetscArraycpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2)); 1259 } 1260 1261 ct2 += ncols; 1262 } 1263 } 1264 PetscCallMPI(MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i)); 1265 } 1266 } 1267 1268 /* Assemble the matrices */ 1269 /* First assemble the local rows */ 1270 for (i=0; i<ismax; i++) { 1271 row2proc_i = row2proc[i]; 1272 subc = (Mat_SeqBAIJ*)submats[i]->data; 1273 imat_ilen = subc->ilen; 1274 imat_j = subc->j; 1275 imat_i = subc->i; 1276 imat_a = subc->a; 1277 1278 if (!allcolumns[i]) cmap_i = cmap[i]; 1279 rmap_i = rmap[i]; 1280 irow_i = irow[i]; 1281 jmax = nrow[i]; 1282 for (j=0; j<jmax; j++) { 1283 if (allrows[i]) row = j; 1284 else row = irow_i[j]; 1285 proc = row2proc_i[j]; 1286 1287 if (proc == rank) { 1288 1289 row = row - rstart; 1290 nzA = a_i[row+1] - a_i[row]; 1291 nzB = b_i[row+1] - b_i[row]; 1292 cworkA = a_j + a_i[row]; 1293 cworkB = b_j + b_i[row]; 1294 if (!ijonly) { 1295 vworkA = a_a + a_i[row]*bs2; 1296 vworkB = b_a + b_i[row]*bs2; 1297 } 1298 1299 if (allrows[i]) { 1300 row = row+rstart; 1301 } else { 1302 #if defined(PETSC_USE_CTABLE) 1303 PetscCall(PetscTableFind(rmap_i,row+rstart+1,&row)); 1304 row--; 1305 1306 PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1307 #else 1308 row = rmap_i[row + rstart]; 1309 #endif 1310 } 1311 mat_i = imat_i[row]; 1312 if (!ijonly) mat_a = imat_a + mat_i*bs2; 1313 mat_j = imat_j + mat_i; 1314 ilen = imat_ilen[row]; 1315 1316 /* load the column indices for this row into cols*/ 1317 if (!allcolumns[i]) { 1318 for (l=0; l<nzB; l++) { 1319 if ((ctmp = bmap[cworkB[l]]) < cstart) { 1320 #if defined(PETSC_USE_CTABLE) 1321 PetscCall(PetscTableFind(cmap_i,ctmp+1,&tcol)); 1322 if (tcol) { 1323 #else 1324 if ((tcol = cmap_i[ctmp])) { 1325 #endif 1326 *mat_j++ = tcol - 1; 1327 PetscCall(PetscArraycpy(mat_a,vworkB+l*bs2,bs2)); 1328 mat_a += bs2; 1329 ilen++; 1330 } 1331 } else break; 1332 } 1333 imark = l; 1334 for (l=0; l<nzA; l++) { 1335 #if defined(PETSC_USE_CTABLE) 1336 PetscCall(PetscTableFind(cmap_i,cstart+cworkA[l]+1,&tcol)); 1337 if (tcol) { 1338 #else 1339 if ((tcol = cmap_i[cstart + cworkA[l]])) { 1340 #endif 1341 *mat_j++ = tcol - 1; 1342 if (!ijonly) { 1343 PetscCall(PetscArraycpy(mat_a,vworkA+l*bs2,bs2)); 1344 mat_a += bs2; 1345 } 1346 ilen++; 1347 } 1348 } 1349 for (l=imark; l<nzB; l++) { 1350 #if defined(PETSC_USE_CTABLE) 1351 PetscCall(PetscTableFind(cmap_i,bmap[cworkB[l]]+1,&tcol)); 1352 if (tcol) { 1353 #else 1354 if ((tcol = cmap_i[bmap[cworkB[l]]])) { 1355 #endif 1356 *mat_j++ = tcol - 1; 1357 if (!ijonly) { 1358 PetscCall(PetscArraycpy(mat_a,vworkB+l*bs2,bs2)); 1359 mat_a += bs2; 1360 } 1361 ilen++; 1362 } 1363 } 1364 } else { /* allcolumns */ 1365 for (l=0; l<nzB; l++) { 1366 if ((ctmp = bmap[cworkB[l]]) < cstart) { 1367 *mat_j++ = ctmp; 1368 PetscCall(PetscArraycpy(mat_a,vworkB+l*bs2,bs2)); 1369 mat_a += bs2; 1370 ilen++; 1371 } else break; 1372 } 1373 imark = l; 1374 for (l=0; l<nzA; l++) { 1375 *mat_j++ = cstart+cworkA[l]; 1376 if (!ijonly) { 1377 PetscCall(PetscArraycpy(mat_a,vworkA+l*bs2,bs2)); 1378 mat_a += bs2; 1379 } 1380 ilen++; 1381 } 1382 for (l=imark; l<nzB; l++) { 1383 *mat_j++ = bmap[cworkB[l]]; 1384 if (!ijonly) { 1385 PetscCall(PetscArraycpy(mat_a,vworkB+l*bs2,bs2)); 1386 mat_a += bs2; 1387 } 1388 ilen++; 1389 } 1390 } 1391 imat_ilen[row] = ilen; 1392 } 1393 } 1394 } 1395 1396 /* Now assemble the off proc rows */ 1397 if (!ijonly) { 1398 PetscCallMPI(MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE)); 1399 } 1400 for (tmp2=0; tmp2<nrqs; tmp2++) { 1401 sbuf1_i = sbuf1[pa[tmp2]]; 1402 jmax = sbuf1_i[0]; 1403 ct1 = 2*jmax + 1; 1404 ct2 = 0; 1405 rbuf2_i = rbuf2[tmp2]; 1406 rbuf3_i = rbuf3[tmp2]; 1407 if (!ijonly) rbuf4_i = rbuf4[tmp2]; 1408 for (j=1; j<=jmax; j++) { 1409 is_no = sbuf1_i[2*j-1]; 1410 rmap_i = rmap[is_no]; 1411 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1412 subc = (Mat_SeqBAIJ*)submats[is_no]->data; 1413 imat_ilen = subc->ilen; 1414 imat_j = subc->j; 1415 imat_i = subc->i; 1416 if (!ijonly) imat_a = subc->a; 1417 max1 = sbuf1_i[2*j]; 1418 for (k=0; k<max1; k++,ct1++) { /* for each recved block row */ 1419 row = sbuf1_i[ct1]; 1420 1421 if (allrows[is_no]) { 1422 row = sbuf1_i[ct1]; 1423 } else { 1424 #if defined(PETSC_USE_CTABLE) 1425 PetscCall(PetscTableFind(rmap_i,row+1,&row)); 1426 row--; 1427 PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1428 #else 1429 row = rmap_i[row]; 1430 #endif 1431 } 1432 ilen = imat_ilen[row]; 1433 mat_i = imat_i[row]; 1434 if (!ijonly) mat_a = imat_a + mat_i*bs2; 1435 mat_j = imat_j + mat_i; 1436 max2 = rbuf2_i[ct1]; 1437 if (!allcolumns[is_no]) { 1438 for (l=0; l<max2; l++,ct2++) { 1439 #if defined(PETSC_USE_CTABLE) 1440 PetscCall(PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol)); 1441 #else 1442 tcol = cmap_i[rbuf3_i[ct2]]; 1443 #endif 1444 if (tcol) { 1445 *mat_j++ = tcol - 1; 1446 if (!ijonly) { 1447 PetscCall(PetscArraycpy(mat_a,rbuf4_i+ct2*bs2,bs2)); 1448 mat_a += bs2; 1449 } 1450 ilen++; 1451 } 1452 } 1453 } else { /* allcolumns */ 1454 for (l=0; l<max2; l++,ct2++) { 1455 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1456 if (!ijonly) { 1457 PetscCall(PetscArraycpy(mat_a,rbuf4_i+ct2*bs2,bs2)); 1458 mat_a += bs2; 1459 } 1460 ilen++; 1461 } 1462 } 1463 imat_ilen[row] = ilen; 1464 } 1465 } 1466 } 1467 1468 if (!iscsorted) { /* sort column indices of the rows */ 1469 MatScalar *work; 1470 1471 PetscCall(PetscMalloc1(bs2,&work)); 1472 for (i=0; i<ismax; i++) { 1473 subc = (Mat_SeqBAIJ*)submats[i]->data; 1474 imat_ilen = subc->ilen; 1475 imat_j = subc->j; 1476 imat_i = subc->i; 1477 if (!ijonly) imat_a = subc->a; 1478 if (allcolumns[i]) continue; 1479 1480 jmax = nrow[i]; 1481 for (j=0; j<jmax; j++) { 1482 mat_i = imat_i[j]; 1483 mat_j = imat_j + mat_i; 1484 ilen = imat_ilen[j]; 1485 if (ijonly) { 1486 PetscCall(PetscSortInt(ilen,mat_j)); 1487 } else { 1488 mat_a = imat_a + mat_i*bs2; 1489 PetscCall(PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work)); 1490 } 1491 } 1492 } 1493 PetscCall(PetscFree(work)); 1494 } 1495 1496 if (!ijonly) { 1497 PetscCall(PetscFree(r_waits4)); 1498 PetscCallMPI(MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE)); 1499 PetscCall(PetscFree(s_waits4)); 1500 } 1501 1502 /* Restore the indices */ 1503 for (i=0; i<ismax; i++) { 1504 if (!allrows[i]) { 1505 PetscCall(ISRestoreIndices(isrow[i],irow+i)); 1506 } 1507 if (!allcolumns[i]) { 1508 PetscCall(ISRestoreIndices(iscol[i],icol+i)); 1509 } 1510 } 1511 1512 for (i=0; i<ismax; i++) { 1513 PetscCall(MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY)); 1514 PetscCall(MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY)); 1515 } 1516 1517 PetscCall(PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted)); 1518 PetscCall(PetscFree5(row2proc,cmap,rmap,allcolumns,allrows)); 1519 1520 if (!ijonly) { 1521 if (sbuf_aa) { 1522 PetscCall(PetscFree(sbuf_aa[0])); 1523 PetscCall(PetscFree(sbuf_aa)); 1524 } 1525 1526 for (i=0; i<nrqs; ++i) { 1527 PetscCall(PetscFree(rbuf4[i])); 1528 } 1529 PetscCall(PetscFree(rbuf4)); 1530 } 1531 c->ijonly = PETSC_FALSE; /* set back to the default */ 1532 PetscFunctionReturn(0); 1533 } 1534