1 /*$Id: sbaijov.c,v 1.7 2001/03/06 20:15:14 balay Exp balay $*/ 2 3 /* 4 Routines to compute overlapping regions of a parallel MPI matrix 5 and to find submatrices that were shared across processors. 6 */ 7 #include "src/mat/impls/baij/mpi/mpibaij.h" 8 #include "petscbt.h" 9 10 static int MatIncreaseOverlap_MPISBAIJ_Once(Mat,int,IS *); 11 static int MatIncreaseOverlap_MPISBAIJ_Local(Mat,int,char **,int*,int**); 12 static int MatIncreaseOverlap_MPISBAIJ_Receive(Mat,int,int **,int**,int*); 13 extern int MatGetRow_MPISBAIJ(Mat,int,int*,int**,Scalar**); 14 extern int MatRestoreRow_MPISBAIJ(Mat,int,int*,int**,Scalar**); 15 16 #undef __FUNC__ 17 #define __FUNC__ "MatCompressIndicesGeneral_MPISBAIJ" 18 static int MatCompressIndicesGeneral_MPISBAIJ(Mat C,int imax,IS *is_in,IS *is_out) 19 { 20 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ*)C->data; 21 int ierr,isz,bs = baij->bs,Nbs,n,i,j,*idx,*nidx,ival; 22 PetscBT table; 23 24 PetscFunctionBegin; 25 Nbs = baij->Nbs; 26 ierr = PetscMalloc((Nbs+1)*sizeof(int),&nidx);CHKERRQ(ierr); 27 ierr = PetscBTCreate(Nbs,table);CHKERRQ(ierr); 28 29 for (i=0; i<imax; i++) { 30 isz = 0; 31 ierr = PetscBTMemzero(Nbs,table);CHKERRQ(ierr); 32 ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr); 33 ierr = ISGetSize(is_in[i],&n);CHKERRQ(ierr); 34 for (j=0; j<n ; j++) { 35 ival = idx[j]/bs; /* convert the indices into block indices */ 36 if (ival>Nbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 37 if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;} 38 } 39 ierr = ISRestoreIndices(is_in[i],&idx);CHKERRQ(ierr); 40 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz,nidx,(is_out+i));CHKERRQ(ierr); 41 } 42 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 43 ierr = PetscFree(nidx);CHKERRQ(ierr); 44 PetscFunctionReturn(0); 45 } 46 47 #undef __FUNC__ 48 #define __FUNC__ "MatCompressIndicesSorted_MPISBAIJ" 49 static int MatCompressIndicesSorted_MPISBAIJ(Mat C,int imax,IS *is_in,IS *is_out) 50 { 51 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ*)C->data; 52 int ierr,bs=baij->bs,i,j,k,val,n,*idx,*nidx,Nbs=baij->Nbs,*idx_local; 53 PetscTruth flg; 54 55 PetscFunctionBegin; 56 for (i=0; i<imax; i++) { 57 ierr = ISSorted(is_in[i],&flg);CHKERRQ(ierr); 58 if (!flg) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Indices are not sorted"); 59 } 60 ierr = PetscMalloc((Nbs+1)*sizeof(int),&nidx);CHKERRQ(ierr); 61 /* Now check if the indices are in block order */ 62 for (i=0; i<imax; i++) { 63 ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr); 64 ierr = ISGetSize(is_in[i],&n);CHKERRQ(ierr); 65 if (n%bs !=0) SETERRQ(1,"Indices are not block ordered"); 66 67 n = n/bs; /* The reduced index size */ 68 idx_local = idx; 69 for (j=0; j<n ; j++) { 70 val = idx_local[0]; 71 if (val%bs != 0) SETERRQ(1,"Indices are not block ordered"); 72 for (k=0; k<bs; k++) { 73 if (val+k != idx_local[k]) SETERRQ(1,"Indices are not block ordered"); 74 } 75 nidx[j] = val/bs; 76 idx_local +=bs; 77 } 78 ierr = ISRestoreIndices(is_in[i],&idx);CHKERRQ(ierr); 79 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,nidx,(is_out+i));CHKERRQ(ierr); 80 } 81 ierr = PetscFree(nidx);CHKERRQ(ierr); 82 PetscFunctionReturn(0); 83 } 84 85 #undef __FUNC__ 86 #define __FUNC__ "MatExpandIndices_MPISBAIJ" 87 static int MatExpandIndices_MPISBAIJ(Mat C,int imax,IS *is_in,IS *is_out) 88 { 89 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ*)C->data; 90 int ierr,bs = baij->bs,Nbs,n,i,j,k,*idx,*nidx; 91 92 PetscFunctionBegin; 93 Nbs = baij->Nbs; 94 95 ierr = PetscMalloc((Nbs*bs+1)*sizeof(int),&nidx);CHKERRQ(ierr); 96 97 for (i=0; i<imax; i++) { 98 ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr); 99 ierr = ISGetSize(is_in[i],&n);CHKERRQ(ierr); 100 for (j=0; j<n ; ++j){ 101 for (k=0; k<bs; k++) 102 nidx[j*bs+k] = idx[j]*bs+k; 103 } 104 ierr = ISRestoreIndices(is_in[i],&idx);CHKERRQ(ierr); 105 ierr = ISCreateGeneral(PETSC_COMM_SELF,n*bs,nidx,is_out+i);CHKERRQ(ierr); 106 } 107 ierr = PetscFree(nidx);CHKERRQ(ierr); 108 PetscFunctionReturn(0); 109 } 110 111 112 #undef __FUNC__ 113 #define __FUNC__ "MatIncreaseOverlap_MPISBAIJ" 114 int MatIncreaseOverlap_MPISBAIJ(Mat C,int imax,IS *is,int ov) 115 { 116 int i,ierr; 117 IS *is_new; 118 119 PetscFunctionBegin; 120 ierr = PetscMalloc(imax*sizeof(IS),&is_new);CHKERRQ(ierr); 121 /* Convert the indices into block format */ 122 ierr = MatCompressIndicesGeneral_MPISBAIJ(C,imax,is,is_new);CHKERRQ(ierr); 123 if (ov < 0){ SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");} 124 for (i=0; i<ov; ++i) { 125 ierr = MatIncreaseOverlap_MPISBAIJ_Once(C,imax,is_new);CHKERRQ(ierr); 126 } 127 for (i=0; i<imax; i++) {ierr = ISDestroy(is[i]);CHKERRQ(ierr);} 128 ierr = MatExpandIndices_MPISBAIJ(C,imax,is_new,is);CHKERRQ(ierr); 129 for (i=0; i<imax; i++) {ierr = ISDestroy(is_new[i]);CHKERRQ(ierr);} 130 ierr = PetscFree(is_new);CHKERRQ(ierr); 131 PetscFunctionReturn(0); 132 } 133 134 /* 135 Sample message format: 136 If a processor A wants processor B to process some elements corresponding 137 to index sets 1s[1], is[5] 138 mesg [0] = 2 (no of index sets in the mesg) 139 ----------- 140 mesg [1] = 1 => is[1] 141 mesg [2] = sizeof(is[1]); 142 ----------- 143 mesg [5] = 5 => is[5] 144 mesg [6] = sizeof(is[5]); 145 ----------- 146 mesg [7] 147 mesg [n] datas[1] 148 ----------- 149 mesg[n+1] 150 mesg[m] data(is[5]) 151 ----------- 152 153 Notes: 154 nrqs - no of requests sent (or to be sent out) 155 nrqr - no of requests recieved (which have to be or which have been processed 156 */ 157 #undef __FUNC__ 158 #define __FUNC__ "MatIncreaseOverlap_MPISBAIJ_Once" 159 static int MatIncreaseOverlap_MPISBAIJ_Once(Mat C,int imax,IS *is) 160 { 161 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 162 int **idx,*n,*w1,*w2,*w3,*w4,*rtable,**data,len,*idx_i; 163 int size,rank,Mbs,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr; 164 int *ctr,*pa,*tmp,nrqr,*isz,*isz1,**xdata,**rbuf2; 165 int *onodes1,*olengths1,tag1,tag2,*onodes2,*olengths2; 166 PetscBT *table; 167 MPI_Comm comm; 168 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 169 MPI_Status *s_status,*recv_status; 170 171 PetscFunctionBegin; 172 comm = C->comm; 173 size = c->size; 174 rank = c->rank; 175 Mbs = c->Mbs; 176 177 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 178 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 179 180 len = (imax+1)*sizeof(int*)+ (imax + Mbs)*sizeof(int); 181 ierr = PetscMalloc(len,&idx);CHKERRQ(ierr); 182 n = (int*)(idx + imax); 183 rtable = n + imax; 184 185 for (i=0; i<imax; i++) { 186 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 187 ierr = ISGetSize(is[i],&n[i]);CHKERRQ(ierr); 188 } 189 190 /* Create hash table for the mapping :row -> proc*/ 191 for (i=0,j=0; i<size; i++) { 192 len = c->rowners[i+1]; 193 for (; j<len; j++) { 194 rtable[j] = i; 195 } 196 } 197 198 /* evaluate communication - mesg to who,length of mesg, and buffer space 199 required. Based on this, buffers are allocated, and data copied into them*/ 200 ierr = PetscMalloc(size*4*sizeof(int),&w1);CHKERRQ(ierr);/* mesg size */ 201 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 202 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 203 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 204 ierr = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/ 205 for (i=0; i<imax; i++) { 206 ierr = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/ 207 idx_i = idx[i]; 208 len = n[i]; 209 for (j=0; j<len; j++) { 210 row = idx_i[j]; 211 if (row < 0) { 212 SETERRQ(1,"Index set cannot have negative entries"); 213 } 214 proc = rtable[row]; 215 w4[proc]++; 216 } 217 for (j=0; j<size; j++){ 218 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 219 } 220 } 221 222 nrqs = 0; /* no of outgoing messages */ 223 msz = 0; /* total mesg length (for all proc */ 224 w1[rank] = 0; /* no mesg sent to intself */ 225 w3[rank] = 0; 226 for (i=0; i<size; i++) { 227 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 228 } 229 /* pa - is list of processors to communicate with */ 230 ierr = PetscMalloc((nrqs+1)*sizeof(int),&pa);CHKERRQ(ierr); 231 for (i=0,j=0; i<size; i++) { 232 if (w1[i]) {pa[j] = i; j++;} 233 } 234 235 /* Each message would have a header = 1 + 2*(no of IS) + data */ 236 for (i=0; i<nrqs; i++) { 237 j = pa[i]; 238 w1[j] += w2[j] + 2*w3[j]; 239 msz += w1[j]; 240 } 241 242 243 /* Determine the number of messages to expect, their lengths, from from-ids */ 244 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 245 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 246 247 /* Now post the Irecvs corresponding to these messages */ 248 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 249 250 /* Allocate Memory for outgoing messages */ 251 len = 2*size*sizeof(int*) + (size+msz)*sizeof(int); 252 ierr = PetscMalloc(len,&outdat);CHKERRQ(ierr); 253 ptr = outdat + size; /* Pointers to the data in outgoing buffers */ 254 ierr = PetscMemzero(outdat,2*size*sizeof(int*));CHKERRQ(ierr); 255 tmp = (int*)(outdat + 2*size); 256 ctr = tmp + msz; 257 258 { 259 int *iptr = tmp,ict = 0; 260 for (i=0; i<nrqs; i++) { 261 j = pa[i]; 262 iptr += ict; 263 outdat[j] = iptr; 264 ict = w1[j]; 265 } 266 } 267 268 /* Form the outgoing messages */ 269 /*plug in the headers*/ 270 for (i=0; i<nrqs; i++) { 271 j = pa[i]; 272 outdat[j][0] = 0; 273 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr); 274 ptr[j] = outdat[j] + 2*w3[j] + 1; 275 } 276 277 /* Memory for doing local proc's work*/ 278 { 279 int *d_p; 280 char *t_p; 281 282 len = (imax)*(sizeof(PetscBT) + sizeof(int*)+ sizeof(int)) + 283 (Mbs)*imax*sizeof(int) + (Mbs/BITSPERBYTE+1)*imax*sizeof(char) + 1; 284 ierr = PetscMalloc(len,&table);CHKERRQ(ierr); 285 ierr = PetscMemzero(table,len);CHKERRQ(ierr); 286 data = (int **)(table + imax); 287 isz = (int *)(data + imax); 288 d_p = (int *)(isz + imax); 289 t_p = (char *)(d_p + Mbs*imax); 290 for (i=0; i<imax; i++) { 291 table[i] = t_p + (Mbs/BITSPERBYTE+1)*i; 292 data[i] = d_p + (Mbs)*i; 293 } 294 } 295 296 /* Parse the IS and update local tables and the outgoing buf with the data*/ 297 { 298 int n_i,*data_i,isz_i,*outdat_j,ctr_j; 299 PetscBT table_i; 300 301 for (i=0; i<imax; i++) { 302 ierr = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr); 303 n_i = n[i]; 304 table_i = table[i]; 305 idx_i = idx[i]; 306 data_i = data[i]; 307 isz_i = isz[i]; 308 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 309 row = idx_i[j]; 310 proc = rtable[row]; 311 if (proc != rank) { /* copy to the outgoing buffer */ 312 ctr[proc]++; 313 *ptr[proc] = row; 314 ptr[proc]++; 315 } 316 else { /* Update the local table */ 317 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 318 } 319 } 320 /* Update the headers for the current IS */ 321 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 322 if ((ctr_j = ctr[j])) { 323 outdat_j = outdat[j]; 324 k = ++outdat_j[0]; 325 outdat_j[2*k] = ctr_j; 326 outdat_j[2*k-1] = i; 327 } 328 } 329 isz[i] = isz_i; 330 } 331 } 332 333 /* Now post the sends */ 334 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 335 for (i=0; i<nrqs; ++i) { 336 j = pa[i]; 337 ierr = MPI_Isend(outdat[j],w1[j],MPI_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 338 } 339 340 /* No longer need the original indices*/ 341 for (i=0; i<imax; ++i) { 342 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 343 } 344 ierr = PetscFree(idx);CHKERRQ(ierr); 345 346 for (i=0; i<imax; ++i) { 347 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 348 } 349 350 /* Do Local work*/ 351 ierr = MatIncreaseOverlap_MPISBAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 352 353 /* Receive messages*/ 354 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 355 ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr); 356 357 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 358 ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr); 359 360 /* Phase 1 sends are complete - deallocate buffers */ 361 ierr = PetscFree(outdat);CHKERRQ(ierr); 362 ierr = PetscFree(w1);CHKERRQ(ierr); 363 364 ierr = PetscMalloc((nrqr+1)*sizeof(int *),&xdata);CHKERRQ(ierr); 365 ierr = PetscMalloc((nrqr+1)*sizeof(int),&isz1);CHKERRQ(ierr); 366 ierr = MatIncreaseOverlap_MPISBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 367 ierr = PetscFree(rbuf);CHKERRQ(ierr); 368 369 /* Send the data back*/ 370 /* Do a global reduction to know the buffer space req for incoming messages*/ 371 { 372 int *rw1; 373 374 ierr = PetscMalloc(size*sizeof(int),&rw1);CHKERRQ(ierr); 375 ierr = PetscMemzero(rw1,size*sizeof(int));CHKERRQ(ierr); 376 377 for (i=0; i<nrqr; ++i) { 378 proc = recv_status[i].MPI_SOURCE; 379 if (proc != onodes1[i]) SETERRQ(1,"MPI_SOURCE mismatch"); 380 rw1[proc] = isz1[i]; 381 } 382 383 ierr = PetscFree(onodes1);CHKERRQ(ierr); 384 ierr = PetscFree(olengths1);CHKERRQ(ierr); 385 386 /* Determine the number of messages to expect, their lengths, from from-ids */ 387 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 388 PetscFree(rw1); 389 } 390 /* Now post the Irecvs corresponding to these messages */ 391 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 392 393 /* Now post the sends */ 394 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 395 for (i=0; i<nrqr; ++i) { 396 j = recv_status[i].MPI_SOURCE; 397 ierr = MPI_Isend(xdata[i],isz1[i],MPI_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 398 } 399 400 /* receive work done on other processors*/ 401 { 402 int index,is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 403 PetscBT table_i; 404 MPI_Status *status2; 405 406 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 407 408 for (i=0; i<nrqs; ++i) { 409 ierr = MPI_Waitany(nrqs,r_waits2,&index,status2+i);CHKERRQ(ierr); 410 /* Process the message*/ 411 rbuf2_i = rbuf2[index]; 412 ct1 = 2*rbuf2_i[0]+1; 413 jmax = rbuf2[index][0]; 414 for (j=1; j<=jmax; j++) { 415 max = rbuf2_i[2*j]; 416 is_no = rbuf2_i[2*j-1]; 417 isz_i = isz[is_no]; 418 data_i = data[is_no]; 419 table_i = table[is_no]; 420 for (k=0; k<max; k++,ct1++) { 421 row = rbuf2_i[ct1]; 422 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 423 } 424 isz[is_no] = isz_i; 425 } 426 } 427 ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr); 428 ierr = PetscFree(status2);CHKERRQ(ierr); 429 } 430 431 for (i=0; i<imax; ++i) { 432 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);CHKERRQ(ierr); 433 } 434 435 436 ierr = PetscFree(onodes2);CHKERRQ(ierr); 437 ierr = PetscFree(olengths2);CHKERRQ(ierr); 438 439 ierr = PetscFree(pa);CHKERRQ(ierr); 440 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 441 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 442 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 443 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 444 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 445 ierr = PetscFree(table);CHKERRQ(ierr); 446 ierr = PetscFree(s_status);CHKERRQ(ierr); 447 ierr = PetscFree(recv_status);CHKERRQ(ierr); 448 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 449 ierr = PetscFree(xdata);CHKERRQ(ierr); 450 ierr = PetscFree(isz1);CHKERRQ(ierr); 451 PetscFunctionReturn(0); 452 } 453 454 #undef __FUNC__ 455 #define __FUNC__ "MatIncreaseOverlap_MPISBAIJ_Local" 456 /* 457 MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do 458 the work on the local processor. 459 460 Inputs: 461 C - MAT_MPIBAIJ; 462 imax - total no of index sets processed at a time; 463 table - an array of char - size = Mbs bits. 464 465 Output: 466 isz - array containing the count of the solution elements correspondign 467 to each index set; 468 data - pointer to the solutions 469 */ 470 static int MatIncreaseOverlap_MPISBAIJ_Local(Mat C,int imax,PetscBT *table,int *isz,int **data) 471 { 472 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 473 Mat A = c->A,B = c->B; 474 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 475 int start,end,val,max,rstart,cstart,*ai,*aj; 476 int *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 477 PetscBT table_i; 478 479 PetscFunctionBegin; 480 rstart = c->rstart; 481 cstart = c->cstart; 482 ai = a->i; 483 aj = a->j; 484 bi = b->i; 485 bj = b->j; 486 garray = c->garray; 487 488 489 for (i=0; i<imax; i++) { 490 data_i = data[i]; 491 table_i = table[i]; 492 isz_i = isz[i]; 493 for (j=0,max=isz[i]; j<max; j++) { 494 row = data_i[j] - rstart; 495 start = ai[row]; 496 end = ai[row+1]; 497 for (k=start; k<end; k++) { /* Amat */ 498 val = aj[k] + cstart; 499 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 500 } 501 start = bi[row]; 502 end = bi[row+1]; 503 for (k=start; k<end; k++) { /* Bmat */ 504 val = garray[bj[k]]; 505 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 506 } 507 } 508 isz[i] = isz_i; 509 } 510 PetscFunctionReturn(0); 511 } 512 #undef __FUNC__ 513 #define __FUNC__ "MatIncreaseOverlap_MPISBAIJ_Receive" 514 /* 515 MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages, 516 and return the output 517 518 Input: 519 C - the matrix 520 nrqr - no of messages being processed. 521 rbuf - an array of pointers to the recieved requests 522 523 Output: 524 xdata - array of messages to be sent back 525 isz1 - size of each message 526 527 For better efficiency perhaps we should malloc seperately each xdata[i], 528 then if a remalloc is required we need only copy the data for that one row 529 rather then all previous rows as it is now where a single large chunck of 530 memory is used. 531 532 */ 533 static int MatIncreaseOverlap_MPISBAIJ_Receive(Mat C,int nrqr,int **rbuf,int **xdata,int * isz1) 534 { 535 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 536 Mat A = c->A,B = c->B; 537 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 538 int rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 539 int row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 540 int val,max1,max2,rank,Mbs,no_malloc =0,*tmp,new_estimate,ctr; 541 int *rbuf_i,kmax,rbuf_0,ierr; 542 PetscBT xtable; 543 544 PetscFunctionBegin; 545 rank = c->rank; 546 Mbs = c->Mbs; 547 rstart = c->rstart; 548 cstart = c->cstart; 549 ai = a->i; 550 aj = a->j; 551 bi = b->i; 552 bj = b->j; 553 garray = c->garray; 554 555 556 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 557 rbuf_i = rbuf[i]; 558 rbuf_0 = rbuf_i[0]; 559 ct += rbuf_0; 560 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 561 } 562 563 if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs; 564 else max1 = 1; 565 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 566 ierr = PetscMalloc(mem_estimate*sizeof(int),&xdata[0]);CHKERRQ(ierr); 567 ++no_malloc; 568 ierr = PetscBTCreate(Mbs,xtable);CHKERRQ(ierr); 569 ierr = PetscMemzero(isz1,nrqr*sizeof(int));CHKERRQ(ierr); 570 571 ct3 = 0; 572 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 573 rbuf_i = rbuf[i]; 574 rbuf_0 = rbuf_i[0]; 575 ct1 = 2*rbuf_0+1; 576 ct2 = ct1; 577 ct3 += ct1; 578 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 579 ierr = PetscBTMemzero(Mbs,xtable);CHKERRQ(ierr); 580 oct2 = ct2; 581 kmax = rbuf_i[2*j]; 582 for (k=0; k<kmax; k++,ct1++) { 583 row = rbuf_i[ct1]; 584 if (!PetscBTLookupSet(xtable,row)) { 585 if (!(ct3 < mem_estimate)) { 586 new_estimate = (int)(1.5*mem_estimate)+1; 587 ierr = PetscMalloc(new_estimate * sizeof(int),&tmp);CHKERRQ(ierr); 588 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr); 589 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 590 xdata[0] = tmp; 591 mem_estimate = new_estimate; ++no_malloc; 592 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 593 } 594 xdata[i][ct2++] = row; 595 ct3++; 596 } 597 } 598 for (k=oct2,max2=ct2; k<max2; k++) { 599 row = xdata[i][k] - rstart; 600 start = ai[row]; 601 end = ai[row+1]; 602 for (l=start; l<end; l++) { 603 val = aj[l] + cstart; 604 if (!PetscBTLookupSet(xtable,val)) { 605 if (!(ct3 < mem_estimate)) { 606 new_estimate = (int)(1.5*mem_estimate)+1; 607 ierr = PetscMalloc(new_estimate * sizeof(int),&tmp);CHKERRQ(ierr); 608 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr); 609 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 610 xdata[0] = tmp; 611 mem_estimate = new_estimate; ++no_malloc; 612 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 613 } 614 xdata[i][ct2++] = val; 615 ct3++; 616 } 617 } 618 start = bi[row]; 619 end = bi[row+1]; 620 for (l=start; l<end; l++) { 621 val = garray[bj[l]]; 622 if (!PetscBTLookupSet(xtable,val)) { 623 if (!(ct3 < mem_estimate)) { 624 new_estimate = (int)(1.5*mem_estimate)+1; 625 ierr = PetscMalloc(new_estimate * sizeof(int),&tmp);CHKERRQ(ierr); 626 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr); 627 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 628 xdata[0] = tmp; 629 mem_estimate = new_estimate; ++no_malloc; 630 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 631 } 632 xdata[i][ct2++] = val; 633 ct3++; 634 } 635 } 636 } 637 /* Update the header*/ 638 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 639 xdata[i][2*j-1] = rbuf_i[2*j-1]; 640 } 641 xdata[i][0] = rbuf_0; 642 xdata[i+1] = xdata[i] + ct2; 643 isz1[i] = ct2; /* size of each message */ 644 } 645 ierr = PetscBTDestroy(xtable);CHKERRQ(ierr); 646 PetscLogInfo(0,"MatIncreaseOverlap_MPIBAIJ:[%d] Allocated %d bytes, required %d, no of mallocs = %d\n",rank,mem_estimate,ct3,no_malloc); 647 PetscFunctionReturn(0); 648 } 649 650 static int MatGetSubMatrices_MPISBAIJ_local(Mat,int,IS *,IS *,MatReuse,Mat *); 651 652 #undef __FUNC__ 653 #define __FUNC__ "MatGetSubMatrices_MPISBAIJ" 654 int MatGetSubMatrices_MPISBAIJ(Mat C,int ismax,IS *isrow,IS *iscol,MatReuse scall,Mat **submat) 655 { 656 IS *isrow_new,*iscol_new; 657 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 658 int nmax,nstages_local,nstages,i,pos,max_no,ierr; 659 660 PetscFunctionBegin; 661 /* The compression and expansion should be avoided. Does'nt point 662 out errors might change the indices hence buggey */ 663 664 ierr = PetscMalloc(2*(ismax+1)*sizeof(IS),&isrow_new);CHKERRQ(ierr); 665 iscol_new = isrow_new + ismax; 666 ierr = MatCompressIndicesSorted_MPISBAIJ(C,ismax,isrow,isrow_new);CHKERRQ(ierr); 667 ierr = MatCompressIndicesSorted_MPISBAIJ(C,ismax,iscol,iscol_new);CHKERRQ(ierr); 668 669 /* Allocate memory to hold all the submatrices */ 670 if (scall != MAT_REUSE_MATRIX) { 671 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 672 } 673 /* Determine the number of stages through which submatrices are done */ 674 nmax = 20*1000000 / (c->Nbs * sizeof(int)); 675 if (!nmax) nmax = 1; 676 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 677 678 /* Make sure every porcessor loops through the nstages */ 679 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);CHKERRQ(ierr); 680 681 for (i=0,pos=0; i<nstages; i++) { 682 if (pos+nmax <= ismax) max_no = nmax; 683 else if (pos == ismax) max_no = 0; 684 else max_no = ismax-pos; 685 ierr = MatGetSubMatrices_MPISBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos);CHKERRQ(ierr); 686 pos += max_no; 687 } 688 689 for (i=0; i<ismax; i++) { 690 ierr = ISDestroy(isrow_new[i]);CHKERRQ(ierr); 691 ierr = ISDestroy(iscol_new[i]);CHKERRQ(ierr); 692 } 693 ierr = PetscFree(isrow_new);CHKERRQ(ierr); 694 PetscFunctionReturn(0); 695 } 696 697 /* -------------------------------------------------------------------------*/ 698 #undef __FUNC__ 699 #define __FUNC__ "MatGetSubMatrices_MPISBAIJ_local" 700 static int MatGetSubMatrices_MPISBAIJ_local(Mat C,int ismax,IS *isrow,IS *iscol,MatReuse scall,Mat *submats) 701 { 702 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 703 Mat A = c->A; 704 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat; 705 int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size; 706 int **sbuf1,**sbuf2,rank,Mbs,i,j,k,l,ct1,ct2,ierr,**rbuf1,row,proc; 707 int nrqs,msz,**ptr,index,*req_size,*ctr,*pa,*tmp,tcol,bsz,nrqr; 708 int **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap; 709 int **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i; 710 int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i; 711 int *rmap_i,bs=c->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB; 712 int cstart = c->cstart,nzA,nzB,*a_i=a->i,*b_i=b->i,imark; 713 int *bmap = c->garray,ctmp,rstart=c->rstart,tag0,tag1,tag2,tag3; 714 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 715 MPI_Request *r_waits4,*s_waits3,*s_waits4; 716 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 717 MPI_Status *r_status3,*r_status4,*s_status4; 718 MPI_Comm comm; 719 MatScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB; 720 MatScalar *a_a=a->a,*b_a=b->a; 721 PetscTruth flag; 722 723 PetscFunctionBegin; 724 comm = C->comm; 725 tag0 = C->tag; 726 size = c->size; 727 rank = c->rank; 728 Mbs = c->Mbs; 729 730 /* Get some new tags to keep the communication clean */ 731 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 732 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 733 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 734 735 /* Check if the col indices are sorted */ 736 for (i=0; i<ismax; i++) { 737 ierr = ISSorted(iscol[i],(PetscTruth*)&j);CHKERRQ(ierr); 738 if (!j) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 739 } 740 741 len = (2*ismax+1)*(sizeof(int*)+ sizeof(int)) + (Mbs+1)*sizeof(int); 742 ierr = PetscMalloc(len,&irow);CHKERRQ(ierr); 743 icol = irow + ismax; 744 nrow = (int*)(icol + ismax); 745 ncol = nrow + ismax; 746 rtable = ncol + ismax; 747 748 for (i=0; i<ismax; i++) { 749 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 750 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 751 ierr = ISGetSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 752 ierr = ISGetSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 753 } 754 755 /* Create hash table for the mapping :row -> proc*/ 756 for (i=0,j=0; i<size; i++) { 757 jmax = c->rowners[i+1]; 758 for (; j<jmax; j++) { 759 rtable[j] = i; 760 } 761 } 762 763 /* evaluate communication - mesg to who,length of mesg,and buffer space 764 required. Based on this, buffers are allocated, and data copied into them*/ 765 ierr = PetscMalloc(size*4*sizeof(int),&w1);CHKERRQ(ierr); /* mesg size */ 766 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 767 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 768 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 769 ierr = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/ 770 for (i=0; i<ismax; i++) { 771 ierr = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/ 772 jmax = nrow[i]; 773 irow_i = irow[i]; 774 for (j=0; j<jmax; j++) { 775 row = irow_i[j]; 776 proc = rtable[row]; 777 w4[proc]++; 778 } 779 for (j=0; j<size; j++) { 780 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 781 } 782 } 783 784 nrqs = 0; /* no of outgoing messages */ 785 msz = 0; /* total mesg length for all proc */ 786 w1[rank] = 0; /* no mesg sent to intself */ 787 w3[rank] = 0; 788 for (i=0; i<size; i++) { 789 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 790 } 791 ierr = PetscMalloc((nrqs+1)*sizeof(int),&pa);CHKERRQ(ierr); /*(proc -array)*/ 792 for (i=0,j=0; i<size; i++) { 793 if (w1[i]) { pa[j] = i; j++; } 794 } 795 796 /* Each message would have a header = 1 + 2*(no of IS) + data */ 797 for (i=0; i<nrqs; i++) { 798 j = pa[i]; 799 w1[j] += w2[j] + 2* w3[j]; 800 msz += w1[j]; 801 } 802 /* Do a global reduction to determine how many messages to expect*/ 803 { 804 int *rw1; 805 ierr = PetscMalloc(2*size*sizeof(int),&rw1);CHKERRQ(ierr); 806 ierr = MPI_Allreduce(w1,rw1,2*size,MPI_INT,PetscMaxSum_Op,comm);CHKERRQ(ierr); 807 bsz = rw1[rank]; 808 nrqr = rw1[size+rank]; 809 ierr = PetscFree(rw1);CHKERRQ(ierr); 810 } 811 812 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 813 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 814 ierr = PetscMalloc(len,&rbuf1);CHKERRQ(ierr); 815 rbuf1[0] = (int*)(rbuf1 + nrqr); 816 for (i=1; i<nrqr; ++i) rbuf1[i] = rbuf1[i-1] + bsz; 817 818 /* Post the receives */ 819 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&r_waits1);CHKERRQ(ierr); 820 for (i=0; i<nrqr; ++i) { 821 ierr = MPI_Irecv(rbuf1[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag0,comm,r_waits1+i);CHKERRQ(ierr); 822 } 823 824 /* Allocate Memory for outgoing messages */ 825 len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int); 826 ierr = PetscMalloc(len,&sbuf1);CHKERRQ(ierr); 827 ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */ 828 ierr = PetscMemzero(sbuf1,2*size*sizeof(int*));CHKERRQ(ierr); 829 /* allocate memory for outgoing data + buf to receive the first reply */ 830 tmp = (int*)(ptr + size); 831 ctr = tmp + 2*msz; 832 833 { 834 int *iptr = tmp,ict = 0; 835 for (i=0; i<nrqs; i++) { 836 j = pa[i]; 837 iptr += ict; 838 sbuf1[j] = iptr; 839 ict = w1[j]; 840 } 841 } 842 843 /* Form the outgoing messages */ 844 /* Initialise the header space */ 845 for (i=0; i<nrqs; i++) { 846 j = pa[i]; 847 sbuf1[j][0] = 0; 848 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr); 849 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 850 } 851 852 /* Parse the isrow and copy data into outbuf */ 853 for (i=0; i<ismax; i++) { 854 ierr = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr); 855 irow_i = irow[i]; 856 jmax = nrow[i]; 857 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 858 row = irow_i[j]; 859 proc = rtable[row]; 860 if (proc != rank) { /* copy to the outgoing buf*/ 861 ctr[proc]++; 862 *ptr[proc] = row; 863 ptr[proc]++; 864 } 865 } 866 /* Update the headers for the current IS */ 867 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 868 if ((ctr_j = ctr[j])) { 869 sbuf1_j = sbuf1[j]; 870 k = ++sbuf1_j[0]; 871 sbuf1_j[2*k] = ctr_j; 872 sbuf1_j[2*k-1] = i; 873 } 874 } 875 } 876 877 /* Now post the sends */ 878 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 879 for (i=0; i<nrqs; ++i) { 880 j = pa[i]; 881 ierr = MPI_Isend(sbuf1[j],w1[j],MPI_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 882 } 883 884 /* Post Recieves to capture the buffer size */ 885 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 886 ierr = PetscMalloc((nrqs+1)*sizeof(int *),&rbuf2);CHKERRQ(ierr); 887 rbuf2[0] = tmp + msz; 888 for (i=1; i<nrqs; ++i) { 889 j = pa[i]; 890 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 891 } 892 for (i=0; i<nrqs; ++i) { 893 j = pa[i]; 894 ierr = MPI_Irecv(rbuf2[i],w1[j],MPI_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 895 } 896 897 /* Send to other procs the buf size they should allocate */ 898 899 900 /* Receive messages*/ 901 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 902 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 903 len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*); 904 ierr = PetscMalloc(len,&sbuf2);CHKERRQ(ierr); 905 req_size = (int*)(sbuf2 + nrqr); 906 req_source = req_size + nrqr; 907 908 { 909 Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data; 910 int *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i; 911 912 for (i=0; i<nrqr; ++i) { 913 ierr = MPI_Waitany(nrqr,r_waits1,&index,r_status1+i);CHKERRQ(ierr); 914 req_size[index] = 0; 915 rbuf1_i = rbuf1[index]; 916 start = 2*rbuf1_i[0] + 1; 917 ierr = MPI_Get_count(r_status1+i,MPI_INT,&end);CHKERRQ(ierr); 918 ierr = PetscMalloc(end*sizeof(int),sbuf2[index]);CHKERRQ(ierr); 919 sbuf2_i = sbuf2[index]; 920 for (j=start; j<end; j++) { 921 id = rbuf1_i[j] - rstart; 922 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 923 sbuf2_i[j] = ncols; 924 req_size[index] += ncols; 925 } 926 req_source[index] = r_status1[i].MPI_SOURCE; 927 /* form the header */ 928 sbuf2_i[0] = req_size[index]; 929 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 930 ierr = MPI_Isend(sbuf2_i,end,MPI_INT,req_source[index],tag1,comm,s_waits2+i);CHKERRQ(ierr); 931 } 932 } 933 ierr = PetscFree(r_status1);CHKERRQ(ierr); 934 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 935 936 /* recv buffer sizes */ 937 /* Receive messages*/ 938 939 ierr = PetscMalloc((nrqs+1)*sizeof(int*),&rbuf3);CHKERRQ(ierr); 940 ierr = PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);CHKERRQ(ierr); 941 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 942 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 943 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 944 945 for (i=0; i<nrqs; ++i) { 946 ierr = MPI_Waitany(nrqs,r_waits2,&index,r_status2+i);CHKERRQ(ierr); 947 ierr = PetscMalloc(rbuf2[index][0]*sizeof(int),&rbuf3[index]);CHKERRQ(ierr); 948 ierr = PetscMalloc(rbuf2[index][0]*bs2*sizeof(MatScalar),&rbuf4[index]);CHKERRQ(ierr); 949 ierr = MPI_Irecv(rbuf3[index],rbuf2[index][0],MPI_INT, 950 r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+index);CHKERRQ(ierr); 951 ierr = MPI_Irecv(rbuf4[index],rbuf2[index][0]*bs2,MPIU_MATSCALAR, 952 r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+index);CHKERRQ(ierr); 953 } 954 ierr = PetscFree(r_status2);CHKERRQ(ierr); 955 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 956 957 /* Wait on sends1 and sends2 */ 958 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 959 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 960 961 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 962 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 963 ierr = PetscFree(s_status1);CHKERRQ(ierr); 964 ierr = PetscFree(s_status2);CHKERRQ(ierr); 965 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 966 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 967 968 /* Now allocate buffers for a->j, and send them off */ 969 ierr = PetscMalloc((nrqr+1)*sizeof(int *),&sbuf_aj);CHKERRQ(ierr); 970 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 971 ierr = PetscMalloc((j+1)*sizeof(int),&sbuf_aj[0]);CHKERRQ(ierr); 972 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 973 974 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 975 { 976 for (i=0; i<nrqr; i++) { 977 rbuf1_i = rbuf1[i]; 978 sbuf_aj_i = sbuf_aj[i]; 979 ct1 = 2*rbuf1_i[0] + 1; 980 ct2 = 0; 981 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 982 kmax = rbuf1[i][2*j]; 983 for (k=0; k<kmax; k++,ct1++) { 984 row = rbuf1_i[ct1] - rstart; 985 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 986 ncols = nzA + nzB; 987 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 988 989 /* load the column indices for this row into cols*/ 990 cols = sbuf_aj_i + ct2; 991 for (l=0; l<nzB; l++) { 992 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp; 993 else break; 994 } 995 imark = l; 996 for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l]; 997 for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]]; 998 ct2 += ncols; 999 } 1000 } 1001 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1002 } 1003 } 1004 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 1005 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 1006 1007 /* Allocate buffers for a->a, and send them off */ 1008 ierr = PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);CHKERRQ(ierr); 1009 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1010 PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);CHKERRQ(ierr); 1011 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2; 1012 1013 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 1014 { 1015 for (i=0; i<nrqr; i++) { 1016 rbuf1_i = rbuf1[i]; 1017 sbuf_aa_i = sbuf_aa[i]; 1018 ct1 = 2*rbuf1_i[0]+1; 1019 ct2 = 0; 1020 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1021 kmax = rbuf1_i[2*j]; 1022 for (k=0; k<kmax; k++,ct1++) { 1023 row = rbuf1_i[ct1] - rstart; 1024 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1025 ncols = nzA + nzB; 1026 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1027 vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2; 1028 1029 /* load the column values for this row into vals*/ 1030 vals = sbuf_aa_i+ct2*bs2; 1031 for (l=0; l<nzB; l++) { 1032 if ((bmap[cworkB[l]]) < cstart) { 1033 ierr = PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1034 } 1035 else break; 1036 } 1037 imark = l; 1038 for (l=0; l<nzA; l++) { 1039 ierr = PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1040 } 1041 for (l=imark; l<nzB; l++) { 1042 ierr = PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1043 } 1044 ct2 += ncols; 1045 } 1046 } 1047 ierr = MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1048 } 1049 } 1050 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 1051 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 1052 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1053 1054 /* Form the matrix */ 1055 /* create col map */ 1056 { 1057 int *icol_i; 1058 1059 len = (1+ismax)*sizeof(int*)+ ismax*c->Nbs*sizeof(int); 1060 ierr = PetscMalloc(len,&cmap);CHKERRQ(ierr); 1061 cmap[0] = (int *)(cmap + ismax); 1062 ierr = PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(int));CHKERRQ(ierr); 1063 for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; } 1064 for (i=0; i<ismax; i++) { 1065 jmax = ncol[i]; 1066 icol_i = icol[i]; 1067 cmap_i = cmap[i]; 1068 for (j=0; j<jmax; j++) { 1069 cmap_i[icol_i[j]] = j+1; 1070 } 1071 } 1072 } 1073 1074 1075 /* Create lens which is required for MatCreate... */ 1076 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1077 len = (1+ismax)*sizeof(int*)+ j*sizeof(int); 1078 ierr = PetscMalloc(len,&lens);CHKERRQ(ierr); 1079 lens[0] = (int *)(lens + ismax); 1080 ierr = PetscMemzero(lens[0],j*sizeof(int));CHKERRQ(ierr); 1081 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1082 1083 /* Update lens from local data */ 1084 for (i=0; i<ismax; i++) { 1085 jmax = nrow[i]; 1086 cmap_i = cmap[i]; 1087 irow_i = irow[i]; 1088 lens_i = lens[i]; 1089 for (j=0; j<jmax; j++) { 1090 row = irow_i[j]; 1091 proc = rtable[row]; 1092 if (proc == rank) { 1093 /* Get indices from matA and then from matB */ 1094 row = row - rstart; 1095 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1096 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1097 for (k=0; k<nzA; k++) { 1098 if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++;} 1099 } 1100 for (k=0; k<nzB; k++) { 1101 if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++;} 1102 } 1103 } 1104 } 1105 } 1106 1107 /* Create row map*/ 1108 len = (1+ismax)*sizeof(int*)+ ismax*c->Mbs*sizeof(int); 1109 ierr = PetscMalloc(len,&rmap);CHKERRQ(ierr); 1110 rmap[0] = (int *)(rmap + ismax); 1111 ierr = PetscMemzero(rmap[0],ismax*c->Mbs*sizeof(int));CHKERRQ(ierr); 1112 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + c->Mbs;} 1113 for (i=0; i<ismax; i++) { 1114 rmap_i = rmap[i]; 1115 irow_i = irow[i]; 1116 jmax = nrow[i]; 1117 for (j=0; j<jmax; j++) { 1118 rmap_i[irow_i[j]] = j; 1119 } 1120 } 1121 1122 /* Update lens from offproc data */ 1123 { 1124 int *rbuf2_i,*rbuf3_i,*sbuf1_i; 1125 1126 for (tmp2=0; tmp2<nrqs; tmp2++) { 1127 ierr = MPI_Waitany(nrqs,r_waits3,&i,r_status3+tmp2);CHKERRQ(ierr); 1128 index = pa[i]; 1129 sbuf1_i = sbuf1[index]; 1130 jmax = sbuf1_i[0]; 1131 ct1 = 2*jmax+1; 1132 ct2 = 0; 1133 rbuf2_i = rbuf2[i]; 1134 rbuf3_i = rbuf3[i]; 1135 for (j=1; j<=jmax; j++) { 1136 is_no = sbuf1_i[2*j-1]; 1137 max1 = sbuf1_i[2*j]; 1138 lens_i = lens[is_no]; 1139 cmap_i = cmap[is_no]; 1140 rmap_i = rmap[is_no]; 1141 for (k=0; k<max1; k++,ct1++) { 1142 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1143 max2 = rbuf2_i[ct1]; 1144 for (l=0; l<max2; l++,ct2++) { 1145 if (cmap_i[rbuf3_i[ct2]]) { 1146 lens_i[row]++; 1147 } 1148 } 1149 } 1150 } 1151 } 1152 } 1153 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1154 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1155 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1156 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1157 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1158 1159 /* Create the submatrices */ 1160 if (scall == MAT_REUSE_MATRIX) { 1161 /* 1162 Assumes new rows are same length as the old rows, hence bug! 1163 */ 1164 for (i=0; i<ismax; i++) { 1165 mat = (Mat_SeqBAIJ *)(submats[i]->data); 1166 if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || mat->bs != bs)) { 1167 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1168 } 1169 ierr = PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(int),&flag);CHKERRQ(ierr); 1170 if (flag == PETSC_FALSE) { 1171 SETERRQ(PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros"); 1172 } 1173 /* Initial matrix as if empty */ 1174 ierr = PetscMemzero(mat->ilen,mat->mbs*sizeof(int));CHKERRQ(ierr); 1175 submats[i]->factor = C->factor; 1176 } 1177 } else { 1178 for (i=0; i<ismax; i++) { 1179 ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,a->bs,nrow[i]*bs,ncol[i]*bs,0,lens[i],submats+i);CHKERRQ(ierr); 1180 } 1181 } 1182 1183 /* Assemble the matrices */ 1184 /* First assemble the local rows */ 1185 { 1186 int ilen_row,*imat_ilen,*imat_j,*imat_i; 1187 MatScalar *imat_a; 1188 1189 for (i=0; i<ismax; i++) { 1190 mat = (Mat_SeqBAIJ*)submats[i]->data; 1191 imat_ilen = mat->ilen; 1192 imat_j = mat->j; 1193 imat_i = mat->i; 1194 imat_a = mat->a; 1195 cmap_i = cmap[i]; 1196 rmap_i = rmap[i]; 1197 irow_i = irow[i]; 1198 jmax = nrow[i]; 1199 for (j=0; j<jmax; j++) { 1200 row = irow_i[j]; 1201 proc = rtable[row]; 1202 if (proc == rank) { 1203 row = row - rstart; 1204 nzA = a_i[row+1] - a_i[row]; 1205 nzB = b_i[row+1] - b_i[row]; 1206 cworkA = a_j + a_i[row]; 1207 cworkB = b_j + b_i[row]; 1208 vworkA = a_a + a_i[row]*bs2; 1209 vworkB = b_a + b_i[row]*bs2; 1210 1211 row = rmap_i[row + rstart]; 1212 mat_i = imat_i[row]; 1213 mat_a = imat_a + mat_i*bs2; 1214 mat_j = imat_j + mat_i; 1215 ilen_row = imat_ilen[row]; 1216 1217 /* load the column indices for this row into cols*/ 1218 for (l=0; l<nzB; l++) { 1219 if ((ctmp = bmap[cworkB[l]]) < cstart) { 1220 if ((tcol = cmap_i[ctmp])) { 1221 *mat_j++ = tcol - 1; 1222 ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1223 mat_a += bs2; 1224 ilen_row++; 1225 } 1226 } else break; 1227 } 1228 imark = l; 1229 for (l=0; l<nzA; l++) { 1230 if ((tcol = cmap_i[cstart + cworkA[l]])) { 1231 *mat_j++ = tcol - 1; 1232 ierr = PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1233 mat_a += bs2; 1234 ilen_row++; 1235 } 1236 } 1237 for (l=imark; l<nzB; l++) { 1238 if ((tcol = cmap_i[bmap[cworkB[l]]])) { 1239 *mat_j++ = tcol - 1; 1240 ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1241 mat_a += bs2; 1242 ilen_row++; 1243 } 1244 } 1245 imat_ilen[row] = ilen_row; 1246 } 1247 } 1248 1249 } 1250 } 1251 1252 /* Now assemble the off proc rows*/ 1253 { 1254 int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1255 int *imat_j,*imat_i; 1256 MatScalar *imat_a,*rbuf4_i; 1257 1258 for (tmp2=0; tmp2<nrqs; tmp2++) { 1259 ierr = MPI_Waitany(nrqs,r_waits4,&i,r_status4+tmp2);CHKERRQ(ierr); 1260 index = pa[i]; 1261 sbuf1_i = sbuf1[index]; 1262 jmax = sbuf1_i[0]; 1263 ct1 = 2*jmax + 1; 1264 ct2 = 0; 1265 rbuf2_i = rbuf2[i]; 1266 rbuf3_i = rbuf3[i]; 1267 rbuf4_i = rbuf4[i]; 1268 for (j=1; j<=jmax; j++) { 1269 is_no = sbuf1_i[2*j-1]; 1270 rmap_i = rmap[is_no]; 1271 cmap_i = cmap[is_no]; 1272 mat = (Mat_SeqBAIJ*)submats[is_no]->data; 1273 imat_ilen = mat->ilen; 1274 imat_j = mat->j; 1275 imat_i = mat->i; 1276 imat_a = mat->a; 1277 max1 = sbuf1_i[2*j]; 1278 for (k=0; k<max1; k++,ct1++) { 1279 row = sbuf1_i[ct1]; 1280 row = rmap_i[row]; 1281 ilen = imat_ilen[row]; 1282 mat_i = imat_i[row]; 1283 mat_a = imat_a + mat_i*bs2; 1284 mat_j = imat_j + mat_i; 1285 max2 = rbuf2_i[ct1]; 1286 for (l=0; l<max2; l++,ct2++) { 1287 if ((tcol = cmap_i[rbuf3_i[ct2]])) { 1288 *mat_j++ = tcol - 1; 1289 /* *mat_a++= rbuf4_i[ct2]; */ 1290 ierr = PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1291 mat_a += bs2; 1292 ilen++; 1293 } 1294 } 1295 imat_ilen[row] = ilen; 1296 } 1297 } 1298 } 1299 } 1300 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1301 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1302 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1303 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1304 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1305 1306 /* Restore the indices */ 1307 for (i=0; i<ismax; i++) { 1308 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1309 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1310 } 1311 1312 /* Destroy allocated memory */ 1313 ierr = PetscFree(irow);CHKERRQ(ierr); 1314 ierr = PetscFree(w1);CHKERRQ(ierr); 1315 ierr = PetscFree(pa);CHKERRQ(ierr); 1316 1317 ierr = PetscFree(sbuf1);CHKERRQ(ierr); 1318 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1319 for (i=0; i<nrqr; ++i) { 1320 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1321 } 1322 for (i=0; i<nrqs; ++i) { 1323 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1324 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1325 } 1326 1327 ierr = PetscFree(sbuf2);CHKERRQ(ierr); 1328 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1329 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1330 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1331 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1332 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1333 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1334 1335 ierr = PetscFree(cmap);CHKERRQ(ierr); 1336 ierr = PetscFree(rmap);CHKERRQ(ierr); 1337 ierr = PetscFree(lens);CHKERRQ(ierr); 1338 1339 for (i=0; i<ismax; i++) { 1340 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1341 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1342 } 1343 1344 PetscFunctionReturn(0); 1345 } 1346 1347 1348