1 #ifndef lint 2 static char vcid[] = "$Id: baijov.c,v 1.6 1996/07/09 22:50:34 balay Exp balay $"; 3 #endif 4 /* 5 Routines to compute overlapping regions of a parallel MPI matrix 6 and to find submatrices that were shared across processors. 7 */ 8 #include "mpibaij.h" 9 #include "src/inline/bitarray.h" 10 11 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat, int, IS *); 12 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat , int , char **,int*, int**); 13 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat , int, int **, int**, int* ); 14 extern int MatGetRow_MPIBAIJ(Mat,int,int*,int**,Scalar**); 15 extern int MatRestoreRow_MPIBAIJ(Mat,int,int*,int**,Scalar**); 16 17 static int MatCompressIndices_MPIBAIJ(Mat C, int imax, IS *is_in, IS *is_out) 18 { 19 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) C->data; 20 int ierr,isz,bs = baij->bs,Nbs,n,i,j,*idx,*nidx,ival; 21 char *table; 22 23 Nbs = baij->Nbs; 24 table = (char *) PetscMalloc((Nbs/BITSPERBYTE +1)*sizeof(char)); CHKPTRQ(table); 25 nidx = (int *) PetscMalloc((Nbs+1)*sizeof(int)); CHKPTRQ(nidx); 26 27 for ( i=0; i<imax; i++ ) { 28 isz = 0; 29 PetscMemzero(table,(Nbs/BITSPERBYTE +1)*sizeof(char)); 30 ierr = ISGetIndices(is_in[i],&idx); CHKERRQ(ierr); 31 ierr = ISGetSize(is_in[i],&n); CHKERRQ(ierr); 32 for ( j=0; j<n ; ++j){ 33 ival = idx[j]/bs; /* convert the indices into block indices */ 34 if (ival>Nbs) SETERRQ(1,"MatIncreaseOverlap_SeqBAIJ: index greater than mat-dim"); 35 if(!BT_LOOKUP(table, ival)) { nidx[isz++] = ival;} 36 } 37 ierr = ISRestoreIndices(is_in[i],&idx); CHKERRQ(ierr); 38 ierr = ISCreateSeq(MPI_COMM_SELF, isz, nidx, (is_out+i)); CHKERRQ(ierr); 39 } 40 PetscFree(table); 41 PetscFree(nidx); 42 return 0; 43 } 44 45 static int MatExpandIndices_MPIBAIJ(Mat C, int imax, IS *is_in, IS *is_out) 46 { 47 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) C->data; 48 int ierr,bs = baij->bs,Nbs,n,i,j,k,*idx,*nidx; 49 50 Nbs = baij->Nbs; 51 52 nidx = (int *) PetscMalloc((Nbs*bs+1)*sizeof(int)); CHKPTRQ(nidx); 53 54 for ( i=0; i<imax; i++ ) { 55 ierr = ISGetIndices(is_in[i],&idx); CHKERRQ(ierr); 56 ierr = ISGetSize(is_in[i],&n); CHKERRQ(ierr); 57 for ( j=0; j<n ; ++j){ 58 for ( k=0; k<bs; k++ ) 59 nidx[j*bs+k] = idx[j]*bs+k; 60 } 61 ierr = ISRestoreIndices(is_in[i],&idx); CHKERRQ(ierr); 62 ierr = ISCreateSeq(MPI_COMM_SELF, n*bs, nidx, (is_out+i)); CHKERRQ(ierr); 63 } 64 PetscFree(nidx); 65 return 0; 66 } 67 68 69 int MatIncreaseOverlap_MPIBAIJ(Mat C, int imax, IS *is, int ov) 70 { 71 int i, ierr; 72 IS *is_new; 73 74 is_new = (IS *)PetscMalloc(imax*sizeof(IS)); CHKPTRQ(is_new); 75 /* Convert the indices into block format */ 76 ierr = MatCompressIndices_MPIBAIJ(C, imax, is,is_new); CHKERRQ(ierr); 77 if (ov < 0){ SETERRQ(1," MatIncreaseOverlap_MPIBAIJ:Negative overlap specified\n");} 78 for ( i=0; i<ov; ++i ) { 79 ierr = MatIncreaseOverlap_MPIBAIJ_Once(C, imax, is_new); CHKERRQ(ierr); 80 } 81 for ( i=0; i<imax; i++ ) ISDestroy(is[i]); 82 ierr = MatExpandIndices_MPIBAIJ(C, imax, is_new,is); CHKERRQ(ierr); 83 for ( i=0; i<imax; i++ ) ISDestroy(is_new[i]); 84 PetscFree(is_new); 85 return 0; 86 } 87 88 /* 89 Sample message format: 90 If a processor A wants processor B to process some elements corresponding 91 to index sets 1s[1], is[5] 92 mesg [0] = 2 ( no of index sets in the mesg) 93 ----------- 94 mesg [1] = 1 => is[1] 95 mesg [2] = sizeof(is[1]); 96 ----------- 97 mesg [5] = 5 => is[5] 98 mesg [6] = sizeof(is[5]); 99 ----------- 100 mesg [7] 101 mesg [n] datas[1] 102 ----------- 103 mesg[n+1] 104 mesg[m] data(is[5]) 105 ----------- 106 107 Notes: 108 nrqs - no of requests sent (or to be sent out) 109 nrqr - no of requests recieved (which have to be or which have been processed 110 */ 111 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat C, int imax, IS *is) 112 { 113 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 114 int **idx, *n, *w1, *w2, *w3, *w4, *rtable,**data,len,*idx_i; 115 int size,rank,Mbs,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr; 116 int *ctr,*pa,tag,*tmp,bsz,nrqr,*isz,*isz1,**xdata,bsz1,**rbuf2; 117 char **table; 118 MPI_Comm comm; 119 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 120 MPI_Status *s_status,*recv_status; 121 122 comm = C->comm; 123 tag = C->tag; 124 size = c->size; 125 rank = c->rank; 126 Mbs = c->Mbs; 127 128 len = (imax+1)*sizeof(int *) + (imax + Mbs)*sizeof(int); 129 idx = (int **) PetscMalloc(len); CHKPTRQ(idx); 130 n = (int *) (idx + imax); 131 rtable = n + imax; 132 133 for ( i=0; i<imax; i++ ) { 134 ierr = ISGetIndices(is[i],&idx[i]); CHKERRQ(ierr); 135 ierr = ISGetSize(is[i],&n[i]); CHKERRQ(ierr); 136 } 137 138 /* Create hash table for the mapping :row -> proc*/ 139 for ( i=0,j=0; i<size; i++ ) { 140 len = c->rowners[i+1]; 141 for ( ; j<len; j++ ) { 142 rtable[j] = i; 143 } 144 } 145 146 /* evaluate communication - mesg to who, length of mesg, and buffer space 147 required. Based on this, buffers are allocated, and data copied into them*/ 148 w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1);/* mesg size */ 149 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 150 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 151 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 152 PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/ 153 for ( i=0; i<imax; i++ ) { 154 PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/ 155 idx_i = idx[i]; 156 len = n[i]; 157 for ( j=0; j<len; j++ ) { 158 row = idx_i[j]; 159 proc = rtable[row]; 160 w4[proc]++; 161 } 162 for ( j=0; j<size; j++ ){ 163 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 164 } 165 } 166 167 nrqs = 0; /* no of outgoing messages */ 168 msz = 0; /* total mesg length (for all proc */ 169 w1[rank] = 0; /* no mesg sent to intself */ 170 w3[rank] = 0; 171 for ( i=0; i<size; i++ ) { 172 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 173 } 174 /* pa - is list of processors to communicate with */ 175 pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); 176 for ( i=0,j=0; i<size; i++ ) { 177 if (w1[i]) {pa[j] = i; j++;} 178 } 179 180 /* Each message would have a header = 1 + 2*(no of IS) + data */ 181 for ( i=0; i<nrqs; i++ ) { 182 j = pa[i]; 183 w1[j] += w2[j] + 2*w3[j]; 184 msz += w1[j]; 185 } 186 187 188 /* Do a global reduction to determine how many messages to expect*/ 189 { 190 int *rw1, *rw2; 191 rw1 = (int *) PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 192 rw2 = rw1+size; 193 MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm); 194 bsz = rw1[rank]; 195 MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm); 196 nrqr = rw2[rank]; 197 PetscFree(rw1); 198 } 199 200 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 201 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 202 rbuf = (int**) PetscMalloc(len); CHKPTRQ(rbuf); 203 rbuf[0] = (int *) (rbuf + nrqr); 204 for ( i=1; i<nrqr; ++i ) rbuf[i] = rbuf[i-1] + bsz; 205 206 /* Post the receives */ 207 r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 208 CHKPTRQ(r_waits1); 209 for ( i=0; i<nrqr; ++i ){ 210 MPI_Irecv(rbuf[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag,comm,r_waits1+i); 211 } 212 213 /* Allocate Memory for outgoing messages */ 214 len = 2*size*sizeof(int*) + (size+msz)*sizeof(int); 215 outdat = (int **)PetscMalloc(len); CHKPTRQ(outdat); 216 ptr = outdat + size; /* Pointers to the data in outgoing buffers */ 217 PetscMemzero(outdat,2*size*sizeof(int*)); 218 tmp = (int *) (outdat + 2*size); 219 ctr = tmp + msz; 220 221 { 222 int *iptr = tmp,ict = 0; 223 for ( i=0; i<nrqs; i++ ) { 224 j = pa[i]; 225 iptr += ict; 226 outdat[j] = iptr; 227 ict = w1[j]; 228 } 229 } 230 231 /* Form the outgoing messages */ 232 /*plug in the headers*/ 233 for ( i=0; i<nrqs; i++ ) { 234 j = pa[i]; 235 outdat[j][0] = 0; 236 PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int)); 237 ptr[j] = outdat[j] + 2*w3[j] + 1; 238 } 239 240 /* Memory for doing local proc's work*/ 241 { 242 int *d_p; 243 char *t_p; 244 245 len = (imax+1)*(sizeof(char *) + sizeof(int *) + sizeof(int)) + 246 (Mbs+1)*imax*sizeof(int) + (Mbs/BITSPERBYTE+1)*imax*sizeof(char); 247 table = (char **)PetscMalloc(len); CHKPTRQ(table); 248 data = (int **)(table + imax); 249 data[0] = (int *)(data + imax); 250 isz = data[0] + (Mbs+1)*imax; 251 table[0] = (char *)(isz + imax); 252 d_p = data[0]; t_p = table[0]; 253 for ( i=1; i<imax; i++ ) { 254 table[i] = t_p + (Mbs/BITSPERBYTE+1)*i; 255 data[i] = d_p + (Mbs+1)*i; 256 } 257 } 258 PetscMemzero(*table,(Mbs/BITSPERBYTE+1)*imax); 259 PetscMemzero(isz,imax*sizeof(int)); 260 261 /* Parse the IS and update local tables and the outgoing buf with the data*/ 262 { 263 int n_i,*data_i,isz_i,*outdat_j,ctr_j; 264 char *table_i; 265 266 for ( i=0; i<imax; i++ ) { 267 PetscMemzero(ctr,size*sizeof(int)); 268 n_i = n[i]; 269 table_i = table[i]; 270 idx_i = idx[i]; 271 data_i = data[i]; 272 isz_i = isz[i]; 273 for ( j=0; j<n_i; j++ ) { /* parse the indices of each IS */ 274 row = idx_i[j]; 275 proc = rtable[row]; 276 if (proc != rank) { /* copy to the outgoing buffer */ 277 ctr[proc]++; 278 *ptr[proc] = row; 279 ptr[proc]++; 280 } 281 else { /* Update the local table */ 282 if (!BT_LOOKUP(table_i,row)) { data_i[isz_i++] = row;} 283 } 284 } 285 /* Update the headers for the current IS */ 286 for ( j=0; j<size; j++ ) { /* Can Optimise this loop by using pa[] */ 287 if ((ctr_j = ctr[j])) { 288 outdat_j = outdat[j]; 289 k = ++outdat_j[0]; 290 outdat_j[2*k] = ctr_j; 291 outdat_j[2*k-1] = i; 292 } 293 } 294 isz[i] = isz_i; 295 } 296 } 297 298 299 300 /* Now post the sends */ 301 s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 302 CHKPTRQ(s_waits1); 303 for ( i=0; i<nrqs; ++i ) { 304 j = pa[i]; 305 MPI_Isend(outdat[j], w1[j], MPI_INT, j, tag, comm, s_waits1+i); 306 } 307 308 /* No longer need the original indices*/ 309 for ( i=0; i<imax; ++i ) { 310 ierr = ISRestoreIndices(is[i], idx+i); CHKERRQ(ierr); 311 } 312 PetscFree(idx); 313 314 for ( i=0; i<imax; ++i ) { 315 ierr = ISDestroy(is[i]); CHKERRQ(ierr); 316 } 317 318 /* Do Local work*/ 319 ierr = MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 320 321 /* Receive messages*/ 322 { 323 int index; 324 325 recv_status = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 326 CHKPTRQ(recv_status); 327 for ( i=0; i<nrqr; ++i ) { 328 MPI_Waitany(nrqr, r_waits1, &index, recv_status+i); 329 } 330 331 s_status = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 332 CHKPTRQ(s_status); 333 MPI_Waitall(nrqs,s_waits1,s_status); 334 } 335 336 /* Phase 1 sends are complete - deallocate buffers */ 337 PetscFree(outdat); 338 PetscFree(w1); 339 340 xdata = (int **)PetscMalloc((nrqr+1)*sizeof(int *)); CHKPTRQ(xdata); 341 isz1 = (int *)PetscMalloc((nrqr+1)*sizeof(int)); CHKPTRQ(isz1); 342 ierr = MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 343 PetscFree(rbuf); 344 345 /* Send the data back*/ 346 /* Do a global reduction to know the buffer space req for incoming messages*/ 347 { 348 int *rw1, *rw2; 349 350 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 351 PetscMemzero(rw1,2*size*sizeof(int)); 352 rw2 = rw1+size; 353 for ( i=0; i<nrqr; ++i ) { 354 proc = recv_status[i].MPI_SOURCE; 355 rw1[proc] = isz1[i]; 356 } 357 358 MPI_Allreduce(rw1, rw2, size, MPI_INT, MPI_MAX, comm); 359 bsz1 = rw2[rank]; 360 PetscFree(rw1); 361 } 362 363 /* Allocate buffers*/ 364 365 /* Allocate memory for recv buffers. Prob none if nrqr = 0 ???? */ 366 len = (nrqs+1)*sizeof(int*) + nrqs*bsz1*sizeof(int); 367 rbuf2 = (int**) PetscMalloc(len); CHKPTRQ(rbuf2); 368 rbuf2[0] = (int *) (rbuf2 + nrqs); 369 for ( i=1; i<nrqs; ++i ) rbuf2[i] = rbuf2[i-1] + bsz1; 370 371 /* Post the receives */ 372 r_waits2 = (MPI_Request *)PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 373 CHKPTRQ(r_waits2); 374 for ( i=0; i<nrqs; ++i ) { 375 MPI_Irecv(rbuf2[i], bsz1, MPI_INT, MPI_ANY_SOURCE, tag, comm, r_waits2+i); 376 } 377 378 /* Now post the sends */ 379 s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 380 CHKPTRQ(s_waits2); 381 for ( i=0; i<nrqr; ++i ) { 382 j = recv_status[i].MPI_SOURCE; 383 MPI_Isend( xdata[i], isz1[i], MPI_INT, j, tag, comm, s_waits2+i); 384 } 385 386 /* receive work done on other processors*/ 387 { 388 int index, is_no, ct1, max,*rbuf2_i,isz_i,*data_i,jmax; 389 char *table_i; 390 MPI_Status *status2; 391 392 status2 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(status2); 393 394 for ( i=0; i<nrqs; ++i ) { 395 MPI_Waitany(nrqs, r_waits2, &index, status2+i); 396 /* Process the message*/ 397 rbuf2_i = rbuf2[index]; 398 ct1 = 2*rbuf2_i[0]+1; 399 jmax = rbuf2[index][0]; 400 for ( j=1; j<=jmax; j++ ) { 401 max = rbuf2_i[2*j]; 402 is_no = rbuf2_i[2*j-1]; 403 isz_i = isz[is_no]; 404 data_i = data[is_no]; 405 table_i = table[is_no]; 406 for ( k=0; k<max; k++,ct1++ ) { 407 row = rbuf2_i[ct1]; 408 if (!BT_LOOKUP(table_i,row)) { data_i[isz_i++] = row;} 409 } 410 isz[is_no] = isz_i; 411 } 412 } 413 MPI_Waitall(nrqr,s_waits2,status2); 414 PetscFree(status2); 415 } 416 417 for ( i=0; i<imax; ++i ) { 418 ierr = ISCreateSeq(MPI_COMM_SELF, isz[i], data[i], is+i); CHKERRQ(ierr); 419 } 420 421 PetscFree(pa); 422 PetscFree(rbuf2); 423 PetscFree(s_waits1); 424 PetscFree(r_waits1); 425 PetscFree(s_waits2); 426 PetscFree(r_waits2); 427 PetscFree(table); 428 PetscFree(s_status); 429 PetscFree(recv_status); 430 PetscFree(xdata[0]); 431 PetscFree(xdata); 432 PetscFree(isz1); 433 return 0; 434 } 435 436 /* 437 MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do 438 the work on the local processor. 439 440 Inputs: 441 C - MAT_MPIBAIJ; 442 imax - total no of index sets processed at a time; 443 table - an array of char - size = Mbs bits. 444 445 Output: 446 isz - array containing the count of the solution elements correspondign 447 to each index set; 448 data - pointer to the solutions 449 */ 450 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat C,int imax,char **table,int *isz, 451 int **data) 452 { 453 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 454 Mat A = c->A, B = c->B; 455 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 456 int start, end, val, max, rstart,cstart,*ai, *aj; 457 int *bi, *bj, *garray, i, j, k, row,*data_i,isz_i; 458 char *table_i; 459 460 rstart = c->rstart; 461 cstart = c->cstart; 462 ai = a->i; 463 aj = a->j; 464 bi = b->i; 465 bj = b->j; 466 garray = c->garray; 467 468 469 for ( i=0; i<imax; i++ ) { 470 data_i = data[i]; 471 table_i = table[i]; 472 isz_i = isz[i]; 473 for ( j=0, max=isz[i]; j<max; j++ ) { 474 row = data_i[j] - rstart; 475 start = ai[row]; 476 end = ai[row+1]; 477 for ( k=start; k<end; k++ ) { /* Amat */ 478 val = aj[k] + cstart; 479 if (!BT_LOOKUP(table_i,val)) { data_i[isz_i++] = val;} 480 } 481 start = bi[row]; 482 end = bi[row+1]; 483 for ( k=start; k<end; k++ ) { /* Bmat */ 484 val = garray[bj[k]]; 485 if (!BT_LOOKUP(table_i,val)) { data_i[isz_i++] = val;} 486 } 487 } 488 isz[i] = isz_i; 489 } 490 return 0; 491 } 492 /* 493 MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages, 494 and return the output 495 496 Input: 497 C - the matrix 498 nrqr - no of messages being processed. 499 rbuf - an array of pointers to the recieved requests 500 501 Output: 502 xdata - array of messages to be sent back 503 isz1 - size of each message 504 505 For better efficiency perhaps we should malloc seperately each xdata[i], 506 then if a remalloc is required we need only copy the data for that one row 507 rather then all previous rows as it is now where a single large chunck of 508 memory is used. 509 510 */ 511 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,int nrqr,int **rbuf, 512 int **xdata, int * isz1) 513 { 514 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 515 Mat A = c->A, B = c->B; 516 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 517 int rstart,cstart,*ai, *aj, *bi, *bj, *garray, i, j, k; 518 int row,total_sz,ct, ct1, ct2, ct3,mem_estimate, oct2, l, start, end; 519 int val, max1, max2, rank, Mbs, no_malloc =0, *tmp, new_estimate, ctr; 520 int *rbuf_i,kmax,rbuf_0; 521 char *xtable; 522 523 rank = c->rank; 524 Mbs = c->Mbs; 525 rstart = c->rstart; 526 cstart = c->cstart; 527 ai = a->i; 528 aj = a->j; 529 bi = b->i; 530 bj = b->j; 531 garray = c->garray; 532 533 534 for ( i=0,ct=0,total_sz=0; i<nrqr; ++i ) { 535 rbuf_i = rbuf[i]; 536 rbuf_0 = rbuf_i[0]; 537 ct += rbuf_0; 538 for ( j=1; j<=rbuf_0; j++ ) { total_sz += rbuf_i[2*j]; } 539 } 540 541 max1 = ct*(a->nz +b->nz)/c->Mbs; 542 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 543 xdata[0] = (int *)PetscMalloc(mem_estimate*sizeof(int)); CHKPTRQ(xdata[0]); 544 ++no_malloc; 545 xtable = (char *)PetscMalloc((Mbs/BITSPERBYTE+1)*sizeof(char)); CHKPTRQ(xtable); 546 PetscMemzero(isz1,nrqr*sizeof(int)); 547 548 ct3 = 0; 549 for ( i=0; i<nrqr; i++ ) { /* for easch mesg from proc i */ 550 rbuf_i = rbuf[i]; 551 rbuf_0 = rbuf_i[0]; 552 ct1 = 2*rbuf_0+1; 553 ct2 = ct1; 554 ct3 += ct1; 555 for ( j=1; j<=rbuf_0; j++ ) { /* for each IS from proc i*/ 556 PetscMemzero(xtable,(Mbs/BITSPERBYTE+1)*sizeof(char)); 557 oct2 = ct2; 558 kmax = rbuf_i[2*j]; 559 for ( k=0; k<kmax; k++, ct1++ ) { 560 row = rbuf_i[ct1]; 561 if (!BT_LOOKUP(xtable,row)) { 562 if (!(ct3 < mem_estimate)) { 563 new_estimate = (int)(1.5*mem_estimate)+1; 564 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 565 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 566 PetscFree(xdata[0]); 567 xdata[0] = tmp; 568 mem_estimate = new_estimate; ++no_malloc; 569 for ( ctr=1; ctr<=i; ctr++ ) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 570 } 571 xdata[i][ct2++] = row; 572 ct3++; 573 } 574 } 575 for ( k=oct2,max2=ct2; k<max2; k++ ) { 576 row = xdata[i][k] - rstart; 577 start = ai[row]; 578 end = ai[row+1]; 579 for ( l=start; l<end; l++ ) { 580 val = aj[l] + cstart; 581 if (!BT_LOOKUP(xtable,val)) { 582 if (!(ct3 < mem_estimate)) { 583 new_estimate = (int)(1.5*mem_estimate)+1; 584 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 585 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 586 PetscFree(xdata[0]); 587 xdata[0] = tmp; 588 mem_estimate = new_estimate; ++no_malloc; 589 for ( ctr=1; ctr<=i; ctr++ ) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 590 } 591 xdata[i][ct2++] = val; 592 ct3++; 593 } 594 } 595 start = bi[row]; 596 end = bi[row+1]; 597 for ( l=start; l<end; l++ ) { 598 val = garray[bj[l]]; 599 if (!BT_LOOKUP(xtable,val)) { 600 if (!(ct3 < mem_estimate)) { 601 new_estimate = (int)(1.5*mem_estimate)+1; 602 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 603 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 604 PetscFree(xdata[0]); 605 xdata[0] = tmp; 606 mem_estimate = new_estimate; ++no_malloc; 607 for ( ctr =1; ctr <=i; ctr++ ) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 608 } 609 xdata[i][ct2++] = val; 610 ct3++; 611 } 612 } 613 } 614 /* Update the header*/ 615 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 616 xdata[i][2*j-1] = rbuf_i[2*j-1]; 617 } 618 xdata[i][0] = rbuf_0; 619 xdata[i+1] = xdata[i] + ct2; 620 isz1[i] = ct2; /* size of each message */ 621 } 622 PetscFree(xtable); 623 PLogInfo(0,"MatIncreaseOverlap_MPIBAIJ:[%d] Allocated %d bytes, required %d bytes, no of mallocs = %d\n",rank,mem_estimate, ct3,no_malloc); 624 return 0; 625 } 626 627 static int MatGetSubMatrices_MPIBAIJ_local(Mat,int,IS *,IS *,MatGetSubMatrixCall,Mat **); 628 629 int MatGetSubMatrices_MPIBAIJ(Mat C,int ismax,IS *isrow,IS *iscol, 630 MatGetSubMatrixCall scall,Mat **submat) 631 { 632 int i,ierr; 633 IS *isrow_new,*iscol_new; 634 635 /* The compression and expansion should be avoided. Dos'nt point 636 out errors might change the indices hence buggey */ 637 638 isrow_new = (IS *)PetscMalloc(2*ismax*sizeof(IS)); CHKPTRQ(isrow_new); 639 iscol_new = isrow_new + ismax; 640 ierr = MatCompressIndices_MPIBAIJ(C, ismax, isrow,isrow_new); CHKERRQ(ierr); 641 ierr = MatCompressIndices_MPIBAIJ(C, ismax, iscol,iscol_new); CHKERRQ(ierr); 642 ierr = MatGetSubMatrices_MPIBAIJ_local(C,ismax,isrow_new,iscol_new,scall,submat); CHKERRQ(ierr); 643 644 for ( i=0; i<ismax; i++) { 645 ISDestroy(isrow_new[i]); 646 ISDestroy(iscol_new[i]); 647 } 648 PetscFree(isrow_new); 649 return 0; 650 } 651 652 /* -------------------------------------------------------------------------*/ 653 static int MatGetSubMatrices_MPIBAIJ_local(Mat C,int ismax,IS *isrow,IS *iscol, 654 MatGetSubMatrixCall scall,Mat **submat) 655 { 656 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 657 Mat A = c->A,*submats = *submat; 658 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data, *b = (Mat_SeqBAIJ*)c->B->data, *mat; 659 int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size; 660 int **sbuf1,**sbuf2, rank, Mbs,i,j,k,l,ct1,ct2,ierr, **rbuf1,row,proc; 661 int nrqs, msz, **ptr,index,*req_size,*ctr,*pa,*tmp,tcol,bsz,nrqr; 662 int **rbuf3,*req_source,**sbuf_aj, **rbuf2, max1,max2,**rmap; 663 int **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i; 664 int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i; 665 int *rmap_i,bs=c->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA, *cworkB; 666 int cstart = c->cstart,nzA,nzB,*a_i=a->i,*b_i=b->i,imark; 667 int *bmap = c->garray,ctmp,rstart=c->rstart,tag0,tag1,tag2,tag3; 668 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 669 MPI_Request *r_waits4,*s_waits3,*s_waits4; 670 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 671 MPI_Status *r_status3,*r_status4,*s_status4; 672 MPI_Comm comm; 673 Scalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB; 674 Scalar *a_a=a->a,*b_a=b->a; 675 676 comm = C->comm; 677 tag0 = C->tag; 678 size = c->size; 679 rank = c->rank; 680 Mbs = c->Mbs; 681 682 /* Get some new tags to keep the communication clean */ 683 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1); CHKERRQ(ierr); 684 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2); CHKERRQ(ierr); 685 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3); CHKERRQ(ierr); 686 687 /* Check if the col indices are sorted */ 688 for ( i=0; i<ismax; i++ ) { 689 ierr = ISSorted(iscol[i],(PetscTruth*)&j); 690 if (!j) SETERRQ(1,"MatGetSubmatrices_MPIBAIJ:IS is not sorted"); 691 } 692 693 len = (2*ismax+1)*(sizeof(int *) + sizeof(int)) + (Mbs+1)*sizeof(int); 694 irow = (int **)PetscMalloc(len); CHKPTRQ(irow); 695 icol = irow + ismax; 696 nrow = (int *) (icol + ismax); 697 ncol = nrow + ismax; 698 rtable = ncol + ismax; 699 700 for ( i=0; i<ismax; i++ ) { 701 ierr = ISGetIndices(isrow[i],&irow[i]); CHKERRQ(ierr); 702 ierr = ISGetIndices(iscol[i],&icol[i]); CHKERRQ(ierr); 703 ierr = ISGetSize(isrow[i],&nrow[i]); CHKERRQ(ierr); 704 ierr = ISGetSize(iscol[i],&ncol[i]); CHKERRQ(ierr); 705 } 706 707 /* Create hash table for the mapping :row -> proc*/ 708 for ( i=0,j=0; i<size; i++ ) { 709 jmax = c->rowners[i+1]; 710 for ( ; j<jmax; j++ ) { 711 rtable[j] = i; 712 } 713 } 714 715 /* evaluate communication - mesg to who, length of mesg, and buffer space 716 required. Based on this, buffers are allocated, and data copied into them*/ 717 w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1); /* mesg size */ 718 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 719 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 720 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 721 PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/ 722 for ( i=0; i<ismax; i++ ) { 723 PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/ 724 jmax = nrow[i]; 725 irow_i = irow[i]; 726 for ( j=0; j<jmax; j++ ) { 727 row = irow_i[j]; 728 proc = rtable[row]; 729 w4[proc]++; 730 } 731 for ( j=0; j<size; j++ ) { 732 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 733 } 734 } 735 736 nrqs = 0; /* no of outgoing messages */ 737 msz = 0; /* total mesg length (for all proc */ 738 w1[rank] = 0; /* no mesg sent to intself */ 739 w3[rank] = 0; 740 for ( i=0; i<size; i++ ) { 741 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 742 } 743 pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); /*(proc -array)*/ 744 for ( i=0, j=0; i<size; i++ ) { 745 if (w1[i]) { pa[j] = i; j++; } 746 } 747 748 /* Each message would have a header = 1 + 2*(no of IS) + data */ 749 for ( i=0; i<nrqs; i++ ) { 750 j = pa[i]; 751 w1[j] += w2[j] + 2* w3[j]; 752 msz += w1[j]; 753 } 754 /* Do a global reduction to determine how many messages to expect*/ 755 { 756 int *rw1, *rw2; 757 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 758 rw2 = rw1+size; 759 MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm); 760 bsz = rw1[rank]; 761 MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm); 762 nrqr = rw2[rank]; 763 PetscFree(rw1); 764 } 765 766 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 767 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 768 rbuf1 = (int**) PetscMalloc(len); CHKPTRQ(rbuf1); 769 rbuf1[0] = (int *) (rbuf1 + nrqr); 770 for ( i=1; i<nrqr; ++i ) rbuf1[i] = rbuf1[i-1] + bsz; 771 772 /* Post the receives */ 773 r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 774 CHKPTRQ(r_waits1); 775 for ( i=0; i<nrqr; ++i ) { 776 MPI_Irecv(rbuf1[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag0,comm,r_waits1+i); 777 } 778 779 /* Allocate Memory for outgoing messages */ 780 len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int); 781 sbuf1 = (int **)PetscMalloc(len); CHKPTRQ(sbuf1); 782 ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */ 783 PetscMemzero(sbuf1,2*size*sizeof(int*)); 784 /* allocate memory for outgoing data + buf to receive the first reply */ 785 tmp = (int *) (ptr + size); 786 ctr = tmp + 2*msz; 787 788 { 789 790 int *iptr = tmp,ict = 0; 791 for ( i=0; i<nrqs; i++ ) { 792 j = pa[i]; 793 iptr += ict; 794 sbuf1[j] = iptr; 795 ict = w1[j]; 796 } 797 } 798 799 /* Form the outgoing messages */ 800 /* Initialise the header space */ 801 for ( i=0; i<nrqs; i++ ) { 802 j = pa[i]; 803 sbuf1[j][0] = 0; 804 PetscMemzero(sbuf1[j]+1, 2*w3[j]*sizeof(int)); 805 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 806 } 807 808 /* Parse the isrow and copy data into outbuf */ 809 for ( i=0; i<ismax; i++ ) { 810 PetscMemzero(ctr,size*sizeof(int)); 811 irow_i = irow[i]; 812 jmax = nrow[i]; 813 for ( j=0; j<jmax; j++ ) { /* parse the indices of each IS */ 814 row = irow_i[j]; 815 proc = rtable[row]; 816 if (proc != rank) { /* copy to the outgoing buf*/ 817 ctr[proc]++; 818 *ptr[proc] = row; 819 ptr[proc]++; 820 } 821 } 822 /* Update the headers for the current IS */ 823 for ( j=0; j<size; j++ ) { /* Can Optimise this loop too */ 824 if ((ctr_j = ctr[j])) { 825 sbuf1_j = sbuf1[j]; 826 k = ++sbuf1_j[0]; 827 sbuf1_j[2*k] = ctr_j; 828 sbuf1_j[2*k-1] = i; 829 } 830 } 831 } 832 833 /* Now post the sends */ 834 s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 835 CHKPTRQ(s_waits1); 836 for ( i=0; i<nrqs; ++i ) { 837 j = pa[i]; 838 /* printf("[%d] Send Req to %d: size %d \n", rank,j, w1[j]); */ 839 MPI_Isend( sbuf1[j], w1[j], MPI_INT, j, tag0, comm, s_waits1+i); 840 } 841 842 /* Post Recieves to capture the buffer size */ 843 r_waits2 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 844 CHKPTRQ(r_waits2); 845 rbuf2 = (int**)PetscMalloc((nrqs+1)*sizeof(int *));CHKPTRQ(rbuf2); 846 rbuf2[0] = tmp + msz; 847 for ( i=1; i<nrqs; ++i ) { 848 j = pa[i]; 849 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 850 } 851 for ( i=0; i<nrqs; ++i ) { 852 j = pa[i]; 853 MPI_Irecv( rbuf2[i], w1[j], MPI_INT, j, tag1, comm, r_waits2+i); 854 } 855 856 /* Send to other procs the buf size they should allocate */ 857 858 859 /* Receive messages*/ 860 s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 861 CHKPTRQ(s_waits2); 862 r_status1 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 863 CHKPTRQ(r_status1); 864 len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*); 865 sbuf2 = (int**) PetscMalloc(len);CHKPTRQ(sbuf2); 866 req_size = (int *) (sbuf2 + nrqr); 867 req_source = req_size + nrqr; 868 869 { 870 Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*) c->A->data, *sB = (Mat_SeqBAIJ*) c->B->data; 871 int *sAi = sA->i, *sBi = sB->i, id; 872 int *sbuf2_i; 873 874 for ( i=0; i<nrqr; ++i ) { 875 MPI_Waitany(nrqr, r_waits1, &index, r_status1+i); 876 req_size[index] = 0; 877 rbuf1_i = rbuf1[index]; 878 start = 2*rbuf1_i[0] + 1; 879 MPI_Get_count(r_status1+i,MPI_INT, &end); 880 sbuf2[index] = (int *)PetscMalloc(end*sizeof(int));CHKPTRQ(sbuf2[index]); 881 sbuf2_i = sbuf2[index]; 882 for ( j=start; j<end; j++ ) { 883 id = rbuf1_i[j] - rstart; 884 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 885 sbuf2_i[j] = ncols; 886 req_size[index] += ncols; 887 } 888 req_source[index] = r_status1[i].MPI_SOURCE; 889 /* form the header */ 890 sbuf2_i[0] = req_size[index]; 891 for ( j=1; j<start; j++ ) { sbuf2_i[j] = rbuf1_i[j]; } 892 MPI_Isend(sbuf2_i,end,MPI_INT,req_source[index],tag1,comm,s_waits2+i); 893 } 894 } 895 PetscFree(r_status1); PetscFree(r_waits1); 896 897 /* recv buffer sizes */ 898 /* Receive messages*/ 899 900 rbuf3 = (int**)PetscMalloc((nrqs+1)*sizeof(int*)); CHKPTRQ(rbuf3); 901 rbuf4 = (Scalar**)PetscMalloc((nrqs+1)*sizeof(Scalar*));CHKPTRQ(rbuf4); 902 r_waits3 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 903 CHKPTRQ(r_waits3); 904 r_waits4 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 905 CHKPTRQ(r_waits4); 906 r_status2 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 907 CHKPTRQ(r_status2); 908 909 for ( i=0; i<nrqs; ++i ) { 910 MPI_Waitany(nrqs, r_waits2, &index, r_status2+i); 911 rbuf3[index] = (int *)PetscMalloc(rbuf2[index][0]*sizeof(int)); 912 CHKPTRQ(rbuf3[index]); 913 rbuf4[index] = (Scalar *)PetscMalloc(rbuf2[index][0]*bs2*sizeof(Scalar)); 914 CHKPTRQ(rbuf4[index]); 915 MPI_Irecv(rbuf3[index],rbuf2[index][0], MPI_INT, 916 r_status2[i].MPI_SOURCE, tag2, comm, r_waits3+index); 917 MPI_Irecv(rbuf4[index],rbuf2[index][0]*bs2, MPIU_SCALAR, 918 r_status2[i].MPI_SOURCE, tag3, comm, r_waits4+index); 919 } 920 PetscFree(r_status2); PetscFree(r_waits2); 921 922 /* Wait on sends1 and sends2 */ 923 s_status1 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 924 CHKPTRQ(s_status1); 925 s_status2 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 926 CHKPTRQ(s_status2); 927 928 MPI_Waitall(nrqs,s_waits1,s_status1); 929 MPI_Waitall(nrqr,s_waits2,s_status2); 930 PetscFree(s_status1); PetscFree(s_status2); 931 PetscFree(s_waits1); PetscFree(s_waits2); 932 933 /* Now allocate buffers for a->j, and send them off */ 934 sbuf_aj = (int **)PetscMalloc((nrqr+1)*sizeof(int *));CHKPTRQ(sbuf_aj); 935 for ( i=0,j=0; i<nrqr; i++ ) j += req_size[i]; 936 sbuf_aj[0] = (int*) PetscMalloc((j+1)*sizeof(int)); CHKPTRQ(sbuf_aj[0]); 937 for ( i=1; i<nrqr; i++ ) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 938 939 s_waits3 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 940 CHKPTRQ(s_waits3); 941 { 942 for ( i=0; i<nrqr; i++ ) { 943 rbuf1_i = rbuf1[i]; 944 sbuf_aj_i = sbuf_aj[i]; 945 ct1 = 2*rbuf1_i[0] + 1; 946 ct2 = 0; 947 for ( j=1,max1=rbuf1_i[0]; j<=max1; j++ ) { 948 kmax = rbuf1[i][2*j]; 949 for ( k=0; k<kmax; k++,ct1++ ) { 950 row = rbuf1_i[ct1] - cstart; 951 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 952 ncols = nzA + nzB; 953 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 954 955 /* load the column indices for this row into cols*/ 956 cols = sbuf_aj_i + ct2; 957 for ( l=0; l<nzB; l++ ) { 958 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp; 959 else break; 960 } 961 imark = l; 962 for ( l=0; l<nzA; l++ ) cols[imark+l] = cstart + cworkA[l]; 963 for ( l=imark; l<nzB; l++ ) cols[nzA+l] = bmap[cworkB[l]]; 964 ct2 += ncols; 965 } 966 } 967 MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i); 968 } 969 } 970 r_status3 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 971 CHKPTRQ(r_status3); 972 s_status3 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 973 CHKPTRQ(s_status3); 974 975 /* Allocate buffers for a->a, and send them off */ 976 sbuf_aa = (Scalar **)PetscMalloc((nrqr+1)*sizeof(Scalar *));CHKPTRQ(sbuf_aa); 977 for ( i=0,j=0; i<nrqr; i++ ) j += req_size[i]; 978 sbuf_aa[0] = (Scalar*) PetscMalloc((j+1)*bs2*sizeof(Scalar));CHKPTRQ(sbuf_aa[0]); 979 for ( i=1; i<nrqr; i++ ) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2; 980 981 s_waits4 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 982 CHKPTRQ(s_waits4); 983 { 984 for ( i=0; i<nrqr; i++ ) { 985 rbuf1_i = rbuf1[i]; 986 sbuf_aa_i = sbuf_aa[i]; 987 ct1 = 2*rbuf1_i[0]+1; 988 ct2 = 0; 989 for ( j=1,max1=rbuf1_i[0]; j<=max1; j++ ) { 990 kmax = rbuf1_i[2*j]; 991 for ( k=0; k<kmax; k++,ct1++ ) { 992 row = rbuf1_i[ct1] - cstart; 993 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 994 ncols = nzA + nzB; 995 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 996 vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2; 997 998 /* load the column values for this row into vals*/ 999 vals = sbuf_aa_i+ct2*bs2; 1000 for ( l=0; l<nzB; l++ ) { 1001 if ((bmap[cworkB[l]]) < cstart) 1002 PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(Scalar)); 1003 else break; 1004 } 1005 imark = l; 1006 for ( l=0; l<nzA; l++ ) 1007 PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(Scalar)); 1008 for ( l=imark; l<nzB; l++ ) 1009 PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(Scalar)); 1010 ct2 += ncols; 1011 } 1012 } 1013 MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i); 1014 } 1015 } 1016 r_status4 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 1017 CHKPTRQ(r_status4); 1018 s_status4 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 1019 CHKPTRQ(s_status4); 1020 PetscFree(rbuf1); 1021 1022 /* Form the matrix */ 1023 /* create col map */ 1024 { 1025 int *icol_i; 1026 1027 len = (1+ismax)*sizeof(int *) + ismax*c->Nbs*sizeof(int); 1028 cmap = (int **)PetscMalloc(len); CHKPTRQ(cmap); 1029 cmap[0] = (int *)(cmap + ismax); 1030 PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(int)); 1031 for ( i=1; i<ismax; i++ ) { cmap[i] = cmap[i-1] + c->Nbs; } 1032 for ( i=0; i<ismax; i++ ) { 1033 jmax = ncol[i]; 1034 icol_i = icol[i]; 1035 cmap_i = cmap[i]; 1036 for ( j=0; j<jmax; j++ ) { 1037 cmap_i[icol_i[j]] = j+1; 1038 } 1039 } 1040 } 1041 1042 1043 /* Create lens which is required for MatCreate... */ 1044 for ( i=0,j=0; i<ismax; i++ ) { j += nrow[i]; } 1045 len = (1+ismax)*sizeof(int *) + j*sizeof(int); 1046 lens = (int **)PetscMalloc(len); CHKPTRQ(lens); 1047 lens[0] = (int *)(lens + ismax); 1048 PetscMemzero(lens[0], j*sizeof(int)); 1049 for ( i=1; i<ismax; i++ ) { lens[i] = lens[i-1] + nrow[i-1]; } 1050 1051 /* Update lens from local data */ 1052 for ( i=0; i<ismax; i++ ) { 1053 jmax = nrow[i]; 1054 cmap_i = cmap[i]; 1055 irow_i = irow[i]; 1056 lens_i = lens[i]; 1057 for ( j=0; j<jmax; j++ ) { 1058 row = irow_i[j]; 1059 proc = rtable[row]; 1060 if (proc == rank) { 1061 /* Get indices from matA and then from matB */ 1062 row = row - rstart; 1063 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1064 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1065 for ( k=0; k<nzA; k++ ) 1066 if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++;} 1067 for ( k=0; k<nzB; k++ ) 1068 if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++;} 1069 } 1070 } 1071 } 1072 1073 /* Create row map*/ 1074 len = (1+ismax)*sizeof(int *) + ismax*c->Mbs*sizeof(int); 1075 rmap = (int **)PetscMalloc(len); CHKPTRQ(rmap); 1076 rmap[0] = (int *)(rmap + ismax); 1077 PetscMemzero(rmap[0],ismax*c->Mbs*sizeof(int)); 1078 for ( i=1; i<ismax; i++ ) { rmap[i] = rmap[i-1] + c->Mbs;} 1079 for ( i=0; i<ismax; i++ ) { 1080 rmap_i = rmap[i]; 1081 irow_i = irow[i]; 1082 jmax = nrow[i]; 1083 for ( j=0; j<jmax; j++ ) { 1084 rmap_i[irow_i[j]] = j; 1085 } 1086 } 1087 1088 /* Update lens from offproc data */ 1089 { 1090 int *rbuf2_i, *rbuf3_i, *sbuf1_i; 1091 1092 for ( tmp2=0; tmp2<nrqs; tmp2++ ) { 1093 MPI_Waitany(nrqs, r_waits3, &i, r_status3+tmp2); 1094 index = pa[i]; 1095 sbuf1_i = sbuf1[index]; 1096 jmax = sbuf1_i[0]; 1097 ct1 = 2*jmax+1; 1098 ct2 = 0; 1099 rbuf2_i = rbuf2[i]; 1100 rbuf3_i = rbuf3[i]; 1101 for ( j=1; j<=jmax; j++ ) { 1102 is_no = sbuf1_i[2*j-1]; 1103 max1 = sbuf1_i[2*j]; 1104 lens_i = lens[is_no]; 1105 cmap_i = cmap[is_no]; 1106 rmap_i = rmap[is_no]; 1107 for ( k=0; k<max1; k++,ct1++ ) { 1108 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1109 max2 = rbuf2_i[ct1]; 1110 for ( l=0; l<max2; l++,ct2++ ) { 1111 if (cmap_i[rbuf3_i[ct2]]) { 1112 lens_i[row]++; 1113 } 1114 } 1115 } 1116 } 1117 } 1118 } 1119 PetscFree(r_status3); PetscFree(r_waits3); 1120 MPI_Waitall(nrqr,s_waits3,s_status3); 1121 PetscFree(s_status3); PetscFree(s_waits3); 1122 1123 /* Create the submatrices */ 1124 if (scall == MAT_REUSE_MATRIX) { 1125 /* 1126 Assumes new rows are same length as the old rows, hence bug! 1127 */ 1128 for ( i=0; i<ismax; i++ ) { 1129 mat = (Mat_SeqBAIJ *)(submats[i]->data); 1130 if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || mat->bs != bs)) { 1131 SETERRQ(1,"MatGetSubmatrices_MPIBAIJ:Cannot reuse matrix. wrong size"); 1132 } 1133 if (PetscMemcmp(mat->ilen,lens[i], mat->mbs *sizeof(int))) { 1134 SETERRQ(1,"MatGetSubmatrices_MPIBAIJ:Cannot reuse matrix. wrong no of nonzeros"); 1135 } 1136 /* Initial matrix as if empty */ 1137 PetscMemzero(mat->ilen,mat->mbs*sizeof(int)); 1138 } 1139 } 1140 else { 1141 *submat = submats = (Mat *)PetscMalloc(ismax*sizeof(Mat)); CHKPTRQ(submats); 1142 for ( i=0; i<ismax; i++ ) { 1143 ierr = MatCreateSeqBAIJ(comm,a->bs,nrow[i]*bs,ncol[i]*bs,0,lens[i],submats+i);CHKERRQ(ierr); 1144 } 1145 } 1146 1147 /* Assemble the matrices */ 1148 /* First assemble the local rows */ 1149 { 1150 int ilen_row,*imat_ilen, *imat_j, *imat_i; 1151 Scalar *imat_a; 1152 1153 for ( i=0; i<ismax; i++ ) { 1154 mat = (Mat_SeqBAIJ *) submats[i]->data; 1155 imat_ilen = mat->ilen; 1156 imat_j = mat->j; 1157 imat_i = mat->i; 1158 imat_a = mat->a; 1159 cmap_i = cmap[i]; 1160 rmap_i = rmap[i]; 1161 irow_i = irow[i]; 1162 jmax = nrow[i]; 1163 for ( j=0; j<jmax; j++ ) { 1164 row = irow_i[j]; 1165 proc = rtable[row]; 1166 if (proc == rank) { 1167 row = row - rstart; 1168 nzA = a_i[row+1] - a_i[row]; 1169 nzB = b_i[row+1] - b_i[row]; 1170 cworkA = a_j + a_i[row]; 1171 cworkB = b_j + b_i[row]; 1172 vworkA = a_a + a_i[row]*bs2; 1173 vworkB = b_a + b_i[row]*bs2; 1174 1175 row = rmap_i[row + rstart]; 1176 mat_i = imat_i[row]; 1177 mat_a = imat_a + mat_i*bs2; 1178 mat_j = imat_j + mat_i; 1179 ilen_row = imat_ilen[row]; 1180 1181 /* load the column indices for this row into cols*/ 1182 for ( l=0; l<nzB; l++ ) { 1183 if ((ctmp = bmap[cworkB[l]]) < cstart) { 1184 if ((tcol = cmap_i[ctmp])) { 1185 *mat_j++ = tcol - 1; 1186 PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(Scalar)); mat_a += bs2; 1187 ilen_row++; 1188 } 1189 } 1190 else break; 1191 } 1192 imark = l; 1193 for ( l=0; l<nzA; l++ ) { 1194 if ((tcol = cmap_i[cstart + cworkA[l]])) { 1195 *mat_j++ = tcol - 1; 1196 PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(Scalar)); mat_a += bs2; 1197 ilen_row++; 1198 } 1199 } 1200 for ( l=imark; l<nzB; l++ ) { 1201 if ((tcol = cmap_i[bmap[cworkB[l]]])) { 1202 *mat_j++ = tcol - 1; 1203 PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(Scalar)); mat_a += bs2; 1204 ilen_row++; 1205 } 1206 } 1207 imat_ilen[row] = ilen_row; 1208 } 1209 } 1210 1211 } 1212 } 1213 1214 /* Now assemble the off proc rows*/ 1215 { 1216 int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1217 int *imat_j,*imat_i; 1218 Scalar *imat_a,*rbuf4_i; 1219 1220 for ( tmp2=0; tmp2<nrqs; tmp2++ ) { 1221 MPI_Waitany(nrqs, r_waits4, &i, r_status4+tmp2); 1222 index = pa[i]; 1223 sbuf1_i = sbuf1[index]; 1224 jmax = sbuf1_i[0]; 1225 ct1 = 2*jmax + 1; 1226 ct2 = 0; 1227 rbuf2_i = rbuf2[i]; 1228 rbuf3_i = rbuf3[i]; 1229 rbuf4_i = rbuf4[i]; 1230 for ( j=1; j<=jmax; j++ ) { 1231 is_no = sbuf1_i[2*j-1]; 1232 rmap_i = rmap[is_no]; 1233 cmap_i = cmap[is_no]; 1234 mat = (Mat_SeqBAIJ *) submats[is_no]->data; 1235 imat_ilen = mat->ilen; 1236 imat_j = mat->j; 1237 imat_i = mat->i; 1238 imat_a = mat->a; 1239 max1 = sbuf1_i[2*j]; 1240 for ( k=0; k<max1; k++, ct1++ ) { 1241 row = sbuf1_i[ct1]; 1242 row = rmap_i[row]; 1243 ilen = imat_ilen[row]; 1244 mat_i = imat_i[row]; 1245 mat_a = imat_a + mat_i*bs2; 1246 mat_j = imat_j + mat_i; 1247 max2 = rbuf2_i[ct1]; 1248 for ( l=0; l<max2; l++,ct2++ ) { 1249 if ((tcol = cmap_i[rbuf3_i[ct2]])) { 1250 *mat_j++ = tcol - 1; 1251 /* *mat_a++ = rbuf4_i[ct2]; */ 1252 PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(Scalar)); mat_a += bs2; 1253 ilen++; 1254 } 1255 } 1256 imat_ilen[row] = ilen; 1257 } 1258 } 1259 } 1260 } 1261 PetscFree(r_status4); PetscFree(r_waits4); 1262 MPI_Waitall(nrqr,s_waits4,s_status4); 1263 PetscFree(s_waits4); PetscFree(s_status4); 1264 1265 /* Restore the indices */ 1266 for ( i=0; i<ismax; i++ ) { 1267 ierr = ISRestoreIndices(isrow[i], irow+i); CHKERRQ(ierr); 1268 ierr = ISRestoreIndices(iscol[i], icol+i); CHKERRQ(ierr); 1269 } 1270 1271 /* Destroy allocated memory */ 1272 PetscFree(irow); 1273 PetscFree(w1); 1274 PetscFree(pa); 1275 1276 PetscFree(sbuf1); 1277 PetscFree(rbuf2); 1278 for ( i=0; i<nrqr; ++i ) { 1279 PetscFree(sbuf2[i]); 1280 } 1281 for ( i=0; i<nrqs; ++i ) { 1282 PetscFree(rbuf3[i]); 1283 PetscFree(rbuf4[i]); 1284 } 1285 1286 PetscFree(sbuf2); 1287 PetscFree(rbuf3); 1288 PetscFree(rbuf4 ); 1289 PetscFree(sbuf_aj[0]); 1290 PetscFree(sbuf_aj); 1291 PetscFree(sbuf_aa[0]); 1292 PetscFree(sbuf_aa); 1293 1294 PetscFree(cmap); 1295 PetscFree(rmap); 1296 PetscFree(lens); 1297 1298 for ( i=0; i<ismax; i++ ) { 1299 ierr = MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); 1300 ierr = MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); 1301 } 1302 1303 ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag3); CHKERRQ(ierr); 1304 ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag2); CHKERRQ(ierr); 1305 ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag1); CHKERRQ(ierr); 1306 1307 return 0; 1308 } 1309