1 #ifndef lint 2 static char vcid[] = "$Id: baijov.c,v 1.3 1996/07/04 13:01:01 bsmith Exp balay $"; 3 #endif 4 /* 5 Routines to compute overlapping regions of a parallel MPI matrix 6 and to find submatrices that were shared across processors. 7 */ 8 #include "mpibaij.h" 9 #include "src/inline/bitarray.h" 10 11 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat, int, IS *); 12 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat , int , char **,int*, int**); 13 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat , int, int **, int**, int* ); 14 extern int MatGetRow_MPIBAIJ(Mat,int,int*,int**,Scalar**); 15 extern int MatRestoreRow_MPIBAIJ(Mat,int,int*,int**,Scalar**); 16 17 static int MatCompressIndices_MPIBAIJ(Mat C, int imax, IS *is) 18 { 19 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) C->data; 20 int ierr,isz,bs = baij->bs,Nbs,n,i,j,*idx,*nidx,ival; 21 char *table; 22 23 Nbs = baij->Nbs; 24 table = (char *) PetscMalloc((Nbs/BITSPERBYTE +1)*sizeof(char)); CHKPTRQ(table); 25 nidx = (int *) PetscMalloc((Nbs+1)*sizeof(int)); CHKPTRQ(nidx); 26 27 for ( i=0; i<imax; i++ ) { 28 isz = 0; 29 PetscMemzero(table,(Nbs/BITSPERBYTE +1)*sizeof(char)); 30 ierr = ISGetIndices(is[i],&idx); CHKERRQ(ierr); 31 ierr = ISGetSize(is[i],&n); CHKERRQ(ierr); 32 for ( j=0; j<n ; ++j){ 33 ival = idx[j]/bs; /* convert the indices into block indices */ 34 if (ival>Nbs) SETERRQ(1,"MatIncreaseOverlap_SeqBAIJ: index greater than mat-dim"); 35 if(!BT_LOOKUP(table, ival)) { nidx[isz++] = ival;} 36 } 37 ierr = ISRestoreIndices(is[i],&idx); CHKERRQ(ierr); 38 ierr = ISDestroy(is[i]); CHKERRQ(ierr); 39 ierr = ISCreateSeq(MPI_COMM_SELF, isz, nidx, (is+i)); CHKERRQ(ierr); 40 } 41 PetscFree(table); 42 PetscFree(nidx); 43 return 0; 44 } 45 46 static int MatExpandIndices_MPIBAIJ(Mat C, int imax, IS *is) 47 { 48 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) C->data; 49 int ierr,bs = baij->bs,Nbs,n,i,j,k,*idx,*nidx; 50 51 Nbs = baij->Nbs; 52 53 nidx = (int *) PetscMalloc((Nbs*bs+1)*sizeof(int)); CHKPTRQ(nidx); 54 55 for ( i=0; i<imax; i++ ) { 56 ierr = ISGetIndices(is[i],&idx); CHKERRQ(ierr); 57 ierr = ISGetSize(is[i],&n); CHKERRQ(ierr); 58 for ( j=0; j<n ; ++j){ 59 for ( k=0; k<bs; k++ ) 60 nidx[j*bs+k] = idx[j]*bs+k; 61 } 62 ierr = ISRestoreIndices(is[i],&idx); CHKERRQ(ierr); 63 ierr = ISDestroy(is[i]); CHKERRQ(ierr); 64 ierr = ISCreateSeq(MPI_COMM_SELF, n*bs, nidx, (is+i)); CHKERRQ(ierr); 65 } 66 PetscFree(nidx); 67 return 0; 68 } 69 70 71 int MatIncreaseOverlap_MPIBAIJ(Mat C, int imax, IS *is, int ov) 72 { 73 int i, ierr; 74 /* Convert the indices into block format */ 75 ierr = MatCompressIndices_MPIBAIJ(C, imax, is); CHKERRQ(ierr); 76 if (ov < 0){ SETERRQ(1," MatIncreaseOverlap_MPIBAIJ:Negative overlap specified\n");} 77 for ( i=0; i<ov; ++i ) { 78 ierr = MatIncreaseOverlap_MPIBAIJ_Once(C, imax, is); CHKERRQ(ierr); 79 } 80 ierr = MatExpandIndices_MPIBAIJ(C, imax, is); CHKERRQ(ierr); 81 return 0; 82 } 83 84 /* 85 Sample message format: 86 If a processor A wants processor B to process some elements corresponding 87 to index sets 1s[1], is[5] 88 mesg [0] = 2 ( no of index sets in the mesg) 89 ----------- 90 mesg [1] = 1 => is[1] 91 mesg [2] = sizeof(is[1]); 92 ----------- 93 mesg [5] = 5 => is[5] 94 mesg [6] = sizeof(is[5]); 95 ----------- 96 mesg [7] 97 mesg [n] datas[1] 98 ----------- 99 mesg[n+1] 100 mesg[m] data(is[5]) 101 ----------- 102 103 Notes: 104 nrqs - no of requests sent (or to be sent out) 105 nrqr - no of requests recieved (which have to be or which have been processed 106 */ 107 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat C, int imax, IS *is) 108 { 109 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 110 int **idx, *n, *w1, *w2, *w3, *w4, *rtable,**data,len,*idx_i; 111 int size,rank,Mbs,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr; 112 int *ctr,*pa,tag,*tmp,bsz,nrqr,*isz,*isz1,**xdata,bsz1,**rbuf2; 113 char **table; 114 MPI_Comm comm; 115 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 116 MPI_Status *s_status,*recv_status; 117 118 comm = C->comm; 119 tag = C->tag; 120 size = c->size; 121 rank = c->rank; 122 Mbs = c->Mbs; 123 124 len = (imax+1)*sizeof(int *) + (imax + Mbs)*sizeof(int); 125 idx = (int **) PetscMalloc(len); CHKPTRQ(idx); 126 n = (int *) (idx + imax); 127 rtable = n + imax; 128 129 for ( i=0; i<imax; i++ ) { 130 ierr = ISGetIndices(is[i],&idx[i]); CHKERRQ(ierr); 131 ierr = ISGetSize(is[i],&n[i]); CHKERRQ(ierr); 132 } 133 134 /* Create hash table for the mapping :row -> proc*/ 135 for ( i=0,j=0; i<size; i++ ) { 136 len = c->rowners[i+1]; 137 for ( ; j<len; j++ ) { 138 rtable[j] = i; 139 } 140 } 141 142 /* evaluate communication - mesg to who, length of mesg, and buffer space 143 required. Based on this, buffers are allocated, and data copied into them*/ 144 w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1);/* mesg size */ 145 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 146 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 147 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 148 PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/ 149 for ( i=0; i<imax; i++ ) { 150 PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/ 151 idx_i = idx[i]; 152 len = n[i]; 153 for ( j=0; j<len; j++ ) { 154 row = idx_i[j]; 155 proc = rtable[row]; 156 w4[proc]++; 157 } 158 for ( j=0; j<size; j++ ){ 159 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 160 } 161 } 162 163 nrqs = 0; /* no of outgoing messages */ 164 msz = 0; /* total mesg length (for all proc */ 165 w1[rank] = 0; /* no mesg sent to intself */ 166 w3[rank] = 0; 167 for ( i=0; i<size; i++ ) { 168 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 169 } 170 /* pa - is list of processors to communicate with */ 171 pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); 172 for ( i=0,j=0; i<size; i++ ) { 173 if (w1[i]) {pa[j] = i; j++;} 174 } 175 176 /* Each message would have a header = 1 + 2*(no of IS) + data */ 177 for ( i=0; i<nrqs; i++ ) { 178 j = pa[i]; 179 w1[j] += w2[j] + 2*w3[j]; 180 msz += w1[j]; 181 } 182 183 184 /* Do a global reduction to determine how many messages to expect*/ 185 { 186 int *rw1, *rw2; 187 rw1 = (int *) PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 188 rw2 = rw1+size; 189 MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm); 190 bsz = rw1[rank]; 191 MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm); 192 nrqr = rw2[rank]; 193 PetscFree(rw1); 194 } 195 196 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 197 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 198 rbuf = (int**) PetscMalloc(len); CHKPTRQ(rbuf); 199 rbuf[0] = (int *) (rbuf + nrqr); 200 for ( i=1; i<nrqr; ++i ) rbuf[i] = rbuf[i-1] + bsz; 201 202 /* Post the receives */ 203 r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 204 CHKPTRQ(r_waits1); 205 for ( i=0; i<nrqr; ++i ){ 206 MPI_Irecv(rbuf[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag,comm,r_waits1+i); 207 } 208 209 /* Allocate Memory for outgoing messages */ 210 len = 2*size*sizeof(int*) + (size+msz)*sizeof(int); 211 outdat = (int **)PetscMalloc(len); CHKPTRQ(outdat); 212 ptr = outdat + size; /* Pointers to the data in outgoing buffers */ 213 PetscMemzero(outdat,2*size*sizeof(int*)); 214 tmp = (int *) (outdat + 2*size); 215 ctr = tmp + msz; 216 217 { 218 int *iptr = tmp,ict = 0; 219 for ( i=0; i<nrqs; i++ ) { 220 j = pa[i]; 221 iptr += ict; 222 outdat[j] = iptr; 223 ict = w1[j]; 224 } 225 } 226 227 /* Form the outgoing messages */ 228 /*plug in the headers*/ 229 for ( i=0; i<nrqs; i++ ) { 230 j = pa[i]; 231 outdat[j][0] = 0; 232 PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int)); 233 ptr[j] = outdat[j] + 2*w3[j] + 1; 234 } 235 236 /* Memory for doing local proc's work*/ 237 { 238 int *d_p; 239 char *t_p; 240 241 len = (imax+1)*(sizeof(char *) + sizeof(int *) + sizeof(int)) + 242 (Mbs+1)*imax*sizeof(int) + (Mbs/BITSPERBYTE+1)*imax*sizeof(char); 243 table = (char **)PetscMalloc(len); CHKPTRQ(table); 244 data = (int **)(table + imax); 245 data[0] = (int *)(data + imax); 246 isz = data[0] + (Mbs+1)*imax; 247 table[0] = (char *)(isz + imax); 248 d_p = data[0]; t_p = table[0]; 249 for ( i=1; i<imax; i++ ) { 250 table[i] = t_p + (Mbs/BITSPERBYTE+1)*i; 251 data[i] = d_p + (Mbs+1)*i; 252 } 253 } 254 PetscMemzero(*table,(Mbs/BITSPERBYTE+1)*imax); 255 PetscMemzero(isz,imax*sizeof(int)); 256 257 /* Parse the IS and update local tables and the outgoing buf with the data*/ 258 { 259 int n_i,*data_i,isz_i,*outdat_j,ctr_j; 260 char *table_i; 261 262 for ( i=0; i<imax; i++ ) { 263 PetscMemzero(ctr,size*sizeof(int)); 264 n_i = n[i]; 265 table_i = table[i]; 266 idx_i = idx[i]; 267 data_i = data[i]; 268 isz_i = isz[i]; 269 for ( j=0; j<n_i; j++ ) { /* parse the indices of each IS */ 270 row = idx_i[j]; 271 proc = rtable[row]; 272 if (proc != rank) { /* copy to the outgoing buffer */ 273 ctr[proc]++; 274 *ptr[proc] = row; 275 ptr[proc]++; 276 } 277 else { /* Update the local table */ 278 if (!BT_LOOKUP(table_i,row)) { data_i[isz_i++] = row;} 279 } 280 } 281 /* Update the headers for the current IS */ 282 for ( j=0; j<size; j++ ) { /* Can Optimise this loop by using pa[] */ 283 if ((ctr_j = ctr[j])) { 284 outdat_j = outdat[j]; 285 k = ++outdat_j[0]; 286 outdat_j[2*k] = ctr_j; 287 outdat_j[2*k-1] = i; 288 } 289 } 290 isz[i] = isz_i; 291 } 292 } 293 294 295 296 /* Now post the sends */ 297 s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 298 CHKPTRQ(s_waits1); 299 for ( i=0; i<nrqs; ++i ) { 300 j = pa[i]; 301 MPI_Isend(outdat[j], w1[j], MPI_INT, j, tag, comm, s_waits1+i); 302 } 303 304 /* No longer need the original indices*/ 305 for ( i=0; i<imax; ++i ) { 306 ierr = ISRestoreIndices(is[i], idx+i); CHKERRQ(ierr); 307 } 308 PetscFree(idx); 309 310 for ( i=0; i<imax; ++i ) { 311 ierr = ISDestroy(is[i]); CHKERRQ(ierr); 312 } 313 314 /* Do Local work*/ 315 ierr = MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 316 317 /* Receive messages*/ 318 { 319 int index; 320 321 recv_status = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 322 CHKPTRQ(recv_status); 323 for ( i=0; i<nrqr; ++i ) { 324 MPI_Waitany(nrqr, r_waits1, &index, recv_status+i); 325 } 326 327 s_status = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 328 CHKPTRQ(s_status); 329 MPI_Waitall(nrqs,s_waits1,s_status); 330 } 331 332 /* Phase 1 sends are complete - deallocate buffers */ 333 PetscFree(outdat); 334 PetscFree(w1); 335 336 xdata = (int **)PetscMalloc((nrqr+1)*sizeof(int *)); CHKPTRQ(xdata); 337 isz1 = (int *)PetscMalloc((nrqr+1)*sizeof(int)); CHKPTRQ(isz1); 338 ierr = MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 339 PetscFree(rbuf); 340 341 /* Send the data back*/ 342 /* Do a global reduction to know the buffer space req for incoming messages*/ 343 { 344 int *rw1, *rw2; 345 346 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 347 PetscMemzero(rw1,2*size*sizeof(int)); 348 rw2 = rw1+size; 349 for ( i=0; i<nrqr; ++i ) { 350 proc = recv_status[i].MPI_SOURCE; 351 rw1[proc] = isz1[i]; 352 } 353 354 MPI_Allreduce(rw1, rw2, size, MPI_INT, MPI_MAX, comm); 355 bsz1 = rw2[rank]; 356 PetscFree(rw1); 357 } 358 359 /* Allocate buffers*/ 360 361 /* Allocate memory for recv buffers. Prob none if nrqr = 0 ???? */ 362 len = (nrqs+1)*sizeof(int*) + nrqs*bsz1*sizeof(int); 363 rbuf2 = (int**) PetscMalloc(len); CHKPTRQ(rbuf2); 364 rbuf2[0] = (int *) (rbuf2 + nrqs); 365 for ( i=1; i<nrqs; ++i ) rbuf2[i] = rbuf2[i-1] + bsz1; 366 367 /* Post the receives */ 368 r_waits2 = (MPI_Request *)PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 369 CHKPTRQ(r_waits2); 370 for ( i=0; i<nrqs; ++i ) { 371 MPI_Irecv(rbuf2[i], bsz1, MPI_INT, MPI_ANY_SOURCE, tag, comm, r_waits2+i); 372 } 373 374 /* Now post the sends */ 375 s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 376 CHKPTRQ(s_waits2); 377 for ( i=0; i<nrqr; ++i ) { 378 j = recv_status[i].MPI_SOURCE; 379 MPI_Isend( xdata[i], isz1[i], MPI_INT, j, tag, comm, s_waits2+i); 380 } 381 382 /* receive work done on other processors*/ 383 { 384 int index, is_no, ct1, max,*rbuf2_i,isz_i,*data_i,jmax; 385 char *table_i; 386 MPI_Status *status2; 387 388 status2 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(status2); 389 390 for ( i=0; i<nrqs; ++i ) { 391 MPI_Waitany(nrqs, r_waits2, &index, status2+i); 392 /* Process the message*/ 393 rbuf2_i = rbuf2[index]; 394 ct1 = 2*rbuf2_i[0]+1; 395 jmax = rbuf2[index][0]; 396 for ( j=1; j<=jmax; j++ ) { 397 max = rbuf2_i[2*j]; 398 is_no = rbuf2_i[2*j-1]; 399 isz_i = isz[is_no]; 400 data_i = data[is_no]; 401 table_i = table[is_no]; 402 for ( k=0; k<max; k++,ct1++ ) { 403 row = rbuf2_i[ct1]; 404 if (!BT_LOOKUP(table_i,row)) { data_i[isz_i++] = row;} 405 } 406 isz[is_no] = isz_i; 407 } 408 } 409 MPI_Waitall(nrqr,s_waits2,status2); 410 PetscFree(status2); 411 } 412 413 for ( i=0; i<imax; ++i ) { 414 ierr = ISCreateSeq(MPI_COMM_SELF, isz[i], data[i], is+i); CHKERRQ(ierr); 415 } 416 417 PetscFree(pa); 418 PetscFree(rbuf2); 419 PetscFree(s_waits1); 420 PetscFree(r_waits1); 421 PetscFree(s_waits2); 422 PetscFree(r_waits2); 423 PetscFree(table); 424 PetscFree(s_status); 425 PetscFree(recv_status); 426 PetscFree(xdata[0]); 427 PetscFree(xdata); 428 PetscFree(isz1); 429 return 0; 430 } 431 432 /* 433 MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do 434 the work on the local processor. 435 436 Inputs: 437 C - MAT_MPIBAIJ; 438 imax - total no of index sets processed at a time; 439 table - an array of char - size = Mbs bits. 440 441 Output: 442 isz - array containing the count of the solution elements correspondign 443 to each index set; 444 data - pointer to the solutions 445 */ 446 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat C,int imax,char **table,int *isz, 447 int **data) 448 { 449 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 450 Mat A = c->A, B = c->B; 451 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 452 int start, end, val, max, rstart,cstart,*ai, *aj; 453 int *bi, *bj, *garray, i, j, k, row,*data_i,isz_i; 454 char *table_i; 455 456 rstart = c->rstart; 457 cstart = c->cstart; 458 ai = a->i; 459 aj = a->j; 460 bi = b->i; 461 bj = b->j; 462 garray = c->garray; 463 464 465 for ( i=0; i<imax; i++ ) { 466 data_i = data[i]; 467 table_i = table[i]; 468 isz_i = isz[i]; 469 for ( j=0, max=isz[i]; j<max; j++ ) { 470 row = data_i[j] - rstart; 471 start = ai[row]; 472 end = ai[row+1]; 473 for ( k=start; k<end; k++ ) { /* Amat */ 474 val = aj[k] + cstart; 475 if (!BT_LOOKUP(table_i,val)) { data_i[isz_i++] = val;} 476 } 477 start = bi[row]; 478 end = bi[row+1]; 479 for ( k=start; k<end; k++ ) { /* Bmat */ 480 val = garray[bj[k]]; 481 if (!BT_LOOKUP(table_i,val)) { data_i[isz_i++] = val;} 482 } 483 } 484 isz[i] = isz_i; 485 } 486 return 0; 487 } 488 /* 489 MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages, 490 and return the output 491 492 Input: 493 C - the matrix 494 nrqr - no of messages being processed. 495 rbuf - an array of pointers to the recieved requests 496 497 Output: 498 xdata - array of messages to be sent back 499 isz1 - size of each message 500 501 For better efficiency perhaps we should malloc seperately each xdata[i], 502 then if a remalloc is required we need only copy the data for that one row 503 rather then all previous rows as it is now where a single large chunck of 504 memory is used. 505 506 */ 507 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,int nrqr,int **rbuf, 508 int **xdata, int * isz1) 509 { 510 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 511 Mat A = c->A, B = c->B; 512 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 513 int rstart,cstart,*ai, *aj, *bi, *bj, *garray, i, j, k; 514 int row,total_sz,ct, ct1, ct2, ct3,mem_estimate, oct2, l, start, end; 515 int val, max1, max2, rank, Mbs, no_malloc =0, *tmp, new_estimate, ctr; 516 int *rbuf_i,kmax,rbuf_0; 517 char *xtable; 518 519 rank = c->rank; 520 Mbs = c->Mbs; 521 rstart = c->rstart; 522 cstart = c->cstart; 523 ai = a->i; 524 aj = a->j; 525 bi = b->i; 526 bj = b->j; 527 garray = c->garray; 528 529 530 for ( i=0,ct=0,total_sz=0; i<nrqr; ++i ) { 531 rbuf_i = rbuf[i]; 532 rbuf_0 = rbuf_i[0]; 533 ct += rbuf_0; 534 for ( j=1; j<=rbuf_0; j++ ) { total_sz += rbuf_i[2*j]; } 535 } 536 537 max1 = ct*(a->nz +b->nz)/c->Mbs; 538 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 539 xdata[0] = (int *)PetscMalloc(mem_estimate*sizeof(int)); CHKPTRQ(xdata[0]); 540 ++no_malloc; 541 xtable = (char *)PetscMalloc((Mbs/BITSPERBYTE+1)*sizeof(char)); CHKPTRQ(xtable); 542 PetscMemzero(isz1,nrqr*sizeof(int)); 543 544 ct3 = 0; 545 for ( i=0; i<nrqr; i++ ) { /* for easch mesg from proc i */ 546 rbuf_i = rbuf[i]; 547 rbuf_0 = rbuf_i[0]; 548 ct1 = 2*rbuf_0+1; 549 ct2 = ct1; 550 ct3 += ct1; 551 for ( j=1; j<=rbuf_0; j++ ) { /* for each IS from proc i*/ 552 PetscMemzero(xtable,(Mbs/BITSPERBYTE+1)*sizeof(char)); 553 oct2 = ct2; 554 kmax = rbuf_i[2*j]; 555 for ( k=0; k<kmax; k++, ct1++ ) { 556 row = rbuf_i[ct1]; 557 if (!BT_LOOKUP(xtable,row)) { 558 if (!(ct3 < mem_estimate)) { 559 new_estimate = (int)(1.5*mem_estimate)+1; 560 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 561 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 562 PetscFree(xdata[0]); 563 xdata[0] = tmp; 564 mem_estimate = new_estimate; ++no_malloc; 565 for ( ctr=1; ctr<=i; ctr++ ) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 566 } 567 xdata[i][ct2++] = row; 568 ct3++; 569 } 570 } 571 for ( k=oct2,max2=ct2; k<max2; k++ ) { 572 row = xdata[i][k] - rstart; 573 start = ai[row]; 574 end = ai[row+1]; 575 for ( l=start; l<end; l++ ) { 576 val = aj[l] + cstart; 577 if (!BT_LOOKUP(xtable,val)) { 578 if (!(ct3 < mem_estimate)) { 579 new_estimate = (int)(1.5*mem_estimate)+1; 580 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 581 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 582 PetscFree(xdata[0]); 583 xdata[0] = tmp; 584 mem_estimate = new_estimate; ++no_malloc; 585 for ( ctr=1; ctr<=i; ctr++ ) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 586 } 587 xdata[i][ct2++] = val; 588 ct3++; 589 } 590 } 591 start = bi[row]; 592 end = bi[row+1]; 593 for ( l=start; l<end; l++ ) { 594 val = garray[bj[l]]; 595 if (!BT_LOOKUP(xtable,val)) { 596 if (!(ct3 < mem_estimate)) { 597 new_estimate = (int)(1.5*mem_estimate)+1; 598 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 599 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 600 PetscFree(xdata[0]); 601 xdata[0] = tmp; 602 mem_estimate = new_estimate; ++no_malloc; 603 for ( ctr =1; ctr <=i; ctr++ ) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 604 } 605 xdata[i][ct2++] = val; 606 ct3++; 607 } 608 } 609 } 610 /* Update the header*/ 611 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 612 xdata[i][2*j-1] = rbuf_i[2*j-1]; 613 } 614 xdata[i][0] = rbuf_0; 615 xdata[i+1] = xdata[i] + ct2; 616 isz1[i] = ct2; /* size of each message */ 617 } 618 PetscFree(xtable); 619 PLogInfo(0,"MatIncreaseOverlap_MPIBAIJ:[%d] Allocated %d bytes, required %d bytes, no of mallocs = %d\n",rank,mem_estimate, ct3,no_malloc); 620 return 0; 621 } 622 623 static int MatGetSubMatrices_MPIBAIJ_local(Mat,int,IS *,IS *,MatGetSubMatrixCall,Mat **); 624 625 int MatGetSubMatrices_MPIBAIJ(Mat C,int ismax,IS *isrow,IS *iscol, 626 MatGetSubMatrixCall scall,Mat **submat) 627 { 628 int ierr; 629 630 /* The compression and expansion should be avoided. Dos'nt point 631 out errors might change the indices hence buggey */ 632 633 ierr = MatCompressIndices_MPIBAIJ(C, ismax, isrow); CHKERRQ(ierr); 634 ierr = MatCompressIndices_MPIBAIJ(C, ismax, iscol); CHKERRQ(ierr); 635 ierr = MatGetSubMatrices_MPIBAIJ_local(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr); 636 ierr = MatExpandIndices_MPIBAIJ(C, ismax, isrow); CHKERRQ(ierr); 637 ierr = MatExpandIndices_MPIBAIJ(C, ismax, isrow); CHKERRQ(ierr); 638 return 0; 639 } 640 641 /* -------------------------------------------------------------------------*/ 642 static int MatGetSubMatrices_MPIBAIJ_local(Mat C,int ismax,IS *isrow,IS *iscol, 643 MatGetSubMatrixCall scall,Mat **submat) 644 { 645 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 646 Mat A = c->A,*submats = *submat; 647 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data, *b = (Mat_SeqBAIJ*)c->B->data, *mat; 648 int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size; 649 int **sbuf1,**sbuf2, rank, Mbs,i,j,k,l,ct1,ct2,ierr, **rbuf1,row,proc; 650 int nrqs, msz, **ptr,index,*req_size,*ctr,*pa,tag,*tmp,tcol,bsz,nrqr; 651 int **rbuf3,*req_source,**sbuf_aj, **rbuf2, max1,max2,**rmap; 652 int **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i; 653 int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i; 654 int *rmap_i,bs=c->bs,bs2=c->bs2; 655 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 656 MPI_Request *r_waits4,*s_waits3,*s_waits4; 657 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 658 MPI_Status *r_status3,*r_status4,*s_status4; 659 MPI_Comm comm; 660 Scalar **rbuf4, **sbuf_aa, *vals, *mat_a, *sbuf_aa_i; 661 662 comm = C->comm; 663 tag = C->tag; 664 size = c->size; 665 rank = c->rank; 666 Mbs = c->Mbs; 667 668 /* Check if the col indices are sorted */ 669 for ( i=0; i<ismax; i++ ) { 670 ierr = ISSorted(iscol[i],(PetscTruth*)&j); 671 if (!j) SETERRQ(1,"MatGetSubmatrices_MPIBAIJ:IS is not sorted"); 672 } 673 674 len = (2*ismax+1)*(sizeof(int *) + sizeof(int)) + (Mbs+1)*sizeof(int); 675 irow = (int **)PetscMalloc(len); CHKPTRQ(irow); 676 icol = irow + ismax; 677 nrow = (int *) (icol + ismax); 678 ncol = nrow + ismax; 679 rtable = ncol + ismax; 680 681 for ( i=0; i<ismax; i++ ) { 682 ierr = ISGetIndices(isrow[i],&irow[i]); CHKERRQ(ierr); 683 ierr = ISGetIndices(iscol[i],&icol[i]); CHKERRQ(ierr); 684 ierr = ISGetSize(isrow[i],&nrow[i]); CHKERRQ(ierr); 685 ierr = ISGetSize(iscol[i],&ncol[i]); CHKERRQ(ierr); 686 } 687 688 /* Create hash table for the mapping :row -> proc*/ 689 for ( i=0,j=0; i<size; i++ ) { 690 jmax = c->rowners[i+1]; 691 for ( ; j<jmax; j++ ) { 692 rtable[j] = i; 693 } 694 } 695 696 /* evaluate communication - mesg to who, length of mesg, and buffer space 697 required. Based on this, buffers are allocated, and data copied into them*/ 698 w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1); /* mesg size */ 699 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 700 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 701 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 702 PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/ 703 for ( i=0; i<ismax; i++ ) { 704 PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/ 705 jmax = nrow[i]; 706 irow_i = irow[i]; 707 for ( j=0; j<jmax; j++ ) { 708 row = irow_i[j]; 709 proc = rtable[row]; 710 w4[proc]++; 711 } 712 for ( j=0; j<size; j++ ) { 713 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 714 } 715 } 716 717 nrqs = 0; /* no of outgoing messages */ 718 msz = 0; /* total mesg length (for all proc */ 719 w1[rank] = 0; /* no mesg sent to intself */ 720 w3[rank] = 0; 721 for ( i=0; i<size; i++ ) { 722 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 723 } 724 pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); /*(proc -array)*/ 725 for ( i=0, j=0; i<size; i++ ) { 726 if (w1[i]) { pa[j] = i; j++; } 727 } 728 729 /* Each message would have a header = 1 + 2*(no of IS) + data */ 730 for ( i=0; i<nrqs; i++ ) { 731 j = pa[i]; 732 w1[j] += w2[j] + 2* w3[j]; 733 msz += w1[j]; 734 } 735 /* Do a global reduction to determine how many messages to expect*/ 736 { 737 int *rw1, *rw2; 738 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 739 rw2 = rw1+size; 740 MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm); 741 bsz = rw1[rank]; 742 MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm); 743 nrqr = rw2[rank]; 744 PetscFree(rw1); 745 } 746 747 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 748 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 749 rbuf1 = (int**) PetscMalloc(len); CHKPTRQ(rbuf1); 750 rbuf1[0] = (int *) (rbuf1 + nrqr); 751 for ( i=1; i<nrqr; ++i ) rbuf1[i] = rbuf1[i-1] + bsz; 752 753 /* Post the receives */ 754 r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 755 CHKPTRQ(r_waits1); 756 for ( i=0; i<nrqr; ++i ) { 757 MPI_Irecv(rbuf1[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag,comm,r_waits1+i); 758 } 759 760 /* Allocate Memory for outgoing messages */ 761 len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int); 762 sbuf1 = (int **)PetscMalloc(len); CHKPTRQ(sbuf1); 763 ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */ 764 PetscMemzero(sbuf1,2*size*sizeof(int*)); 765 /* allocate memory for outgoing data + buf to receive the first reply */ 766 tmp = (int *) (ptr + size); 767 ctr = tmp + 2*msz; 768 769 { 770 int *iptr = tmp,ict = 0; 771 for ( i=0; i<nrqs; i++ ) { 772 j = pa[i]; 773 iptr += ict; 774 sbuf1[j] = iptr; 775 ict = w1[j]; 776 } 777 } 778 779 /* Form the outgoing messages */ 780 /* Initialise the header space */ 781 for ( i=0; i<nrqs; i++ ) { 782 j = pa[i]; 783 sbuf1[j][0] = 0; 784 PetscMemzero(sbuf1[j]+1, 2*w3[j]*sizeof(int)); 785 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 786 } 787 788 /* Parse the isrow and copy data into outbuf */ 789 for ( i=0; i<ismax; i++ ) { 790 PetscMemzero(ctr,size*sizeof(int)); 791 irow_i = irow[i]; 792 jmax = nrow[i]; 793 for ( j=0; j<jmax; j++ ) { /* parse the indices of each IS */ 794 row = irow_i[j]; 795 proc = rtable[row]; 796 if (proc != rank) { /* copy to the outgoing buf*/ 797 ctr[proc]++; 798 *ptr[proc] = row; 799 ptr[proc]++; 800 } 801 } 802 /* Update the headers for the current IS */ 803 for ( j=0; j<size; j++ ) { /* Can Optimise this loop too */ 804 if ((ctr_j = ctr[j])) { 805 sbuf1_j = sbuf1[j]; 806 k = ++sbuf1_j[0]; 807 sbuf1_j[2*k] = ctr_j; 808 sbuf1_j[2*k-1] = i; 809 } 810 } 811 } 812 813 /* Now post the sends */ 814 s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 815 CHKPTRQ(s_waits1); 816 for ( i=0; i<nrqs; ++i ) { 817 j = pa[i]; 818 /* printf("[%d] Send Req to %d: size %d \n", rank,j, w1[j]); */ 819 MPI_Isend( sbuf1[j], w1[j], MPI_INT, j, tag, comm, s_waits1+i); 820 } 821 822 /* Post Recieves to capture the buffer size */ 823 r_waits2 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 824 CHKPTRQ(r_waits2); 825 rbuf2 = (int**)PetscMalloc((nrqs+1)*sizeof(int *));CHKPTRQ(rbuf2); 826 rbuf2[0] = tmp + msz; 827 for ( i=1; i<nrqs; ++i ) { 828 j = pa[i]; 829 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 830 } 831 for ( i=0; i<nrqs; ++i ) { 832 j = pa[i]; 833 MPI_Irecv( rbuf2[i], w1[j], MPI_INT, j, tag+1, comm, r_waits2+i); 834 } 835 836 /* Send to other procs the buf size they should allocate */ 837 838 839 /* Receive messages*/ 840 s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 841 CHKPTRQ(s_waits2); 842 r_status1 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 843 CHKPTRQ(r_status1); 844 len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*); 845 sbuf2 = (int**) PetscMalloc(len);CHKPTRQ(sbuf2); 846 req_size = (int *) (sbuf2 + nrqr); 847 req_source = req_size + nrqr; 848 849 { 850 Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*) c->A->data, *sB = (Mat_SeqBAIJ*) c->B->data; 851 int *sAi = sA->i, *sBi = sB->i, id, rstart = c->rstart; 852 int *sbuf2_i; 853 854 for ( i=0; i<nrqr; ++i ) { 855 MPI_Waitany(nrqr, r_waits1, &index, r_status1+i); 856 req_size[index] = 0; 857 rbuf1_i = rbuf1[index]; 858 start = 2*rbuf1_i[0] + 1; 859 MPI_Get_count(r_status1+i,MPI_INT, &end); 860 sbuf2[index] = (int *)PetscMalloc(end*sizeof(int));CHKPTRQ(sbuf2[index]); 861 sbuf2_i = sbuf2[index]; 862 for ( j=start; j<end; j++ ) { 863 id = rbuf1_i[j] - rstart; 864 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 865 sbuf2_i[j] = ncols; 866 req_size[index] += ncols; 867 } 868 req_source[index] = r_status1[i].MPI_SOURCE; 869 /* form the header */ 870 sbuf2_i[0] = req_size[index]; 871 for ( j=1; j<start; j++ ) { sbuf2_i[j] = rbuf1_i[j]; } 872 MPI_Isend(sbuf2_i,end,MPI_INT,req_source[index],tag+1,comm,s_waits2+i); 873 } 874 } 875 PetscFree(r_status1); PetscFree(r_waits1); 876 877 /* recv buffer sizes */ 878 /* Receive messages*/ 879 880 rbuf3 = (int**)PetscMalloc((nrqs+1)*sizeof(int*)); CHKPTRQ(rbuf3); 881 rbuf4 = (Scalar**)PetscMalloc((nrqs+1)*sizeof(Scalar*));CHKPTRQ(rbuf4); 882 r_waits3 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 883 CHKPTRQ(r_waits3); 884 r_waits4 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 885 CHKPTRQ(r_waits4); 886 r_status2 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 887 CHKPTRQ(r_status2); 888 889 for ( i=0; i<nrqs; ++i ) { 890 MPI_Waitany(nrqs, r_waits2, &index, r_status2+i); 891 rbuf3[index] = (int *)PetscMalloc(rbuf2[index][0]*sizeof(int)); 892 CHKPTRQ(rbuf3[index]); 893 rbuf4[index] = (Scalar *)PetscMalloc(rbuf2[index][0]*bs2*sizeof(Scalar)); 894 CHKPTRQ(rbuf4[index]); 895 MPI_Irecv(rbuf3[index],rbuf2[index][0], MPI_INT, 896 r_status2[i].MPI_SOURCE, tag+2, comm, r_waits3+index); 897 MPI_Irecv(rbuf4[index],rbuf2[index][0]*bs2, MPIU_SCALAR, 898 r_status2[i].MPI_SOURCE, tag+3, comm, r_waits4+index); 899 } 900 PetscFree(r_status2); PetscFree(r_waits2); 901 902 /* Wait on sends1 and sends2 */ 903 s_status1 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 904 CHKPTRQ(s_status1); 905 s_status2 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 906 CHKPTRQ(s_status2); 907 908 MPI_Waitall(nrqs,s_waits1,s_status1); 909 MPI_Waitall(nrqr,s_waits2,s_status2); 910 PetscFree(s_status1); PetscFree(s_status2); 911 PetscFree(s_waits1); PetscFree(s_waits2); 912 913 /* Now allocate buffers for a->j, and send them off */ 914 sbuf_aj = (int **)PetscMalloc((nrqr+1)*sizeof(int *));CHKPTRQ(sbuf_aj); 915 for ( i=0,j=0; i<nrqr; i++ ) j += req_size[i]; 916 sbuf_aj[0] = (int*) PetscMalloc((j+1)*sizeof(int)); CHKPTRQ(sbuf_aj[0]); 917 for ( i=1; i<nrqr; i++ ) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 918 919 s_waits3 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 920 CHKPTRQ(s_waits3); 921 { 922 int nzA, nzB, *a_i = a->i, *b_i = b->i, imark; 923 int *cworkA, *cworkB, cstart = c->cstart, *bmap = c->garray; 924 int *a_j = a->j, *b_j = b->j,ctmp; 925 926 for ( i=0; i<nrqr; i++ ) { 927 rbuf1_i = rbuf1[i]; 928 sbuf_aj_i = sbuf_aj[i]; 929 ct1 = 2*rbuf1_i[0] + 1; 930 ct2 = 0; 931 for ( j=1,max1=rbuf1_i[0]; j<=max1; j++ ) { 932 kmax = rbuf1[i][2*j]; 933 for ( k=0; k<kmax; k++,ct1++ ) { 934 row = rbuf1_i[ct1] - cstart; 935 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 936 ncols = nzA + nzB; 937 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 938 939 /* load the column indices for this row into cols*/ 940 cols = sbuf_aj_i + ct2; 941 for ( l=0; l<nzB; l++ ) { 942 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp; 943 else break; 944 } 945 imark = l; 946 for ( l=0; l<nzA; l++ ) cols[imark+l] = cstart + cworkA[l]; 947 for ( l=imark; l<nzB; l++ ) cols[nzA+l] = bmap[cworkB[l]]; 948 ct2 += ncols; 949 } 950 } 951 MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag+2,comm,s_waits3+i); 952 } 953 } 954 r_status3 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 955 CHKPTRQ(r_status3); 956 s_status3 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 957 CHKPTRQ(s_status3); 958 959 /* Allocate buffers for a->a, and send them off */ 960 sbuf_aa = (Scalar **)PetscMalloc((nrqr+1)*sizeof(Scalar *));CHKPTRQ(sbuf_aa); 961 for ( i=0,j=0; i<nrqr; i++ ) j += req_size[i]; 962 sbuf_aa[0] = (Scalar*) PetscMalloc((j+1)*bs2*sizeof(Scalar));CHKPTRQ(sbuf_aa[0]); 963 for ( i=1; i<nrqr; i++ ) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2; 964 965 s_waits4 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 966 CHKPTRQ(s_waits4); 967 { 968 int nzA, nzB, *a_i = a->i, *b_i = b->i, imark; 969 int *cworkA, *cworkB, cstart = c->cstart, *bmap = c->garray; 970 int *a_j = a->j, *b_j = b->j; 971 Scalar *vworkA, *vworkB, *a_a = a->a, *b_a = b->a; 972 973 for ( i=0; i<nrqr; i++ ) { 974 rbuf1_i = rbuf1[i]; 975 sbuf_aa_i = sbuf_aa[i]; 976 ct1 = 2*rbuf1_i[0]+1; 977 ct2 = 0; 978 for ( j=1,max1=rbuf1_i[0]; j<=max1; j++ ) { 979 kmax = rbuf1_i[2*j]; 980 for ( k=0; k<kmax; k++,ct1++ ) { 981 row = rbuf1_i[ct1] - cstart; 982 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 983 ncols = nzA + nzB; 984 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 985 vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2; 986 987 /* load the column values for this row into vals*/ 988 vals = sbuf_aa_i+ct2; 989 for ( l=0; l<nzB; l++ ) { 990 if ((bmap[cworkB[l]]) < cstart) 991 PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(Scalar)); 992 else break; 993 } 994 imark = l; 995 for ( l=0; l<nzA; l++ ) 996 PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(Scalar)); 997 for ( l=imark; l<nzB; l++ ) 998 PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(Scalar)); 999 ct2 += ncols; 1000 } 1001 } 1002 MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_SCALAR,req_source[i],tag+3,comm,s_waits4+i); 1003 } 1004 } 1005 r_status4 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 1006 CHKPTRQ(r_status4); 1007 s_status4 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 1008 CHKPTRQ(s_status4); 1009 PetscFree(rbuf1); 1010 1011 /* Form the matrix */ 1012 /* create col map */ 1013 { 1014 int *icol_i; 1015 1016 len = (1+ismax)*sizeof(int *) + ismax*c->Nbs*sizeof(int); 1017 cmap = (int **)PetscMalloc(len); CHKPTRQ(cmap); 1018 cmap[0] = (int *)(cmap + ismax); 1019 PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(int)); 1020 for ( i=1; i<ismax; i++ ) { cmap[i] = cmap[i-1] + c->Nbs; } 1021 for ( i=0; i<ismax; i++ ) { 1022 jmax = ncol[i]; 1023 icol_i = icol[i]; 1024 cmap_i = cmap[i]; 1025 for ( j=0; j<jmax; j++ ) { 1026 cmap_i[icol_i[j]] = j+1; 1027 } 1028 } 1029 } 1030 1031 1032 /* Create lens which is required for MatCreate... */ 1033 for ( i=0,j=0; i<ismax; i++ ) { j += nrow[i]; } 1034 len = (1+ismax)*sizeof(int *) + j*sizeof(int); 1035 lens = (int **)PetscMalloc(len); CHKPTRQ(lens); 1036 lens[0] = (int *)(lens + ismax); 1037 PetscMemzero(lens[0], j*sizeof(int)); 1038 for ( i=1; i<ismax; i++ ) { lens[i] = lens[i-1] + nrow[i-1]; } 1039 1040 /* Update lens from local data */ 1041 for ( i=0; i<ismax; i++ ) { 1042 jmax = nrow[i]; 1043 cmap_i = cmap[i]; 1044 irow_i = irow[i]; 1045 lens_i = lens[i]; 1046 for ( j=0; j<jmax; j++ ) { 1047 row = irow_i[j]; 1048 proc = rtable[row]; 1049 if (proc == rank) { 1050 for ( k=0; k<bs; k++ ) { 1051 ierr = MatGetRow_MPIBAIJ(C,row*bs+k,&ncols,&cols,0); CHKERRQ(ierr); 1052 for ( l=0; l<ncols; l++ ) { 1053 if (cmap_i[cols[l]/bs]) { lens_i[j]++;} 1054 } 1055 ierr = MatRestoreRow_MPIBAIJ(C,row,&ncols,&cols,0); CHKERRQ(ierr); 1056 } 1057 } 1058 } 1059 } 1060 1061 /* Create row map*/ 1062 len = (1+ismax)*sizeof(int *) + ismax*c->Mbs*sizeof(int); 1063 rmap = (int **)PetscMalloc(len); CHKPTRQ(rmap); 1064 rmap[0] = (int *)(rmap + ismax); 1065 PetscMemzero(rmap[0],ismax*c->Mbs*sizeof(int)); 1066 for ( i=1; i<ismax; i++ ) { rmap[i] = rmap[i-1] + c->Mbs;} 1067 for ( i=0; i<ismax; i++ ) { 1068 rmap_i = rmap[i]; 1069 irow_i = irow[i]; 1070 jmax = nrow[i]; 1071 for ( j=0; j<jmax; j++ ) { 1072 rmap_i[irow_i[j]] = j; 1073 } 1074 } 1075 1076 /* Update lens from offproc data */ 1077 { 1078 int *rbuf2_i, *rbuf3_i, *sbuf1_i; 1079 1080 for ( tmp2=0; tmp2<nrqs; tmp2++ ) { 1081 MPI_Waitany(nrqs, r_waits3, &i, r_status3+tmp2); 1082 index = pa[i]; 1083 sbuf1_i = sbuf1[index]; 1084 jmax = sbuf1_i[0]; 1085 ct1 = 2*jmax+1; 1086 ct2 = 0; 1087 rbuf2_i = rbuf2[i]; 1088 rbuf3_i = rbuf3[i]; 1089 for ( j=1; j<=jmax; j++ ) { 1090 is_no = sbuf1_i[2*j-1]; 1091 max1 = sbuf1_i[2*j]; 1092 lens_i = lens[is_no]; 1093 cmap_i = cmap[is_no]; 1094 rmap_i = rmap[is_no]; 1095 for ( k=0; k<max1; k++,ct1++ ) { 1096 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1097 max2 = rbuf2_i[ct1]; 1098 for ( l=0; l<max2; l++,ct2++ ) { 1099 if (cmap_i[rbuf3_i[ct2]]) { 1100 lens_i[row]++; 1101 } 1102 } 1103 } 1104 } 1105 } 1106 } 1107 PetscFree(r_status3); PetscFree(r_waits3); 1108 MPI_Waitall(nrqr,s_waits3,s_status3); 1109 PetscFree(s_status3); PetscFree(s_waits3); 1110 1111 /* Create the submatrices */ 1112 if (scall == MAT_REUSE_MATRIX) { 1113 /* 1114 Assumes new rows are same length as the old rows, hence bug! 1115 */ 1116 for ( i=0; i<ismax; i++ ) { 1117 mat = (Mat_SeqBAIJ *)(submats[i]->data); 1118 if ((mat->m != nrow[i]) || (mat->n != ncol[i])) { 1119 SETERRQ(1,"MatGetSubmatrices_MPIBAIJ:Cannot reuse matrix. wrong size"); 1120 } 1121 if (PetscMemcmp(mat->ilen,lens[i], mat->m *sizeof(int))) { 1122 SETERRQ(1,"MatGetSubmatrices_MPIBAIJ:Cannot reuse matrix. wrong no of nonzeros"); 1123 } 1124 /* Initial matrix as if empty */ 1125 PetscMemzero(mat->ilen,mat->m*sizeof(int)); 1126 } 1127 } 1128 else { 1129 *submat = submats = (Mat *)PetscMalloc(ismax*sizeof(Mat)); CHKPTRQ(submats); 1130 for ( i=0; i<ismax; i++ ) { 1131 ierr = MatCreateSeqBAIJ(comm,a->bs,nrow[i]*bs,ncol[i]*bs,0,lens[i],submats+i);CHKERRQ(ierr); 1132 } 1133 } 1134 1135 /* Assemble the matrices */ 1136 /* First assemble the local rows */ 1137 { 1138 int ilen_row,*imat_ilen, *imat_j, *imat_i,old_row; 1139 Scalar *imat_a; 1140 1141 for ( i=0; i<ismax; i++ ) { 1142 mat = (Mat_SeqBAIJ *) submats[i]->data; 1143 imat_ilen = mat->ilen; 1144 imat_j = mat->j; 1145 imat_i = mat->i; 1146 imat_a = mat->a; 1147 cmap_i = cmap[i]; 1148 rmap_i = rmap[i]; 1149 irow_i = irow[i]; 1150 jmax = nrow[i]; 1151 for ( j=0; j<jmax; j++ ) { 1152 row = irow_i[j]; 1153 proc = rtable[row]; 1154 if (proc == rank) { 1155 old_row = row; 1156 row = rmap_i[row]; 1157 ilen_row = imat_ilen[row]; 1158 ierr = MatGetRow_MPIBAIJ(C,old_row,&ncols,&cols,&vals); CHKERRQ(ierr); 1159 mat_i = imat_i[row]; 1160 mat_a = imat_a + mat_i; 1161 mat_j = imat_j + mat_i; 1162 for ( k=0; k<ncols; k++ ) { 1163 if ((tcol = cmap_i[cols[k]])) { 1164 *mat_j++ = tcol - 1; 1165 *mat_a++ = vals[k]; 1166 ilen_row++; 1167 } 1168 } 1169 ierr = MatRestoreRow_MPIBAIJ(C,old_row,&ncols,&cols,&vals); CHKERRQ(ierr); 1170 imat_ilen[row] = ilen_row; 1171 } 1172 1173 } 1174 } 1175 } 1176 1177 /* Now assemble the off proc rows*/ 1178 { 1179 int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1180 int *imat_j,*imat_i; 1181 Scalar *imat_a,*rbuf4_i; 1182 1183 for ( tmp2=0; tmp2<nrqs; tmp2++ ) { 1184 MPI_Waitany(nrqs, r_waits4, &i, r_status4+tmp2); 1185 index = pa[i]; 1186 sbuf1_i = sbuf1[index]; 1187 jmax = sbuf1_i[0]; 1188 ct1 = 2*jmax + 1; 1189 ct2 = 0; 1190 rbuf2_i = rbuf2[i]; 1191 rbuf3_i = rbuf3[i]; 1192 rbuf4_i = rbuf4[i]; 1193 for ( j=1; j<=jmax; j++ ) { 1194 is_no = sbuf1_i[2*j-1]; 1195 rmap_i = rmap[is_no]; 1196 cmap_i = cmap[is_no]; 1197 mat = (Mat_SeqBAIJ *) submats[is_no]->data; 1198 imat_ilen = mat->ilen; 1199 imat_j = mat->j; 1200 imat_i = mat->i; 1201 imat_a = mat->a; 1202 max1 = sbuf1_i[2*j]; 1203 for ( k=0; k<max1; k++, ct1++ ) { 1204 row = sbuf1_i[ct1]; 1205 row = rmap_i[row]; 1206 ilen = imat_ilen[row]; 1207 mat_i = imat_i[row]; 1208 mat_a = imat_a + mat_i; 1209 mat_j = imat_j + mat_i; 1210 max2 = rbuf2_i[ct1]; 1211 for ( l=0; l<max2; l++,ct2++ ) { 1212 if ((tcol = cmap_i[rbuf3_i[ct2]])) { 1213 *mat_j++ = tcol - 1; 1214 *mat_a++ = rbuf4_i[ct2]; 1215 ilen++; 1216 } 1217 } 1218 imat_ilen[row] = ilen; 1219 } 1220 } 1221 } 1222 } 1223 PetscFree(r_status4); PetscFree(r_waits4); 1224 MPI_Waitall(nrqr,s_waits4,s_status4); 1225 PetscFree(s_waits4); PetscFree(s_status4); 1226 1227 /* Restore the indices */ 1228 for ( i=0; i<ismax; i++ ) { 1229 ierr = ISRestoreIndices(isrow[i], irow+i); CHKERRQ(ierr); 1230 ierr = ISRestoreIndices(iscol[i], icol+i); CHKERRQ(ierr); 1231 } 1232 1233 /* Destroy allocated memory */ 1234 PetscFree(irow); 1235 PetscFree(w1); 1236 PetscFree(pa); 1237 1238 PetscFree(sbuf1); 1239 PetscFree(rbuf2); 1240 for ( i=0; i<nrqr; ++i ) { 1241 PetscFree(sbuf2[i]); 1242 } 1243 for ( i=0; i<nrqs; ++i ) { 1244 PetscFree(rbuf3[i]); 1245 PetscFree(rbuf4[i]); 1246 } 1247 1248 PetscFree(sbuf2); 1249 PetscFree(rbuf3); 1250 PetscFree(rbuf4 ); 1251 PetscFree(sbuf_aj[0]); 1252 PetscFree(sbuf_aj); 1253 PetscFree(sbuf_aa[0]); 1254 PetscFree(sbuf_aa); 1255 1256 PetscFree(cmap); 1257 PetscFree(rmap); 1258 PetscFree(lens); 1259 1260 for ( i=0; i<ismax; i++ ) { 1261 ierr = MatAssemblyBegin(submats[i], FINAL_ASSEMBLY); CHKERRQ(ierr); 1262 ierr = MatAssemblyEnd(submats[i], FINAL_ASSEMBLY); CHKERRQ(ierr); 1263 } 1264 1265 return 0; 1266 } 1267