1 #ifndef lint 2 static char vcid[] = "$Id: baijov.c,v 1.2 1996/07/04 13:00:37 bsmith Exp bsmith $"; 3 #endif 4 /* 5 Routines to compute overlapping regions of a parallel MPI matrix 6 and to find submatrices that were shared across processors. 7 */ 8 #include "mpibaij.h" 9 #include "src/inline/bitarray.h" 10 11 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat, int, IS *); 12 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat , int , char **,int*, int**); 13 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat , int, int **, int**, int* ); 14 extern int MatGetRow_MPIBAIJ(Mat,int,int*,int**,Scalar**); 15 extern int MatRestoreRow_MPIBAIJ(Mat,int,int*,int**,Scalar**); 16 17 static int MatCompressIndices_MPIBAIJ(Mat C, int imax, IS *is) 18 { 19 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) C->data; 20 int ierr,isz,bs = baij->bs,Nbs,n,i,j,*idx,*nidx,ival; 21 char *table; 22 23 Nbs = baij->Nbs; 24 table = (char *) PetscMalloc((Nbs/BITSPERBYTE +1)*sizeof(char)); CHKPTRQ(table); 25 nidx = (int *) PetscMalloc((Nbs+1)*sizeof(int)); CHKPTRQ(nidx); 26 27 for ( i=0; i<imax; i++ ) { 28 isz = 0; 29 PetscMemzero(table,(Nbs/BITSPERBYTE +1)*sizeof(char)); 30 ierr = ISGetIndices(is[i],&idx); CHKERRQ(ierr); 31 ierr = ISGetSize(is[i],&n); CHKERRQ(ierr); 32 for ( j=0; j<n ; ++j){ 33 ival = idx[j]/bs; /* convert the indices into block indices */ 34 if (ival>Nbs) SETERRQ(1,"MatIncreaseOverlap_SeqBAIJ: index greater than mat-dim"); 35 if(!BT_LOOKUP(table, ival)) { nidx[isz++] = ival;} 36 } 37 ierr = ISRestoreIndices(is[i],&idx); CHKERRQ(ierr); 38 ierr = ISDestroy(is[i]); CHKERRQ(ierr); 39 ierr = ISCreateSeq(MPI_COMM_SELF, isz, nidx, (is+i)); CHKERRQ(ierr); 40 } 41 PetscFree(table); 42 PetscFree(nidx); 43 return 0; 44 } 45 46 static int MatExpandIndices_MPIBAIJ(Mat C, int imax, IS *is) 47 { 48 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) C->data; 49 int ierr,bs = baij->bs,Nbs,n,i,j,k,*idx,*nidx; 50 51 Nbs = baij->Nbs; 52 53 nidx = (int *) PetscMalloc((Nbs*bs+1)*sizeof(int)); CHKPTRQ(nidx); 54 55 for ( i=0; i<imax; i++ ) { 56 ierr = ISGetIndices(is[i],&idx); CHKERRQ(ierr); 57 ierr = ISGetSize(is[i],&n); CHKERRQ(ierr); 58 for ( j=0; j<n ; ++j){ 59 for ( k=0; k<bs; k++ ) 60 nidx[j*bs+k] = idx[j]*bs+k; 61 } 62 ierr = ISRestoreIndices(is[i],&idx); CHKERRQ(ierr); 63 ierr = ISDestroy(is[i]); CHKERRQ(ierr); 64 ierr = ISCreateSeq(MPI_COMM_SELF, n*bs, nidx, (is+i)); CHKERRQ(ierr); 65 } 66 PetscFree(nidx); 67 return 0; 68 } 69 70 71 int MatIncreaseOverlap_MPIBAIJ(Mat C, int imax, IS *is, int ov) 72 { 73 int i, ierr; 74 /* Convert the indices into block format */ 75 ierr = MatCompressIndices_MPIBAIJ(C, imax, is); CHKERRQ(ierr); 76 if (ov < 0){ SETERRQ(1," MatIncreaseOverlap_MPIBAIJ:Negative overlap specified\n");} 77 for ( i=0; i<ov; ++i ) { 78 ierr = MatIncreaseOverlap_MPIBAIJ_Once(C, imax, is); CHKERRQ(ierr); 79 } 80 ierr = MatExpandIndices_MPIBAIJ(C, imax, is); CHKERRQ(ierr); 81 return 0; 82 } 83 84 /* 85 Sample message format: 86 If a processor A wants processor B to process some elements corresponding 87 to index sets 1s[1], is[5] 88 mesg [0] = 2 ( no of index sets in the mesg) 89 ----------- 90 mesg [1] = 1 => is[1] 91 mesg [2] = sizeof(is[1]); 92 ----------- 93 mesg [5] = 5 => is[5] 94 mesg [6] = sizeof(is[5]); 95 ----------- 96 mesg [7] 97 mesg [n] datas[1] 98 ----------- 99 mesg[n+1] 100 mesg[m] data(is[5]) 101 ----------- 102 103 Notes: 104 nrqs - no of requests sent (or to be sent out) 105 nrqr - no of requests recieved (which have to be or which have been processed 106 */ 107 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat C, int imax, IS *is) 108 { 109 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 110 int **idx, *n, *w1, *w2, *w3, *w4, *rtable,**data,len,*idx_i; 111 int size,rank,Mbs,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr; 112 int *ctr,*pa,tag,*tmp,bsz,nrqr,*isz,*isz1,**xdata,bsz1,**rbuf2; 113 char **table; 114 MPI_Comm comm; 115 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 116 MPI_Status *s_status,*recv_status; 117 118 comm = C->comm; 119 tag = C->tag; 120 size = c->size; 121 rank = c->rank; 122 Mbs = c->Mbs; 123 124 len = (imax+1)*sizeof(int *) + (imax + Mbs)*sizeof(int); 125 idx = (int **) PetscMalloc(len); CHKPTRQ(idx); 126 n = (int *) (idx + imax); 127 rtable = n + imax; 128 129 for ( i=0; i<imax; i++ ) { 130 ierr = ISGetIndices(is[i],&idx[i]); CHKERRQ(ierr); 131 ierr = ISGetSize(is[i],&n[i]); CHKERRQ(ierr); 132 } 133 134 /* Create hash table for the mapping :row -> proc*/ 135 for ( i=0,j=0; i<size; i++ ) { 136 len = c->rowners[i+1]; 137 for ( ; j<len; j++ ) { 138 rtable[j] = i; 139 } 140 } 141 142 /* evaluate communication - mesg to who, length of mesg, and buffer space 143 required. Based on this, buffers are allocated, and data copied into them*/ 144 w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1);/* mesg size */ 145 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 146 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 147 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 148 PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/ 149 for ( i=0; i<imax; i++ ) { 150 PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/ 151 idx_i = idx[i]; 152 len = n[i]; 153 for ( j=0; j<len; j++ ) { 154 row = idx_i[j]; 155 proc = rtable[row]; 156 w4[proc]++; 157 } 158 for ( j=0; j<size; j++ ){ 159 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 160 } 161 } 162 163 nrqs = 0; /* no of outgoing messages */ 164 msz = 0; /* total mesg length (for all proc */ 165 w1[rank] = 0; /* no mesg sent to intself */ 166 w3[rank] = 0; 167 for ( i=0; i<size; i++ ) { 168 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 169 } 170 /* pa - is list of processors to communicate with */ 171 pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); 172 for ( i=0,j=0; i<size; i++ ) { 173 if (w1[i]) {pa[j] = i; j++;} 174 } 175 176 /* Each message would have a header = 1 + 2*(no of IS) + data */ 177 for ( i=0; i<nrqs; i++ ) { 178 j = pa[i]; 179 w1[j] += w2[j] + 2*w3[j]; 180 msz += w1[j]; 181 } 182 183 184 /* Do a global reduction to determine how many messages to expect*/ 185 { 186 int *rw1, *rw2; 187 rw1 = (int *) PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 188 rw2 = rw1+size; 189 MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm); 190 bsz = rw1[rank]; 191 MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm); 192 nrqr = rw2[rank]; 193 PetscFree(rw1); 194 } 195 196 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 197 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 198 rbuf = (int**) PetscMalloc(len); CHKPTRQ(rbuf); 199 rbuf[0] = (int *) (rbuf + nrqr); 200 for ( i=1; i<nrqr; ++i ) rbuf[i] = rbuf[i-1] + bsz; 201 202 /* Post the receives */ 203 r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 204 CHKPTRQ(r_waits1); 205 for ( i=0; i<nrqr; ++i ){ 206 MPI_Irecv(rbuf[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag,comm,r_waits1+i); 207 } 208 209 /* Allocate Memory for outgoing messages */ 210 len = 2*size*sizeof(int*) + (size+msz)*sizeof(int); 211 outdat = (int **)PetscMalloc(len); CHKPTRQ(outdat); 212 ptr = outdat + size; /* Pointers to the data in outgoing buffers */ 213 PetscMemzero(outdat,2*size*sizeof(int*)); 214 tmp = (int *) (outdat + 2*size); 215 ctr = tmp + msz; 216 217 { 218 int *iptr = tmp,ict = 0; 219 for ( i=0; i<nrqs; i++ ) { 220 j = pa[i]; 221 iptr += ict; 222 outdat[j] = iptr; 223 ict = w1[j]; 224 } 225 } 226 227 /* Form the outgoing messages */ 228 /*plug in the headers*/ 229 for ( i=0; i<nrqs; i++ ) { 230 j = pa[i]; 231 outdat[j][0] = 0; 232 PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int)); 233 ptr[j] = outdat[j] + 2*w3[j] + 1; 234 } 235 236 /* Memory for doing local proc's work*/ 237 { 238 int *d_p; 239 char *t_p; 240 241 len = (imax+1)*(sizeof(char *) + sizeof(int *) + sizeof(int)) + 242 (Mbs+1)*imax*sizeof(int) + (Mbs/BITSPERBYTE+1)*imax*sizeof(char); 243 table = (char **)PetscMalloc(len); CHKPTRQ(table); 244 data = (int **)(table + imax); 245 data[0] = (int *)(data + imax); 246 isz = data[0] + (Mbs+1)*imax; 247 table[0] = (char *)(isz + imax); 248 d_p = data[0]; t_p = table[0]; 249 for ( i=1; i<imax; i++ ) { 250 table[i] = t_p + (Mbs/BITSPERBYTE+1)*i; 251 data[i] = d_p + (Mbs+1)*i; 252 } 253 } 254 PetscMemzero(*table,(Mbs/BITSPERBYTE+1)*imax); 255 PetscMemzero(isz,imax*sizeof(int)); 256 257 /* Parse the IS and update local tables and the outgoing buf with the data*/ 258 { 259 int n_i,*data_i,isz_i,*outdat_j,ctr_j; 260 char *table_i; 261 262 for ( i=0; i<imax; i++ ) { 263 PetscMemzero(ctr,size*sizeof(int)); 264 n_i = n[i]; 265 table_i = table[i]; 266 idx_i = idx[i]; 267 data_i = data[i]; 268 isz_i = isz[i]; 269 for ( j=0; j<n_i; j++ ) { /* parse the indices of each IS */ 270 row = idx_i[j]; 271 proc = rtable[row]; 272 if (proc != rank) { /* copy to the outgoing buffer */ 273 ctr[proc]++; 274 *ptr[proc] = row; 275 ptr[proc]++; 276 } 277 else { /* Update the local table */ 278 if (!BT_LOOKUP(table_i,row)) { data_i[isz_i++] = row;} 279 } 280 } 281 /* Update the headers for the current IS */ 282 for ( j=0; j<size; j++ ) { /* Can Optimise this loop by using pa[] */ 283 if ((ctr_j = ctr[j])) { 284 outdat_j = outdat[j]; 285 k = ++outdat_j[0]; 286 outdat_j[2*k] = ctr_j; 287 outdat_j[2*k-1] = i; 288 } 289 } 290 isz[i] = isz_i; 291 } 292 } 293 294 295 296 /* Now post the sends */ 297 s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 298 CHKPTRQ(s_waits1); 299 for ( i=0; i<nrqs; ++i ) { 300 j = pa[i]; 301 MPI_Isend(outdat[j], w1[j], MPI_INT, j, tag, comm, s_waits1+i); 302 } 303 304 /* No longer need the original indices*/ 305 for ( i=0; i<imax; ++i ) { 306 ierr = ISRestoreIndices(is[i], idx+i); CHKERRQ(ierr); 307 } 308 PetscFree(idx); 309 310 for ( i=0; i<imax; ++i ) { 311 ierr = ISDestroy(is[i]); CHKERRQ(ierr); 312 } 313 314 /* Do Local work*/ 315 ierr = MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 316 317 /* Receive messages*/ 318 { 319 int index; 320 321 recv_status = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 322 CHKPTRQ(recv_status); 323 for ( i=0; i<nrqr; ++i ) { 324 MPI_Waitany(nrqr, r_waits1, &index, recv_status+i); 325 } 326 327 s_status = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 328 CHKPTRQ(s_status); 329 MPI_Waitall(nrqs,s_waits1,s_status); 330 } 331 332 /* Phase 1 sends are complete - deallocate buffers */ 333 PetscFree(outdat); 334 PetscFree(w1); 335 336 xdata = (int **)PetscMalloc((nrqr+1)*sizeof(int *)); CHKPTRQ(xdata); 337 isz1 = (int *)PetscMalloc((nrqr+1)*sizeof(int)); CHKPTRQ(isz1); 338 ierr = MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 339 PetscFree(rbuf); 340 341 /* Send the data back*/ 342 /* Do a global reduction to know the buffer space req for incoming messages*/ 343 { 344 int *rw1, *rw2; 345 346 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 347 PetscMemzero(rw1,2*size*sizeof(int)); 348 rw2 = rw1+size; 349 for ( i=0; i<nrqr; ++i ) { 350 proc = recv_status[i].MPI_SOURCE; 351 rw1[proc] = isz1[i]; 352 } 353 354 MPI_Allreduce(rw1, rw2, size, MPI_INT, MPI_MAX, comm); 355 bsz1 = rw2[rank]; 356 PetscFree(rw1); 357 } 358 359 /* Allocate buffers*/ 360 361 /* Allocate memory for recv buffers. Prob none if nrqr = 0 ???? */ 362 len = (nrqs+1)*sizeof(int*) + nrqs*bsz1*sizeof(int); 363 rbuf2 = (int**) PetscMalloc(len); CHKPTRQ(rbuf2); 364 rbuf2[0] = (int *) (rbuf2 + nrqs); 365 for ( i=1; i<nrqs; ++i ) rbuf2[i] = rbuf2[i-1] + bsz1; 366 367 /* Post the receives */ 368 r_waits2 = (MPI_Request *)PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 369 CHKPTRQ(r_waits2); 370 for ( i=0; i<nrqs; ++i ) { 371 MPI_Irecv(rbuf2[i], bsz1, MPI_INT, MPI_ANY_SOURCE, tag, comm, r_waits2+i); 372 } 373 374 /* Now post the sends */ 375 s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 376 CHKPTRQ(s_waits2); 377 for ( i=0; i<nrqr; ++i ) { 378 j = recv_status[i].MPI_SOURCE; 379 MPI_Isend( xdata[i], isz1[i], MPI_INT, j, tag, comm, s_waits2+i); 380 } 381 382 /* receive work done on other processors*/ 383 { 384 int index, is_no, ct1, max,*rbuf2_i,isz_i,*data_i,jmax; 385 char *table_i; 386 MPI_Status *status2; 387 388 status2 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(status2); 389 390 for ( i=0; i<nrqs; ++i ) { 391 MPI_Waitany(nrqs, r_waits2, &index, status2+i); 392 /* Process the message*/ 393 rbuf2_i = rbuf2[index]; 394 ct1 = 2*rbuf2_i[0]+1; 395 jmax = rbuf2[index][0]; 396 for ( j=1; j<=jmax; j++ ) { 397 max = rbuf2_i[2*j]; 398 is_no = rbuf2_i[2*j-1]; 399 isz_i = isz[is_no]; 400 data_i = data[is_no]; 401 table_i = table[is_no]; 402 for ( k=0; k<max; k++,ct1++ ) { 403 row = rbuf2_i[ct1]; 404 if (!BT_LOOKUP(table_i,row)) { data_i[isz_i++] = row;} 405 } 406 isz[is_no] = isz_i; 407 } 408 } 409 MPI_Waitall(nrqr,s_waits2,status2); 410 PetscFree(status2); 411 } 412 413 for ( i=0; i<imax; ++i ) { 414 ierr = ISCreateSeq(MPI_COMM_SELF, isz[i], data[i], is+i); CHKERRQ(ierr); 415 } 416 417 PetscFree(pa); 418 PetscFree(rbuf2); 419 PetscFree(s_waits1); 420 PetscFree(r_waits1); 421 PetscFree(s_waits2); 422 PetscFree(r_waits2); 423 PetscFree(table); 424 PetscFree(s_status); 425 PetscFree(recv_status); 426 PetscFree(xdata[0]); 427 PetscFree(xdata); 428 PetscFree(isz1); 429 return 0; 430 } 431 432 /* 433 MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do 434 the work on the local processor. 435 436 Inputs: 437 C - MAT_MPIBAIJ; 438 imax - total no of index sets processed at a time; 439 table - an array of char - size = Mbs bits. 440 441 Output: 442 isz - array containing the count of the solution elements correspondign 443 to each index set; 444 data - pointer to the solutions 445 */ 446 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat C,int imax,char **table,int *isz, 447 int **data) 448 { 449 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 450 Mat A = c->A, B = c->B; 451 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 452 int start, end, val, max, rstart,cstart,*ai, *aj; 453 int *bi, *bj, *garray, i, j, k, row,*data_i,isz_i; 454 char *table_i; 455 456 rstart = c->rstart; 457 cstart = c->cstart; 458 ai = a->i; 459 aj = a->j; 460 bi = b->i; 461 bj = b->j; 462 garray = c->garray; 463 464 465 for ( i=0; i<imax; i++ ) { 466 data_i = data[i]; 467 table_i = table[i]; 468 isz_i = isz[i]; 469 for ( j=0, max=isz[i]; j<max; j++ ) { 470 row = data_i[j] - rstart; 471 start = ai[row]; 472 end = ai[row+1]; 473 for ( k=start; k<end; k++ ) { /* Amat */ 474 val = aj[k] + cstart; 475 if (!BT_LOOKUP(table_i,val)) { data_i[isz_i++] = val;} 476 } 477 start = bi[row]; 478 end = bi[row+1]; 479 for ( k=start; k<end; k++ ) { /* Bmat */ 480 val = garray[bj[k]]; 481 if (!BT_LOOKUP(table_i,val)) { data_i[isz_i++] = val;} 482 } 483 } 484 isz[i] = isz_i; 485 } 486 return 0; 487 } 488 /* 489 MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages, 490 and return the output 491 492 Input: 493 C - the matrix 494 nrqr - no of messages being processed. 495 rbuf - an array of pointers to the recieved requests 496 497 Output: 498 xdata - array of messages to be sent back 499 isz1 - size of each message 500 501 For better efficiency perhaps we should malloc seperately each xdata[i], 502 then if a remalloc is required we need only copy the data for that one row 503 rather then all previous rows as it is now where a single large chunck of 504 memory is used. 505 506 */ 507 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,int nrqr,int **rbuf, 508 int **xdata, int * isz1) 509 { 510 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 511 Mat A = c->A, B = c->B; 512 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 513 int rstart,cstart,*ai, *aj, *bi, *bj, *garray, i, j, k; 514 int row,total_sz,ct, ct1, ct2, ct3,mem_estimate, oct2, l, start, end; 515 int val, max1, max2, rank, Mbs, no_malloc =0, *tmp, new_estimate, ctr; 516 int *rbuf_i,kmax,rbuf_0; 517 char *xtable; 518 519 rank = c->rank; 520 Mbs = c->Mbs; 521 rstart = c->rstart; 522 cstart = c->cstart; 523 ai = a->i; 524 aj = a->j; 525 bi = b->i; 526 bj = b->j; 527 garray = c->garray; 528 529 530 for ( i=0,ct=0,total_sz=0; i<nrqr; ++i ) { 531 rbuf_i = rbuf[i]; 532 rbuf_0 = rbuf_i[0]; 533 ct += rbuf_0; 534 for ( j=1; j<=rbuf_0; j++ ) { total_sz += rbuf_i[2*j]; } 535 } 536 537 max1 = ct*(a->nz +b->nz)/c->Mbs; 538 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 539 xdata[0] = (int *)PetscMalloc(mem_estimate*sizeof(int)); CHKPTRQ(xdata[0]); 540 ++no_malloc; 541 xtable = (char *)PetscMalloc((Mbs/BITSPERBYTE+1)*sizeof(char)); CHKPTRQ(xtable); 542 PetscMemzero(isz1,nrqr*sizeof(int)); 543 544 ct3 = 0; 545 for ( i=0; i<nrqr; i++ ) { /* for easch mesg from proc i */ 546 rbuf_i = rbuf[i]; 547 rbuf_0 = rbuf_i[0]; 548 ct1 = 2*rbuf_0+1; 549 ct2 = ct1; 550 ct3 += ct1; 551 for ( j=1; j<=rbuf_0; j++ ) { /* for each IS from proc i*/ 552 PetscMemzero(xtable,(Mbs/BITSPERBYTE+1)*sizeof(char)); 553 oct2 = ct2; 554 kmax = rbuf_i[2*j]; 555 for ( k=0; k<kmax; k++, ct1++ ) { 556 row = rbuf_i[ct1]; 557 if (!BT_LOOKUP(xtable,row)) { 558 if (!(ct3 < mem_estimate)) { 559 new_estimate = (int)(1.5*mem_estimate)+1; 560 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 561 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 562 PetscFree(xdata[0]); 563 xdata[0] = tmp; 564 mem_estimate = new_estimate; ++no_malloc; 565 for ( ctr=1; ctr<=i; ctr++ ) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 566 } 567 xdata[i][ct2++] = row; 568 ct3++; 569 } 570 } 571 for ( k=oct2,max2=ct2; k<max2; k++ ) { 572 row = xdata[i][k] - rstart; 573 start = ai[row]; 574 end = ai[row+1]; 575 for ( l=start; l<end; l++ ) { 576 val = aj[l] + cstart; 577 if (!BT_LOOKUP(xtable,val)) { 578 if (!(ct3 < mem_estimate)) { 579 new_estimate = (int)(1.5*mem_estimate)+1; 580 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 581 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 582 PetscFree(xdata[0]); 583 xdata[0] = tmp; 584 mem_estimate = new_estimate; ++no_malloc; 585 for ( ctr=1; ctr<=i; ctr++ ) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 586 } 587 xdata[i][ct2++] = val; 588 ct3++; 589 } 590 } 591 start = bi[row]; 592 end = bi[row+1]; 593 for ( l=start; l<end; l++ ) { 594 val = garray[bj[l]]; 595 if (!BT_LOOKUP(xtable,val)) { 596 if (!(ct3 < mem_estimate)) { 597 new_estimate = (int)(1.5*mem_estimate)+1; 598 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 599 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 600 PetscFree(xdata[0]); 601 xdata[0] = tmp; 602 mem_estimate = new_estimate; ++no_malloc; 603 for ( ctr =1; ctr <=i; ctr++ ) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 604 } 605 xdata[i][ct2++] = val; 606 ct3++; 607 } 608 } 609 } 610 /* Update the header*/ 611 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 612 xdata[i][2*j-1] = rbuf_i[2*j-1]; 613 } 614 xdata[i][0] = rbuf_0; 615 xdata[i+1] = xdata[i] + ct2; 616 isz1[i] = ct2; /* size of each message */ 617 } 618 PetscFree(xtable); 619 PLogInfo(0,"MatIncreaseOverlap_MPIBAIJ:[%d] Allocated %d bytes, required %d bytes, no of mallocs = %d\n",rank,mem_estimate, ct3,no_malloc); 620 return 0; 621 } 622 623 /* -------------------------------------------------------------------------*/ 624 int MatGetSubMatrices_MPIBAIJ(Mat C,int ismax,IS *isrow,IS *iscol, 625 MatGetSubMatrixCall scall,Mat **submat) 626 { 627 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 628 Mat A = c->A,*submats = *submat; 629 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data, *b = (Mat_SeqBAIJ*)c->B->data, *mat; 630 int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size; 631 int **sbuf1,**sbuf2, rank, m,i,j,k,l,ct1,ct2,ierr, **rbuf1,row,proc; 632 int nrqs, msz, **ptr,index,*req_size,*ctr,*pa,tag,*tmp,tcol,bsz,nrqr; 633 int **rbuf3,*req_source,**sbuf_aj, **rbuf2, max1,max2,**rmap; 634 int **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i; 635 int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i; 636 int *rmap_i; 637 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 638 MPI_Request *r_waits4,*s_waits3,*s_waits4; 639 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 640 MPI_Status *r_status3,*r_status4,*s_status4; 641 MPI_Comm comm; 642 Scalar **rbuf4, **sbuf_aa, *vals, *mat_a, *sbuf_aa_i; 643 644 comm = C->comm; 645 tag = C->tag; 646 size = c->size; 647 rank = c->rank; 648 m = c->M; 649 650 /* Check if the col indices are sorted */ 651 for ( i=0; i<ismax; i++ ) { 652 ierr = ISSorted(iscol[i],(PetscTruth*)&j); 653 if (!j) SETERRQ(1,"MatGetSubmatrices_MPIBAIJ:IS is not sorted"); 654 } 655 656 len = (2*ismax+1)*(sizeof(int *) + sizeof(int)) + (m+1)*sizeof(int); 657 irow = (int **)PetscMalloc(len); CHKPTRQ(irow); 658 icol = irow + ismax; 659 nrow = (int *) (icol + ismax); 660 ncol = nrow + ismax; 661 rtable = ncol + ismax; 662 663 for ( i=0; i<ismax; i++ ) { 664 ierr = ISGetIndices(isrow[i],&irow[i]); CHKERRQ(ierr); 665 ierr = ISGetIndices(iscol[i],&icol[i]); CHKERRQ(ierr); 666 ierr = ISGetSize(isrow[i],&nrow[i]); CHKERRQ(ierr); 667 ierr = ISGetSize(iscol[i],&ncol[i]); CHKERRQ(ierr); 668 } 669 670 /* Create hash table for the mapping :row -> proc*/ 671 for ( i=0,j=0; i<size; i++ ) { 672 jmax = c->rowners[i+1]; 673 for ( ; j<jmax; j++ ) { 674 rtable[j] = i; 675 } 676 } 677 678 /* evaluate communication - mesg to who, length of mesg, and buffer space 679 required. Based on this, buffers are allocated, and data copied into them*/ 680 w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1); /* mesg size */ 681 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 682 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 683 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 684 PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/ 685 for ( i=0; i<ismax; i++ ) { 686 PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/ 687 jmax = nrow[i]; 688 irow_i = irow[i]; 689 for ( j=0; j<jmax; j++ ) { 690 row = irow_i[j]; 691 proc = rtable[row]; 692 w4[proc]++; 693 } 694 for ( j=0; j<size; j++ ) { 695 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 696 } 697 } 698 699 nrqs = 0; /* no of outgoing messages */ 700 msz = 0; /* total mesg length (for all proc */ 701 w1[rank] = 0; /* no mesg sent to intself */ 702 w3[rank] = 0; 703 for ( i=0; i<size; i++ ) { 704 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 705 } 706 pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); /*(proc -array)*/ 707 for ( i=0, j=0; i<size; i++ ) { 708 if (w1[i]) { pa[j] = i; j++; } 709 } 710 711 /* Each message would have a header = 1 + 2*(no of IS) + data */ 712 for ( i=0; i<nrqs; i++ ) { 713 j = pa[i]; 714 w1[j] += w2[j] + 2* w3[j]; 715 msz += w1[j]; 716 } 717 /* Do a global reduction to determine how many messages to expect*/ 718 { 719 int *rw1, *rw2; 720 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 721 rw2 = rw1+size; 722 MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm); 723 bsz = rw1[rank]; 724 MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm); 725 nrqr = rw2[rank]; 726 PetscFree(rw1); 727 } 728 729 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 730 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 731 rbuf1 = (int**) PetscMalloc(len); CHKPTRQ(rbuf1); 732 rbuf1[0] = (int *) (rbuf1 + nrqr); 733 for ( i=1; i<nrqr; ++i ) rbuf1[i] = rbuf1[i-1] + bsz; 734 735 /* Post the receives */ 736 r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 737 CHKPTRQ(r_waits1); 738 for ( i=0; i<nrqr; ++i ) { 739 MPI_Irecv(rbuf1[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag,comm,r_waits1+i); 740 } 741 742 /* Allocate Memory for outgoing messages */ 743 len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int); 744 sbuf1 = (int **)PetscMalloc(len); CHKPTRQ(sbuf1); 745 ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */ 746 PetscMemzero(sbuf1,2*size*sizeof(int*)); 747 /* allocate memory for outgoing data + buf to receive the first reply */ 748 tmp = (int *) (ptr + size); 749 ctr = tmp + 2*msz; 750 751 { 752 int *iptr = tmp,ict = 0; 753 for ( i=0; i<nrqs; i++ ) { 754 j = pa[i]; 755 iptr += ict; 756 sbuf1[j] = iptr; 757 ict = w1[j]; 758 } 759 } 760 761 /* Form the outgoing messages */ 762 /* Initialise the header space */ 763 for ( i=0; i<nrqs; i++ ) { 764 j = pa[i]; 765 sbuf1[j][0] = 0; 766 PetscMemzero(sbuf1[j]+1, 2*w3[j]*sizeof(int)); 767 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 768 } 769 770 /* Parse the isrow and copy data into outbuf */ 771 for ( i=0; i<ismax; i++ ) { 772 PetscMemzero(ctr,size*sizeof(int)); 773 irow_i = irow[i]; 774 jmax = nrow[i]; 775 for ( j=0; j<jmax; j++ ) { /* parse the indices of each IS */ 776 row = irow_i[j]; 777 proc = rtable[row]; 778 if (proc != rank) { /* copy to the outgoing buf*/ 779 ctr[proc]++; 780 *ptr[proc] = row; 781 ptr[proc]++; 782 } 783 } 784 /* Update the headers for the current IS */ 785 for ( j=0; j<size; j++ ) { /* Can Optimise this loop too */ 786 if ((ctr_j = ctr[j])) { 787 sbuf1_j = sbuf1[j]; 788 k = ++sbuf1_j[0]; 789 sbuf1_j[2*k] = ctr_j; 790 sbuf1_j[2*k-1] = i; 791 } 792 } 793 } 794 795 /* Now post the sends */ 796 s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 797 CHKPTRQ(s_waits1); 798 for ( i=0; i<nrqs; ++i ) { 799 j = pa[i]; 800 /* printf("[%d] Send Req to %d: size %d \n", rank,j, w1[j]); */ 801 MPI_Isend( sbuf1[j], w1[j], MPI_INT, j, tag, comm, s_waits1+i); 802 } 803 804 /* Post Recieves to capture the buffer size */ 805 r_waits2 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 806 CHKPTRQ(r_waits2); 807 rbuf2 = (int**)PetscMalloc((nrqs+1)*sizeof(int *));CHKPTRQ(rbuf2); 808 rbuf2[0] = tmp + msz; 809 for ( i=1; i<nrqs; ++i ) { 810 j = pa[i]; 811 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 812 } 813 for ( i=0; i<nrqs; ++i ) { 814 j = pa[i]; 815 MPI_Irecv( rbuf2[i], w1[j], MPI_INT, j, tag+1, comm, r_waits2+i); 816 } 817 818 /* Send to other procs the buf size they should allocate */ 819 820 821 /* Receive messages*/ 822 s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 823 CHKPTRQ(s_waits2); 824 r_status1 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 825 CHKPTRQ(r_status1); 826 len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*); 827 sbuf2 = (int**) PetscMalloc(len);CHKPTRQ(sbuf2); 828 req_size = (int *) (sbuf2 + nrqr); 829 req_source = req_size + nrqr; 830 831 { 832 Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*) c->A->data, *sB = (Mat_SeqBAIJ*) c->B->data; 833 int *sAi = sA->i, *sBi = sB->i, id, rstart = c->rstart; 834 int *sbuf2_i; 835 836 for ( i=0; i<nrqr; ++i ) { 837 MPI_Waitany(nrqr, r_waits1, &index, r_status1+i); 838 req_size[index] = 0; 839 rbuf1_i = rbuf1[index]; 840 start = 2*rbuf1_i[0] + 1; 841 MPI_Get_count(r_status1+i,MPI_INT, &end); 842 sbuf2[index] = (int *)PetscMalloc(end*sizeof(int));CHKPTRQ(sbuf2[index]); 843 sbuf2_i = sbuf2[index]; 844 for ( j=start; j<end; j++ ) { 845 id = rbuf1_i[j] - rstart; 846 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 847 sbuf2_i[j] = ncols; 848 req_size[index] += ncols; 849 } 850 req_source[index] = r_status1[i].MPI_SOURCE; 851 /* form the header */ 852 sbuf2_i[0] = req_size[index]; 853 for ( j=1; j<start; j++ ) { sbuf2_i[j] = rbuf1_i[j]; } 854 MPI_Isend(sbuf2_i,end,MPI_INT,req_source[index],tag+1,comm,s_waits2+i); 855 } 856 } 857 PetscFree(r_status1); PetscFree(r_waits1); 858 859 /* recv buffer sizes */ 860 /* Receive messages*/ 861 862 rbuf3 = (int**)PetscMalloc((nrqs+1)*sizeof(int*)); CHKPTRQ(rbuf3); 863 rbuf4 = (Scalar**)PetscMalloc((nrqs+1)*sizeof(Scalar*));CHKPTRQ(rbuf4); 864 r_waits3 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 865 CHKPTRQ(r_waits3); 866 r_waits4 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 867 CHKPTRQ(r_waits4); 868 r_status2 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 869 CHKPTRQ(r_status2); 870 871 for ( i=0; i<nrqs; ++i ) { 872 MPI_Waitany(nrqs, r_waits2, &index, r_status2+i); 873 rbuf3[index] = (int *)PetscMalloc(rbuf2[index][0]*sizeof(int)); 874 CHKPTRQ(rbuf3[index]); 875 rbuf4[index] = (Scalar *)PetscMalloc(rbuf2[index][0]*sizeof(Scalar)); 876 CHKPTRQ(rbuf4[index]); 877 MPI_Irecv(rbuf3[index],rbuf2[index][0], MPI_INT, 878 r_status2[i].MPI_SOURCE, tag+2, comm, r_waits3+index); 879 MPI_Irecv(rbuf4[index],rbuf2[index][0], MPIU_SCALAR, 880 r_status2[i].MPI_SOURCE, tag+3, comm, r_waits4+index); 881 } 882 PetscFree(r_status2); PetscFree(r_waits2); 883 884 /* Wait on sends1 and sends2 */ 885 s_status1 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 886 CHKPTRQ(s_status1); 887 s_status2 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 888 CHKPTRQ(s_status2); 889 890 MPI_Waitall(nrqs,s_waits1,s_status1); 891 MPI_Waitall(nrqr,s_waits2,s_status2); 892 PetscFree(s_status1); PetscFree(s_status2); 893 PetscFree(s_waits1); PetscFree(s_waits2); 894 895 /* Now allocate buffers for a->j, and send them off */ 896 sbuf_aj = (int **)PetscMalloc((nrqr+1)*sizeof(int *));CHKPTRQ(sbuf_aj); 897 for ( i=0,j=0; i<nrqr; i++ ) j += req_size[i]; 898 sbuf_aj[0] = (int*) PetscMalloc((j+1)*sizeof(int)); CHKPTRQ(sbuf_aj[0]); 899 for ( i=1; i<nrqr; i++ ) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 900 901 s_waits3 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 902 CHKPTRQ(s_waits3); 903 { 904 int nzA, nzB, *a_i = a->i, *b_i = b->i, imark; 905 int *cworkA, *cworkB, cstart = c->cstart, *bmap = c->garray; 906 int *a_j = a->j, *b_j = b->j,ctmp; 907 908 for ( i=0; i<nrqr; i++ ) { 909 rbuf1_i = rbuf1[i]; 910 sbuf_aj_i = sbuf_aj[i]; 911 ct1 = 2*rbuf1_i[0] + 1; 912 ct2 = 0; 913 for ( j=1,max1=rbuf1_i[0]; j<=max1; j++ ) { 914 kmax = rbuf1[i][2*j]; 915 for ( k=0; k<kmax; k++,ct1++ ) { 916 row = rbuf1_i[ct1] - cstart; 917 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 918 ncols = nzA + nzB; 919 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 920 921 /* load the column indices for this row into cols*/ 922 cols = sbuf_aj_i + ct2; 923 for ( l=0; l<nzB; l++ ) { 924 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp; 925 else break; 926 } 927 imark = l; 928 for ( l=0; l<nzA; l++ ) cols[imark+l] = cstart + cworkA[l]; 929 for ( l=imark; l<nzB; l++ ) cols[nzA+l] = bmap[cworkB[l]]; 930 ct2 += ncols; 931 } 932 } 933 MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag+2,comm,s_waits3+i); 934 } 935 } 936 r_status3 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 937 CHKPTRQ(r_status3); 938 s_status3 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 939 CHKPTRQ(s_status3); 940 941 /* Allocate buffers for a->a, and send them off */ 942 sbuf_aa = (Scalar **)PetscMalloc((nrqr+1)*sizeof(Scalar *));CHKPTRQ(sbuf_aa); 943 for ( i=0,j=0; i<nrqr; i++ ) j += req_size[i]; 944 sbuf_aa[0] = (Scalar*) PetscMalloc((j+1)*sizeof(Scalar));CHKPTRQ(sbuf_aa[0]); 945 for ( i=1; i<nrqr; i++ ) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 946 947 s_waits4 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 948 CHKPTRQ(s_waits4); 949 { 950 int nzA, nzB, *a_i = a->i, *b_i = b->i, imark; 951 int *cworkA, *cworkB, cstart = c->cstart, *bmap = c->garray; 952 int *a_j = a->j, *b_j = b->j; 953 Scalar *vworkA, *vworkB, *a_a = a->a, *b_a = b->a; 954 955 for ( i=0; i<nrqr; i++ ) { 956 rbuf1_i = rbuf1[i]; 957 sbuf_aa_i = sbuf_aa[i]; 958 ct1 = 2*rbuf1_i[0]+1; 959 ct2 = 0; 960 for ( j=1,max1=rbuf1_i[0]; j<=max1; j++ ) { 961 kmax = rbuf1_i[2*j]; 962 for ( k=0; k<kmax; k++,ct1++ ) { 963 row = rbuf1_i[ct1] - cstart; 964 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 965 ncols = nzA + nzB; 966 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 967 vworkA = a_a + a_i[row]; vworkB = b_a + b_i[row]; 968 969 /* load the column values for this row into vals*/ 970 vals = sbuf_aa_i+ct2; 971 for ( l=0; l<nzB; l++ ) { 972 if ((bmap[cworkB[l]]) < cstart) vals[l] = vworkB[l]; 973 else break; 974 } 975 imark = l; 976 for ( l=0; l<nzA; l++ ) vals[imark+l] = vworkA[l]; 977 for ( l=imark; l<nzB; l++ ) vals[nzA+l] = vworkB[l]; 978 ct2 += ncols; 979 } 980 } 981 MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag+3,comm,s_waits4+i); 982 } 983 } 984 r_status4 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 985 CHKPTRQ(r_status4); 986 s_status4 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 987 CHKPTRQ(s_status4); 988 PetscFree(rbuf1); 989 990 /* Form the matrix */ 991 /* create col map */ 992 { 993 int *icol_i; 994 995 len = (1+ismax)*sizeof(int *) + ismax*c->N*sizeof(int); 996 cmap = (int **)PetscMalloc(len); CHKPTRQ(cmap); 997 cmap[0] = (int *)(cmap + ismax); 998 PetscMemzero(cmap[0],(1+ismax*c->N)*sizeof(int)); 999 for ( i=1; i<ismax; i++ ) { cmap[i] = cmap[i-1] + c->N; } 1000 for ( i=0; i<ismax; i++ ) { 1001 jmax = ncol[i]; 1002 icol_i = icol[i]; 1003 cmap_i = cmap[i]; 1004 for ( j=0; j<jmax; j++ ) { 1005 cmap_i[icol_i[j]] = j+1; 1006 } 1007 } 1008 } 1009 1010 1011 /* Create lens which is required for MatCreate... */ 1012 for ( i=0,j=0; i<ismax; i++ ) { j += nrow[i]; } 1013 len = (1+ismax)*sizeof(int *) + j*sizeof(int); 1014 lens = (int **)PetscMalloc(len); CHKPTRQ(lens); 1015 lens[0] = (int *)(lens + ismax); 1016 PetscMemzero(lens[0], j*sizeof(int)); 1017 for ( i=1; i<ismax; i++ ) { lens[i] = lens[i-1] + nrow[i-1]; } 1018 1019 /* Update lens from local data */ 1020 for ( i=0; i<ismax; i++ ) { 1021 jmax = nrow[i]; 1022 cmap_i = cmap[i]; 1023 irow_i = irow[i]; 1024 lens_i = lens[i]; 1025 for ( j=0; j<jmax; j++ ) { 1026 row = irow_i[j]; 1027 proc = rtable[row]; 1028 if (proc == rank) { 1029 ierr = MatGetRow_MPIBAIJ(C,row,&ncols,&cols,0); CHKERRQ(ierr); 1030 for ( k=0; k<ncols; k++ ) { 1031 if (cmap_i[cols[k]]) { lens_i[j]++;} 1032 } 1033 ierr = MatRestoreRow_MPIBAIJ(C,row,&ncols,&cols,0); CHKERRQ(ierr); 1034 } 1035 } 1036 } 1037 1038 /* Create row map*/ 1039 len = (1+ismax)*sizeof(int *) + ismax*c->M*sizeof(int); 1040 rmap = (int **)PetscMalloc(len); CHKPTRQ(rmap); 1041 rmap[0] = (int *)(rmap + ismax); 1042 PetscMemzero(rmap[0],ismax*c->M*sizeof(int)); 1043 for ( i=1; i<ismax; i++ ) { rmap[i] = rmap[i-1] + c->M;} 1044 for ( i=0; i<ismax; i++ ) { 1045 rmap_i = rmap[i]; 1046 irow_i = irow[i]; 1047 jmax = nrow[i]; 1048 for ( j=0; j<jmax; j++ ) { 1049 rmap_i[irow_i[j]] = j; 1050 } 1051 } 1052 1053 /* Update lens from offproc data */ 1054 { 1055 int *rbuf2_i, *rbuf3_i, *sbuf1_i; 1056 1057 for ( tmp2=0; tmp2<nrqs; tmp2++ ) { 1058 MPI_Waitany(nrqs, r_waits3, &i, r_status3+tmp2); 1059 index = pa[i]; 1060 sbuf1_i = sbuf1[index]; 1061 jmax = sbuf1_i[0]; 1062 ct1 = 2*jmax+1; 1063 ct2 = 0; 1064 rbuf2_i = rbuf2[i]; 1065 rbuf3_i = rbuf3[i]; 1066 for ( j=1; j<=jmax; j++ ) { 1067 is_no = sbuf1_i[2*j-1]; 1068 max1 = sbuf1_i[2*j]; 1069 lens_i = lens[is_no]; 1070 cmap_i = cmap[is_no]; 1071 rmap_i = rmap[is_no]; 1072 for ( k=0; k<max1; k++,ct1++ ) { 1073 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1074 max2 = rbuf2_i[ct1]; 1075 for ( l=0; l<max2; l++,ct2++ ) { 1076 if (cmap_i[rbuf3_i[ct2]]) { 1077 lens_i[row]++; 1078 } 1079 } 1080 } 1081 } 1082 } 1083 } 1084 PetscFree(r_status3); PetscFree(r_waits3); 1085 MPI_Waitall(nrqr,s_waits3,s_status3); 1086 PetscFree(s_status3); PetscFree(s_waits3); 1087 1088 /* Create the submatrices */ 1089 if (scall == MAT_REUSE_MATRIX) { 1090 /* 1091 Assumes new rows are same length as the old rows, hence bug! 1092 */ 1093 for ( i=0; i<ismax; i++ ) { 1094 mat = (Mat_SeqBAIJ *)(submats[i]->data); 1095 if ((mat->m != nrow[i]) || (mat->n != ncol[i])) { 1096 SETERRQ(1,"MatGetSubmatrices_MPIBAIJ:Cannot reuse matrix. wrong size"); 1097 } 1098 if (PetscMemcmp(mat->ilen,lens[i], mat->m *sizeof(int))) { 1099 SETERRQ(1,"MatGetSubmatrices_MPIBAIJ:Cannot reuse matrix. wrong no of nonzeros"); 1100 } 1101 /* Initial matrix as if empty */ 1102 PetscMemzero(mat->ilen,mat->m*sizeof(int)); 1103 } 1104 } 1105 else { 1106 *submat = submats = (Mat *)PetscMalloc(ismax*sizeof(Mat)); CHKPTRQ(submats); 1107 for ( i=0; i<ismax; i++ ) { 1108 ierr = MatCreateSeqBAIJ(comm,a->bs,nrow[i],ncol[i],0,lens[i],submats+i);CHKERRQ(ierr); 1109 } 1110 } 1111 1112 /* Assemble the matrices */ 1113 /* First assemble the local rows */ 1114 { 1115 int ilen_row,*imat_ilen, *imat_j, *imat_i,old_row; 1116 Scalar *imat_a; 1117 1118 for ( i=0; i<ismax; i++ ) { 1119 mat = (Mat_SeqBAIJ *) submats[i]->data; 1120 imat_ilen = mat->ilen; 1121 imat_j = mat->j; 1122 imat_i = mat->i; 1123 imat_a = mat->a; 1124 cmap_i = cmap[i]; 1125 rmap_i = rmap[i]; 1126 irow_i = irow[i]; 1127 jmax = nrow[i]; 1128 for ( j=0; j<jmax; j++ ) { 1129 row = irow_i[j]; 1130 proc = rtable[row]; 1131 if (proc == rank) { 1132 old_row = row; 1133 row = rmap_i[row]; 1134 ilen_row = imat_ilen[row]; 1135 ierr = MatGetRow_MPIBAIJ(C,old_row,&ncols,&cols,&vals); CHKERRQ(ierr); 1136 mat_i = imat_i[row]; 1137 mat_a = imat_a + mat_i; 1138 mat_j = imat_j + mat_i; 1139 for ( k=0; k<ncols; k++ ) { 1140 if ((tcol = cmap_i[cols[k]])) { 1141 *mat_j++ = tcol - 1; 1142 *mat_a++ = vals[k]; 1143 ilen_row++; 1144 } 1145 } 1146 ierr = MatRestoreRow_MPIBAIJ(C,old_row,&ncols,&cols,&vals); CHKERRQ(ierr); 1147 imat_ilen[row] = ilen_row; 1148 } 1149 1150 } 1151 } 1152 } 1153 1154 /* Now assemble the off proc rows*/ 1155 { 1156 int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1157 int *imat_j,*imat_i; 1158 Scalar *imat_a,*rbuf4_i; 1159 1160 for ( tmp2=0; tmp2<nrqs; tmp2++ ) { 1161 MPI_Waitany(nrqs, r_waits4, &i, r_status4+tmp2); 1162 index = pa[i]; 1163 sbuf1_i = sbuf1[index]; 1164 jmax = sbuf1_i[0]; 1165 ct1 = 2*jmax + 1; 1166 ct2 = 0; 1167 rbuf2_i = rbuf2[i]; 1168 rbuf3_i = rbuf3[i]; 1169 rbuf4_i = rbuf4[i]; 1170 for ( j=1; j<=jmax; j++ ) { 1171 is_no = sbuf1_i[2*j-1]; 1172 rmap_i = rmap[is_no]; 1173 cmap_i = cmap[is_no]; 1174 mat = (Mat_SeqBAIJ *) submats[is_no]->data; 1175 imat_ilen = mat->ilen; 1176 imat_j = mat->j; 1177 imat_i = mat->i; 1178 imat_a = mat->a; 1179 max1 = sbuf1_i[2*j]; 1180 for ( k=0; k<max1; k++, ct1++ ) { 1181 row = sbuf1_i[ct1]; 1182 row = rmap_i[row]; 1183 ilen = imat_ilen[row]; 1184 mat_i = imat_i[row]; 1185 mat_a = imat_a + mat_i; 1186 mat_j = imat_j + mat_i; 1187 max2 = rbuf2_i[ct1]; 1188 for ( l=0; l<max2; l++,ct2++ ) { 1189 if ((tcol = cmap_i[rbuf3_i[ct2]])) { 1190 *mat_j++ = tcol - 1; 1191 *mat_a++ = rbuf4_i[ct2]; 1192 ilen++; 1193 } 1194 } 1195 imat_ilen[row] = ilen; 1196 } 1197 } 1198 } 1199 } 1200 PetscFree(r_status4); PetscFree(r_waits4); 1201 MPI_Waitall(nrqr,s_waits4,s_status4); 1202 PetscFree(s_waits4); PetscFree(s_status4); 1203 1204 /* Restore the indices */ 1205 for ( i=0; i<ismax; i++ ) { 1206 ierr = ISRestoreIndices(isrow[i], irow+i); CHKERRQ(ierr); 1207 ierr = ISRestoreIndices(iscol[i], icol+i); CHKERRQ(ierr); 1208 } 1209 1210 /* Destroy allocated memory */ 1211 PetscFree(irow); 1212 PetscFree(w1); 1213 PetscFree(pa); 1214 1215 PetscFree(sbuf1); 1216 PetscFree(rbuf2); 1217 for ( i=0; i<nrqr; ++i ) { 1218 PetscFree(sbuf2[i]); 1219 } 1220 for ( i=0; i<nrqs; ++i ) { 1221 PetscFree(rbuf3[i]); 1222 PetscFree(rbuf4[i]); 1223 } 1224 1225 PetscFree(sbuf2); 1226 PetscFree(rbuf3); 1227 PetscFree(rbuf4 ); 1228 PetscFree(sbuf_aj[0]); 1229 PetscFree(sbuf_aj); 1230 PetscFree(sbuf_aa[0]); 1231 PetscFree(sbuf_aa); 1232 1233 PetscFree(cmap); 1234 PetscFree(rmap); 1235 PetscFree(lens); 1236 1237 for ( i=0; i<ismax; i++ ) { 1238 ierr = MatAssemblyBegin(submats[i], FINAL_ASSEMBLY); CHKERRQ(ierr); 1239 ierr = MatAssemblyEnd(submats[i], FINAL_ASSEMBLY); CHKERRQ(ierr); 1240 } 1241 1242 return 0; 1243 } 1244