1 #ifndef lint 2 static char vcid[] = "$Id: baijov.c,v 1.5 1996/07/09 19:02:16 balay Exp balay $"; 3 #endif 4 /* 5 Routines to compute overlapping regions of a parallel MPI matrix 6 and to find submatrices that were shared across processors. 7 */ 8 #include "mpibaij.h" 9 #include "src/inline/bitarray.h" 10 11 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat, int, IS *); 12 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat , int , char **,int*, int**); 13 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat , int, int **, int**, int* ); 14 extern int MatGetRow_MPIBAIJ(Mat,int,int*,int**,Scalar**); 15 extern int MatRestoreRow_MPIBAIJ(Mat,int,int*,int**,Scalar**); 16 17 static int MatCompressIndices_MPIBAIJ(Mat C, int imax, IS *is_in, IS *is_out) 18 { 19 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) C->data; 20 int ierr,isz,bs = baij->bs,Nbs,n,i,j,*idx,*nidx,ival; 21 char *table; 22 23 Nbs = baij->Nbs; 24 table = (char *) PetscMalloc((Nbs/BITSPERBYTE +1)*sizeof(char)); CHKPTRQ(table); 25 nidx = (int *) PetscMalloc((Nbs+1)*sizeof(int)); CHKPTRQ(nidx); 26 27 for ( i=0; i<imax; i++ ) { 28 isz = 0; 29 PetscMemzero(table,(Nbs/BITSPERBYTE +1)*sizeof(char)); 30 ierr = ISGetIndices(is_in[i],&idx); CHKERRQ(ierr); 31 ierr = ISGetSize(is_in[i],&n); CHKERRQ(ierr); 32 for ( j=0; j<n ; ++j){ 33 ival = idx[j]/bs; /* convert the indices into block indices */ 34 if (ival>Nbs) SETERRQ(1,"MatIncreaseOverlap_SeqBAIJ: index greater than mat-dim"); 35 if(!BT_LOOKUP(table, ival)) { nidx[isz++] = ival;} 36 } 37 ierr = ISRestoreIndices(is_in[i],&idx); CHKERRQ(ierr); 38 ierr = ISCreateSeq(MPI_COMM_SELF, isz, nidx, (is_out+i)); CHKERRQ(ierr); 39 } 40 PetscFree(table); 41 PetscFree(nidx); 42 return 0; 43 } 44 45 static int MatExpandIndices_MPIBAIJ(Mat C, int imax, IS *is_in, IS *is_out) 46 { 47 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) C->data; 48 int ierr,bs = baij->bs,Nbs,n,i,j,k,*idx,*nidx; 49 50 Nbs = baij->Nbs; 51 52 nidx = (int *) PetscMalloc((Nbs*bs+1)*sizeof(int)); CHKPTRQ(nidx); 53 54 for ( i=0; i<imax; i++ ) { 55 ierr = ISGetIndices(is_in[i],&idx); CHKERRQ(ierr); 56 ierr = ISGetSize(is_in[i],&n); CHKERRQ(ierr); 57 for ( j=0; j<n ; ++j){ 58 for ( k=0; k<bs; k++ ) 59 nidx[j*bs+k] = idx[j]*bs+k; 60 } 61 ierr = ISRestoreIndices(is_in[i],&idx); CHKERRQ(ierr); 62 ierr = ISCreateSeq(MPI_COMM_SELF, n*bs, nidx, (is_out+i)); CHKERRQ(ierr); 63 } 64 PetscFree(nidx); 65 return 0; 66 } 67 68 69 int MatIncreaseOverlap_MPIBAIJ(Mat C, int imax, IS *is, int ov) 70 { 71 int i, ierr; 72 IS *is_new; 73 74 is_new = (IS *)PetscMalloc(imax*sizeof(IS)); CHKPTRQ(is_new); 75 /* Convert the indices into block format */ 76 ierr = MatCompressIndices_MPIBAIJ(C, imax, is,is_new); CHKERRQ(ierr); 77 if (ov < 0){ SETERRQ(1," MatIncreaseOverlap_MPIBAIJ:Negative overlap specified\n");} 78 for ( i=0; i<ov; ++i ) { 79 ierr = MatIncreaseOverlap_MPIBAIJ_Once(C, imax, is_new); CHKERRQ(ierr); 80 } 81 for ( i=0; i<imax; i++ ) ISDestroy(is[i]); 82 ierr = MatExpandIndices_MPIBAIJ(C, imax, is_new,is); CHKERRQ(ierr); 83 for ( i=0; i<imax; i++ ) ISDestroy(is_new[i]); 84 PetscFree(is_new); 85 return 0; 86 } 87 88 /* 89 Sample message format: 90 If a processor A wants processor B to process some elements corresponding 91 to index sets 1s[1], is[5] 92 mesg [0] = 2 ( no of index sets in the mesg) 93 ----------- 94 mesg [1] = 1 => is[1] 95 mesg [2] = sizeof(is[1]); 96 ----------- 97 mesg [5] = 5 => is[5] 98 mesg [6] = sizeof(is[5]); 99 ----------- 100 mesg [7] 101 mesg [n] datas[1] 102 ----------- 103 mesg[n+1] 104 mesg[m] data(is[5]) 105 ----------- 106 107 Notes: 108 nrqs - no of requests sent (or to be sent out) 109 nrqr - no of requests recieved (which have to be or which have been processed 110 */ 111 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat C, int imax, IS *is) 112 { 113 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 114 int **idx, *n, *w1, *w2, *w3, *w4, *rtable,**data,len,*idx_i; 115 int size,rank,Mbs,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr; 116 int *ctr,*pa,tag,*tmp,bsz,nrqr,*isz,*isz1,**xdata,bsz1,**rbuf2; 117 char **table; 118 MPI_Comm comm; 119 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 120 MPI_Status *s_status,*recv_status; 121 122 comm = C->comm; 123 tag = C->tag; 124 size = c->size; 125 rank = c->rank; 126 Mbs = c->Mbs; 127 128 len = (imax+1)*sizeof(int *) + (imax + Mbs)*sizeof(int); 129 idx = (int **) PetscMalloc(len); CHKPTRQ(idx); 130 n = (int *) (idx + imax); 131 rtable = n + imax; 132 133 for ( i=0; i<imax; i++ ) { 134 ierr = ISGetIndices(is[i],&idx[i]); CHKERRQ(ierr); 135 ierr = ISGetSize(is[i],&n[i]); CHKERRQ(ierr); 136 } 137 138 /* Create hash table for the mapping :row -> proc*/ 139 for ( i=0,j=0; i<size; i++ ) { 140 len = c->rowners[i+1]; 141 for ( ; j<len; j++ ) { 142 rtable[j] = i; 143 } 144 } 145 146 /* evaluate communication - mesg to who, length of mesg, and buffer space 147 required. Based on this, buffers are allocated, and data copied into them*/ 148 w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1);/* mesg size */ 149 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 150 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 151 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 152 PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/ 153 for ( i=0; i<imax; i++ ) { 154 PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/ 155 idx_i = idx[i]; 156 len = n[i]; 157 for ( j=0; j<len; j++ ) { 158 row = idx_i[j]; 159 proc = rtable[row]; 160 w4[proc]++; 161 } 162 for ( j=0; j<size; j++ ){ 163 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 164 } 165 } 166 167 nrqs = 0; /* no of outgoing messages */ 168 msz = 0; /* total mesg length (for all proc */ 169 w1[rank] = 0; /* no mesg sent to intself */ 170 w3[rank] = 0; 171 for ( i=0; i<size; i++ ) { 172 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 173 } 174 /* pa - is list of processors to communicate with */ 175 pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); 176 for ( i=0,j=0; i<size; i++ ) { 177 if (w1[i]) {pa[j] = i; j++;} 178 } 179 180 /* Each message would have a header = 1 + 2*(no of IS) + data */ 181 for ( i=0; i<nrqs; i++ ) { 182 j = pa[i]; 183 w1[j] += w2[j] + 2*w3[j]; 184 msz += w1[j]; 185 } 186 187 188 /* Do a global reduction to determine how many messages to expect*/ 189 { 190 int *rw1, *rw2; 191 rw1 = (int *) PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 192 rw2 = rw1+size; 193 MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm); 194 bsz = rw1[rank]; 195 MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm); 196 nrqr = rw2[rank]; 197 PetscFree(rw1); 198 } 199 200 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 201 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 202 rbuf = (int**) PetscMalloc(len); CHKPTRQ(rbuf); 203 rbuf[0] = (int *) (rbuf + nrqr); 204 for ( i=1; i<nrqr; ++i ) rbuf[i] = rbuf[i-1] + bsz; 205 206 /* Post the receives */ 207 r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 208 CHKPTRQ(r_waits1); 209 for ( i=0; i<nrqr; ++i ){ 210 MPI_Irecv(rbuf[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag,comm,r_waits1+i); 211 } 212 213 /* Allocate Memory for outgoing messages */ 214 len = 2*size*sizeof(int*) + (size+msz)*sizeof(int); 215 outdat = (int **)PetscMalloc(len); CHKPTRQ(outdat); 216 ptr = outdat + size; /* Pointers to the data in outgoing buffers */ 217 PetscMemzero(outdat,2*size*sizeof(int*)); 218 tmp = (int *) (outdat + 2*size); 219 ctr = tmp + msz; 220 221 { 222 int *iptr = tmp,ict = 0; 223 for ( i=0; i<nrqs; i++ ) { 224 j = pa[i]; 225 iptr += ict; 226 outdat[j] = iptr; 227 ict = w1[j]; 228 } 229 } 230 231 /* Form the outgoing messages */ 232 /*plug in the headers*/ 233 for ( i=0; i<nrqs; i++ ) { 234 j = pa[i]; 235 outdat[j][0] = 0; 236 PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int)); 237 ptr[j] = outdat[j] + 2*w3[j] + 1; 238 } 239 240 /* Memory for doing local proc's work*/ 241 { 242 int *d_p; 243 char *t_p; 244 245 len = (imax+1)*(sizeof(char *) + sizeof(int *) + sizeof(int)) + 246 (Mbs+1)*imax*sizeof(int) + (Mbs/BITSPERBYTE+1)*imax*sizeof(char); 247 table = (char **)PetscMalloc(len); CHKPTRQ(table); 248 data = (int **)(table + imax); 249 data[0] = (int *)(data + imax); 250 isz = data[0] + (Mbs+1)*imax; 251 table[0] = (char *)(isz + imax); 252 d_p = data[0]; t_p = table[0]; 253 for ( i=1; i<imax; i++ ) { 254 table[i] = t_p + (Mbs/BITSPERBYTE+1)*i; 255 data[i] = d_p + (Mbs+1)*i; 256 } 257 } 258 PetscMemzero(*table,(Mbs/BITSPERBYTE+1)*imax); 259 PetscMemzero(isz,imax*sizeof(int)); 260 261 /* Parse the IS and update local tables and the outgoing buf with the data*/ 262 { 263 int n_i,*data_i,isz_i,*outdat_j,ctr_j; 264 char *table_i; 265 266 for ( i=0; i<imax; i++ ) { 267 PetscMemzero(ctr,size*sizeof(int)); 268 n_i = n[i]; 269 table_i = table[i]; 270 idx_i = idx[i]; 271 data_i = data[i]; 272 isz_i = isz[i]; 273 for ( j=0; j<n_i; j++ ) { /* parse the indices of each IS */ 274 row = idx_i[j]; 275 proc = rtable[row]; 276 if (proc != rank) { /* copy to the outgoing buffer */ 277 ctr[proc]++; 278 *ptr[proc] = row; 279 ptr[proc]++; 280 } 281 else { /* Update the local table */ 282 if (!BT_LOOKUP(table_i,row)) { data_i[isz_i++] = row;} 283 } 284 } 285 /* Update the headers for the current IS */ 286 for ( j=0; j<size; j++ ) { /* Can Optimise this loop by using pa[] */ 287 if ((ctr_j = ctr[j])) { 288 outdat_j = outdat[j]; 289 k = ++outdat_j[0]; 290 outdat_j[2*k] = ctr_j; 291 outdat_j[2*k-1] = i; 292 } 293 } 294 isz[i] = isz_i; 295 } 296 } 297 298 299 300 /* Now post the sends */ 301 s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 302 CHKPTRQ(s_waits1); 303 for ( i=0; i<nrqs; ++i ) { 304 j = pa[i]; 305 MPI_Isend(outdat[j], w1[j], MPI_INT, j, tag, comm, s_waits1+i); 306 } 307 308 /* No longer need the original indices*/ 309 for ( i=0; i<imax; ++i ) { 310 ierr = ISRestoreIndices(is[i], idx+i); CHKERRQ(ierr); 311 } 312 PetscFree(idx); 313 314 for ( i=0; i<imax; ++i ) { 315 ierr = ISDestroy(is[i]); CHKERRQ(ierr); 316 } 317 318 /* Do Local work*/ 319 ierr = MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 320 321 /* Receive messages*/ 322 { 323 int index; 324 325 recv_status = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 326 CHKPTRQ(recv_status); 327 for ( i=0; i<nrqr; ++i ) { 328 MPI_Waitany(nrqr, r_waits1, &index, recv_status+i); 329 } 330 331 s_status = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 332 CHKPTRQ(s_status); 333 MPI_Waitall(nrqs,s_waits1,s_status); 334 } 335 336 /* Phase 1 sends are complete - deallocate buffers */ 337 PetscFree(outdat); 338 PetscFree(w1); 339 340 xdata = (int **)PetscMalloc((nrqr+1)*sizeof(int *)); CHKPTRQ(xdata); 341 isz1 = (int *)PetscMalloc((nrqr+1)*sizeof(int)); CHKPTRQ(isz1); 342 ierr = MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 343 PetscFree(rbuf); 344 345 /* Send the data back*/ 346 /* Do a global reduction to know the buffer space req for incoming messages*/ 347 { 348 int *rw1, *rw2; 349 350 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 351 PetscMemzero(rw1,2*size*sizeof(int)); 352 rw2 = rw1+size; 353 for ( i=0; i<nrqr; ++i ) { 354 proc = recv_status[i].MPI_SOURCE; 355 rw1[proc] = isz1[i]; 356 } 357 358 MPI_Allreduce(rw1, rw2, size, MPI_INT, MPI_MAX, comm); 359 bsz1 = rw2[rank]; 360 PetscFree(rw1); 361 } 362 363 /* Allocate buffers*/ 364 365 /* Allocate memory for recv buffers. Prob none if nrqr = 0 ???? */ 366 len = (nrqs+1)*sizeof(int*) + nrqs*bsz1*sizeof(int); 367 rbuf2 = (int**) PetscMalloc(len); CHKPTRQ(rbuf2); 368 rbuf2[0] = (int *) (rbuf2 + nrqs); 369 for ( i=1; i<nrqs; ++i ) rbuf2[i] = rbuf2[i-1] + bsz1; 370 371 /* Post the receives */ 372 r_waits2 = (MPI_Request *)PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 373 CHKPTRQ(r_waits2); 374 for ( i=0; i<nrqs; ++i ) { 375 MPI_Irecv(rbuf2[i], bsz1, MPI_INT, MPI_ANY_SOURCE, tag, comm, r_waits2+i); 376 } 377 378 /* Now post the sends */ 379 s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 380 CHKPTRQ(s_waits2); 381 for ( i=0; i<nrqr; ++i ) { 382 j = recv_status[i].MPI_SOURCE; 383 MPI_Isend( xdata[i], isz1[i], MPI_INT, j, tag, comm, s_waits2+i); 384 } 385 386 /* receive work done on other processors*/ 387 { 388 int index, is_no, ct1, max,*rbuf2_i,isz_i,*data_i,jmax; 389 char *table_i; 390 MPI_Status *status2; 391 392 status2 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(status2); 393 394 for ( i=0; i<nrqs; ++i ) { 395 MPI_Waitany(nrqs, r_waits2, &index, status2+i); 396 /* Process the message*/ 397 rbuf2_i = rbuf2[index]; 398 ct1 = 2*rbuf2_i[0]+1; 399 jmax = rbuf2[index][0]; 400 for ( j=1; j<=jmax; j++ ) { 401 max = rbuf2_i[2*j]; 402 is_no = rbuf2_i[2*j-1]; 403 isz_i = isz[is_no]; 404 data_i = data[is_no]; 405 table_i = table[is_no]; 406 for ( k=0; k<max; k++,ct1++ ) { 407 row = rbuf2_i[ct1]; 408 if (!BT_LOOKUP(table_i,row)) { data_i[isz_i++] = row;} 409 } 410 isz[is_no] = isz_i; 411 } 412 } 413 MPI_Waitall(nrqr,s_waits2,status2); 414 PetscFree(status2); 415 } 416 417 for ( i=0; i<imax; ++i ) { 418 ierr = ISCreateSeq(MPI_COMM_SELF, isz[i], data[i], is+i); CHKERRQ(ierr); 419 } 420 421 PetscFree(pa); 422 PetscFree(rbuf2); 423 PetscFree(s_waits1); 424 PetscFree(r_waits1); 425 PetscFree(s_waits2); 426 PetscFree(r_waits2); 427 PetscFree(table); 428 PetscFree(s_status); 429 PetscFree(recv_status); 430 PetscFree(xdata[0]); 431 PetscFree(xdata); 432 PetscFree(isz1); 433 return 0; 434 } 435 436 /* 437 MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do 438 the work on the local processor. 439 440 Inputs: 441 C - MAT_MPIBAIJ; 442 imax - total no of index sets processed at a time; 443 table - an array of char - size = Mbs bits. 444 445 Output: 446 isz - array containing the count of the solution elements correspondign 447 to each index set; 448 data - pointer to the solutions 449 */ 450 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat C,int imax,char **table,int *isz, 451 int **data) 452 { 453 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 454 Mat A = c->A, B = c->B; 455 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 456 int start, end, val, max, rstart,cstart,*ai, *aj; 457 int *bi, *bj, *garray, i, j, k, row,*data_i,isz_i; 458 char *table_i; 459 460 rstart = c->rstart; 461 cstart = c->cstart; 462 ai = a->i; 463 aj = a->j; 464 bi = b->i; 465 bj = b->j; 466 garray = c->garray; 467 468 469 for ( i=0; i<imax; i++ ) { 470 data_i = data[i]; 471 table_i = table[i]; 472 isz_i = isz[i]; 473 for ( j=0, max=isz[i]; j<max; j++ ) { 474 row = data_i[j] - rstart; 475 start = ai[row]; 476 end = ai[row+1]; 477 for ( k=start; k<end; k++ ) { /* Amat */ 478 val = aj[k] + cstart; 479 if (!BT_LOOKUP(table_i,val)) { data_i[isz_i++] = val;} 480 } 481 start = bi[row]; 482 end = bi[row+1]; 483 for ( k=start; k<end; k++ ) { /* Bmat */ 484 val = garray[bj[k]]; 485 if (!BT_LOOKUP(table_i,val)) { data_i[isz_i++] = val;} 486 } 487 } 488 isz[i] = isz_i; 489 } 490 return 0; 491 } 492 /* 493 MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages, 494 and return the output 495 496 Input: 497 C - the matrix 498 nrqr - no of messages being processed. 499 rbuf - an array of pointers to the recieved requests 500 501 Output: 502 xdata - array of messages to be sent back 503 isz1 - size of each message 504 505 For better efficiency perhaps we should malloc seperately each xdata[i], 506 then if a remalloc is required we need only copy the data for that one row 507 rather then all previous rows as it is now where a single large chunck of 508 memory is used. 509 510 */ 511 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,int nrqr,int **rbuf, 512 int **xdata, int * isz1) 513 { 514 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 515 Mat A = c->A, B = c->B; 516 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 517 int rstart,cstart,*ai, *aj, *bi, *bj, *garray, i, j, k; 518 int row,total_sz,ct, ct1, ct2, ct3,mem_estimate, oct2, l, start, end; 519 int val, max1, max2, rank, Mbs, no_malloc =0, *tmp, new_estimate, ctr; 520 int *rbuf_i,kmax,rbuf_0; 521 char *xtable; 522 523 rank = c->rank; 524 Mbs = c->Mbs; 525 rstart = c->rstart; 526 cstart = c->cstart; 527 ai = a->i; 528 aj = a->j; 529 bi = b->i; 530 bj = b->j; 531 garray = c->garray; 532 533 534 for ( i=0,ct=0,total_sz=0; i<nrqr; ++i ) { 535 rbuf_i = rbuf[i]; 536 rbuf_0 = rbuf_i[0]; 537 ct += rbuf_0; 538 for ( j=1; j<=rbuf_0; j++ ) { total_sz += rbuf_i[2*j]; } 539 } 540 541 max1 = ct*(a->nz +b->nz)/c->Mbs; 542 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 543 xdata[0] = (int *)PetscMalloc(mem_estimate*sizeof(int)); CHKPTRQ(xdata[0]); 544 ++no_malloc; 545 xtable = (char *)PetscMalloc((Mbs/BITSPERBYTE+1)*sizeof(char)); CHKPTRQ(xtable); 546 PetscMemzero(isz1,nrqr*sizeof(int)); 547 548 ct3 = 0; 549 for ( i=0; i<nrqr; i++ ) { /* for easch mesg from proc i */ 550 rbuf_i = rbuf[i]; 551 rbuf_0 = rbuf_i[0]; 552 ct1 = 2*rbuf_0+1; 553 ct2 = ct1; 554 ct3 += ct1; 555 for ( j=1; j<=rbuf_0; j++ ) { /* for each IS from proc i*/ 556 PetscMemzero(xtable,(Mbs/BITSPERBYTE+1)*sizeof(char)); 557 oct2 = ct2; 558 kmax = rbuf_i[2*j]; 559 for ( k=0; k<kmax; k++, ct1++ ) { 560 row = rbuf_i[ct1]; 561 if (!BT_LOOKUP(xtable,row)) { 562 if (!(ct3 < mem_estimate)) { 563 new_estimate = (int)(1.5*mem_estimate)+1; 564 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 565 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 566 PetscFree(xdata[0]); 567 xdata[0] = tmp; 568 mem_estimate = new_estimate; ++no_malloc; 569 for ( ctr=1; ctr<=i; ctr++ ) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 570 } 571 xdata[i][ct2++] = row; 572 ct3++; 573 } 574 } 575 for ( k=oct2,max2=ct2; k<max2; k++ ) { 576 row = xdata[i][k] - rstart; 577 start = ai[row]; 578 end = ai[row+1]; 579 for ( l=start; l<end; l++ ) { 580 val = aj[l] + cstart; 581 if (!BT_LOOKUP(xtable,val)) { 582 if (!(ct3 < mem_estimate)) { 583 new_estimate = (int)(1.5*mem_estimate)+1; 584 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 585 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 586 PetscFree(xdata[0]); 587 xdata[0] = tmp; 588 mem_estimate = new_estimate; ++no_malloc; 589 for ( ctr=1; ctr<=i; ctr++ ) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 590 } 591 xdata[i][ct2++] = val; 592 ct3++; 593 } 594 } 595 start = bi[row]; 596 end = bi[row+1]; 597 for ( l=start; l<end; l++ ) { 598 val = garray[bj[l]]; 599 if (!BT_LOOKUP(xtable,val)) { 600 if (!(ct3 < mem_estimate)) { 601 new_estimate = (int)(1.5*mem_estimate)+1; 602 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 603 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 604 PetscFree(xdata[0]); 605 xdata[0] = tmp; 606 mem_estimate = new_estimate; ++no_malloc; 607 for ( ctr =1; ctr <=i; ctr++ ) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 608 } 609 xdata[i][ct2++] = val; 610 ct3++; 611 } 612 } 613 } 614 /* Update the header*/ 615 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 616 xdata[i][2*j-1] = rbuf_i[2*j-1]; 617 } 618 xdata[i][0] = rbuf_0; 619 xdata[i+1] = xdata[i] + ct2; 620 isz1[i] = ct2; /* size of each message */ 621 } 622 PetscFree(xtable); 623 PLogInfo(0,"MatIncreaseOverlap_MPIBAIJ:[%d] Allocated %d bytes, required %d bytes, no of mallocs = %d\n",rank,mem_estimate, ct3,no_malloc); 624 return 0; 625 } 626 627 static int MatGetSubMatrices_MPIBAIJ_local(Mat,int,IS *,IS *,MatGetSubMatrixCall,Mat **); 628 629 int MatGetSubMatrices_MPIBAIJ(Mat C,int ismax,IS *isrow,IS *iscol, 630 MatGetSubMatrixCall scall,Mat **submat) 631 { 632 int i,ierr; 633 IS *isrow_new,*iscol_new; 634 635 /* The compression and expansion should be avoided. Dos'nt point 636 out errors might change the indices hence buggey */ 637 638 isrow_new = (IS *)PetscMalloc(2*ismax*sizeof(IS)); CHKPTRQ(isrow_new); 639 iscol_new = isrow_new + ismax; 640 ierr = MatCompressIndices_MPIBAIJ(C, ismax, isrow,isrow_new); CHKERRQ(ierr); 641 ierr = MatCompressIndices_MPIBAIJ(C, ismax, iscol,iscol_new); CHKERRQ(ierr); 642 ierr = MatGetSubMatrices_MPIBAIJ_local(C,ismax,isrow_new,iscol_new,scall,submat); CHKERRQ(ierr); 643 644 for ( i=0; i<ismax; i++) { 645 ISDestroy(isrow_new[i]); 646 ISDestroy(iscol_new[i]); 647 } 648 PetscFree(isrow_new); 649 return 0; 650 } 651 652 /* -------------------------------------------------------------------------*/ 653 static int MatGetSubMatrices_MPIBAIJ_local(Mat C,int ismax,IS *isrow,IS *iscol, 654 MatGetSubMatrixCall scall,Mat **submat) 655 { 656 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 657 Mat A = c->A,*submats = *submat; 658 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data, *b = (Mat_SeqBAIJ*)c->B->data, *mat; 659 int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size; 660 int **sbuf1,**sbuf2, rank, Mbs,i,j,k,l,ct1,ct2,ierr, **rbuf1,row,proc; 661 int nrqs, msz, **ptr,index,*req_size,*ctr,*pa,tag,*tmp,tcol,bsz,nrqr; 662 int **rbuf3,*req_source,**sbuf_aj, **rbuf2, max1,max2,**rmap; 663 int **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i; 664 int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i; 665 int *rmap_i,bs=c->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA, *cworkB; 666 int cstart = c->cstart,nzA,nzB,*a_i=a->i,*b_i=b->i,imark; 667 int *bmap = c->garray,ctmp,rstart=c->rstart; 668 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 669 MPI_Request *r_waits4,*s_waits3,*s_waits4; 670 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 671 MPI_Status *r_status3,*r_status4,*s_status4; 672 MPI_Comm comm; 673 Scalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB; 674 Scalar *a_a=a->a,*b_a=b->a; 675 676 comm = C->comm; 677 tag = C->tag; 678 size = c->size; 679 rank = c->rank; 680 Mbs = c->Mbs; 681 682 /* Check if the col indices are sorted */ 683 for ( i=0; i<ismax; i++ ) { 684 ierr = ISSorted(iscol[i],(PetscTruth*)&j); 685 if (!j) SETERRQ(1,"MatGetSubmatrices_MPIBAIJ:IS is not sorted"); 686 } 687 688 len = (2*ismax+1)*(sizeof(int *) + sizeof(int)) + (Mbs+1)*sizeof(int); 689 irow = (int **)PetscMalloc(len); CHKPTRQ(irow); 690 icol = irow + ismax; 691 nrow = (int *) (icol + ismax); 692 ncol = nrow + ismax; 693 rtable = ncol + ismax; 694 695 for ( i=0; i<ismax; i++ ) { 696 ierr = ISGetIndices(isrow[i],&irow[i]); CHKERRQ(ierr); 697 ierr = ISGetIndices(iscol[i],&icol[i]); CHKERRQ(ierr); 698 ierr = ISGetSize(isrow[i],&nrow[i]); CHKERRQ(ierr); 699 ierr = ISGetSize(iscol[i],&ncol[i]); CHKERRQ(ierr); 700 } 701 702 /* Create hash table for the mapping :row -> proc*/ 703 for ( i=0,j=0; i<size; i++ ) { 704 jmax = c->rowners[i+1]; 705 for ( ; j<jmax; j++ ) { 706 rtable[j] = i; 707 } 708 } 709 710 /* evaluate communication - mesg to who, length of mesg, and buffer space 711 required. Based on this, buffers are allocated, and data copied into them*/ 712 w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1); /* mesg size */ 713 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 714 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 715 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 716 PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/ 717 for ( i=0; i<ismax; i++ ) { 718 PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/ 719 jmax = nrow[i]; 720 irow_i = irow[i]; 721 for ( j=0; j<jmax; j++ ) { 722 row = irow_i[j]; 723 proc = rtable[row]; 724 w4[proc]++; 725 } 726 for ( j=0; j<size; j++ ) { 727 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 728 } 729 } 730 731 nrqs = 0; /* no of outgoing messages */ 732 msz = 0; /* total mesg length (for all proc */ 733 w1[rank] = 0; /* no mesg sent to intself */ 734 w3[rank] = 0; 735 for ( i=0; i<size; i++ ) { 736 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 737 } 738 pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); /*(proc -array)*/ 739 for ( i=0, j=0; i<size; i++ ) { 740 if (w1[i]) { pa[j] = i; j++; } 741 } 742 743 /* Each message would have a header = 1 + 2*(no of IS) + data */ 744 for ( i=0; i<nrqs; i++ ) { 745 j = pa[i]; 746 w1[j] += w2[j] + 2* w3[j]; 747 msz += w1[j]; 748 } 749 /* Do a global reduction to determine how many messages to expect*/ 750 { 751 int *rw1, *rw2; 752 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 753 rw2 = rw1+size; 754 MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm); 755 bsz = rw1[rank]; 756 MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm); 757 nrqr = rw2[rank]; 758 PetscFree(rw1); 759 } 760 761 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 762 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 763 rbuf1 = (int**) PetscMalloc(len); CHKPTRQ(rbuf1); 764 rbuf1[0] = (int *) (rbuf1 + nrqr); 765 for ( i=1; i<nrqr; ++i ) rbuf1[i] = rbuf1[i-1] + bsz; 766 767 /* Post the receives */ 768 r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 769 CHKPTRQ(r_waits1); 770 for ( i=0; i<nrqr; ++i ) { 771 MPI_Irecv(rbuf1[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag,comm,r_waits1+i); 772 } 773 774 /* Allocate Memory for outgoing messages */ 775 len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int); 776 sbuf1 = (int **)PetscMalloc(len); CHKPTRQ(sbuf1); 777 ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */ 778 PetscMemzero(sbuf1,2*size*sizeof(int*)); 779 /* allocate memory for outgoing data + buf to receive the first reply */ 780 tmp = (int *) (ptr + size); 781 ctr = tmp + 2*msz; 782 783 { 784 785 int *iptr = tmp,ict = 0; 786 for ( i=0; i<nrqs; i++ ) { 787 j = pa[i]; 788 iptr += ict; 789 sbuf1[j] = iptr; 790 ict = w1[j]; 791 } 792 } 793 794 /* Form the outgoing messages */ 795 /* Initialise the header space */ 796 for ( i=0; i<nrqs; i++ ) { 797 j = pa[i]; 798 sbuf1[j][0] = 0; 799 PetscMemzero(sbuf1[j]+1, 2*w3[j]*sizeof(int)); 800 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 801 } 802 803 /* Parse the isrow and copy data into outbuf */ 804 for ( i=0; i<ismax; i++ ) { 805 PetscMemzero(ctr,size*sizeof(int)); 806 irow_i = irow[i]; 807 jmax = nrow[i]; 808 for ( j=0; j<jmax; j++ ) { /* parse the indices of each IS */ 809 row = irow_i[j]; 810 proc = rtable[row]; 811 if (proc != rank) { /* copy to the outgoing buf*/ 812 ctr[proc]++; 813 *ptr[proc] = row; 814 ptr[proc]++; 815 } 816 } 817 /* Update the headers for the current IS */ 818 for ( j=0; j<size; j++ ) { /* Can Optimise this loop too */ 819 if ((ctr_j = ctr[j])) { 820 sbuf1_j = sbuf1[j]; 821 k = ++sbuf1_j[0]; 822 sbuf1_j[2*k] = ctr_j; 823 sbuf1_j[2*k-1] = i; 824 } 825 } 826 } 827 828 /* Now post the sends */ 829 s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 830 CHKPTRQ(s_waits1); 831 for ( i=0; i<nrqs; ++i ) { 832 j = pa[i]; 833 /* printf("[%d] Send Req to %d: size %d \n", rank,j, w1[j]); */ 834 MPI_Isend( sbuf1[j], w1[j], MPI_INT, j, tag, comm, s_waits1+i); 835 } 836 837 /* Post Recieves to capture the buffer size */ 838 r_waits2 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 839 CHKPTRQ(r_waits2); 840 rbuf2 = (int**)PetscMalloc((nrqs+1)*sizeof(int *));CHKPTRQ(rbuf2); 841 rbuf2[0] = tmp + msz; 842 for ( i=1; i<nrqs; ++i ) { 843 j = pa[i]; 844 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 845 } 846 for ( i=0; i<nrqs; ++i ) { 847 j = pa[i]; 848 MPI_Irecv( rbuf2[i], w1[j], MPI_INT, j, tag+1, comm, r_waits2+i); 849 } 850 851 /* Send to other procs the buf size they should allocate */ 852 853 854 /* Receive messages*/ 855 s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 856 CHKPTRQ(s_waits2); 857 r_status1 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 858 CHKPTRQ(r_status1); 859 len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*); 860 sbuf2 = (int**) PetscMalloc(len);CHKPTRQ(sbuf2); 861 req_size = (int *) (sbuf2 + nrqr); 862 req_source = req_size + nrqr; 863 864 { 865 Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*) c->A->data, *sB = (Mat_SeqBAIJ*) c->B->data; 866 int *sAi = sA->i, *sBi = sB->i, id; 867 int *sbuf2_i; 868 869 for ( i=0; i<nrqr; ++i ) { 870 MPI_Waitany(nrqr, r_waits1, &index, r_status1+i); 871 req_size[index] = 0; 872 rbuf1_i = rbuf1[index]; 873 start = 2*rbuf1_i[0] + 1; 874 MPI_Get_count(r_status1+i,MPI_INT, &end); 875 sbuf2[index] = (int *)PetscMalloc(end*sizeof(int));CHKPTRQ(sbuf2[index]); 876 sbuf2_i = sbuf2[index]; 877 for ( j=start; j<end; j++ ) { 878 id = rbuf1_i[j] - rstart; 879 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 880 sbuf2_i[j] = ncols; 881 req_size[index] += ncols; 882 } 883 req_source[index] = r_status1[i].MPI_SOURCE; 884 /* form the header */ 885 sbuf2_i[0] = req_size[index]; 886 for ( j=1; j<start; j++ ) { sbuf2_i[j] = rbuf1_i[j]; } 887 MPI_Isend(sbuf2_i,end,MPI_INT,req_source[index],tag+1,comm,s_waits2+i); 888 } 889 } 890 PetscFree(r_status1); PetscFree(r_waits1); 891 892 /* recv buffer sizes */ 893 /* Receive messages*/ 894 895 rbuf3 = (int**)PetscMalloc((nrqs+1)*sizeof(int*)); CHKPTRQ(rbuf3); 896 rbuf4 = (Scalar**)PetscMalloc((nrqs+1)*sizeof(Scalar*));CHKPTRQ(rbuf4); 897 r_waits3 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 898 CHKPTRQ(r_waits3); 899 r_waits4 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 900 CHKPTRQ(r_waits4); 901 r_status2 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 902 CHKPTRQ(r_status2); 903 904 for ( i=0; i<nrqs; ++i ) { 905 MPI_Waitany(nrqs, r_waits2, &index, r_status2+i); 906 rbuf3[index] = (int *)PetscMalloc(rbuf2[index][0]*sizeof(int)); 907 CHKPTRQ(rbuf3[index]); 908 rbuf4[index] = (Scalar *)PetscMalloc(rbuf2[index][0]*bs2*sizeof(Scalar)); 909 CHKPTRQ(rbuf4[index]); 910 MPI_Irecv(rbuf3[index],rbuf2[index][0], MPI_INT, 911 r_status2[i].MPI_SOURCE, tag+2, comm, r_waits3+index); 912 MPI_Irecv(rbuf4[index],rbuf2[index][0]*bs2, MPIU_SCALAR, 913 r_status2[i].MPI_SOURCE, tag+3, comm, r_waits4+index); 914 } 915 PetscFree(r_status2); PetscFree(r_waits2); 916 917 /* Wait on sends1 and sends2 */ 918 s_status1 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 919 CHKPTRQ(s_status1); 920 s_status2 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 921 CHKPTRQ(s_status2); 922 923 MPI_Waitall(nrqs,s_waits1,s_status1); 924 MPI_Waitall(nrqr,s_waits2,s_status2); 925 PetscFree(s_status1); PetscFree(s_status2); 926 PetscFree(s_waits1); PetscFree(s_waits2); 927 928 /* Now allocate buffers for a->j, and send them off */ 929 sbuf_aj = (int **)PetscMalloc((nrqr+1)*sizeof(int *));CHKPTRQ(sbuf_aj); 930 for ( i=0,j=0; i<nrqr; i++ ) j += req_size[i]; 931 sbuf_aj[0] = (int*) PetscMalloc((j+1)*sizeof(int)); CHKPTRQ(sbuf_aj[0]); 932 for ( i=1; i<nrqr; i++ ) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 933 934 s_waits3 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 935 CHKPTRQ(s_waits3); 936 { 937 for ( i=0; i<nrqr; i++ ) { 938 rbuf1_i = rbuf1[i]; 939 sbuf_aj_i = sbuf_aj[i]; 940 ct1 = 2*rbuf1_i[0] + 1; 941 ct2 = 0; 942 for ( j=1,max1=rbuf1_i[0]; j<=max1; j++ ) { 943 kmax = rbuf1[i][2*j]; 944 for ( k=0; k<kmax; k++,ct1++ ) { 945 row = rbuf1_i[ct1] - cstart; 946 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 947 ncols = nzA + nzB; 948 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 949 950 /* load the column indices for this row into cols*/ 951 cols = sbuf_aj_i + ct2; 952 for ( l=0; l<nzB; l++ ) { 953 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp; 954 else break; 955 } 956 imark = l; 957 for ( l=0; l<nzA; l++ ) cols[imark+l] = cstart + cworkA[l]; 958 for ( l=imark; l<nzB; l++ ) cols[nzA+l] = bmap[cworkB[l]]; 959 ct2 += ncols; 960 } 961 } 962 MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag+2,comm,s_waits3+i); 963 } 964 } 965 r_status3 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 966 CHKPTRQ(r_status3); 967 s_status3 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 968 CHKPTRQ(s_status3); 969 970 /* Allocate buffers for a->a, and send them off */ 971 sbuf_aa = (Scalar **)PetscMalloc((nrqr+1)*sizeof(Scalar *));CHKPTRQ(sbuf_aa); 972 for ( i=0,j=0; i<nrqr; i++ ) j += req_size[i]; 973 sbuf_aa[0] = (Scalar*) PetscMalloc((j+1)*bs2*sizeof(Scalar));CHKPTRQ(sbuf_aa[0]); 974 for ( i=1; i<nrqr; i++ ) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2; 975 976 s_waits4 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 977 CHKPTRQ(s_waits4); 978 { 979 for ( i=0; i<nrqr; i++ ) { 980 rbuf1_i = rbuf1[i]; 981 sbuf_aa_i = sbuf_aa[i]; 982 ct1 = 2*rbuf1_i[0]+1; 983 ct2 = 0; 984 for ( j=1,max1=rbuf1_i[0]; j<=max1; j++ ) { 985 kmax = rbuf1_i[2*j]; 986 for ( k=0; k<kmax; k++,ct1++ ) { 987 row = rbuf1_i[ct1] - cstart; 988 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 989 ncols = nzA + nzB; 990 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 991 vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2; 992 993 /* load the column values for this row into vals*/ 994 vals = sbuf_aa_i+ct2*bs2; 995 for ( l=0; l<nzB; l++ ) { 996 if ((bmap[cworkB[l]]) < cstart) 997 PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(Scalar)); 998 else break; 999 } 1000 imark = l; 1001 for ( l=0; l<nzA; l++ ) 1002 PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(Scalar)); 1003 for ( l=imark; l<nzB; l++ ) 1004 PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(Scalar)); 1005 ct2 += ncols; 1006 } 1007 } 1008 MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_SCALAR,req_source[i],tag+3,comm,s_waits4+i); 1009 } 1010 } 1011 r_status4 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 1012 CHKPTRQ(r_status4); 1013 s_status4 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 1014 CHKPTRQ(s_status4); 1015 PetscFree(rbuf1); 1016 1017 /* Form the matrix */ 1018 /* create col map */ 1019 { 1020 int *icol_i; 1021 1022 len = (1+ismax)*sizeof(int *) + ismax*c->Nbs*sizeof(int); 1023 cmap = (int **)PetscMalloc(len); CHKPTRQ(cmap); 1024 cmap[0] = (int *)(cmap + ismax); 1025 PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(int)); 1026 for ( i=1; i<ismax; i++ ) { cmap[i] = cmap[i-1] + c->Nbs; } 1027 for ( i=0; i<ismax; i++ ) { 1028 jmax = ncol[i]; 1029 icol_i = icol[i]; 1030 cmap_i = cmap[i]; 1031 for ( j=0; j<jmax; j++ ) { 1032 cmap_i[icol_i[j]] = j+1; 1033 } 1034 } 1035 } 1036 1037 1038 /* Create lens which is required for MatCreate... */ 1039 for ( i=0,j=0; i<ismax; i++ ) { j += nrow[i]; } 1040 len = (1+ismax)*sizeof(int *) + j*sizeof(int); 1041 lens = (int **)PetscMalloc(len); CHKPTRQ(lens); 1042 lens[0] = (int *)(lens + ismax); 1043 PetscMemzero(lens[0], j*sizeof(int)); 1044 for ( i=1; i<ismax; i++ ) { lens[i] = lens[i-1] + nrow[i-1]; } 1045 1046 /* Update lens from local data */ 1047 for ( i=0; i<ismax; i++ ) { 1048 jmax = nrow[i]; 1049 cmap_i = cmap[i]; 1050 irow_i = irow[i]; 1051 lens_i = lens[i]; 1052 for ( j=0; j<jmax; j++ ) { 1053 row = irow_i[j]; 1054 proc = rtable[row]; 1055 if (proc == rank) { 1056 /* Get indices from matA and then from matB */ 1057 row = row - rstart; 1058 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1059 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1060 for ( k=0; k<nzA; k++ ) 1061 if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++;} 1062 for ( k=0; k<nzB; k++ ) 1063 if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++;} 1064 } 1065 } 1066 } 1067 1068 /* Create row map*/ 1069 len = (1+ismax)*sizeof(int *) + ismax*c->Mbs*sizeof(int); 1070 rmap = (int **)PetscMalloc(len); CHKPTRQ(rmap); 1071 rmap[0] = (int *)(rmap + ismax); 1072 PetscMemzero(rmap[0],ismax*c->Mbs*sizeof(int)); 1073 for ( i=1; i<ismax; i++ ) { rmap[i] = rmap[i-1] + c->Mbs;} 1074 for ( i=0; i<ismax; i++ ) { 1075 rmap_i = rmap[i]; 1076 irow_i = irow[i]; 1077 jmax = nrow[i]; 1078 for ( j=0; j<jmax; j++ ) { 1079 rmap_i[irow_i[j]] = j; 1080 } 1081 } 1082 1083 /* Update lens from offproc data */ 1084 { 1085 int *rbuf2_i, *rbuf3_i, *sbuf1_i; 1086 1087 for ( tmp2=0; tmp2<nrqs; tmp2++ ) { 1088 MPI_Waitany(nrqs, r_waits3, &i, r_status3+tmp2); 1089 index = pa[i]; 1090 sbuf1_i = sbuf1[index]; 1091 jmax = sbuf1_i[0]; 1092 ct1 = 2*jmax+1; 1093 ct2 = 0; 1094 rbuf2_i = rbuf2[i]; 1095 rbuf3_i = rbuf3[i]; 1096 for ( j=1; j<=jmax; j++ ) { 1097 is_no = sbuf1_i[2*j-1]; 1098 max1 = sbuf1_i[2*j]; 1099 lens_i = lens[is_no]; 1100 cmap_i = cmap[is_no]; 1101 rmap_i = rmap[is_no]; 1102 for ( k=0; k<max1; k++,ct1++ ) { 1103 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1104 max2 = rbuf2_i[ct1]; 1105 for ( l=0; l<max2; l++,ct2++ ) { 1106 if (cmap_i[rbuf3_i[ct2]]) { 1107 lens_i[row]++; 1108 } 1109 } 1110 } 1111 } 1112 } 1113 } 1114 PetscFree(r_status3); PetscFree(r_waits3); 1115 MPI_Waitall(nrqr,s_waits3,s_status3); 1116 PetscFree(s_status3); PetscFree(s_waits3); 1117 1118 /* Create the submatrices */ 1119 if (scall == MAT_REUSE_MATRIX) { 1120 /* 1121 Assumes new rows are same length as the old rows, hence bug! 1122 */ 1123 for ( i=0; i<ismax; i++ ) { 1124 mat = (Mat_SeqBAIJ *)(submats[i]->data); 1125 if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || mat->bs != bs)) { 1126 SETERRQ(1,"MatGetSubmatrices_MPIBAIJ:Cannot reuse matrix. wrong size"); 1127 } 1128 if (PetscMemcmp(mat->ilen,lens[i], mat->mbs *sizeof(int))) { 1129 SETERRQ(1,"MatGetSubmatrices_MPIBAIJ:Cannot reuse matrix. wrong no of nonzeros"); 1130 } 1131 /* Initial matrix as if empty */ 1132 PetscMemzero(mat->ilen,mat->mbs*sizeof(int)); 1133 } 1134 } 1135 else { 1136 *submat = submats = (Mat *)PetscMalloc(ismax*sizeof(Mat)); CHKPTRQ(submats); 1137 for ( i=0; i<ismax; i++ ) { 1138 ierr = MatCreateSeqBAIJ(comm,a->bs,nrow[i]*bs,ncol[i]*bs,0,lens[i],submats+i);CHKERRQ(ierr); 1139 } 1140 } 1141 1142 /* Assemble the matrices */ 1143 /* First assemble the local rows */ 1144 { 1145 int ilen_row,*imat_ilen, *imat_j, *imat_i; 1146 Scalar *imat_a; 1147 1148 for ( i=0; i<ismax; i++ ) { 1149 mat = (Mat_SeqBAIJ *) submats[i]->data; 1150 imat_ilen = mat->ilen; 1151 imat_j = mat->j; 1152 imat_i = mat->i; 1153 imat_a = mat->a; 1154 cmap_i = cmap[i]; 1155 rmap_i = rmap[i]; 1156 irow_i = irow[i]; 1157 jmax = nrow[i]; 1158 for ( j=0; j<jmax; j++ ) { 1159 row = irow_i[j]; 1160 proc = rtable[row]; 1161 if (proc == rank) { 1162 row = row - rstart; 1163 nzA = a_i[row+1] - a_i[row]; 1164 nzB = b_i[row+1] - b_i[row]; 1165 cworkA = a_j + a_i[row]; 1166 cworkB = b_j + b_i[row]; 1167 vworkA = a_a + a_i[row]*bs2; 1168 vworkB = b_a + b_i[row]*bs2; 1169 1170 row = rmap_i[row + rstart]; 1171 mat_i = imat_i[row]; 1172 mat_a = imat_a + mat_i*bs2; 1173 mat_j = imat_j + mat_i; 1174 ilen_row = imat_ilen[row]; 1175 1176 /* load the column indices for this row into cols*/ 1177 for ( l=0; l<nzB; l++ ) { 1178 if ((ctmp = bmap[cworkB[l]]) < cstart) { 1179 if ((tcol = cmap_i[ctmp])) { 1180 *mat_j++ = tcol - 1; 1181 PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(Scalar)); mat_a += bs2; 1182 ilen_row++; 1183 } 1184 } 1185 else break; 1186 } 1187 imark = l; 1188 for ( l=0; l<nzA; l++ ) { 1189 if ((tcol = cmap_i[cstart + cworkA[l]])) { 1190 *mat_j++ = tcol - 1; 1191 PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(Scalar)); mat_a += bs2; 1192 ilen_row++; 1193 } 1194 } 1195 for ( l=imark; l<nzB; l++ ) { 1196 if ((tcol = cmap_i[bmap[cworkB[l]]])) { 1197 *mat_j++ = tcol - 1; 1198 PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(Scalar)); mat_a += bs2; 1199 ilen_row++; 1200 } 1201 } 1202 imat_ilen[row] = ilen_row; 1203 } 1204 } 1205 1206 } 1207 } 1208 1209 /* Now assemble the off proc rows*/ 1210 { 1211 int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1212 int *imat_j,*imat_i; 1213 Scalar *imat_a,*rbuf4_i; 1214 1215 for ( tmp2=0; tmp2<nrqs; tmp2++ ) { 1216 MPI_Waitany(nrqs, r_waits4, &i, r_status4+tmp2); 1217 index = pa[i]; 1218 sbuf1_i = sbuf1[index]; 1219 jmax = sbuf1_i[0]; 1220 ct1 = 2*jmax + 1; 1221 ct2 = 0; 1222 rbuf2_i = rbuf2[i]; 1223 rbuf3_i = rbuf3[i]; 1224 rbuf4_i = rbuf4[i]; 1225 for ( j=1; j<=jmax; j++ ) { 1226 is_no = sbuf1_i[2*j-1]; 1227 rmap_i = rmap[is_no]; 1228 cmap_i = cmap[is_no]; 1229 mat = (Mat_SeqBAIJ *) submats[is_no]->data; 1230 imat_ilen = mat->ilen; 1231 imat_j = mat->j; 1232 imat_i = mat->i; 1233 imat_a = mat->a; 1234 max1 = sbuf1_i[2*j]; 1235 for ( k=0; k<max1; k++, ct1++ ) { 1236 row = sbuf1_i[ct1]; 1237 row = rmap_i[row]; 1238 ilen = imat_ilen[row]; 1239 mat_i = imat_i[row]; 1240 mat_a = imat_a + mat_i*bs2; 1241 mat_j = imat_j + mat_i; 1242 max2 = rbuf2_i[ct1]; 1243 for ( l=0; l<max2; l++,ct2++ ) { 1244 if ((tcol = cmap_i[rbuf3_i[ct2]])) { 1245 *mat_j++ = tcol - 1; 1246 /* *mat_a++ = rbuf4_i[ct2]; */ 1247 PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(Scalar)); mat_a += bs2; 1248 ilen++; 1249 } 1250 } 1251 imat_ilen[row] = ilen; 1252 } 1253 } 1254 } 1255 } 1256 PetscFree(r_status4); PetscFree(r_waits4); 1257 MPI_Waitall(nrqr,s_waits4,s_status4); 1258 PetscFree(s_waits4); PetscFree(s_status4); 1259 1260 /* Restore the indices */ 1261 for ( i=0; i<ismax; i++ ) { 1262 ierr = ISRestoreIndices(isrow[i], irow+i); CHKERRQ(ierr); 1263 ierr = ISRestoreIndices(iscol[i], icol+i); CHKERRQ(ierr); 1264 } 1265 1266 /* Destroy allocated memory */ 1267 PetscFree(irow); 1268 PetscFree(w1); 1269 PetscFree(pa); 1270 1271 PetscFree(sbuf1); 1272 PetscFree(rbuf2); 1273 for ( i=0; i<nrqr; ++i ) { 1274 PetscFree(sbuf2[i]); 1275 } 1276 for ( i=0; i<nrqs; ++i ) { 1277 PetscFree(rbuf3[i]); 1278 PetscFree(rbuf4[i]); 1279 } 1280 1281 PetscFree(sbuf2); 1282 PetscFree(rbuf3); 1283 PetscFree(rbuf4 ); 1284 PetscFree(sbuf_aj[0]); 1285 PetscFree(sbuf_aj); 1286 PetscFree(sbuf_aa[0]); 1287 PetscFree(sbuf_aa); 1288 1289 PetscFree(cmap); 1290 PetscFree(rmap); 1291 PetscFree(lens); 1292 1293 for ( i=0; i<ismax; i++ ) { 1294 ierr = MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); 1295 ierr = MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); 1296 } 1297 return 0; 1298 } 1299