1 #ifndef lint 2 static char vcid[] = "$Id: baijov.c,v 1.12 1996/09/14 03:08:37 bsmith Exp balay $"; 3 #endif 4 /* 5 Routines to compute overlapping regions of a parallel MPI matrix 6 and to find submatrices that were shared across processors. 7 */ 8 #include "src/mat/impls/baij/mpi/mpibaij.h" 9 #include "src/inline/bitarray.h" 10 11 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat, int, IS *); 12 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat , int , char **,int*, int**); 13 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat , int, int **, int**, int* ); 14 extern int MatGetRow_MPIBAIJ(Mat,int,int*,int**,Scalar**); 15 extern int MatRestoreRow_MPIBAIJ(Mat,int,int*,int**,Scalar**); 16 17 18 #undef __FUNCTION__ 19 #define __FUNCTION__ "MatCompressIndicesGeneral_MPIBAIJ" 20 static int MatCompressIndicesGeneral_MPIBAIJ(Mat C, int imax, IS *is_in, IS *is_out) 21 { 22 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) C->data; 23 int ierr,isz,bs = baij->bs,Nbs,n,i,j,*idx,*nidx,ival; 24 char *table; 25 26 Nbs = baij->Nbs; 27 table = (char *) PetscMalloc((Nbs/BITSPERBYTE +1)*sizeof(char)); CHKPTRQ(table); 28 nidx = (int *) PetscMalloc((Nbs+1)*sizeof(int)); CHKPTRQ(nidx); 29 30 for (i=0; i<imax; i++) { 31 isz = 0; 32 PetscMemzero(table,(Nbs/BITSPERBYTE +1)*sizeof(char)); 33 ierr = ISGetIndices(is_in[i],&idx); CHKERRQ(ierr); 34 ierr = ISGetSize(is_in[i],&n); CHKERRQ(ierr); 35 for (j=0; j<n ; j++) { 36 ival = idx[j]/bs; /* convert the indices into block indices */ 37 if (ival>Nbs) SETERRQ(1,"MatCompressIndicesGeneral_MPIBAIJ: index greater than mat-dim"); 38 if(!BT_LOOKUP(table, ival)) { nidx[isz++] = ival;} 39 } 40 ierr = ISRestoreIndices(is_in[i],&idx); CHKERRQ(ierr); 41 ierr = ISCreateGeneral(MPI_COMM_SELF, isz, nidx, (is_out+i)); CHKERRQ(ierr); 42 } 43 PetscFree(table); 44 PetscFree(nidx); 45 return 0; 46 } 47 48 #undef __FUNCTION__ 49 #define __FUNCTION__ "MatCompressIndicesSorted_MPIBAIJ" 50 static int MatCompressIndicesSorted_MPIBAIJ(Mat C, int imax, IS *is_in, IS *is_out) 51 { 52 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) C->data; 53 int ierr,bs=baij->bs,i,j,k,val,n,*idx,*nidx,Nbs=baij->Nbs,*idx_local; 54 PetscTruth flg; 55 56 for (i=0; i<imax; i++) { 57 ierr = ISSorted(is_in[i],&flg); CHKERRQ(ierr); 58 if (!flg) SETERRQ(1,"MatCompressIndicesSorted_MPIBAIJ: Indices are not sorted"); 59 } 60 nidx = (int *) PetscMalloc((Nbs+1)*sizeof(int)); CHKPTRQ(nidx); 61 /* Now chech if the indices are in block order */ 62 for (i=0; i<imax; i++) { 63 ierr = ISGetIndices(is_in[i],&idx); CHKERRQ(ierr); 64 ierr = ISGetSize(is_in[i],&n); CHKERRQ(ierr); 65 if ( n%bs !=0 ) SETERRA(1,"MatCompressIndicesSorted_MPIBAIJ: Indices are not block ordered"); 66 67 n = n/bs; /* The reduced index size */ 68 idx_local = idx; 69 for (j=0; j<n ; j++) { 70 val = idx_local[0]; 71 if(val%bs != 0) SETERRA(1,"MatCompressIncicesSorted_MPIBAIJ: Indices are not block ordered"); 72 for (k=0; k<bs; k++) { 73 if ( val+k != idx_local[k]) SETERRA(1,"MatCompressIncicesSorted_MPIBAIJ: Indices are not block ordered"); 74 } 75 nidx[j] = val/bs; 76 idx_local +=bs; 77 } 78 ierr = ISRestoreIndices(is_in[i],&idx); CHKERRQ(ierr); 79 ierr = ISCreateGeneral(MPI_COMM_SELF,n,nidx,(is_out+i)); CHKERRQ(ierr); 80 } 81 PetscFree(nidx); 82 return 0; 83 } 84 85 #undef __FUNCTION__ 86 #define __FUNCTION__ "MatExpandIndices_MPIBAIJ" 87 static int MatExpandIndices_MPIBAIJ(Mat C, int imax, IS *is_in, IS *is_out) 88 { 89 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) C->data; 90 int ierr,bs = baij->bs,Nbs,n,i,j,k,*idx,*nidx; 91 92 Nbs = baij->Nbs; 93 94 nidx = (int *) PetscMalloc((Nbs*bs+1)*sizeof(int)); CHKPTRQ(nidx); 95 96 for ( i=0; i<imax; i++ ) { 97 ierr = ISGetIndices(is_in[i],&idx); CHKERRQ(ierr); 98 ierr = ISGetSize(is_in[i],&n); CHKERRQ(ierr); 99 for (j=0; j<n ; ++j){ 100 for (k=0; k<bs; k++) 101 nidx[j*bs+k] = idx[j]*bs+k; 102 } 103 ierr = ISRestoreIndices(is_in[i],&idx); CHKERRQ(ierr); 104 ierr = ISCreateGeneral(MPI_COMM_SELF, n*bs, nidx, (is_out+i)); CHKERRQ(ierr); 105 } 106 PetscFree(nidx); 107 return 0; 108 } 109 110 111 #undef __FUNCTION__ 112 #define __FUNCTION__ "MatIncreaseOverlap_MPIBAIJ" 113 int MatIncreaseOverlap_MPIBAIJ(Mat C, int imax, IS *is, int ov) 114 { 115 int i, ierr; 116 IS *is_new; 117 118 is_new = (IS *)PetscMalloc(imax*sizeof(IS)); CHKPTRQ(is_new); 119 /* Convert the indices into block format */ 120 ierr = MatCompressIndicesGeneral_MPIBAIJ(C, imax, is,is_new); CHKERRQ(ierr); 121 if (ov < 0){ SETERRQ(1," MatIncreaseOverlap_MPIBAIJ:Negative overlap specified\n");} 122 for (i=0; i<ov; ++i) { 123 ierr = MatIncreaseOverlap_MPIBAIJ_Once(C, imax, is_new); CHKERRQ(ierr); 124 } 125 for (i=0; i<imax; i++) ISDestroy(is[i]); 126 ierr = MatExpandIndices_MPIBAIJ(C, imax, is_new,is); CHKERRQ(ierr); 127 for (i=0; i<imax; i++) ISDestroy(is_new[i]); 128 PetscFree(is_new); 129 return 0; 130 } 131 132 /* 133 Sample message format: 134 If a processor A wants processor B to process some elements corresponding 135 to index sets 1s[1], is[5] 136 mesg [0] = 2 ( no of index sets in the mesg) 137 ----------- 138 mesg [1] = 1 => is[1] 139 mesg [2] = sizeof(is[1]); 140 ----------- 141 mesg [5] = 5 => is[5] 142 mesg [6] = sizeof(is[5]); 143 ----------- 144 mesg [7] 145 mesg [n] datas[1] 146 ----------- 147 mesg[n+1] 148 mesg[m] data(is[5]) 149 ----------- 150 151 Notes: 152 nrqs - no of requests sent (or to be sent out) 153 nrqr - no of requests recieved (which have to be or which have been processed 154 */ 155 #undef __FUNCTION__ 156 #define __FUNCTION__ "MatIncreaseOverlap_MPIBAIJ_Once" 157 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat C, int imax, IS *is) 158 { 159 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 160 int **idx, *n, *w1, *w2, *w3, *w4, *rtable,**data,len,*idx_i; 161 int size,rank,Mbs,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr; 162 int *ctr,*pa,tag,*tmp,bsz,nrqr,*isz,*isz1,**xdata,bsz1,**rbuf2; 163 char **table; 164 MPI_Comm comm; 165 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 166 MPI_Status *s_status,*recv_status; 167 168 comm = C->comm; 169 tag = C->tag; 170 size = c->size; 171 rank = c->rank; 172 Mbs = c->Mbs; 173 174 len = (imax+1)*sizeof(int *) + (imax + Mbs)*sizeof(int); 175 idx = (int **) PetscMalloc(len); CHKPTRQ(idx); 176 n = (int *) (idx + imax); 177 rtable = n + imax; 178 179 for (i=0; i<imax; i++) { 180 ierr = ISGetIndices(is[i],&idx[i]); CHKERRQ(ierr); 181 ierr = ISGetSize(is[i],&n[i]); CHKERRQ(ierr); 182 } 183 184 /* Create hash table for the mapping :row -> proc*/ 185 for (i=0,j=0; i<size; i++) { 186 len = c->rowners[i+1]; 187 for (; j<len; j++) { 188 rtable[j] = i; 189 } 190 } 191 192 /* evaluate communication - mesg to who, length of mesg, and buffer space 193 required. Based on this, buffers are allocated, and data copied into them*/ 194 w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1);/* mesg size */ 195 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 196 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 197 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 198 PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/ 199 for (i=0; i<imax; i++) { 200 PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/ 201 idx_i = idx[i]; 202 len = n[i]; 203 for (j=0; j<len; j++) { 204 row = idx_i[j]; 205 proc = rtable[row]; 206 w4[proc]++; 207 } 208 for (j=0; j<size; j++){ 209 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 210 } 211 } 212 213 nrqs = 0; /* no of outgoing messages */ 214 msz = 0; /* total mesg length (for all proc */ 215 w1[rank] = 0; /* no mesg sent to intself */ 216 w3[rank] = 0; 217 for ( i=0; i<size; i++) { 218 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 219 } 220 /* pa - is list of processors to communicate with */ 221 pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); 222 for (i=0,j=0; i<size; i++) { 223 if (w1[i]) {pa[j] = i; j++;} 224 } 225 226 /* Each message would have a header = 1 + 2*(no of IS) + data */ 227 for (i=0; i<nrqs; i++) { 228 j = pa[i]; 229 w1[j] += w2[j] + 2*w3[j]; 230 msz += w1[j]; 231 } 232 233 234 /* Do a global reduction to determine how many messages to expect*/ 235 { 236 int *rw1, *rw2; 237 rw1 = (int *) PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 238 rw2 = rw1+size; 239 MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm); 240 bsz = rw1[rank]; 241 MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm); 242 nrqr = rw2[rank]; 243 PetscFree(rw1); 244 } 245 246 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 247 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 248 rbuf = (int**) PetscMalloc(len); CHKPTRQ(rbuf); 249 rbuf[0] = (int *) (rbuf + nrqr); 250 for (i=1; i<nrqr; ++i) rbuf[i] = rbuf[i-1] + bsz; 251 252 /* Post the receives */ 253 r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 254 CHKPTRQ(r_waits1); 255 for (i=0; i<nrqr; ++i) { 256 MPI_Irecv(rbuf[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag,comm,r_waits1+i); 257 } 258 259 /* Allocate Memory for outgoing messages */ 260 len = 2*size*sizeof(int*) + (size+msz)*sizeof(int); 261 outdat = (int **)PetscMalloc(len); CHKPTRQ(outdat); 262 ptr = outdat + size; /* Pointers to the data in outgoing buffers */ 263 PetscMemzero(outdat,2*size*sizeof(int*)); 264 tmp = (int *) (outdat + 2*size); 265 ctr = tmp + msz; 266 267 { 268 int *iptr = tmp,ict = 0; 269 for (i=0; i<nrqs; i++) { 270 j = pa[i]; 271 iptr += ict; 272 outdat[j] = iptr; 273 ict = w1[j]; 274 } 275 } 276 277 /* Form the outgoing messages */ 278 /*plug in the headers*/ 279 for (i=0; i<nrqs; i++) { 280 j = pa[i]; 281 outdat[j][0] = 0; 282 PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int)); 283 ptr[j] = outdat[j] + 2*w3[j] + 1; 284 } 285 286 /* Memory for doing local proc's work*/ 287 { 288 int *d_p; 289 char *t_p; 290 291 len = (imax+1)*(sizeof(char *) + sizeof(int *) + sizeof(int)) + 292 (Mbs+1)*imax*sizeof(int) + (Mbs/BITSPERBYTE+1)*imax*sizeof(char); 293 table = (char **)PetscMalloc(len); CHKPTRQ(table); 294 data = (int **)(table + imax); 295 data[0] = (int *)(data + imax); 296 isz = data[0] + (Mbs+1)*imax; 297 table[0] = (char *)(isz + imax); 298 d_p = data[0]; t_p = table[0]; 299 for (i=1; i<imax; i++) { 300 table[i] = t_p + (Mbs/BITSPERBYTE+1)*i; 301 data[i] = d_p + (Mbs+1)*i; 302 } 303 } 304 PetscMemzero(*table,(Mbs/BITSPERBYTE+1)*imax); 305 PetscMemzero(isz,imax*sizeof(int)); 306 307 /* Parse the IS and update local tables and the outgoing buf with the data*/ 308 { 309 int n_i,*data_i,isz_i,*outdat_j,ctr_j; 310 char *table_i; 311 312 for (i=0; i<imax; i++) { 313 PetscMemzero(ctr,size*sizeof(int)); 314 n_i = n[i]; 315 table_i = table[i]; 316 idx_i = idx[i]; 317 data_i = data[i]; 318 isz_i = isz[i]; 319 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 320 row = idx_i[j]; 321 proc = rtable[row]; 322 if (proc != rank) { /* copy to the outgoing buffer */ 323 ctr[proc]++; 324 *ptr[proc] = row; 325 ptr[proc]++; 326 } 327 else { /* Update the local table */ 328 if (!BT_LOOKUP(table_i,row)) { data_i[isz_i++] = row;} 329 } 330 } 331 /* Update the headers for the current IS */ 332 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 333 if ((ctr_j = ctr[j])) { 334 outdat_j = outdat[j]; 335 k = ++outdat_j[0]; 336 outdat_j[2*k] = ctr_j; 337 outdat_j[2*k-1] = i; 338 } 339 } 340 isz[i] = isz_i; 341 } 342 } 343 344 345 346 /* Now post the sends */ 347 s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 348 CHKPTRQ(s_waits1); 349 for (i=0; i<nrqs; ++i) { 350 j = pa[i]; 351 MPI_Isend(outdat[j], w1[j], MPI_INT, j, tag, comm, s_waits1+i); 352 } 353 354 /* No longer need the original indices*/ 355 for (i=0; i<imax; ++i) { 356 ierr = ISRestoreIndices(is[i], idx+i); CHKERRQ(ierr); 357 } 358 PetscFree(idx); 359 360 for (i=0; i<imax; ++i) { 361 ierr = ISDestroy(is[i]); CHKERRQ(ierr); 362 } 363 364 /* Do Local work*/ 365 ierr = MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 366 367 /* Receive messages*/ 368 { 369 int index; 370 371 recv_status = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 372 CHKPTRQ(recv_status); 373 for (i=0; i<nrqr; ++i) { 374 MPI_Waitany(nrqr, r_waits1, &index, recv_status+i); 375 } 376 377 s_status = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 378 CHKPTRQ(s_status); 379 MPI_Waitall(nrqs,s_waits1,s_status); 380 } 381 382 /* Phase 1 sends are complete - deallocate buffers */ 383 PetscFree(outdat); 384 PetscFree(w1); 385 386 xdata = (int **)PetscMalloc((nrqr+1)*sizeof(int *)); CHKPTRQ(xdata); 387 isz1 = (int *)PetscMalloc((nrqr+1)*sizeof(int)); CHKPTRQ(isz1); 388 ierr = MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 389 PetscFree(rbuf); 390 391 /* Send the data back*/ 392 /* Do a global reduction to know the buffer space req for incoming messages*/ 393 { 394 int *rw1, *rw2; 395 396 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 397 PetscMemzero(rw1,2*size*sizeof(int)); 398 rw2 = rw1+size; 399 for (i=0; i<nrqr; ++i) { 400 proc = recv_status[i].MPI_SOURCE; 401 rw1[proc] = isz1[i]; 402 } 403 404 MPI_Allreduce(rw1, rw2, size, MPI_INT, MPI_MAX, comm); 405 bsz1 = rw2[rank]; 406 PetscFree(rw1); 407 } 408 409 /* Allocate buffers*/ 410 411 /* Allocate memory for recv buffers. Prob none if nrqr = 0 ???? */ 412 len = (nrqs+1)*sizeof(int*) + nrqs*bsz1*sizeof(int); 413 rbuf2 = (int**) PetscMalloc(len); CHKPTRQ(rbuf2); 414 rbuf2[0] = (int *) (rbuf2 + nrqs); 415 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + bsz1; 416 417 /* Post the receives */ 418 r_waits2 = (MPI_Request *)PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 419 CHKPTRQ(r_waits2); 420 for (i=0; i<nrqs; ++i) { 421 MPI_Irecv(rbuf2[i], bsz1, MPI_INT, MPI_ANY_SOURCE, tag, comm, r_waits2+i); 422 } 423 424 /* Now post the sends */ 425 s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 426 CHKPTRQ(s_waits2); 427 for (i=0; i<nrqr; ++i) { 428 j = recv_status[i].MPI_SOURCE; 429 MPI_Isend( xdata[i], isz1[i], MPI_INT, j, tag, comm, s_waits2+i); 430 } 431 432 /* receive work done on other processors*/ 433 { 434 int index, is_no, ct1, max,*rbuf2_i,isz_i,*data_i,jmax; 435 char *table_i; 436 MPI_Status *status2; 437 438 status2 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(status2); 439 440 for (i=0; i<nrqs; ++i) { 441 MPI_Waitany(nrqs, r_waits2, &index, status2+i); 442 /* Process the message*/ 443 rbuf2_i = rbuf2[index]; 444 ct1 = 2*rbuf2_i[0]+1; 445 jmax = rbuf2[index][0]; 446 for (j=1; j<=jmax; j++) { 447 max = rbuf2_i[2*j]; 448 is_no = rbuf2_i[2*j-1]; 449 isz_i = isz[is_no]; 450 data_i = data[is_no]; 451 table_i = table[is_no]; 452 for (k=0; k<max; k++,ct1++) { 453 row = rbuf2_i[ct1]; 454 if (!BT_LOOKUP(table_i,row)) { data_i[isz_i++] = row;} 455 } 456 isz[is_no] = isz_i; 457 } 458 } 459 MPI_Waitall(nrqr,s_waits2,status2); 460 PetscFree(status2); 461 } 462 463 for (i=0; i<imax; ++i) { 464 ierr = ISCreateGeneral(MPI_COMM_SELF, isz[i], data[i], is+i); CHKERRQ(ierr); 465 } 466 467 PetscFree(pa); 468 PetscFree(rbuf2); 469 PetscFree(s_waits1); 470 PetscFree(r_waits1); 471 PetscFree(s_waits2); 472 PetscFree(r_waits2); 473 PetscFree(table); 474 PetscFree(s_status); 475 PetscFree(recv_status); 476 PetscFree(xdata[0]); 477 PetscFree(xdata); 478 PetscFree(isz1); 479 return 0; 480 } 481 482 #undef __FUNCTION__ 483 #define __FUNCTION__ "MatIncreaseOverlap_MPIBAIJ_Local" 484 /* 485 MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do 486 the work on the local processor. 487 488 Inputs: 489 C - MAT_MPIBAIJ; 490 imax - total no of index sets processed at a time; 491 table - an array of char - size = Mbs bits. 492 493 Output: 494 isz - array containing the count of the solution elements correspondign 495 to each index set; 496 data - pointer to the solutions 497 */ 498 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat C,int imax,char **table,int *isz, 499 int **data) 500 { 501 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 502 Mat A = c->A, B = c->B; 503 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 504 int start, end, val, max, rstart,cstart,*ai, *aj; 505 int *bi, *bj, *garray, i, j, k, row,*data_i,isz_i; 506 char *table_i; 507 508 rstart = c->rstart; 509 cstart = c->cstart; 510 ai = a->i; 511 aj = a->j; 512 bi = b->i; 513 bj = b->j; 514 garray = c->garray; 515 516 517 for (i=0; i<imax; i++) { 518 data_i = data[i]; 519 table_i = table[i]; 520 isz_i = isz[i]; 521 for (j=0, max=isz[i]; j<max; j++) { 522 row = data_i[j] - rstart; 523 start = ai[row]; 524 end = ai[row+1]; 525 for (k=start; k<end; k++) { /* Amat */ 526 val = aj[k] + cstart; 527 if (!BT_LOOKUP(table_i,val)) { data_i[isz_i++] = val;} 528 } 529 start = bi[row]; 530 end = bi[row+1]; 531 for (k=start; k<end; k++) { /* Bmat */ 532 val = garray[bj[k]]; 533 if (!BT_LOOKUP(table_i,val)) { data_i[isz_i++] = val;} 534 } 535 } 536 isz[i] = isz_i; 537 } 538 return 0; 539 } 540 #undef __FUNCTION__ 541 #define __FUNCTION__ "MatIncreaseOverlap_MPIBAIJ_Receive" 542 /* 543 MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages, 544 and return the output 545 546 Input: 547 C - the matrix 548 nrqr - no of messages being processed. 549 rbuf - an array of pointers to the recieved requests 550 551 Output: 552 xdata - array of messages to be sent back 553 isz1 - size of each message 554 555 For better efficiency perhaps we should malloc seperately each xdata[i], 556 then if a remalloc is required we need only copy the data for that one row 557 rather then all previous rows as it is now where a single large chunck of 558 memory is used. 559 560 */ 561 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,int nrqr,int **rbuf, 562 int **xdata, int * isz1) 563 { 564 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 565 Mat A = c->A, B = c->B; 566 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 567 int rstart,cstart,*ai, *aj, *bi, *bj, *garray, i, j, k; 568 int row,total_sz,ct, ct1, ct2, ct3,mem_estimate, oct2, l, start, end; 569 int val, max1, max2, rank, Mbs, no_malloc =0, *tmp, new_estimate, ctr; 570 int *rbuf_i,kmax,rbuf_0; 571 char *xtable; 572 573 rank = c->rank; 574 Mbs = c->Mbs; 575 rstart = c->rstart; 576 cstart = c->cstart; 577 ai = a->i; 578 aj = a->j; 579 bi = b->i; 580 bj = b->j; 581 garray = c->garray; 582 583 584 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 585 rbuf_i = rbuf[i]; 586 rbuf_0 = rbuf_i[0]; 587 ct += rbuf_0; 588 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 589 } 590 591 max1 = ct*(a->nz +b->nz)/c->Mbs; 592 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 593 xdata[0] = (int *)PetscMalloc(mem_estimate*sizeof(int)); CHKPTRQ(xdata[0]); 594 ++no_malloc; 595 xtable = (char *)PetscMalloc((Mbs/BITSPERBYTE+1)*sizeof(char)); CHKPTRQ(xtable); 596 PetscMemzero(isz1,nrqr*sizeof(int)); 597 598 ct3 = 0; 599 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 600 rbuf_i = rbuf[i]; 601 rbuf_0 = rbuf_i[0]; 602 ct1 = 2*rbuf_0+1; 603 ct2 = ct1; 604 ct3 += ct1; 605 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 606 PetscMemzero(xtable,(Mbs/BITSPERBYTE+1)*sizeof(char)); 607 oct2 = ct2; 608 kmax = rbuf_i[2*j]; 609 for (k=0; k<kmax; k++, ct1++) { 610 row = rbuf_i[ct1]; 611 if (!BT_LOOKUP(xtable,row)) { 612 if (!(ct3 < mem_estimate)) { 613 new_estimate = (int)(1.5*mem_estimate)+1; 614 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 615 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 616 PetscFree(xdata[0]); 617 xdata[0] = tmp; 618 mem_estimate = new_estimate; ++no_malloc; 619 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 620 } 621 xdata[i][ct2++] = row; 622 ct3++; 623 } 624 } 625 for (k=oct2,max2=ct2; k<max2; k++) { 626 row = xdata[i][k] - rstart; 627 start = ai[row]; 628 end = ai[row+1]; 629 for (l=start; l<end; l++) { 630 val = aj[l] + cstart; 631 if (!BT_LOOKUP(xtable,val)) { 632 if (!(ct3 < mem_estimate)) { 633 new_estimate = (int)(1.5*mem_estimate)+1; 634 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 635 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 636 PetscFree(xdata[0]); 637 xdata[0] = tmp; 638 mem_estimate = new_estimate; ++no_malloc; 639 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 640 } 641 xdata[i][ct2++] = val; 642 ct3++; 643 } 644 } 645 start = bi[row]; 646 end = bi[row+1]; 647 for (l=start; l<end; l++) { 648 val = garray[bj[l]]; 649 if (!BT_LOOKUP(xtable,val)) { 650 if (!(ct3 < mem_estimate)) { 651 new_estimate = (int)(1.5*mem_estimate)+1; 652 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 653 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 654 PetscFree(xdata[0]); 655 xdata[0] = tmp; 656 mem_estimate = new_estimate; ++no_malloc; 657 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 658 } 659 xdata[i][ct2++] = val; 660 ct3++; 661 } 662 } 663 } 664 /* Update the header*/ 665 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 666 xdata[i][2*j-1] = rbuf_i[2*j-1]; 667 } 668 xdata[i][0] = rbuf_0; 669 xdata[i+1] = xdata[i] + ct2; 670 isz1[i] = ct2; /* size of each message */ 671 } 672 PetscFree(xtable); 673 PLogInfo(0,"MatIncreaseOverlap_MPIBAIJ:[%d] Allocated %d bytes, required %d bytes, no of mallocs = %d\n",rank,mem_estimate, ct3,no_malloc); 674 return 0; 675 } 676 677 static int MatGetSubMatrices_MPIBAIJ_local(Mat,int,IS *,IS *,MatGetSubMatrixCall,Mat **); 678 679 #undef __FUNCTION__ 680 #define __FUNCTION__ "MatGetSubMatrices_MPIBAIJ" 681 int MatGetSubMatrices_MPIBAIJ(Mat C,int ismax,IS *isrow,IS *iscol, 682 MatGetSubMatrixCall scall,Mat **submat) 683 { 684 int i,ierr; 685 IS *isrow_new,*iscol_new; 686 687 /* The compression and expansion should be avoided. Dos'nt point 688 out errors might change the indices hence buggey */ 689 690 isrow_new = (IS *)PetscMalloc(2*ismax*sizeof(IS)); CHKPTRQ(isrow_new); 691 iscol_new = isrow_new + ismax; 692 ierr = MatCompressIndicesSorted_MPIBAIJ(C, ismax, isrow,isrow_new); CHKERRQ(ierr); 693 ierr = MatCompressIndicesSorted_MPIBAIJ(C, ismax, iscol,iscol_new); CHKERRQ(ierr); 694 ierr = MatGetSubMatrices_MPIBAIJ_local(C,ismax,isrow_new,iscol_new,scall,submat); CHKERRQ(ierr); 695 696 for (i=0; i<ismax; i++) { 697 ISDestroy(isrow_new[i]); 698 ISDestroy(iscol_new[i]); 699 } 700 PetscFree(isrow_new); 701 return 0; 702 } 703 704 /* -------------------------------------------------------------------------*/ 705 #undef __FUNCTION__ 706 #define __FUNCTION__ "MatGetSubMatrices_MPIBAIJ_local" 707 static int MatGetSubMatrices_MPIBAIJ_local(Mat C,int ismax,IS *isrow,IS *iscol, 708 MatGetSubMatrixCall scall,Mat **submat) 709 { 710 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 711 Mat A = c->A,*submats = *submat; 712 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data, *b = (Mat_SeqBAIJ*)c->B->data, *mat; 713 int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size; 714 int **sbuf1,**sbuf2, rank, Mbs,i,j,k,l,ct1,ct2,ierr, **rbuf1,row,proc; 715 int nrqs, msz, **ptr,index,*req_size,*ctr,*pa,*tmp,tcol,bsz,nrqr; 716 int **rbuf3,*req_source,**sbuf_aj, **rbuf2, max1,max2,**rmap; 717 int **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i; 718 int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i; 719 int *rmap_i,bs=c->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA, *cworkB; 720 int cstart = c->cstart,nzA,nzB,*a_i=a->i,*b_i=b->i,imark; 721 int *bmap = c->garray,ctmp,rstart=c->rstart,tag0,tag1,tag2,tag3; 722 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 723 MPI_Request *r_waits4,*s_waits3,*s_waits4; 724 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 725 MPI_Status *r_status3,*r_status4,*s_status4; 726 MPI_Comm comm; 727 Scalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB; 728 Scalar *a_a=a->a,*b_a=b->a; 729 730 comm = C->comm; 731 tag0 = C->tag; 732 size = c->size; 733 rank = c->rank; 734 Mbs = c->Mbs; 735 736 /* Get some new tags to keep the communication clean */ 737 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1); CHKERRQ(ierr); 738 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2); CHKERRQ(ierr); 739 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3); CHKERRQ(ierr); 740 741 /* Check if the col indices are sorted */ 742 for (i=0; i<ismax; i++) { 743 ierr = ISSorted(iscol[i],(PetscTruth*)&j); 744 if (!j) SETERRQ(1,"MatGetSubmatrices_MPIBAIJ:IS is not sorted"); 745 } 746 747 len = (2*ismax+1)*(sizeof(int *) + sizeof(int)) + (Mbs+1)*sizeof(int); 748 irow = (int **)PetscMalloc(len); CHKPTRQ(irow); 749 icol = irow + ismax; 750 nrow = (int *) (icol + ismax); 751 ncol = nrow + ismax; 752 rtable = ncol + ismax; 753 754 for (i=0; i<ismax; i++) { 755 ierr = ISGetIndices(isrow[i],&irow[i]); CHKERRQ(ierr); 756 ierr = ISGetIndices(iscol[i],&icol[i]); CHKERRQ(ierr); 757 ierr = ISGetSize(isrow[i],&nrow[i]); CHKERRQ(ierr); 758 ierr = ISGetSize(iscol[i],&ncol[i]); CHKERRQ(ierr); 759 } 760 761 /* Create hash table for the mapping :row -> proc*/ 762 for (i=0,j=0; i<size; i++) { 763 jmax = c->rowners[i+1]; 764 for (; j<jmax; j++) { 765 rtable[j] = i; 766 } 767 } 768 769 /* evaluate communication - mesg to who, length of mesg, and buffer space 770 required. Based on this, buffers are allocated, and data copied into them*/ 771 w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1); /* mesg size */ 772 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 773 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 774 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 775 PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/ 776 for (i=0; i<ismax; i++) { 777 PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/ 778 jmax = nrow[i]; 779 irow_i = irow[i]; 780 for (j=0; j<jmax; j++) { 781 row = irow_i[j]; 782 proc = rtable[row]; 783 w4[proc]++; 784 } 785 for (j=0; j<size; j++) { 786 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 787 } 788 } 789 790 nrqs = 0; /* no of outgoing messages */ 791 msz = 0; /* total mesg length for all proc */ 792 w1[rank] = 0; /* no mesg sent to intself */ 793 w3[rank] = 0; 794 for (i=0; i<size; i++) { 795 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 796 } 797 pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); /*(proc -array)*/ 798 for (i=0, j=0; i<size; i++) { 799 if (w1[i]) { pa[j] = i; j++; } 800 } 801 802 /* Each message would have a header = 1 + 2*(no of IS) + data */ 803 for (i=0; i<nrqs; i++) { 804 j = pa[i]; 805 w1[j] += w2[j] + 2* w3[j]; 806 msz += w1[j]; 807 } 808 /* Do a global reduction to determine how many messages to expect*/ 809 { 810 int *rw1, *rw2; 811 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 812 rw2 = rw1+size; 813 MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm); 814 bsz = rw1[rank]; 815 MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm); 816 nrqr = rw2[rank]; 817 PetscFree(rw1); 818 } 819 820 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 821 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 822 rbuf1 = (int**) PetscMalloc(len); CHKPTRQ(rbuf1); 823 rbuf1[0] = (int *) (rbuf1 + nrqr); 824 for (i=1; i<nrqr; ++i) rbuf1[i] = rbuf1[i-1] + bsz; 825 826 /* Post the receives */ 827 r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 828 CHKPTRQ(r_waits1); 829 for (i=0; i<nrqr; ++i) { 830 MPI_Irecv(rbuf1[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag0,comm,r_waits1+i); 831 } 832 833 /* Allocate Memory for outgoing messages */ 834 len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int); 835 sbuf1 = (int **)PetscMalloc(len); CHKPTRQ(sbuf1); 836 ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */ 837 PetscMemzero(sbuf1,2*size*sizeof(int*)); 838 /* allocate memory for outgoing data + buf to receive the first reply */ 839 tmp = (int *) (ptr + size); 840 ctr = tmp + 2*msz; 841 842 { 843 844 int *iptr = tmp,ict = 0; 845 for (i=0; i<nrqs; i++) { 846 j = pa[i]; 847 iptr += ict; 848 sbuf1[j] = iptr; 849 ict = w1[j]; 850 } 851 } 852 853 /* Form the outgoing messages */ 854 /* Initialise the header space */ 855 for (i=0; i<nrqs; i++) { 856 j = pa[i]; 857 sbuf1[j][0] = 0; 858 PetscMemzero(sbuf1[j]+1, 2*w3[j]*sizeof(int)); 859 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 860 } 861 862 /* Parse the isrow and copy data into outbuf */ 863 for (i=0; i<ismax; i++) { 864 PetscMemzero(ctr,size*sizeof(int)); 865 irow_i = irow[i]; 866 jmax = nrow[i]; 867 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 868 row = irow_i[j]; 869 proc = rtable[row]; 870 if (proc != rank) { /* copy to the outgoing buf*/ 871 ctr[proc]++; 872 *ptr[proc] = row; 873 ptr[proc]++; 874 } 875 } 876 /* Update the headers for the current IS */ 877 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 878 if ((ctr_j = ctr[j])) { 879 sbuf1_j = sbuf1[j]; 880 k = ++sbuf1_j[0]; 881 sbuf1_j[2*k] = ctr_j; 882 sbuf1_j[2*k-1] = i; 883 } 884 } 885 } 886 887 /* Now post the sends */ 888 s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 889 CHKPTRQ(s_waits1); 890 for (i=0; i<nrqs; ++i) { 891 j = pa[i]; 892 /* printf("[%d] Send Req to %d: size %d \n", rank,j, w1[j]); */ 893 MPI_Isend( sbuf1[j], w1[j], MPI_INT, j, tag0, comm, s_waits1+i); 894 } 895 896 /* Post Recieves to capture the buffer size */ 897 r_waits2 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 898 CHKPTRQ(r_waits2); 899 rbuf2 = (int**)PetscMalloc((nrqs+1)*sizeof(int *));CHKPTRQ(rbuf2); 900 rbuf2[0] = tmp + msz; 901 for (i=1; i<nrqs; ++i) { 902 j = pa[i]; 903 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 904 } 905 for (i=0; i<nrqs; ++i) { 906 j = pa[i]; 907 MPI_Irecv( rbuf2[i], w1[j], MPI_INT, j, tag1, comm, r_waits2+i); 908 } 909 910 /* Send to other procs the buf size they should allocate */ 911 912 913 /* Receive messages*/ 914 s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 915 CHKPTRQ(s_waits2); 916 r_status1 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 917 CHKPTRQ(r_status1); 918 len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*); 919 sbuf2 = (int**) PetscMalloc(len);CHKPTRQ(sbuf2); 920 req_size = (int *) (sbuf2 + nrqr); 921 req_source = req_size + nrqr; 922 923 { 924 Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*) c->A->data, *sB = (Mat_SeqBAIJ*) c->B->data; 925 int *sAi = sA->i, *sBi = sB->i, id; 926 int *sbuf2_i; 927 928 for (i=0; i<nrqr; ++i) { 929 MPI_Waitany(nrqr, r_waits1, &index, r_status1+i); 930 req_size[index] = 0; 931 rbuf1_i = rbuf1[index]; 932 start = 2*rbuf1_i[0] + 1; 933 MPI_Get_count(r_status1+i,MPI_INT, &end); 934 sbuf2[index] = (int *)PetscMalloc(end*sizeof(int));CHKPTRQ(sbuf2[index]); 935 sbuf2_i = sbuf2[index]; 936 for (j=start; j<end; j++) { 937 id = rbuf1_i[j] - rstart; 938 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 939 sbuf2_i[j] = ncols; 940 req_size[index] += ncols; 941 } 942 req_source[index] = r_status1[i].MPI_SOURCE; 943 /* form the header */ 944 sbuf2_i[0] = req_size[index]; 945 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 946 MPI_Isend(sbuf2_i,end,MPI_INT,req_source[index],tag1,comm,s_waits2+i); 947 } 948 } 949 PetscFree(r_status1); PetscFree(r_waits1); 950 951 /* recv buffer sizes */ 952 /* Receive messages*/ 953 954 rbuf3 = (int**)PetscMalloc((nrqs+1)*sizeof(int*)); CHKPTRQ(rbuf3); 955 rbuf4 = (Scalar**)PetscMalloc((nrqs+1)*sizeof(Scalar*));CHKPTRQ(rbuf4); 956 r_waits3 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 957 CHKPTRQ(r_waits3); 958 r_waits4 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 959 CHKPTRQ(r_waits4); 960 r_status2 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 961 CHKPTRQ(r_status2); 962 963 for (i=0; i<nrqs; ++i) { 964 MPI_Waitany(nrqs, r_waits2, &index, r_status2+i); 965 rbuf3[index] = (int *)PetscMalloc(rbuf2[index][0]*sizeof(int)); 966 CHKPTRQ(rbuf3[index]); 967 rbuf4[index] = (Scalar *)PetscMalloc(rbuf2[index][0]*bs2*sizeof(Scalar)); 968 CHKPTRQ(rbuf4[index]); 969 MPI_Irecv(rbuf3[index],rbuf2[index][0], MPI_INT, 970 r_status2[i].MPI_SOURCE, tag2, comm, r_waits3+index); 971 MPI_Irecv(rbuf4[index],rbuf2[index][0]*bs2, MPIU_SCALAR, 972 r_status2[i].MPI_SOURCE, tag3, comm, r_waits4+index); 973 } 974 PetscFree(r_status2); PetscFree(r_waits2); 975 976 /* Wait on sends1 and sends2 */ 977 s_status1 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 978 CHKPTRQ(s_status1); 979 s_status2 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 980 CHKPTRQ(s_status2); 981 982 MPI_Waitall(nrqs,s_waits1,s_status1); 983 MPI_Waitall(nrqr,s_waits2,s_status2); 984 PetscFree(s_status1); PetscFree(s_status2); 985 PetscFree(s_waits1); PetscFree(s_waits2); 986 987 /* Now allocate buffers for a->j, and send them off */ 988 sbuf_aj = (int **)PetscMalloc((nrqr+1)*sizeof(int *));CHKPTRQ(sbuf_aj); 989 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 990 sbuf_aj[0] = (int*) PetscMalloc((j+1)*sizeof(int)); CHKPTRQ(sbuf_aj[0]); 991 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 992 993 s_waits3 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 994 CHKPTRQ(s_waits3); 995 { 996 for (i=0; i<nrqr; i++) { 997 rbuf1_i = rbuf1[i]; 998 sbuf_aj_i = sbuf_aj[i]; 999 ct1 = 2*rbuf1_i[0] + 1; 1000 ct2 = 0; 1001 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1002 kmax = rbuf1[i][2*j]; 1003 for (k=0; k<kmax; k++,ct1++) { 1004 row = rbuf1_i[ct1] - rstart; 1005 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1006 ncols = nzA + nzB; 1007 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1008 1009 /* load the column indices for this row into cols*/ 1010 cols = sbuf_aj_i + ct2; 1011 for (l=0; l<nzB; l++) { 1012 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp; 1013 else break; 1014 } 1015 imark = l; 1016 for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l]; 1017 for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]]; 1018 ct2 += ncols; 1019 } 1020 } 1021 MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i); 1022 } 1023 } 1024 r_status3 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 1025 CHKPTRQ(r_status3); 1026 s_status3 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 1027 CHKPTRQ(s_status3); 1028 1029 /* Allocate buffers for a->a, and send them off */ 1030 sbuf_aa = (Scalar **)PetscMalloc((nrqr+1)*sizeof(Scalar *));CHKPTRQ(sbuf_aa); 1031 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1032 sbuf_aa[0] = (Scalar*) PetscMalloc((j+1)*bs2*sizeof(Scalar));CHKPTRQ(sbuf_aa[0]); 1033 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2; 1034 1035 s_waits4 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 1036 CHKPTRQ(s_waits4); 1037 { 1038 for (i=0; i<nrqr; i++) { 1039 rbuf1_i = rbuf1[i]; 1040 sbuf_aa_i = sbuf_aa[i]; 1041 ct1 = 2*rbuf1_i[0]+1; 1042 ct2 = 0; 1043 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1044 kmax = rbuf1_i[2*j]; 1045 for (k=0; k<kmax; k++,ct1++) { 1046 row = rbuf1_i[ct1] - rstart; 1047 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1048 ncols = nzA + nzB; 1049 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1050 vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2; 1051 1052 /* load the column values for this row into vals*/ 1053 vals = sbuf_aa_i+ct2*bs2; 1054 for (l=0; l<nzB; l++) { 1055 if ((bmap[cworkB[l]]) < cstart) 1056 PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(Scalar)); 1057 else break; 1058 } 1059 imark = l; 1060 for (l=0; l<nzA; l++) 1061 PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(Scalar)); 1062 for (l=imark; l<nzB; l++) 1063 PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(Scalar)); 1064 ct2 += ncols; 1065 } 1066 } 1067 MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i); 1068 } 1069 } 1070 r_status4 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 1071 CHKPTRQ(r_status4); 1072 s_status4 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 1073 CHKPTRQ(s_status4); 1074 PetscFree(rbuf1); 1075 1076 /* Form the matrix */ 1077 /* create col map */ 1078 { 1079 int *icol_i; 1080 1081 len = (1+ismax)*sizeof(int *) + ismax*c->Nbs*sizeof(int); 1082 cmap = (int **)PetscMalloc(len); CHKPTRQ(cmap); 1083 cmap[0] = (int *)(cmap + ismax); 1084 PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(int)); 1085 for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; } 1086 for (i=0; i<ismax; i++) { 1087 jmax = ncol[i]; 1088 icol_i = icol[i]; 1089 cmap_i = cmap[i]; 1090 for (j=0; j<jmax; j++) { 1091 cmap_i[icol_i[j]] = j+1; 1092 } 1093 } 1094 } 1095 1096 1097 /* Create lens which is required for MatCreate... */ 1098 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1099 len = (1+ismax)*sizeof(int *) + j*sizeof(int); 1100 lens = (int **)PetscMalloc(len); CHKPTRQ(lens); 1101 lens[0] = (int *)(lens + ismax); 1102 PetscMemzero(lens[0], j*sizeof(int)); 1103 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1104 1105 /* Update lens from local data */ 1106 for (i=0; i<ismax; i++) { 1107 jmax = nrow[i]; 1108 cmap_i = cmap[i]; 1109 irow_i = irow[i]; 1110 lens_i = lens[i]; 1111 for (j=0; j<jmax; j++) { 1112 row = irow_i[j]; 1113 proc = rtable[row]; 1114 if (proc == rank) { 1115 /* Get indices from matA and then from matB */ 1116 row = row - rstart; 1117 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1118 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1119 for (k=0; k<nzA; k++) 1120 if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++;} 1121 for (k=0; k<nzB; k++) 1122 if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++;} 1123 } 1124 } 1125 } 1126 1127 /* Create row map*/ 1128 len = (1+ismax)*sizeof(int *) + ismax*c->Mbs*sizeof(int); 1129 rmap = (int **)PetscMalloc(len); CHKPTRQ(rmap); 1130 rmap[0] = (int *)(rmap + ismax); 1131 PetscMemzero(rmap[0],ismax*c->Mbs*sizeof(int)); 1132 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + c->Mbs;} 1133 for (i=0; i<ismax; i++) { 1134 rmap_i = rmap[i]; 1135 irow_i = irow[i]; 1136 jmax = nrow[i]; 1137 for (j=0; j<jmax; j++) { 1138 rmap_i[irow_i[j]] = j; 1139 } 1140 } 1141 1142 /* Update lens from offproc data */ 1143 { 1144 int *rbuf2_i, *rbuf3_i, *sbuf1_i; 1145 1146 for (tmp2=0; tmp2<nrqs; tmp2++) { 1147 MPI_Waitany(nrqs, r_waits3, &i, r_status3+tmp2); 1148 index = pa[i]; 1149 sbuf1_i = sbuf1[index]; 1150 jmax = sbuf1_i[0]; 1151 ct1 = 2*jmax+1; 1152 ct2 = 0; 1153 rbuf2_i = rbuf2[i]; 1154 rbuf3_i = rbuf3[i]; 1155 for (j=1; j<=jmax; j++) { 1156 is_no = sbuf1_i[2*j-1]; 1157 max1 = sbuf1_i[2*j]; 1158 lens_i = lens[is_no]; 1159 cmap_i = cmap[is_no]; 1160 rmap_i = rmap[is_no]; 1161 for (k=0; k<max1; k++,ct1++) { 1162 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1163 max2 = rbuf2_i[ct1]; 1164 for (l=0; l<max2; l++,ct2++) { 1165 if (cmap_i[rbuf3_i[ct2]]) { 1166 lens_i[row]++; 1167 } 1168 } 1169 } 1170 } 1171 } 1172 } 1173 PetscFree(r_status3); PetscFree(r_waits3); 1174 MPI_Waitall(nrqr,s_waits3,s_status3); 1175 PetscFree(s_status3); PetscFree(s_waits3); 1176 1177 /* Create the submatrices */ 1178 if (scall == MAT_REUSE_MATRIX) { 1179 /* 1180 Assumes new rows are same length as the old rows, hence bug! 1181 */ 1182 for (i=0; i<ismax; i++) { 1183 mat = (Mat_SeqBAIJ *)(submats[i]->data); 1184 if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || mat->bs != bs)) { 1185 SETERRQ(1,"MatGetSubmatrices_MPIBAIJ:Cannot reuse matrix. wrong size"); 1186 } 1187 if (PetscMemcmp(mat->ilen,lens[i], mat->mbs *sizeof(int))) { 1188 SETERRQ(1,"MatGetSubmatrices_MPIBAIJ:Cannot reuse matrix. wrong no of nonzeros"); 1189 } 1190 /* Initial matrix as if empty */ 1191 PetscMemzero(mat->ilen,mat->mbs*sizeof(int)); 1192 } 1193 } 1194 else { 1195 *submat = submats = (Mat *)PetscMalloc(ismax*sizeof(Mat)); CHKPTRQ(submats); 1196 for (i=0; i<ismax; i++) { 1197 ierr = MatCreateSeqBAIJ(MPI_COMM_SELF,a->bs,nrow[i]*bs,ncol[i]*bs,0,lens[i],submats+i); 1198 CHKERRQ(ierr); 1199 } 1200 } 1201 1202 /* Assemble the matrices */ 1203 /* First assemble the local rows */ 1204 { 1205 int ilen_row,*imat_ilen, *imat_j, *imat_i; 1206 Scalar *imat_a; 1207 1208 for (i=0; i<ismax; i++) { 1209 mat = (Mat_SeqBAIJ *) submats[i]->data; 1210 imat_ilen = mat->ilen; 1211 imat_j = mat->j; 1212 imat_i = mat->i; 1213 imat_a = mat->a; 1214 cmap_i = cmap[i]; 1215 rmap_i = rmap[i]; 1216 irow_i = irow[i]; 1217 jmax = nrow[i]; 1218 for (j=0; j<jmax; j++) { 1219 row = irow_i[j]; 1220 proc = rtable[row]; 1221 if (proc == rank) { 1222 row = row - rstart; 1223 nzA = a_i[row+1] - a_i[row]; 1224 nzB = b_i[row+1] - b_i[row]; 1225 cworkA = a_j + a_i[row]; 1226 cworkB = b_j + b_i[row]; 1227 vworkA = a_a + a_i[row]*bs2; 1228 vworkB = b_a + b_i[row]*bs2; 1229 1230 row = rmap_i[row + rstart]; 1231 mat_i = imat_i[row]; 1232 mat_a = imat_a + mat_i*bs2; 1233 mat_j = imat_j + mat_i; 1234 ilen_row = imat_ilen[row]; 1235 1236 /* load the column indices for this row into cols*/ 1237 for (l=0; l<nzB; l++) { 1238 if ((ctmp = bmap[cworkB[l]]) < cstart) { 1239 if ((tcol = cmap_i[ctmp])) { 1240 *mat_j++ = tcol - 1; 1241 PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(Scalar)); mat_a += bs2; 1242 ilen_row++; 1243 } 1244 } 1245 else break; 1246 } 1247 imark = l; 1248 for (l=0; l<nzA; l++) { 1249 if ((tcol = cmap_i[cstart + cworkA[l]])) { 1250 *mat_j++ = tcol - 1; 1251 PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(Scalar)); mat_a += bs2; 1252 ilen_row++; 1253 } 1254 } 1255 for (l=imark; l<nzB; l++) { 1256 if ((tcol = cmap_i[bmap[cworkB[l]]])) { 1257 *mat_j++ = tcol - 1; 1258 PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(Scalar)); mat_a += bs2; 1259 ilen_row++; 1260 } 1261 } 1262 imat_ilen[row] = ilen_row; 1263 } 1264 } 1265 1266 } 1267 } 1268 1269 /* Now assemble the off proc rows*/ 1270 { 1271 int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1272 int *imat_j,*imat_i; 1273 Scalar *imat_a,*rbuf4_i; 1274 1275 for (tmp2=0; tmp2<nrqs; tmp2++) { 1276 MPI_Waitany(nrqs, r_waits4, &i, r_status4+tmp2); 1277 index = pa[i]; 1278 sbuf1_i = sbuf1[index]; 1279 jmax = sbuf1_i[0]; 1280 ct1 = 2*jmax + 1; 1281 ct2 = 0; 1282 rbuf2_i = rbuf2[i]; 1283 rbuf3_i = rbuf3[i]; 1284 rbuf4_i = rbuf4[i]; 1285 for (j=1; j<=jmax; j++) { 1286 is_no = sbuf1_i[2*j-1]; 1287 rmap_i = rmap[is_no]; 1288 cmap_i = cmap[is_no]; 1289 mat = (Mat_SeqBAIJ *) submats[is_no]->data; 1290 imat_ilen = mat->ilen; 1291 imat_j = mat->j; 1292 imat_i = mat->i; 1293 imat_a = mat->a; 1294 max1 = sbuf1_i[2*j]; 1295 for (k=0; k<max1; k++, ct1++) { 1296 row = sbuf1_i[ct1]; 1297 row = rmap_i[row]; 1298 ilen = imat_ilen[row]; 1299 mat_i = imat_i[row]; 1300 mat_a = imat_a + mat_i*bs2; 1301 mat_j = imat_j + mat_i; 1302 max2 = rbuf2_i[ct1]; 1303 for (l=0; l<max2; l++,ct2++) { 1304 if ((tcol = cmap_i[rbuf3_i[ct2]])) { 1305 *mat_j++ = tcol - 1; 1306 /* *mat_a++ = rbuf4_i[ct2]; */ 1307 PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(Scalar)); mat_a += bs2; 1308 ilen++; 1309 } 1310 } 1311 imat_ilen[row] = ilen; 1312 } 1313 } 1314 } 1315 } 1316 PetscFree(r_status4); PetscFree(r_waits4); 1317 MPI_Waitall(nrqr,s_waits4,s_status4); 1318 PetscFree(s_waits4); PetscFree(s_status4); 1319 1320 /* Restore the indices */ 1321 for (i=0; i<ismax; i++) { 1322 ierr = ISRestoreIndices(isrow[i], irow+i); CHKERRQ(ierr); 1323 ierr = ISRestoreIndices(iscol[i], icol+i); CHKERRQ(ierr); 1324 } 1325 1326 /* Destroy allocated memory */ 1327 PetscFree(irow); 1328 PetscFree(w1); 1329 PetscFree(pa); 1330 1331 PetscFree(sbuf1); 1332 PetscFree(rbuf2); 1333 for (i=0; i<nrqr; ++i) { 1334 PetscFree(sbuf2[i]); 1335 } 1336 for (i=0; i<nrqs; ++i) { 1337 PetscFree(rbuf3[i]); 1338 PetscFree(rbuf4[i]); 1339 } 1340 1341 PetscFree(sbuf2); 1342 PetscFree(rbuf3); 1343 PetscFree(rbuf4 ); 1344 PetscFree(sbuf_aj[0]); 1345 PetscFree(sbuf_aj); 1346 PetscFree(sbuf_aa[0]); 1347 PetscFree(sbuf_aa); 1348 1349 PetscFree(cmap); 1350 PetscFree(rmap); 1351 PetscFree(lens); 1352 1353 for (i=0; i<ismax; i++) { 1354 ierr = MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); 1355 ierr = MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); 1356 } 1357 1358 ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag3); CHKERRQ(ierr); 1359 ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag2); CHKERRQ(ierr); 1360 ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag1); CHKERRQ(ierr); 1361 1362 return 0; 1363 } 1364