1 #ifndef lint 2 static char vcid[] = "$Id: baijov.c,v 1.11 1996/08/15 12:48:12 bsmith Exp bsmith $"; 3 #endif 4 /* 5 Routines to compute overlapping regions of a parallel MPI matrix 6 and to find submatrices that were shared across processors. 7 */ 8 #include "src/mat/impls/baij/mpi/mpibaij.h" 9 #include "src/inline/bitarray.h" 10 11 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat, int, IS *); 12 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat , int , char **,int*, int**); 13 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat , int, int **, int**, int* ); 14 extern int MatGetRow_MPIBAIJ(Mat,int,int*,int**,Scalar**); 15 extern int MatRestoreRow_MPIBAIJ(Mat,int,int*,int**,Scalar**); 16 17 18 static int MatCompressIndicesGeneral_MPIBAIJ(Mat C, int imax, IS *is_in, IS *is_out) 19 { 20 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) C->data; 21 int ierr,isz,bs = baij->bs,Nbs,n,i,j,*idx,*nidx,ival; 22 char *table; 23 24 Nbs = baij->Nbs; 25 table = (char *) PetscMalloc((Nbs/BITSPERBYTE +1)*sizeof(char)); CHKPTRQ(table); 26 nidx = (int *) PetscMalloc((Nbs+1)*sizeof(int)); CHKPTRQ(nidx); 27 28 for (i=0; i<imax; i++) { 29 isz = 0; 30 PetscMemzero(table,(Nbs/BITSPERBYTE +1)*sizeof(char)); 31 ierr = ISGetIndices(is_in[i],&idx); CHKERRQ(ierr); 32 ierr = ISGetSize(is_in[i],&n); CHKERRQ(ierr); 33 for (j=0; j<n ; j++) { 34 ival = idx[j]/bs; /* convert the indices into block indices */ 35 if (ival>Nbs) SETERRQ(1,"MatCompressIndicesGeneral_MPIBAIJ: index greater than mat-dim"); 36 if(!BT_LOOKUP(table, ival)) { nidx[isz++] = ival;} 37 } 38 ierr = ISRestoreIndices(is_in[i],&idx); CHKERRQ(ierr); 39 ierr = ISCreateGeneral(MPI_COMM_SELF, isz, nidx, (is_out+i)); CHKERRQ(ierr); 40 } 41 PetscFree(table); 42 PetscFree(nidx); 43 return 0; 44 } 45 46 static int MatCompressIndicesSorted_MPIBAIJ(Mat C, int imax, IS *is_in, IS *is_out) 47 { 48 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) C->data; 49 int ierr,bs=baij->bs,i,j,k,val,n,*idx,*nidx,Nbs=baij->Nbs,*idx_local; 50 PetscTruth flg; 51 52 for (i=0; i<imax; i++) { 53 ierr = ISSorted(is_in[i],&flg); CHKERRQ(ierr); 54 if (!flg) SETERRQ(1,"MatCompressIndicesSorted_MPIBAIJ: Indices are not sorted"); 55 } 56 nidx = (int *) PetscMalloc((Nbs+1)*sizeof(int)); CHKPTRQ(nidx); 57 /* Now chech if the indices are in block order */ 58 for (i=0; i<imax; i++) { 59 ierr = ISGetIndices(is_in[i],&idx); CHKERRQ(ierr); 60 ierr = ISGetSize(is_in[i],&n); CHKERRQ(ierr); 61 if ( n%bs !=0 ) SETERRA(1,"MatCompressIndicesSorted_MPIBAIJ: Indices are not block ordered"); 62 63 n = n/bs; /* The reduced index size */ 64 idx_local = idx; 65 for (j=0; j<n ; j++) { 66 val = idx_local[0]; 67 if(val%bs != 0) SETERRA(1,"MatCompressIncicesSorted_MPIBAIJ: Indices are not block ordered"); 68 for (k=0; k<bs; k++) { 69 if ( val+k != idx_local[k]) SETERRA(1,"MatCompressIncicesSorted_MPIBAIJ: Indices are not block ordered"); 70 } 71 nidx[j] = val/bs; 72 idx_local +=bs; 73 } 74 ierr = ISRestoreIndices(is_in[i],&idx); CHKERRQ(ierr); 75 ierr = ISCreateGeneral(MPI_COMM_SELF,n,nidx,(is_out+i)); CHKERRQ(ierr); 76 } 77 PetscFree(nidx); 78 return 0; 79 } 80 81 static int MatExpandIndices_MPIBAIJ(Mat C, int imax, IS *is_in, IS *is_out) 82 { 83 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) C->data; 84 int ierr,bs = baij->bs,Nbs,n,i,j,k,*idx,*nidx; 85 86 Nbs = baij->Nbs; 87 88 nidx = (int *) PetscMalloc((Nbs*bs+1)*sizeof(int)); CHKPTRQ(nidx); 89 90 for ( i=0; i<imax; i++ ) { 91 ierr = ISGetIndices(is_in[i],&idx); CHKERRQ(ierr); 92 ierr = ISGetSize(is_in[i],&n); CHKERRQ(ierr); 93 for (j=0; j<n ; ++j){ 94 for (k=0; k<bs; k++) 95 nidx[j*bs+k] = idx[j]*bs+k; 96 } 97 ierr = ISRestoreIndices(is_in[i],&idx); CHKERRQ(ierr); 98 ierr = ISCreateGeneral(MPI_COMM_SELF, n*bs, nidx, (is_out+i)); CHKERRQ(ierr); 99 } 100 PetscFree(nidx); 101 return 0; 102 } 103 104 105 int MatIncreaseOverlap_MPIBAIJ(Mat C, int imax, IS *is, int ov) 106 { 107 int i, ierr; 108 IS *is_new; 109 110 is_new = (IS *)PetscMalloc(imax*sizeof(IS)); CHKPTRQ(is_new); 111 /* Convert the indices into block format */ 112 ierr = MatCompressIndicesGeneral_MPIBAIJ(C, imax, is,is_new); CHKERRQ(ierr); 113 if (ov < 0){ SETERRQ(1," MatIncreaseOverlap_MPIBAIJ:Negative overlap specified\n");} 114 for (i=0; i<ov; ++i) { 115 ierr = MatIncreaseOverlap_MPIBAIJ_Once(C, imax, is_new); CHKERRQ(ierr); 116 } 117 for (i=0; i<imax; i++) ISDestroy(is[i]); 118 ierr = MatExpandIndices_MPIBAIJ(C, imax, is_new,is); CHKERRQ(ierr); 119 for (i=0; i<imax; i++) ISDestroy(is_new[i]); 120 PetscFree(is_new); 121 return 0; 122 } 123 124 /* 125 Sample message format: 126 If a processor A wants processor B to process some elements corresponding 127 to index sets 1s[1], is[5] 128 mesg [0] = 2 ( no of index sets in the mesg) 129 ----------- 130 mesg [1] = 1 => is[1] 131 mesg [2] = sizeof(is[1]); 132 ----------- 133 mesg [5] = 5 => is[5] 134 mesg [6] = sizeof(is[5]); 135 ----------- 136 mesg [7] 137 mesg [n] datas[1] 138 ----------- 139 mesg[n+1] 140 mesg[m] data(is[5]) 141 ----------- 142 143 Notes: 144 nrqs - no of requests sent (or to be sent out) 145 nrqr - no of requests recieved (which have to be or which have been processed 146 */ 147 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat C, int imax, IS *is) 148 { 149 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 150 int **idx, *n, *w1, *w2, *w3, *w4, *rtable,**data,len,*idx_i; 151 int size,rank,Mbs,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr; 152 int *ctr,*pa,tag,*tmp,bsz,nrqr,*isz,*isz1,**xdata,bsz1,**rbuf2; 153 char **table; 154 MPI_Comm comm; 155 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 156 MPI_Status *s_status,*recv_status; 157 158 comm = C->comm; 159 tag = C->tag; 160 size = c->size; 161 rank = c->rank; 162 Mbs = c->Mbs; 163 164 len = (imax+1)*sizeof(int *) + (imax + Mbs)*sizeof(int); 165 idx = (int **) PetscMalloc(len); CHKPTRQ(idx); 166 n = (int *) (idx + imax); 167 rtable = n + imax; 168 169 for (i=0; i<imax; i++) { 170 ierr = ISGetIndices(is[i],&idx[i]); CHKERRQ(ierr); 171 ierr = ISGetSize(is[i],&n[i]); CHKERRQ(ierr); 172 } 173 174 /* Create hash table for the mapping :row -> proc*/ 175 for (i=0,j=0; i<size; i++) { 176 len = c->rowners[i+1]; 177 for (; j<len; j++) { 178 rtable[j] = i; 179 } 180 } 181 182 /* evaluate communication - mesg to who, length of mesg, and buffer space 183 required. Based on this, buffers are allocated, and data copied into them*/ 184 w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1);/* mesg size */ 185 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 186 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 187 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 188 PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/ 189 for (i=0; i<imax; i++) { 190 PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/ 191 idx_i = idx[i]; 192 len = n[i]; 193 for (j=0; j<len; j++) { 194 row = idx_i[j]; 195 proc = rtable[row]; 196 w4[proc]++; 197 } 198 for (j=0; j<size; j++){ 199 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 200 } 201 } 202 203 nrqs = 0; /* no of outgoing messages */ 204 msz = 0; /* total mesg length (for all proc */ 205 w1[rank] = 0; /* no mesg sent to intself */ 206 w3[rank] = 0; 207 for ( i=0; i<size; i++) { 208 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 209 } 210 /* pa - is list of processors to communicate with */ 211 pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); 212 for (i=0,j=0; i<size; i++) { 213 if (w1[i]) {pa[j] = i; j++;} 214 } 215 216 /* Each message would have a header = 1 + 2*(no of IS) + data */ 217 for (i=0; i<nrqs; i++) { 218 j = pa[i]; 219 w1[j] += w2[j] + 2*w3[j]; 220 msz += w1[j]; 221 } 222 223 224 /* Do a global reduction to determine how many messages to expect*/ 225 { 226 int *rw1, *rw2; 227 rw1 = (int *) PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 228 rw2 = rw1+size; 229 MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm); 230 bsz = rw1[rank]; 231 MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm); 232 nrqr = rw2[rank]; 233 PetscFree(rw1); 234 } 235 236 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 237 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 238 rbuf = (int**) PetscMalloc(len); CHKPTRQ(rbuf); 239 rbuf[0] = (int *) (rbuf + nrqr); 240 for (i=1; i<nrqr; ++i) rbuf[i] = rbuf[i-1] + bsz; 241 242 /* Post the receives */ 243 r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 244 CHKPTRQ(r_waits1); 245 for (i=0; i<nrqr; ++i) { 246 MPI_Irecv(rbuf[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag,comm,r_waits1+i); 247 } 248 249 /* Allocate Memory for outgoing messages */ 250 len = 2*size*sizeof(int*) + (size+msz)*sizeof(int); 251 outdat = (int **)PetscMalloc(len); CHKPTRQ(outdat); 252 ptr = outdat + size; /* Pointers to the data in outgoing buffers */ 253 PetscMemzero(outdat,2*size*sizeof(int*)); 254 tmp = (int *) (outdat + 2*size); 255 ctr = tmp + msz; 256 257 { 258 int *iptr = tmp,ict = 0; 259 for (i=0; i<nrqs; i++) { 260 j = pa[i]; 261 iptr += ict; 262 outdat[j] = iptr; 263 ict = w1[j]; 264 } 265 } 266 267 /* Form the outgoing messages */ 268 /*plug in the headers*/ 269 for (i=0; i<nrqs; i++) { 270 j = pa[i]; 271 outdat[j][0] = 0; 272 PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int)); 273 ptr[j] = outdat[j] + 2*w3[j] + 1; 274 } 275 276 /* Memory for doing local proc's work*/ 277 { 278 int *d_p; 279 char *t_p; 280 281 len = (imax+1)*(sizeof(char *) + sizeof(int *) + sizeof(int)) + 282 (Mbs+1)*imax*sizeof(int) + (Mbs/BITSPERBYTE+1)*imax*sizeof(char); 283 table = (char **)PetscMalloc(len); CHKPTRQ(table); 284 data = (int **)(table + imax); 285 data[0] = (int *)(data + imax); 286 isz = data[0] + (Mbs+1)*imax; 287 table[0] = (char *)(isz + imax); 288 d_p = data[0]; t_p = table[0]; 289 for (i=1; i<imax; i++) { 290 table[i] = t_p + (Mbs/BITSPERBYTE+1)*i; 291 data[i] = d_p + (Mbs+1)*i; 292 } 293 } 294 PetscMemzero(*table,(Mbs/BITSPERBYTE+1)*imax); 295 PetscMemzero(isz,imax*sizeof(int)); 296 297 /* Parse the IS and update local tables and the outgoing buf with the data*/ 298 { 299 int n_i,*data_i,isz_i,*outdat_j,ctr_j; 300 char *table_i; 301 302 for (i=0; i<imax; i++) { 303 PetscMemzero(ctr,size*sizeof(int)); 304 n_i = n[i]; 305 table_i = table[i]; 306 idx_i = idx[i]; 307 data_i = data[i]; 308 isz_i = isz[i]; 309 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 310 row = idx_i[j]; 311 proc = rtable[row]; 312 if (proc != rank) { /* copy to the outgoing buffer */ 313 ctr[proc]++; 314 *ptr[proc] = row; 315 ptr[proc]++; 316 } 317 else { /* Update the local table */ 318 if (!BT_LOOKUP(table_i,row)) { data_i[isz_i++] = row;} 319 } 320 } 321 /* Update the headers for the current IS */ 322 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 323 if ((ctr_j = ctr[j])) { 324 outdat_j = outdat[j]; 325 k = ++outdat_j[0]; 326 outdat_j[2*k] = ctr_j; 327 outdat_j[2*k-1] = i; 328 } 329 } 330 isz[i] = isz_i; 331 } 332 } 333 334 335 336 /* Now post the sends */ 337 s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 338 CHKPTRQ(s_waits1); 339 for (i=0; i<nrqs; ++i) { 340 j = pa[i]; 341 MPI_Isend(outdat[j], w1[j], MPI_INT, j, tag, comm, s_waits1+i); 342 } 343 344 /* No longer need the original indices*/ 345 for (i=0; i<imax; ++i) { 346 ierr = ISRestoreIndices(is[i], idx+i); CHKERRQ(ierr); 347 } 348 PetscFree(idx); 349 350 for (i=0; i<imax; ++i) { 351 ierr = ISDestroy(is[i]); CHKERRQ(ierr); 352 } 353 354 /* Do Local work*/ 355 ierr = MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 356 357 /* Receive messages*/ 358 { 359 int index; 360 361 recv_status = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 362 CHKPTRQ(recv_status); 363 for (i=0; i<nrqr; ++i) { 364 MPI_Waitany(nrqr, r_waits1, &index, recv_status+i); 365 } 366 367 s_status = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 368 CHKPTRQ(s_status); 369 MPI_Waitall(nrqs,s_waits1,s_status); 370 } 371 372 /* Phase 1 sends are complete - deallocate buffers */ 373 PetscFree(outdat); 374 PetscFree(w1); 375 376 xdata = (int **)PetscMalloc((nrqr+1)*sizeof(int *)); CHKPTRQ(xdata); 377 isz1 = (int *)PetscMalloc((nrqr+1)*sizeof(int)); CHKPTRQ(isz1); 378 ierr = MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 379 PetscFree(rbuf); 380 381 /* Send the data back*/ 382 /* Do a global reduction to know the buffer space req for incoming messages*/ 383 { 384 int *rw1, *rw2; 385 386 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 387 PetscMemzero(rw1,2*size*sizeof(int)); 388 rw2 = rw1+size; 389 for (i=0; i<nrqr; ++i) { 390 proc = recv_status[i].MPI_SOURCE; 391 rw1[proc] = isz1[i]; 392 } 393 394 MPI_Allreduce(rw1, rw2, size, MPI_INT, MPI_MAX, comm); 395 bsz1 = rw2[rank]; 396 PetscFree(rw1); 397 } 398 399 /* Allocate buffers*/ 400 401 /* Allocate memory for recv buffers. Prob none if nrqr = 0 ???? */ 402 len = (nrqs+1)*sizeof(int*) + nrqs*bsz1*sizeof(int); 403 rbuf2 = (int**) PetscMalloc(len); CHKPTRQ(rbuf2); 404 rbuf2[0] = (int *) (rbuf2 + nrqs); 405 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + bsz1; 406 407 /* Post the receives */ 408 r_waits2 = (MPI_Request *)PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 409 CHKPTRQ(r_waits2); 410 for (i=0; i<nrqs; ++i) { 411 MPI_Irecv(rbuf2[i], bsz1, MPI_INT, MPI_ANY_SOURCE, tag, comm, r_waits2+i); 412 } 413 414 /* Now post the sends */ 415 s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 416 CHKPTRQ(s_waits2); 417 for (i=0; i<nrqr; ++i) { 418 j = recv_status[i].MPI_SOURCE; 419 MPI_Isend( xdata[i], isz1[i], MPI_INT, j, tag, comm, s_waits2+i); 420 } 421 422 /* receive work done on other processors*/ 423 { 424 int index, is_no, ct1, max,*rbuf2_i,isz_i,*data_i,jmax; 425 char *table_i; 426 MPI_Status *status2; 427 428 status2 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(status2); 429 430 for (i=0; i<nrqs; ++i) { 431 MPI_Waitany(nrqs, r_waits2, &index, status2+i); 432 /* Process the message*/ 433 rbuf2_i = rbuf2[index]; 434 ct1 = 2*rbuf2_i[0]+1; 435 jmax = rbuf2[index][0]; 436 for (j=1; j<=jmax; j++) { 437 max = rbuf2_i[2*j]; 438 is_no = rbuf2_i[2*j-1]; 439 isz_i = isz[is_no]; 440 data_i = data[is_no]; 441 table_i = table[is_no]; 442 for (k=0; k<max; k++,ct1++) { 443 row = rbuf2_i[ct1]; 444 if (!BT_LOOKUP(table_i,row)) { data_i[isz_i++] = row;} 445 } 446 isz[is_no] = isz_i; 447 } 448 } 449 MPI_Waitall(nrqr,s_waits2,status2); 450 PetscFree(status2); 451 } 452 453 for (i=0; i<imax; ++i) { 454 ierr = ISCreateGeneral(MPI_COMM_SELF, isz[i], data[i], is+i); CHKERRQ(ierr); 455 } 456 457 PetscFree(pa); 458 PetscFree(rbuf2); 459 PetscFree(s_waits1); 460 PetscFree(r_waits1); 461 PetscFree(s_waits2); 462 PetscFree(r_waits2); 463 PetscFree(table); 464 PetscFree(s_status); 465 PetscFree(recv_status); 466 PetscFree(xdata[0]); 467 PetscFree(xdata); 468 PetscFree(isz1); 469 return 0; 470 } 471 472 /* 473 MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do 474 the work on the local processor. 475 476 Inputs: 477 C - MAT_MPIBAIJ; 478 imax - total no of index sets processed at a time; 479 table - an array of char - size = Mbs bits. 480 481 Output: 482 isz - array containing the count of the solution elements correspondign 483 to each index set; 484 data - pointer to the solutions 485 */ 486 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat C,int imax,char **table,int *isz, 487 int **data) 488 { 489 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 490 Mat A = c->A, B = c->B; 491 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 492 int start, end, val, max, rstart,cstart,*ai, *aj; 493 int *bi, *bj, *garray, i, j, k, row,*data_i,isz_i; 494 char *table_i; 495 496 rstart = c->rstart; 497 cstart = c->cstart; 498 ai = a->i; 499 aj = a->j; 500 bi = b->i; 501 bj = b->j; 502 garray = c->garray; 503 504 505 for (i=0; i<imax; i++) { 506 data_i = data[i]; 507 table_i = table[i]; 508 isz_i = isz[i]; 509 for (j=0, max=isz[i]; j<max; j++) { 510 row = data_i[j] - rstart; 511 start = ai[row]; 512 end = ai[row+1]; 513 for (k=start; k<end; k++) { /* Amat */ 514 val = aj[k] + cstart; 515 if (!BT_LOOKUP(table_i,val)) { data_i[isz_i++] = val;} 516 } 517 start = bi[row]; 518 end = bi[row+1]; 519 for (k=start; k<end; k++) { /* Bmat */ 520 val = garray[bj[k]]; 521 if (!BT_LOOKUP(table_i,val)) { data_i[isz_i++] = val;} 522 } 523 } 524 isz[i] = isz_i; 525 } 526 return 0; 527 } 528 /* 529 MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages, 530 and return the output 531 532 Input: 533 C - the matrix 534 nrqr - no of messages being processed. 535 rbuf - an array of pointers to the recieved requests 536 537 Output: 538 xdata - array of messages to be sent back 539 isz1 - size of each message 540 541 For better efficiency perhaps we should malloc seperately each xdata[i], 542 then if a remalloc is required we need only copy the data for that one row 543 rather then all previous rows as it is now where a single large chunck of 544 memory is used. 545 546 */ 547 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,int nrqr,int **rbuf, 548 int **xdata, int * isz1) 549 { 550 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 551 Mat A = c->A, B = c->B; 552 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 553 int rstart,cstart,*ai, *aj, *bi, *bj, *garray, i, j, k; 554 int row,total_sz,ct, ct1, ct2, ct3,mem_estimate, oct2, l, start, end; 555 int val, max1, max2, rank, Mbs, no_malloc =0, *tmp, new_estimate, ctr; 556 int *rbuf_i,kmax,rbuf_0; 557 char *xtable; 558 559 rank = c->rank; 560 Mbs = c->Mbs; 561 rstart = c->rstart; 562 cstart = c->cstart; 563 ai = a->i; 564 aj = a->j; 565 bi = b->i; 566 bj = b->j; 567 garray = c->garray; 568 569 570 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 571 rbuf_i = rbuf[i]; 572 rbuf_0 = rbuf_i[0]; 573 ct += rbuf_0; 574 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 575 } 576 577 max1 = ct*(a->nz +b->nz)/c->Mbs; 578 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 579 xdata[0] = (int *)PetscMalloc(mem_estimate*sizeof(int)); CHKPTRQ(xdata[0]); 580 ++no_malloc; 581 xtable = (char *)PetscMalloc((Mbs/BITSPERBYTE+1)*sizeof(char)); CHKPTRQ(xtable); 582 PetscMemzero(isz1,nrqr*sizeof(int)); 583 584 ct3 = 0; 585 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 586 rbuf_i = rbuf[i]; 587 rbuf_0 = rbuf_i[0]; 588 ct1 = 2*rbuf_0+1; 589 ct2 = ct1; 590 ct3 += ct1; 591 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 592 PetscMemzero(xtable,(Mbs/BITSPERBYTE+1)*sizeof(char)); 593 oct2 = ct2; 594 kmax = rbuf_i[2*j]; 595 for (k=0; k<kmax; k++, ct1++) { 596 row = rbuf_i[ct1]; 597 if (!BT_LOOKUP(xtable,row)) { 598 if (!(ct3 < mem_estimate)) { 599 new_estimate = (int)(1.5*mem_estimate)+1; 600 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 601 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 602 PetscFree(xdata[0]); 603 xdata[0] = tmp; 604 mem_estimate = new_estimate; ++no_malloc; 605 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 606 } 607 xdata[i][ct2++] = row; 608 ct3++; 609 } 610 } 611 for (k=oct2,max2=ct2; k<max2; k++) { 612 row = xdata[i][k] - rstart; 613 start = ai[row]; 614 end = ai[row+1]; 615 for (l=start; l<end; l++) { 616 val = aj[l] + cstart; 617 if (!BT_LOOKUP(xtable,val)) { 618 if (!(ct3 < mem_estimate)) { 619 new_estimate = (int)(1.5*mem_estimate)+1; 620 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 621 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 622 PetscFree(xdata[0]); 623 xdata[0] = tmp; 624 mem_estimate = new_estimate; ++no_malloc; 625 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 626 } 627 xdata[i][ct2++] = val; 628 ct3++; 629 } 630 } 631 start = bi[row]; 632 end = bi[row+1]; 633 for (l=start; l<end; l++) { 634 val = garray[bj[l]]; 635 if (!BT_LOOKUP(xtable,val)) { 636 if (!(ct3 < mem_estimate)) { 637 new_estimate = (int)(1.5*mem_estimate)+1; 638 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 639 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 640 PetscFree(xdata[0]); 641 xdata[0] = tmp; 642 mem_estimate = new_estimate; ++no_malloc; 643 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 644 } 645 xdata[i][ct2++] = val; 646 ct3++; 647 } 648 } 649 } 650 /* Update the header*/ 651 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 652 xdata[i][2*j-1] = rbuf_i[2*j-1]; 653 } 654 xdata[i][0] = rbuf_0; 655 xdata[i+1] = xdata[i] + ct2; 656 isz1[i] = ct2; /* size of each message */ 657 } 658 PetscFree(xtable); 659 PLogInfo(0,"MatIncreaseOverlap_MPIBAIJ:[%d] Allocated %d bytes, required %d bytes, no of mallocs = %d\n",rank,mem_estimate, ct3,no_malloc); 660 return 0; 661 } 662 663 static int MatGetSubMatrices_MPIBAIJ_local(Mat,int,IS *,IS *,MatGetSubMatrixCall,Mat **); 664 665 int MatGetSubMatrices_MPIBAIJ(Mat C,int ismax,IS *isrow,IS *iscol, 666 MatGetSubMatrixCall scall,Mat **submat) 667 { 668 int i,ierr; 669 IS *isrow_new,*iscol_new; 670 671 /* The compression and expansion should be avoided. Dos'nt point 672 out errors might change the indices hence buggey */ 673 674 isrow_new = (IS *)PetscMalloc(2*ismax*sizeof(IS)); CHKPTRQ(isrow_new); 675 iscol_new = isrow_new + ismax; 676 ierr = MatCompressIndicesSorted_MPIBAIJ(C, ismax, isrow,isrow_new); CHKERRQ(ierr); 677 ierr = MatCompressIndicesSorted_MPIBAIJ(C, ismax, iscol,iscol_new); CHKERRQ(ierr); 678 ierr = MatGetSubMatrices_MPIBAIJ_local(C,ismax,isrow_new,iscol_new,scall,submat); CHKERRQ(ierr); 679 680 for (i=0; i<ismax; i++) { 681 ISDestroy(isrow_new[i]); 682 ISDestroy(iscol_new[i]); 683 } 684 PetscFree(isrow_new); 685 return 0; 686 } 687 688 /* -------------------------------------------------------------------------*/ 689 static int MatGetSubMatrices_MPIBAIJ_local(Mat C,int ismax,IS *isrow,IS *iscol, 690 MatGetSubMatrixCall scall,Mat **submat) 691 { 692 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 693 Mat A = c->A,*submats = *submat; 694 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data, *b = (Mat_SeqBAIJ*)c->B->data, *mat; 695 int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size; 696 int **sbuf1,**sbuf2, rank, Mbs,i,j,k,l,ct1,ct2,ierr, **rbuf1,row,proc; 697 int nrqs, msz, **ptr,index,*req_size,*ctr,*pa,*tmp,tcol,bsz,nrqr; 698 int **rbuf3,*req_source,**sbuf_aj, **rbuf2, max1,max2,**rmap; 699 int **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i; 700 int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i; 701 int *rmap_i,bs=c->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA, *cworkB; 702 int cstart = c->cstart,nzA,nzB,*a_i=a->i,*b_i=b->i,imark; 703 int *bmap = c->garray,ctmp,rstart=c->rstart,tag0,tag1,tag2,tag3; 704 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 705 MPI_Request *r_waits4,*s_waits3,*s_waits4; 706 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 707 MPI_Status *r_status3,*r_status4,*s_status4; 708 MPI_Comm comm; 709 Scalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB; 710 Scalar *a_a=a->a,*b_a=b->a; 711 712 comm = C->comm; 713 tag0 = C->tag; 714 size = c->size; 715 rank = c->rank; 716 Mbs = c->Mbs; 717 718 /* Get some new tags to keep the communication clean */ 719 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1); CHKERRQ(ierr); 720 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2); CHKERRQ(ierr); 721 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3); CHKERRQ(ierr); 722 723 /* Check if the col indices are sorted */ 724 for (i=0; i<ismax; i++) { 725 ierr = ISSorted(iscol[i],(PetscTruth*)&j); 726 if (!j) SETERRQ(1,"MatGetSubmatrices_MPIBAIJ:IS is not sorted"); 727 } 728 729 len = (2*ismax+1)*(sizeof(int *) + sizeof(int)) + (Mbs+1)*sizeof(int); 730 irow = (int **)PetscMalloc(len); CHKPTRQ(irow); 731 icol = irow + ismax; 732 nrow = (int *) (icol + ismax); 733 ncol = nrow + ismax; 734 rtable = ncol + ismax; 735 736 for (i=0; i<ismax; i++) { 737 ierr = ISGetIndices(isrow[i],&irow[i]); CHKERRQ(ierr); 738 ierr = ISGetIndices(iscol[i],&icol[i]); CHKERRQ(ierr); 739 ierr = ISGetSize(isrow[i],&nrow[i]); CHKERRQ(ierr); 740 ierr = ISGetSize(iscol[i],&ncol[i]); CHKERRQ(ierr); 741 } 742 743 /* Create hash table for the mapping :row -> proc*/ 744 for (i=0,j=0; i<size; i++) { 745 jmax = c->rowners[i+1]; 746 for (; j<jmax; j++) { 747 rtable[j] = i; 748 } 749 } 750 751 /* evaluate communication - mesg to who, length of mesg, and buffer space 752 required. Based on this, buffers are allocated, and data copied into them*/ 753 w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1); /* mesg size */ 754 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 755 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 756 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 757 PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/ 758 for (i=0; i<ismax; i++) { 759 PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/ 760 jmax = nrow[i]; 761 irow_i = irow[i]; 762 for (j=0; j<jmax; j++) { 763 row = irow_i[j]; 764 proc = rtable[row]; 765 w4[proc]++; 766 } 767 for (j=0; j<size; j++) { 768 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 769 } 770 } 771 772 nrqs = 0; /* no of outgoing messages */ 773 msz = 0; /* total mesg length for all proc */ 774 w1[rank] = 0; /* no mesg sent to intself */ 775 w3[rank] = 0; 776 for (i=0; i<size; i++) { 777 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 778 } 779 pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); /*(proc -array)*/ 780 for (i=0, j=0; i<size; i++) { 781 if (w1[i]) { pa[j] = i; j++; } 782 } 783 784 /* Each message would have a header = 1 + 2*(no of IS) + data */ 785 for (i=0; i<nrqs; i++) { 786 j = pa[i]; 787 w1[j] += w2[j] + 2* w3[j]; 788 msz += w1[j]; 789 } 790 /* Do a global reduction to determine how many messages to expect*/ 791 { 792 int *rw1, *rw2; 793 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 794 rw2 = rw1+size; 795 MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm); 796 bsz = rw1[rank]; 797 MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm); 798 nrqr = rw2[rank]; 799 PetscFree(rw1); 800 } 801 802 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 803 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 804 rbuf1 = (int**) PetscMalloc(len); CHKPTRQ(rbuf1); 805 rbuf1[0] = (int *) (rbuf1 + nrqr); 806 for (i=1; i<nrqr; ++i) rbuf1[i] = rbuf1[i-1] + bsz; 807 808 /* Post the receives */ 809 r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 810 CHKPTRQ(r_waits1); 811 for (i=0; i<nrqr; ++i) { 812 MPI_Irecv(rbuf1[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag0,comm,r_waits1+i); 813 } 814 815 /* Allocate Memory for outgoing messages */ 816 len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int); 817 sbuf1 = (int **)PetscMalloc(len); CHKPTRQ(sbuf1); 818 ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */ 819 PetscMemzero(sbuf1,2*size*sizeof(int*)); 820 /* allocate memory for outgoing data + buf to receive the first reply */ 821 tmp = (int *) (ptr + size); 822 ctr = tmp + 2*msz; 823 824 { 825 826 int *iptr = tmp,ict = 0; 827 for (i=0; i<nrqs; i++) { 828 j = pa[i]; 829 iptr += ict; 830 sbuf1[j] = iptr; 831 ict = w1[j]; 832 } 833 } 834 835 /* Form the outgoing messages */ 836 /* Initialise the header space */ 837 for (i=0; i<nrqs; i++) { 838 j = pa[i]; 839 sbuf1[j][0] = 0; 840 PetscMemzero(sbuf1[j]+1, 2*w3[j]*sizeof(int)); 841 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 842 } 843 844 /* Parse the isrow and copy data into outbuf */ 845 for (i=0; i<ismax; i++) { 846 PetscMemzero(ctr,size*sizeof(int)); 847 irow_i = irow[i]; 848 jmax = nrow[i]; 849 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 850 row = irow_i[j]; 851 proc = rtable[row]; 852 if (proc != rank) { /* copy to the outgoing buf*/ 853 ctr[proc]++; 854 *ptr[proc] = row; 855 ptr[proc]++; 856 } 857 } 858 /* Update the headers for the current IS */ 859 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 860 if ((ctr_j = ctr[j])) { 861 sbuf1_j = sbuf1[j]; 862 k = ++sbuf1_j[0]; 863 sbuf1_j[2*k] = ctr_j; 864 sbuf1_j[2*k-1] = i; 865 } 866 } 867 } 868 869 /* Now post the sends */ 870 s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 871 CHKPTRQ(s_waits1); 872 for (i=0; i<nrqs; ++i) { 873 j = pa[i]; 874 /* printf("[%d] Send Req to %d: size %d \n", rank,j, w1[j]); */ 875 MPI_Isend( sbuf1[j], w1[j], MPI_INT, j, tag0, comm, s_waits1+i); 876 } 877 878 /* Post Recieves to capture the buffer size */ 879 r_waits2 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 880 CHKPTRQ(r_waits2); 881 rbuf2 = (int**)PetscMalloc((nrqs+1)*sizeof(int *));CHKPTRQ(rbuf2); 882 rbuf2[0] = tmp + msz; 883 for (i=1; i<nrqs; ++i) { 884 j = pa[i]; 885 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 886 } 887 for (i=0; i<nrqs; ++i) { 888 j = pa[i]; 889 MPI_Irecv( rbuf2[i], w1[j], MPI_INT, j, tag1, comm, r_waits2+i); 890 } 891 892 /* Send to other procs the buf size they should allocate */ 893 894 895 /* Receive messages*/ 896 s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 897 CHKPTRQ(s_waits2); 898 r_status1 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 899 CHKPTRQ(r_status1); 900 len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*); 901 sbuf2 = (int**) PetscMalloc(len);CHKPTRQ(sbuf2); 902 req_size = (int *) (sbuf2 + nrqr); 903 req_source = req_size + nrqr; 904 905 { 906 Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*) c->A->data, *sB = (Mat_SeqBAIJ*) c->B->data; 907 int *sAi = sA->i, *sBi = sB->i, id; 908 int *sbuf2_i; 909 910 for (i=0; i<nrqr; ++i) { 911 MPI_Waitany(nrqr, r_waits1, &index, r_status1+i); 912 req_size[index] = 0; 913 rbuf1_i = rbuf1[index]; 914 start = 2*rbuf1_i[0] + 1; 915 MPI_Get_count(r_status1+i,MPI_INT, &end); 916 sbuf2[index] = (int *)PetscMalloc(end*sizeof(int));CHKPTRQ(sbuf2[index]); 917 sbuf2_i = sbuf2[index]; 918 for (j=start; j<end; j++) { 919 id = rbuf1_i[j] - rstart; 920 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 921 sbuf2_i[j] = ncols; 922 req_size[index] += ncols; 923 } 924 req_source[index] = r_status1[i].MPI_SOURCE; 925 /* form the header */ 926 sbuf2_i[0] = req_size[index]; 927 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 928 MPI_Isend(sbuf2_i,end,MPI_INT,req_source[index],tag1,comm,s_waits2+i); 929 } 930 } 931 PetscFree(r_status1); PetscFree(r_waits1); 932 933 /* recv buffer sizes */ 934 /* Receive messages*/ 935 936 rbuf3 = (int**)PetscMalloc((nrqs+1)*sizeof(int*)); CHKPTRQ(rbuf3); 937 rbuf4 = (Scalar**)PetscMalloc((nrqs+1)*sizeof(Scalar*));CHKPTRQ(rbuf4); 938 r_waits3 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 939 CHKPTRQ(r_waits3); 940 r_waits4 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 941 CHKPTRQ(r_waits4); 942 r_status2 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 943 CHKPTRQ(r_status2); 944 945 for (i=0; i<nrqs; ++i) { 946 MPI_Waitany(nrqs, r_waits2, &index, r_status2+i); 947 rbuf3[index] = (int *)PetscMalloc(rbuf2[index][0]*sizeof(int)); 948 CHKPTRQ(rbuf3[index]); 949 rbuf4[index] = (Scalar *)PetscMalloc(rbuf2[index][0]*bs2*sizeof(Scalar)); 950 CHKPTRQ(rbuf4[index]); 951 MPI_Irecv(rbuf3[index],rbuf2[index][0], MPI_INT, 952 r_status2[i].MPI_SOURCE, tag2, comm, r_waits3+index); 953 MPI_Irecv(rbuf4[index],rbuf2[index][0]*bs2, MPIU_SCALAR, 954 r_status2[i].MPI_SOURCE, tag3, comm, r_waits4+index); 955 } 956 PetscFree(r_status2); PetscFree(r_waits2); 957 958 /* Wait on sends1 and sends2 */ 959 s_status1 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 960 CHKPTRQ(s_status1); 961 s_status2 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 962 CHKPTRQ(s_status2); 963 964 MPI_Waitall(nrqs,s_waits1,s_status1); 965 MPI_Waitall(nrqr,s_waits2,s_status2); 966 PetscFree(s_status1); PetscFree(s_status2); 967 PetscFree(s_waits1); PetscFree(s_waits2); 968 969 /* Now allocate buffers for a->j, and send them off */ 970 sbuf_aj = (int **)PetscMalloc((nrqr+1)*sizeof(int *));CHKPTRQ(sbuf_aj); 971 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 972 sbuf_aj[0] = (int*) PetscMalloc((j+1)*sizeof(int)); CHKPTRQ(sbuf_aj[0]); 973 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 974 975 s_waits3 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 976 CHKPTRQ(s_waits3); 977 { 978 for (i=0; i<nrqr; i++) { 979 rbuf1_i = rbuf1[i]; 980 sbuf_aj_i = sbuf_aj[i]; 981 ct1 = 2*rbuf1_i[0] + 1; 982 ct2 = 0; 983 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 984 kmax = rbuf1[i][2*j]; 985 for (k=0; k<kmax; k++,ct1++) { 986 row = rbuf1_i[ct1] - rstart; 987 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 988 ncols = nzA + nzB; 989 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 990 991 /* load the column indices for this row into cols*/ 992 cols = sbuf_aj_i + ct2; 993 for (l=0; l<nzB; l++) { 994 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp; 995 else break; 996 } 997 imark = l; 998 for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l]; 999 for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]]; 1000 ct2 += ncols; 1001 } 1002 } 1003 MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i); 1004 } 1005 } 1006 r_status3 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 1007 CHKPTRQ(r_status3); 1008 s_status3 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 1009 CHKPTRQ(s_status3); 1010 1011 /* Allocate buffers for a->a, and send them off */ 1012 sbuf_aa = (Scalar **)PetscMalloc((nrqr+1)*sizeof(Scalar *));CHKPTRQ(sbuf_aa); 1013 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1014 sbuf_aa[0] = (Scalar*) PetscMalloc((j+1)*bs2*sizeof(Scalar));CHKPTRQ(sbuf_aa[0]); 1015 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2; 1016 1017 s_waits4 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 1018 CHKPTRQ(s_waits4); 1019 { 1020 for (i=0; i<nrqr; i++) { 1021 rbuf1_i = rbuf1[i]; 1022 sbuf_aa_i = sbuf_aa[i]; 1023 ct1 = 2*rbuf1_i[0]+1; 1024 ct2 = 0; 1025 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1026 kmax = rbuf1_i[2*j]; 1027 for (k=0; k<kmax; k++,ct1++) { 1028 row = rbuf1_i[ct1] - rstart; 1029 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1030 ncols = nzA + nzB; 1031 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1032 vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2; 1033 1034 /* load the column values for this row into vals*/ 1035 vals = sbuf_aa_i+ct2*bs2; 1036 for (l=0; l<nzB; l++) { 1037 if ((bmap[cworkB[l]]) < cstart) 1038 PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(Scalar)); 1039 else break; 1040 } 1041 imark = l; 1042 for (l=0; l<nzA; l++) 1043 PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(Scalar)); 1044 for (l=imark; l<nzB; l++) 1045 PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(Scalar)); 1046 ct2 += ncols; 1047 } 1048 } 1049 MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i); 1050 } 1051 } 1052 r_status4 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 1053 CHKPTRQ(r_status4); 1054 s_status4 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 1055 CHKPTRQ(s_status4); 1056 PetscFree(rbuf1); 1057 1058 /* Form the matrix */ 1059 /* create col map */ 1060 { 1061 int *icol_i; 1062 1063 len = (1+ismax)*sizeof(int *) + ismax*c->Nbs*sizeof(int); 1064 cmap = (int **)PetscMalloc(len); CHKPTRQ(cmap); 1065 cmap[0] = (int *)(cmap + ismax); 1066 PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(int)); 1067 for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; } 1068 for (i=0; i<ismax; i++) { 1069 jmax = ncol[i]; 1070 icol_i = icol[i]; 1071 cmap_i = cmap[i]; 1072 for (j=0; j<jmax; j++) { 1073 cmap_i[icol_i[j]] = j+1; 1074 } 1075 } 1076 } 1077 1078 1079 /* Create lens which is required for MatCreate... */ 1080 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1081 len = (1+ismax)*sizeof(int *) + j*sizeof(int); 1082 lens = (int **)PetscMalloc(len); CHKPTRQ(lens); 1083 lens[0] = (int *)(lens + ismax); 1084 PetscMemzero(lens[0], j*sizeof(int)); 1085 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1086 1087 /* Update lens from local data */ 1088 for (i=0; i<ismax; i++) { 1089 jmax = nrow[i]; 1090 cmap_i = cmap[i]; 1091 irow_i = irow[i]; 1092 lens_i = lens[i]; 1093 for (j=0; j<jmax; j++) { 1094 row = irow_i[j]; 1095 proc = rtable[row]; 1096 if (proc == rank) { 1097 /* Get indices from matA and then from matB */ 1098 row = row - rstart; 1099 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1100 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1101 for (k=0; k<nzA; k++) 1102 if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++;} 1103 for (k=0; k<nzB; k++) 1104 if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++;} 1105 } 1106 } 1107 } 1108 1109 /* Create row map*/ 1110 len = (1+ismax)*sizeof(int *) + ismax*c->Mbs*sizeof(int); 1111 rmap = (int **)PetscMalloc(len); CHKPTRQ(rmap); 1112 rmap[0] = (int *)(rmap + ismax); 1113 PetscMemzero(rmap[0],ismax*c->Mbs*sizeof(int)); 1114 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + c->Mbs;} 1115 for (i=0; i<ismax; i++) { 1116 rmap_i = rmap[i]; 1117 irow_i = irow[i]; 1118 jmax = nrow[i]; 1119 for (j=0; j<jmax; j++) { 1120 rmap_i[irow_i[j]] = j; 1121 } 1122 } 1123 1124 /* Update lens from offproc data */ 1125 { 1126 int *rbuf2_i, *rbuf3_i, *sbuf1_i; 1127 1128 for (tmp2=0; tmp2<nrqs; tmp2++) { 1129 MPI_Waitany(nrqs, r_waits3, &i, r_status3+tmp2); 1130 index = pa[i]; 1131 sbuf1_i = sbuf1[index]; 1132 jmax = sbuf1_i[0]; 1133 ct1 = 2*jmax+1; 1134 ct2 = 0; 1135 rbuf2_i = rbuf2[i]; 1136 rbuf3_i = rbuf3[i]; 1137 for (j=1; j<=jmax; j++) { 1138 is_no = sbuf1_i[2*j-1]; 1139 max1 = sbuf1_i[2*j]; 1140 lens_i = lens[is_no]; 1141 cmap_i = cmap[is_no]; 1142 rmap_i = rmap[is_no]; 1143 for (k=0; k<max1; k++,ct1++) { 1144 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1145 max2 = rbuf2_i[ct1]; 1146 for (l=0; l<max2; l++,ct2++) { 1147 if (cmap_i[rbuf3_i[ct2]]) { 1148 lens_i[row]++; 1149 } 1150 } 1151 } 1152 } 1153 } 1154 } 1155 PetscFree(r_status3); PetscFree(r_waits3); 1156 MPI_Waitall(nrqr,s_waits3,s_status3); 1157 PetscFree(s_status3); PetscFree(s_waits3); 1158 1159 /* Create the submatrices */ 1160 if (scall == MAT_REUSE_MATRIX) { 1161 /* 1162 Assumes new rows are same length as the old rows, hence bug! 1163 */ 1164 for (i=0; i<ismax; i++) { 1165 mat = (Mat_SeqBAIJ *)(submats[i]->data); 1166 if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || mat->bs != bs)) { 1167 SETERRQ(1,"MatGetSubmatrices_MPIBAIJ:Cannot reuse matrix. wrong size"); 1168 } 1169 if (PetscMemcmp(mat->ilen,lens[i], mat->mbs *sizeof(int))) { 1170 SETERRQ(1,"MatGetSubmatrices_MPIBAIJ:Cannot reuse matrix. wrong no of nonzeros"); 1171 } 1172 /* Initial matrix as if empty */ 1173 PetscMemzero(mat->ilen,mat->mbs*sizeof(int)); 1174 } 1175 } 1176 else { 1177 *submat = submats = (Mat *)PetscMalloc(ismax*sizeof(Mat)); CHKPTRQ(submats); 1178 for (i=0; i<ismax; i++) { 1179 ierr = MatCreateSeqBAIJ(MPI_COMM_SELF,a->bs,nrow[i]*bs,ncol[i]*bs,0,lens[i],submats+i); 1180 CHKERRQ(ierr); 1181 } 1182 } 1183 1184 /* Assemble the matrices */ 1185 /* First assemble the local rows */ 1186 { 1187 int ilen_row,*imat_ilen, *imat_j, *imat_i; 1188 Scalar *imat_a; 1189 1190 for (i=0; i<ismax; i++) { 1191 mat = (Mat_SeqBAIJ *) submats[i]->data; 1192 imat_ilen = mat->ilen; 1193 imat_j = mat->j; 1194 imat_i = mat->i; 1195 imat_a = mat->a; 1196 cmap_i = cmap[i]; 1197 rmap_i = rmap[i]; 1198 irow_i = irow[i]; 1199 jmax = nrow[i]; 1200 for (j=0; j<jmax; j++) { 1201 row = irow_i[j]; 1202 proc = rtable[row]; 1203 if (proc == rank) { 1204 row = row - rstart; 1205 nzA = a_i[row+1] - a_i[row]; 1206 nzB = b_i[row+1] - b_i[row]; 1207 cworkA = a_j + a_i[row]; 1208 cworkB = b_j + b_i[row]; 1209 vworkA = a_a + a_i[row]*bs2; 1210 vworkB = b_a + b_i[row]*bs2; 1211 1212 row = rmap_i[row + rstart]; 1213 mat_i = imat_i[row]; 1214 mat_a = imat_a + mat_i*bs2; 1215 mat_j = imat_j + mat_i; 1216 ilen_row = imat_ilen[row]; 1217 1218 /* load the column indices for this row into cols*/ 1219 for (l=0; l<nzB; l++) { 1220 if ((ctmp = bmap[cworkB[l]]) < cstart) { 1221 if ((tcol = cmap_i[ctmp])) { 1222 *mat_j++ = tcol - 1; 1223 PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(Scalar)); mat_a += bs2; 1224 ilen_row++; 1225 } 1226 } 1227 else break; 1228 } 1229 imark = l; 1230 for (l=0; l<nzA; l++) { 1231 if ((tcol = cmap_i[cstart + cworkA[l]])) { 1232 *mat_j++ = tcol - 1; 1233 PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(Scalar)); mat_a += bs2; 1234 ilen_row++; 1235 } 1236 } 1237 for (l=imark; l<nzB; l++) { 1238 if ((tcol = cmap_i[bmap[cworkB[l]]])) { 1239 *mat_j++ = tcol - 1; 1240 PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(Scalar)); mat_a += bs2; 1241 ilen_row++; 1242 } 1243 } 1244 imat_ilen[row] = ilen_row; 1245 } 1246 } 1247 1248 } 1249 } 1250 1251 /* Now assemble the off proc rows*/ 1252 { 1253 int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1254 int *imat_j,*imat_i; 1255 Scalar *imat_a,*rbuf4_i; 1256 1257 for (tmp2=0; tmp2<nrqs; tmp2++) { 1258 MPI_Waitany(nrqs, r_waits4, &i, r_status4+tmp2); 1259 index = pa[i]; 1260 sbuf1_i = sbuf1[index]; 1261 jmax = sbuf1_i[0]; 1262 ct1 = 2*jmax + 1; 1263 ct2 = 0; 1264 rbuf2_i = rbuf2[i]; 1265 rbuf3_i = rbuf3[i]; 1266 rbuf4_i = rbuf4[i]; 1267 for (j=1; j<=jmax; j++) { 1268 is_no = sbuf1_i[2*j-1]; 1269 rmap_i = rmap[is_no]; 1270 cmap_i = cmap[is_no]; 1271 mat = (Mat_SeqBAIJ *) submats[is_no]->data; 1272 imat_ilen = mat->ilen; 1273 imat_j = mat->j; 1274 imat_i = mat->i; 1275 imat_a = mat->a; 1276 max1 = sbuf1_i[2*j]; 1277 for (k=0; k<max1; k++, ct1++) { 1278 row = sbuf1_i[ct1]; 1279 row = rmap_i[row]; 1280 ilen = imat_ilen[row]; 1281 mat_i = imat_i[row]; 1282 mat_a = imat_a + mat_i*bs2; 1283 mat_j = imat_j + mat_i; 1284 max2 = rbuf2_i[ct1]; 1285 for (l=0; l<max2; l++,ct2++) { 1286 if ((tcol = cmap_i[rbuf3_i[ct2]])) { 1287 *mat_j++ = tcol - 1; 1288 /* *mat_a++ = rbuf4_i[ct2]; */ 1289 PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(Scalar)); mat_a += bs2; 1290 ilen++; 1291 } 1292 } 1293 imat_ilen[row] = ilen; 1294 } 1295 } 1296 } 1297 } 1298 PetscFree(r_status4); PetscFree(r_waits4); 1299 MPI_Waitall(nrqr,s_waits4,s_status4); 1300 PetscFree(s_waits4); PetscFree(s_status4); 1301 1302 /* Restore the indices */ 1303 for (i=0; i<ismax; i++) { 1304 ierr = ISRestoreIndices(isrow[i], irow+i); CHKERRQ(ierr); 1305 ierr = ISRestoreIndices(iscol[i], icol+i); CHKERRQ(ierr); 1306 } 1307 1308 /* Destroy allocated memory */ 1309 PetscFree(irow); 1310 PetscFree(w1); 1311 PetscFree(pa); 1312 1313 PetscFree(sbuf1); 1314 PetscFree(rbuf2); 1315 for (i=0; i<nrqr; ++i) { 1316 PetscFree(sbuf2[i]); 1317 } 1318 for (i=0; i<nrqs; ++i) { 1319 PetscFree(rbuf3[i]); 1320 PetscFree(rbuf4[i]); 1321 } 1322 1323 PetscFree(sbuf2); 1324 PetscFree(rbuf3); 1325 PetscFree(rbuf4 ); 1326 PetscFree(sbuf_aj[0]); 1327 PetscFree(sbuf_aj); 1328 PetscFree(sbuf_aa[0]); 1329 PetscFree(sbuf_aa); 1330 1331 PetscFree(cmap); 1332 PetscFree(rmap); 1333 PetscFree(lens); 1334 1335 for (i=0; i<ismax; i++) { 1336 ierr = MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); 1337 ierr = MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); 1338 } 1339 1340 ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag3); CHKERRQ(ierr); 1341 ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag2); CHKERRQ(ierr); 1342 ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag1); CHKERRQ(ierr); 1343 1344 return 0; 1345 } 1346