1 #ifdef PETSC_RCS_HEADER 2 static char vcid[] = "$Id: baijov.c,v 1.27 1997/11/03 04:46:15 bsmith Exp bsmith $"; 3 #endif 4 5 /* 6 Routines to compute overlapping regions of a parallel MPI matrix 7 and to find submatrices that were shared across processors. 8 */ 9 #include "src/mat/impls/baij/mpi/mpibaij.h" 10 #include "src/inline/bitarray.h" 11 12 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat, int, IS *); 13 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat , int , char **,int*, int**); 14 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat , int, int **, int**, int* ); 15 extern int MatGetRow_MPIBAIJ(Mat,int,int*,int**,Scalar**); 16 extern int MatRestoreRow_MPIBAIJ(Mat,int,int*,int**,Scalar**); 17 18 #undef __FUNC__ 19 #define __FUNC__ "MatCompressIndicesGeneral_MPIBAIJ" 20 static int MatCompressIndicesGeneral_MPIBAIJ(Mat C, int imax, IS *is_in, IS *is_out) 21 { 22 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) C->data; 23 int ierr,isz,bs = baij->bs,Nbs,n,i,j,*idx,*nidx,ival; 24 BT table; 25 26 PetscFunctionBegin; 27 Nbs = baij->Nbs; 28 nidx = (int *) PetscMalloc((Nbs+1)*sizeof(int)); CHKPTRQ(nidx); 29 ierr = BTCreate(Nbs,table); CHKERRQ(ierr); 30 31 for (i=0; i<imax; i++) { 32 isz = 0; 33 BTMemzero(Nbs,table); 34 ierr = ISGetIndices(is_in[i],&idx); CHKERRQ(ierr); 35 ierr = ISGetSize(is_in[i],&n); CHKERRQ(ierr); 36 for (j=0; j<n ; j++) { 37 ival = idx[j]/bs; /* convert the indices into block indices */ 38 if (ival>Nbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"index greater than mat-dim"); 39 if(!BTLookupSet(table, ival)) { nidx[isz++] = ival;} 40 } 41 ierr = ISRestoreIndices(is_in[i],&idx); CHKERRQ(ierr); 42 ierr = ISCreateGeneral(PETSC_COMM_SELF, isz, nidx, (is_out+i)); CHKERRQ(ierr); 43 } 44 BTDestroy(table); 45 PetscFree(nidx); 46 PetscFunctionReturn(0); 47 } 48 49 #undef __FUNC__ 50 #define __FUNC__ "MatCompressIndicesSorted_MPIBAIJ" 51 static int MatCompressIndicesSorted_MPIBAIJ(Mat C, int imax, IS *is_in, IS *is_out) 52 { 53 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) C->data; 54 int ierr,bs=baij->bs,i,j,k,val,n,*idx,*nidx,Nbs=baij->Nbs,*idx_local; 55 PetscTruth flg; 56 57 PetscFunctionBegin; 58 for (i=0; i<imax; i++) { 59 ierr = ISSorted(is_in[i],&flg); CHKERRQ(ierr); 60 if (!flg) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Indices are not sorted"); 61 } 62 nidx = (int *) PetscMalloc((Nbs+1)*sizeof(int)); CHKPTRQ(nidx); 63 /* Now check if the indices are in block order */ 64 for (i=0; i<imax; i++) { 65 ierr = ISGetIndices(is_in[i],&idx); CHKERRQ(ierr); 66 ierr = ISGetSize(is_in[i],&n); CHKERRQ(ierr); 67 if ( n%bs !=0 ) SETERRA(1,0,"Indices are not block ordered"); 68 69 n = n/bs; /* The reduced index size */ 70 idx_local = idx; 71 for (j=0; j<n ; j++) { 72 val = idx_local[0]; 73 if (val%bs != 0) SETERRA(1,0,"Indices are not block ordered"); 74 for (k=0; k<bs; k++) { 75 if ( val+k != idx_local[k]) SETERRA(1,0,"Indices are not block ordered"); 76 } 77 nidx[j] = val/bs; 78 idx_local +=bs; 79 } 80 ierr = ISRestoreIndices(is_in[i],&idx); CHKERRQ(ierr); 81 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,nidx,(is_out+i)); CHKERRQ(ierr); 82 } 83 PetscFree(nidx); 84 PetscFunctionReturn(0); 85 } 86 87 #undef __FUNC__ 88 #define __FUNC__ "MatExpandIndices_MPIBAIJ" 89 static int MatExpandIndices_MPIBAIJ(Mat C, int imax, IS *is_in, IS *is_out) 90 { 91 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) C->data; 92 int ierr,bs = baij->bs,Nbs,n,i,j,k,*idx,*nidx; 93 94 PetscFunctionBegin; 95 Nbs = baij->Nbs; 96 97 nidx = (int *) PetscMalloc((Nbs*bs+1)*sizeof(int)); CHKPTRQ(nidx); 98 99 for ( i=0; i<imax; i++ ) { 100 ierr = ISGetIndices(is_in[i],&idx); CHKERRQ(ierr); 101 ierr = ISGetSize(is_in[i],&n); CHKERRQ(ierr); 102 for (j=0; j<n ; ++j){ 103 for (k=0; k<bs; k++) 104 nidx[j*bs+k] = idx[j]*bs+k; 105 } 106 ierr = ISRestoreIndices(is_in[i],&idx); CHKERRQ(ierr); 107 ierr = ISCreateGeneral(PETSC_COMM_SELF, n*bs, nidx, (is_out+i)); CHKERRQ(ierr); 108 } 109 PetscFree(nidx); 110 PetscFunctionReturn(0); 111 } 112 113 114 #undef __FUNC__ 115 #define __FUNC__ "MatIncreaseOverlap_MPIBAIJ" 116 int MatIncreaseOverlap_MPIBAIJ(Mat C, int imax, IS *is, int ov) 117 { 118 int i, ierr; 119 IS *is_new; 120 121 PetscFunctionBegin; 122 is_new = (IS *)PetscMalloc(imax*sizeof(IS)); CHKPTRQ(is_new); 123 /* Convert the indices into block format */ 124 ierr = MatCompressIndicesGeneral_MPIBAIJ(C, imax, is,is_new); CHKERRQ(ierr); 125 if (ov < 0){ SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative overlap specified\n");} 126 for (i=0; i<ov; ++i) { 127 ierr = MatIncreaseOverlap_MPIBAIJ_Once(C, imax, is_new); CHKERRQ(ierr); 128 } 129 for (i=0; i<imax; i++) {ierr = ISDestroy(is[i]); CHKERRQ(ierr);} 130 ierr = MatExpandIndices_MPIBAIJ(C, imax, is_new,is); CHKERRQ(ierr); 131 for (i=0; i<imax; i++) {ierr = ISDestroy(is_new[i]);CHKERRQ(ierr);} 132 PetscFree(is_new); 133 PetscFunctionReturn(0); 134 } 135 136 /* 137 Sample message format: 138 If a processor A wants processor B to process some elements corresponding 139 to index sets 1s[1], is[5] 140 mesg [0] = 2 ( no of index sets in the mesg) 141 ----------- 142 mesg [1] = 1 => is[1] 143 mesg [2] = sizeof(is[1]); 144 ----------- 145 mesg [5] = 5 => is[5] 146 mesg [6] = sizeof(is[5]); 147 ----------- 148 mesg [7] 149 mesg [n] datas[1] 150 ----------- 151 mesg[n+1] 152 mesg[m] data(is[5]) 153 ----------- 154 155 Notes: 156 nrqs - no of requests sent (or to be sent out) 157 nrqr - no of requests recieved (which have to be or which have been processed 158 */ 159 #undef __FUNC__ 160 #define __FUNC__ "MatIncreaseOverlap_MPIBAIJ_Once" 161 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat C, int imax, IS *is) 162 { 163 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 164 int **idx, *n, *w1, *w2, *w3, *w4, *rtable,**data,len,*idx_i; 165 int size,rank,Mbs,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr; 166 int *ctr,*pa,tag,*tmp,bsz,nrqr,*isz,*isz1,**xdata,bsz1,**rbuf2; 167 BT *table; 168 MPI_Comm comm; 169 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 170 MPI_Status *s_status,*recv_status; 171 172 PetscFunctionBegin; 173 comm = C->comm; 174 tag = C->tag; 175 size = c->size; 176 rank = c->rank; 177 Mbs = c->Mbs; 178 179 len = (imax+1)*sizeof(int *) + (imax + Mbs)*sizeof(int); 180 idx = (int **) PetscMalloc(len); CHKPTRQ(idx); 181 n = (int *) (idx + imax); 182 rtable = n + imax; 183 184 for (i=0; i<imax; i++) { 185 ierr = ISGetIndices(is[i],&idx[i]); CHKERRQ(ierr); 186 ierr = ISGetSize(is[i],&n[i]); CHKERRQ(ierr); 187 } 188 189 /* Create hash table for the mapping :row -> proc*/ 190 for (i=0,j=0; i<size; i++) { 191 len = c->rowners[i+1]; 192 for (; j<len; j++) { 193 rtable[j] = i; 194 } 195 } 196 197 /* evaluate communication - mesg to who, length of mesg, and buffer space 198 required. Based on this, buffers are allocated, and data copied into them*/ 199 w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1);/* mesg size */ 200 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 201 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 202 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 203 PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/ 204 for (i=0; i<imax; i++) { 205 PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/ 206 idx_i = idx[i]; 207 len = n[i]; 208 for (j=0; j<len; j++) { 209 row = idx_i[j]; 210 proc = rtable[row]; 211 w4[proc]++; 212 } 213 for (j=0; j<size; j++){ 214 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 215 } 216 } 217 218 nrqs = 0; /* no of outgoing messages */ 219 msz = 0; /* total mesg length (for all proc */ 220 w1[rank] = 0; /* no mesg sent to intself */ 221 w3[rank] = 0; 222 for ( i=0; i<size; i++) { 223 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 224 } 225 /* pa - is list of processors to communicate with */ 226 pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); 227 for (i=0,j=0; i<size; i++) { 228 if (w1[i]) {pa[j] = i; j++;} 229 } 230 231 /* Each message would have a header = 1 + 2*(no of IS) + data */ 232 for (i=0; i<nrqs; i++) { 233 j = pa[i]; 234 w1[j] += w2[j] + 2*w3[j]; 235 msz += w1[j]; 236 } 237 238 239 /* Do a global reduction to determine how many messages to expect*/ 240 { 241 int *rw1, *rw2; 242 rw1 = (int *) PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 243 rw2 = rw1+size; 244 ierr = MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm);CHKERRQ(ierr); 245 bsz = rw1[rank]; 246 ierr = MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm);CHKERRQ(ierr); 247 nrqr = rw2[rank]; 248 PetscFree(rw1); 249 } 250 251 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 252 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 253 rbuf = (int**) PetscMalloc(len); CHKPTRQ(rbuf); 254 rbuf[0] = (int *) (rbuf + nrqr); 255 for (i=1; i<nrqr; ++i) rbuf[i] = rbuf[i-1] + bsz; 256 257 /* Post the receives */ 258 r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(r_waits1); 259 for (i=0; i<nrqr; ++i) { 260 ierr = MPI_Irecv(rbuf[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag,comm,r_waits1+i);CHKERRQ(ierr); 261 } 262 263 /* Allocate Memory for outgoing messages */ 264 len = 2*size*sizeof(int*) + (size+msz)*sizeof(int); 265 outdat = (int **)PetscMalloc(len); CHKPTRQ(outdat); 266 ptr = outdat + size; /* Pointers to the data in outgoing buffers */ 267 PetscMemzero(outdat,2*size*sizeof(int*)); 268 tmp = (int *) (outdat + 2*size); 269 ctr = tmp + msz; 270 271 { 272 int *iptr = tmp,ict = 0; 273 for (i=0; i<nrqs; i++) { 274 j = pa[i]; 275 iptr += ict; 276 outdat[j] = iptr; 277 ict = w1[j]; 278 } 279 } 280 281 /* Form the outgoing messages */ 282 /*plug in the headers*/ 283 for (i=0; i<nrqs; i++) { 284 j = pa[i]; 285 outdat[j][0] = 0; 286 PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int)); 287 ptr[j] = outdat[j] + 2*w3[j] + 1; 288 } 289 290 /* Memory for doing local proc's work*/ 291 { 292 int *d_p; 293 char *t_p; 294 295 len = (imax)*(sizeof(BT) + sizeof(int *) + sizeof(int)) + 296 (Mbs)*imax*sizeof(int) + (Mbs/BITSPERBYTE+1)*imax*sizeof(char) + 1; 297 table = (BT *)PetscMalloc(len); CHKPTRQ(table); 298 PetscMemzero(table,len); 299 data = (int **)(table + imax); 300 isz = (int *)(data + imax); 301 d_p = (int *)(isz + imax); 302 t_p = (char *)(d_p + Mbs*imax); 303 for (i=0; i<imax; i++) { 304 table[i] = t_p + (Mbs/BITSPERBYTE+1)*i; 305 data[i] = d_p + (Mbs)*i; 306 } 307 } 308 309 /* Parse the IS and update local tables and the outgoing buf with the data*/ 310 { 311 int n_i,*data_i,isz_i,*outdat_j,ctr_j; 312 BT table_i; 313 314 for (i=0; i<imax; i++) { 315 PetscMemzero(ctr,size*sizeof(int)); 316 n_i = n[i]; 317 table_i = table[i]; 318 idx_i = idx[i]; 319 data_i = data[i]; 320 isz_i = isz[i]; 321 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 322 row = idx_i[j]; 323 proc = rtable[row]; 324 if (proc != rank) { /* copy to the outgoing buffer */ 325 ctr[proc]++; 326 *ptr[proc] = row; 327 ptr[proc]++; 328 } 329 else { /* Update the local table */ 330 if (!BTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 331 } 332 } 333 /* Update the headers for the current IS */ 334 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 335 if ((ctr_j = ctr[j])) { 336 outdat_j = outdat[j]; 337 k = ++outdat_j[0]; 338 outdat_j[2*k] = ctr_j; 339 outdat_j[2*k-1] = i; 340 } 341 } 342 isz[i] = isz_i; 343 } 344 } 345 346 347 348 /* Now post the sends */ 349 s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(s_waits1); 350 for (i=0; i<nrqs; ++i) { 351 j = pa[i]; 352 ierr = MPI_Isend(outdat[j], w1[j], MPI_INT, j, tag, comm, s_waits1+i);CHKERRQ(ierr); 353 } 354 355 /* No longer need the original indices*/ 356 for (i=0; i<imax; ++i) { 357 ierr = ISRestoreIndices(is[i], idx+i); CHKERRQ(ierr); 358 } 359 PetscFree(idx); 360 361 for (i=0; i<imax; ++i) { 362 ierr = ISDestroy(is[i]); CHKERRQ(ierr); 363 } 364 365 /* Do Local work*/ 366 ierr = MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 367 368 /* Receive messages*/ 369 { 370 int index; 371 372 recv_status = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(recv_status); 373 for (i=0; i<nrqr; ++i) { 374 ierr = MPI_Waitany(nrqr, r_waits1, &index, recv_status+i);CHKERRQ(ierr); 375 } 376 377 s_status = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(s_status); 378 ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr); 379 } 380 381 /* Phase 1 sends are complete - deallocate buffers */ 382 PetscFree(outdat); 383 PetscFree(w1); 384 385 xdata = (int **)PetscMalloc((nrqr+1)*sizeof(int *)); CHKPTRQ(xdata); 386 isz1 = (int *)PetscMalloc((nrqr+1)*sizeof(int)); CHKPTRQ(isz1); 387 ierr = MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 388 PetscFree(rbuf); 389 390 /* Send the data back*/ 391 /* Do a global reduction to know the buffer space req for incoming messages*/ 392 { 393 int *rw1, *rw2; 394 395 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 396 PetscMemzero(rw1,2*size*sizeof(int)); 397 rw2 = rw1+size; 398 for (i=0; i<nrqr; ++i) { 399 proc = recv_status[i].MPI_SOURCE; 400 rw1[proc] = isz1[i]; 401 } 402 403 ierr = MPI_Allreduce(rw1, rw2, size, MPI_INT, MPI_MAX, comm);CHKERRQ(ierr); 404 bsz1 = rw2[rank]; 405 PetscFree(rw1); 406 } 407 408 /* Allocate buffers*/ 409 410 /* Allocate memory for recv buffers. Prob none if nrqr = 0 ???? */ 411 len = (nrqs+1)*sizeof(int*) + nrqs*bsz1*sizeof(int); 412 rbuf2 = (int**) PetscMalloc(len); CHKPTRQ(rbuf2); 413 rbuf2[0] = (int *) (rbuf2 + nrqs); 414 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + bsz1; 415 416 /* Post the receives */ 417 r_waits2 = (MPI_Request *)PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits2); 418 for (i=0; i<nrqs; ++i) { 419 ierr = MPI_Irecv(rbuf2[i], bsz1, MPI_INT, MPI_ANY_SOURCE, tag, comm, r_waits2+i);CHKERRQ(ierr); 420 } 421 422 /* Now post the sends */ 423 s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits2); 424 for (i=0; i<nrqr; ++i) { 425 j = recv_status[i].MPI_SOURCE; 426 ierr = MPI_Isend( xdata[i], isz1[i], MPI_INT, j, tag, comm, s_waits2+i);CHKERRQ(ierr); 427 } 428 429 /* receive work done on other processors*/ 430 { 431 int index, is_no, ct1, max,*rbuf2_i,isz_i,*data_i,jmax; 432 BT table_i; 433 MPI_Status *status2; 434 435 status2 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(status2); 436 437 for (i=0; i<nrqs; ++i) { 438 ierr = MPI_Waitany(nrqs, r_waits2, &index, status2+i);CHKERRQ(ierr); 439 /* Process the message*/ 440 rbuf2_i = rbuf2[index]; 441 ct1 = 2*rbuf2_i[0]+1; 442 jmax = rbuf2[index][0]; 443 for (j=1; j<=jmax; j++) { 444 max = rbuf2_i[2*j]; 445 is_no = rbuf2_i[2*j-1]; 446 isz_i = isz[is_no]; 447 data_i = data[is_no]; 448 table_i = table[is_no]; 449 for (k=0; k<max; k++,ct1++) { 450 row = rbuf2_i[ct1]; 451 if (!BTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 452 } 453 isz[is_no] = isz_i; 454 } 455 } 456 ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr); 457 PetscFree(status2); 458 } 459 460 for (i=0; i<imax; ++i) { 461 ierr = ISCreateGeneral(PETSC_COMM_SELF, isz[i], data[i], is+i); CHKERRQ(ierr); 462 } 463 464 PetscFree(pa); 465 PetscFree(rbuf2); 466 PetscFree(s_waits1); 467 PetscFree(r_waits1); 468 PetscFree(s_waits2); 469 PetscFree(r_waits2); 470 PetscFree(table); 471 PetscFree(s_status); 472 PetscFree(recv_status); 473 PetscFree(xdata[0]); 474 PetscFree(xdata); 475 PetscFree(isz1); 476 PetscFunctionReturn(0); 477 } 478 479 #undef __FUNC__ 480 #define __FUNC__ "MatIncreaseOverlap_MPIBAIJ_Local" 481 /* 482 MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do 483 the work on the local processor. 484 485 Inputs: 486 C - MAT_MPIBAIJ; 487 imax - total no of index sets processed at a time; 488 table - an array of char - size = Mbs bits. 489 490 Output: 491 isz - array containing the count of the solution elements correspondign 492 to each index set; 493 data - pointer to the solutions 494 */ 495 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat C,int imax,BT *table,int *isz, 496 int **data) 497 { 498 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 499 Mat A = c->A, B = c->B; 500 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 501 int start, end, val, max, rstart,cstart,*ai, *aj; 502 int *bi, *bj, *garray, i, j, k, row,*data_i,isz_i; 503 BT table_i; 504 505 PetscFunctionBegin; 506 rstart = c->rstart; 507 cstart = c->cstart; 508 ai = a->i; 509 aj = a->j; 510 bi = b->i; 511 bj = b->j; 512 garray = c->garray; 513 514 515 for (i=0; i<imax; i++) { 516 data_i = data[i]; 517 table_i = table[i]; 518 isz_i = isz[i]; 519 for (j=0, max=isz[i]; j<max; j++) { 520 row = data_i[j] - rstart; 521 start = ai[row]; 522 end = ai[row+1]; 523 for (k=start; k<end; k++) { /* Amat */ 524 val = aj[k] + cstart; 525 if (!BTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 526 } 527 start = bi[row]; 528 end = bi[row+1]; 529 for (k=start; k<end; k++) { /* Bmat */ 530 val = garray[bj[k]]; 531 if (!BTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 532 } 533 } 534 isz[i] = isz_i; 535 } 536 PetscFunctionReturn(0); 537 } 538 #undef __FUNC__ 539 #define __FUNC__ "MatIncreaseOverlap_MPIBAIJ_Receive" 540 /* 541 MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages, 542 and return the output 543 544 Input: 545 C - the matrix 546 nrqr - no of messages being processed. 547 rbuf - an array of pointers to the recieved requests 548 549 Output: 550 xdata - array of messages to be sent back 551 isz1 - size of each message 552 553 For better efficiency perhaps we should malloc seperately each xdata[i], 554 then if a remalloc is required we need only copy the data for that one row 555 rather then all previous rows as it is now where a single large chunck of 556 memory is used. 557 558 */ 559 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,int nrqr,int **rbuf, 560 int **xdata, int * isz1) 561 { 562 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 563 Mat A = c->A, B = c->B; 564 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 565 int rstart,cstart,*ai, *aj, *bi, *bj, *garray, i, j, k; 566 int row,total_sz,ct, ct1, ct2, ct3,mem_estimate, oct2, l, start, end; 567 int val, max1, max2, rank, Mbs, no_malloc =0, *tmp, new_estimate, ctr; 568 int *rbuf_i,kmax,rbuf_0,ierr; 569 BT xtable; 570 571 PetscFunctionBegin; 572 rank = c->rank; 573 Mbs = c->Mbs; 574 rstart = c->rstart; 575 cstart = c->cstart; 576 ai = a->i; 577 aj = a->j; 578 bi = b->i; 579 bj = b->j; 580 garray = c->garray; 581 582 583 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 584 rbuf_i = rbuf[i]; 585 rbuf_0 = rbuf_i[0]; 586 ct += rbuf_0; 587 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 588 } 589 590 max1 = ct*(a->nz +b->nz)/c->Mbs; 591 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 592 xdata[0] = (int *)PetscMalloc(mem_estimate*sizeof(int)); CHKPTRQ(xdata[0]); 593 ++no_malloc; 594 ierr = BTCreate(Mbs,xtable); CHKERRQ(ierr); 595 PetscMemzero(isz1,nrqr*sizeof(int)); 596 597 ct3 = 0; 598 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 599 rbuf_i = rbuf[i]; 600 rbuf_0 = rbuf_i[0]; 601 ct1 = 2*rbuf_0+1; 602 ct2 = ct1; 603 ct3 += ct1; 604 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 605 BTMemzero(Mbs,xtable); 606 oct2 = ct2; 607 kmax = rbuf_i[2*j]; 608 for (k=0; k<kmax; k++, ct1++) { 609 row = rbuf_i[ct1]; 610 if (!BTLookupSet(xtable,row)) { 611 if (!(ct3 < mem_estimate)) { 612 new_estimate = (int)(1.5*mem_estimate)+1; 613 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 614 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 615 PetscFree(xdata[0]); 616 xdata[0] = tmp; 617 mem_estimate = new_estimate; ++no_malloc; 618 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 619 } 620 xdata[i][ct2++] = row; 621 ct3++; 622 } 623 } 624 for (k=oct2,max2=ct2; k<max2; k++) { 625 row = xdata[i][k] - rstart; 626 start = ai[row]; 627 end = ai[row+1]; 628 for (l=start; l<end; l++) { 629 val = aj[l] + cstart; 630 if (!BTLookupSet(xtable,val)) { 631 if (!(ct3 < mem_estimate)) { 632 new_estimate = (int)(1.5*mem_estimate)+1; 633 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 634 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 635 PetscFree(xdata[0]); 636 xdata[0] = tmp; 637 mem_estimate = new_estimate; ++no_malloc; 638 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 639 } 640 xdata[i][ct2++] = val; 641 ct3++; 642 } 643 } 644 start = bi[row]; 645 end = bi[row+1]; 646 for (l=start; l<end; l++) { 647 val = garray[bj[l]]; 648 if (!BTLookupSet(xtable,val)) { 649 if (!(ct3 < mem_estimate)) { 650 new_estimate = (int)(1.5*mem_estimate)+1; 651 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 652 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 653 PetscFree(xdata[0]); 654 xdata[0] = tmp; 655 mem_estimate = new_estimate; ++no_malloc; 656 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 657 } 658 xdata[i][ct2++] = val; 659 ct3++; 660 } 661 } 662 } 663 /* Update the header*/ 664 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 665 xdata[i][2*j-1] = rbuf_i[2*j-1]; 666 } 667 xdata[i][0] = rbuf_0; 668 xdata[i+1] = xdata[i] + ct2; 669 isz1[i] = ct2; /* size of each message */ 670 } 671 BTDestroy(xtable); 672 PLogInfo(0,"MatIncreaseOverlap_MPIBAIJ:[%d] Allocated %d bytes, required %d, no of mallocs = %d\n",rank,mem_estimate,ct3,no_malloc); 673 PetscFunctionReturn(0); 674 } 675 676 static int MatGetSubMatrices_MPIBAIJ_local(Mat,int,IS *,IS *,MatGetSubMatrixCall,Mat *); 677 678 #undef __FUNC__ 679 #define __FUNC__ "MatGetSubMatrices_MPIBAIJ" 680 int MatGetSubMatrices_MPIBAIJ(Mat C,int ismax,IS *isrow,IS *iscol, 681 MatGetSubMatrixCall scall,Mat **submat) 682 { 683 IS *isrow_new,*iscol_new; 684 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 685 int nmax,nstages_local,nstages,i,pos,max_no,ierr; 686 687 PetscFunctionBegin; 688 /* The compression and expansion should be avoided. Does'nt point 689 out errors might change the indices hence buggey */ 690 691 isrow_new = (IS *)PetscMalloc(2*ismax*sizeof(IS)); CHKPTRQ(isrow_new); 692 iscol_new = isrow_new + ismax; 693 ierr = MatCompressIndicesSorted_MPIBAIJ(C, ismax, isrow,isrow_new); CHKERRQ(ierr); 694 ierr = MatCompressIndicesSorted_MPIBAIJ(C, ismax, iscol,iscol_new); CHKERRQ(ierr); 695 696 /* Allocate memory to hold all the submatrices */ 697 if (scall != MAT_REUSE_MATRIX) { 698 *submat = (Mat *)PetscMalloc((ismax+1)*sizeof(Mat));CHKPTRQ(*submat); 699 } 700 /* Determine the number of stages through which submatrices are done */ 701 nmax = 20*1000000 / (c->Nbs * sizeof(int)); 702 if (nmax == 0) nmax = 1; 703 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 704 705 /* Make sure every porcessor loops through the nstages */ 706 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);CHKERRQ(ierr); 707 708 for ( i=0,pos=0; i<nstages; i++ ) { 709 if (pos+nmax <= ismax) max_no = nmax; 710 else if (pos == ismax) max_no = 0; 711 else max_no = ismax-pos; 712 ierr = MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos); CHKERRQ(ierr); 713 pos += max_no; 714 } 715 716 for (i=0; i<ismax; i++) { 717 ierr = ISDestroy(isrow_new[i]);CHKERRQ(ierr); 718 ierr = ISDestroy(iscol_new[i]);CHKERRQ(ierr); 719 } 720 PetscFree(isrow_new); 721 PetscFunctionReturn(0); 722 } 723 724 /* -------------------------------------------------------------------------*/ 725 #undef __FUNC__ 726 #define __FUNC__ "MatGetSubMatrices_MPIBAIJ_local" 727 static int MatGetSubMatrices_MPIBAIJ_local(Mat C,int ismax,IS *isrow,IS *iscol, 728 MatGetSubMatrixCall scall,Mat *submats) 729 { 730 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 731 Mat A = c->A; 732 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data, *b = (Mat_SeqBAIJ*)c->B->data, *mat; 733 int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size; 734 int **sbuf1,**sbuf2, rank, Mbs,i,j,k,l,ct1,ct2,ierr, **rbuf1,row,proc; 735 int nrqs, msz, **ptr,index,*req_size,*ctr,*pa,*tmp,tcol,bsz,nrqr; 736 int **rbuf3,*req_source,**sbuf_aj, **rbuf2, max1,max2,**rmap; 737 int **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i; 738 int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i; 739 int *rmap_i,bs=c->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA, *cworkB; 740 int cstart = c->cstart,nzA,nzB,*a_i=a->i,*b_i=b->i,imark; 741 int *bmap = c->garray,ctmp,rstart=c->rstart,tag0,tag1,tag2,tag3; 742 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 743 MPI_Request *r_waits4,*s_waits3,*s_waits4; 744 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 745 MPI_Status *r_status3,*r_status4,*s_status4; 746 MPI_Comm comm; 747 Scalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB; 748 Scalar *a_a=a->a,*b_a=b->a; 749 750 PetscFunctionBegin; 751 comm = C->comm; 752 tag0 = C->tag; 753 size = c->size; 754 rank = c->rank; 755 Mbs = c->Mbs; 756 757 /* Get some new tags to keep the communication clean */ 758 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1); CHKERRQ(ierr); 759 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2); CHKERRQ(ierr); 760 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3); CHKERRQ(ierr); 761 762 /* Check if the col indices are sorted */ 763 for (i=0; i<ismax; i++) { 764 ierr = ISSorted(iscol[i],(PetscTruth*)&j); 765 if (!j) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted"); 766 } 767 768 len = (2*ismax+1)*(sizeof(int *) + sizeof(int)) + (Mbs+1)*sizeof(int); 769 irow = (int **)PetscMalloc(len); CHKPTRQ(irow); 770 icol = irow + ismax; 771 nrow = (int *) (icol + ismax); 772 ncol = nrow + ismax; 773 rtable = ncol + ismax; 774 775 for (i=0; i<ismax; i++) { 776 ierr = ISGetIndices(isrow[i],&irow[i]); CHKERRQ(ierr); 777 ierr = ISGetIndices(iscol[i],&icol[i]); CHKERRQ(ierr); 778 ierr = ISGetSize(isrow[i],&nrow[i]); CHKERRQ(ierr); 779 ierr = ISGetSize(iscol[i],&ncol[i]); CHKERRQ(ierr); 780 } 781 782 /* Create hash table for the mapping :row -> proc*/ 783 for (i=0,j=0; i<size; i++) { 784 jmax = c->rowners[i+1]; 785 for (; j<jmax; j++) { 786 rtable[j] = i; 787 } 788 } 789 790 /* evaluate communication - mesg to who, length of mesg, and buffer space 791 required. Based on this, buffers are allocated, and data copied into them*/ 792 w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1); /* mesg size */ 793 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 794 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 795 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 796 PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/ 797 for (i=0; i<ismax; i++) { 798 PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/ 799 jmax = nrow[i]; 800 irow_i = irow[i]; 801 for (j=0; j<jmax; j++) { 802 row = irow_i[j]; 803 proc = rtable[row]; 804 w4[proc]++; 805 } 806 for (j=0; j<size; j++) { 807 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 808 } 809 } 810 811 nrqs = 0; /* no of outgoing messages */ 812 msz = 0; /* total mesg length for all proc */ 813 w1[rank] = 0; /* no mesg sent to intself */ 814 w3[rank] = 0; 815 for (i=0; i<size; i++) { 816 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 817 } 818 pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); /*(proc -array)*/ 819 for (i=0, j=0; i<size; i++) { 820 if (w1[i]) { pa[j] = i; j++; } 821 } 822 823 /* Each message would have a header = 1 + 2*(no of IS) + data */ 824 for (i=0; i<nrqs; i++) { 825 j = pa[i]; 826 w1[j] += w2[j] + 2* w3[j]; 827 msz += w1[j]; 828 } 829 /* Do a global reduction to determine how many messages to expect*/ 830 { 831 int *rw1, *rw2; 832 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 833 rw2 = rw1+size; 834 ierr = MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm);CHKERRQ(ierr); 835 bsz = rw1[rank]; 836 ierr = MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm);CHKERRQ(ierr); 837 nrqr = rw2[rank]; 838 PetscFree(rw1); 839 } 840 841 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 842 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 843 rbuf1 = (int**) PetscMalloc(len); CHKPTRQ(rbuf1); 844 rbuf1[0] = (int *) (rbuf1 + nrqr); 845 for (i=1; i<nrqr; ++i) rbuf1[i] = rbuf1[i-1] + bsz; 846 847 /* Post the receives */ 848 r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(r_waits1); 849 for (i=0; i<nrqr; ++i) { 850 ierr = MPI_Irecv(rbuf1[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag0,comm,r_waits1+i);CHKERRQ(ierr); 851 } 852 853 /* Allocate Memory for outgoing messages */ 854 len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int); 855 sbuf1 = (int **)PetscMalloc(len); CHKPTRQ(sbuf1); 856 ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */ 857 PetscMemzero(sbuf1,2*size*sizeof(int*)); 858 /* allocate memory for outgoing data + buf to receive the first reply */ 859 tmp = (int *) (ptr + size); 860 ctr = tmp + 2*msz; 861 862 { 863 int *iptr = tmp,ict = 0; 864 for (i=0; i<nrqs; i++) { 865 j = pa[i]; 866 iptr += ict; 867 sbuf1[j] = iptr; 868 ict = w1[j]; 869 } 870 } 871 872 /* Form the outgoing messages */ 873 /* Initialise the header space */ 874 for (i=0; i<nrqs; i++) { 875 j = pa[i]; 876 sbuf1[j][0] = 0; 877 PetscMemzero(sbuf1[j]+1, 2*w3[j]*sizeof(int)); 878 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 879 } 880 881 /* Parse the isrow and copy data into outbuf */ 882 for (i=0; i<ismax; i++) { 883 PetscMemzero(ctr,size*sizeof(int)); 884 irow_i = irow[i]; 885 jmax = nrow[i]; 886 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 887 row = irow_i[j]; 888 proc = rtable[row]; 889 if (proc != rank) { /* copy to the outgoing buf*/ 890 ctr[proc]++; 891 *ptr[proc] = row; 892 ptr[proc]++; 893 } 894 } 895 /* Update the headers for the current IS */ 896 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 897 if ((ctr_j = ctr[j])) { 898 sbuf1_j = sbuf1[j]; 899 k = ++sbuf1_j[0]; 900 sbuf1_j[2*k] = ctr_j; 901 sbuf1_j[2*k-1] = i; 902 } 903 } 904 } 905 906 /* Now post the sends */ 907 s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(s_waits1); 908 for (i=0; i<nrqs; ++i) { 909 j = pa[i]; 910 /* printf("[%d] Send Req to %d: size %d \n", rank,j, w1[j]); */ 911 ierr = MPI_Isend( sbuf1[j], w1[j], MPI_INT, j, tag0, comm, s_waits1+i);CHKERRQ(ierr); 912 } 913 914 /* Post Recieves to capture the buffer size */ 915 r_waits2 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits2); 916 rbuf2 = (int**)PetscMalloc((nrqs+1)*sizeof(int *));CHKPTRQ(rbuf2); 917 rbuf2[0] = tmp + msz; 918 for (i=1; i<nrqs; ++i) { 919 j = pa[i]; 920 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 921 } 922 for (i=0; i<nrqs; ++i) { 923 j = pa[i]; 924 ierr = MPI_Irecv( rbuf2[i], w1[j], MPI_INT, j, tag1, comm, r_waits2+i);CHKERRQ(ierr); 925 } 926 927 /* Send to other procs the buf size they should allocate */ 928 929 930 /* Receive messages*/ 931 s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits2); 932 r_status1 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(r_status1); 933 len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*); 934 sbuf2 = (int**) PetscMalloc(len);CHKPTRQ(sbuf2); 935 req_size = (int *) (sbuf2 + nrqr); 936 req_source = req_size + nrqr; 937 938 { 939 Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*) c->A->data, *sB = (Mat_SeqBAIJ*) c->B->data; 940 int *sAi = sA->i, *sBi = sB->i, id, *sbuf2_i; 941 942 for (i=0; i<nrqr; ++i) { 943 ierr = MPI_Waitany(nrqr, r_waits1, &index, r_status1+i);CHKERRQ(ierr); 944 req_size[index] = 0; 945 rbuf1_i = rbuf1[index]; 946 start = 2*rbuf1_i[0] + 1; 947 ierr = MPI_Get_count(r_status1+i,MPI_INT, &end);CHKERRQ(ierr); 948 sbuf2[index] = (int *)PetscMalloc(end*sizeof(int));CHKPTRQ(sbuf2[index]); 949 sbuf2_i = sbuf2[index]; 950 for (j=start; j<end; j++) { 951 id = rbuf1_i[j] - rstart; 952 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 953 sbuf2_i[j] = ncols; 954 req_size[index] += ncols; 955 } 956 req_source[index] = r_status1[i].MPI_SOURCE; 957 /* form the header */ 958 sbuf2_i[0] = req_size[index]; 959 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 960 ierr = MPI_Isend(sbuf2_i,end,MPI_INT,req_source[index],tag1,comm,s_waits2+i); CHKERRQ(ierr); 961 } 962 } 963 PetscFree(r_status1); PetscFree(r_waits1); 964 965 /* recv buffer sizes */ 966 /* Receive messages*/ 967 968 rbuf3 = (int**)PetscMalloc((nrqs+1)*sizeof(int*)); CHKPTRQ(rbuf3); 969 rbuf4 = (Scalar**)PetscMalloc((nrqs+1)*sizeof(Scalar*));CHKPTRQ(rbuf4); 970 r_waits3 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits3); 971 r_waits4 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits4); 972 r_status2 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(r_status2); 973 974 for (i=0; i<nrqs; ++i) { 975 ierr = MPI_Waitany(nrqs, r_waits2, &index, r_status2+i);CHKERRQ(ierr); 976 rbuf3[index] = (int *)PetscMalloc(rbuf2[index][0]*sizeof(int));CHKPTRQ(rbuf3[index]); 977 rbuf4[index] = (Scalar *)PetscMalloc(rbuf2[index][0]*bs2*sizeof(Scalar));CHKPTRQ(rbuf4[index]); 978 ierr = MPI_Irecv(rbuf3[index],rbuf2[index][0], MPI_INT, 979 r_status2[i].MPI_SOURCE, tag2, comm, r_waits3+index); CHKERRQ(ierr); 980 ierr = MPI_Irecv(rbuf4[index],rbuf2[index][0]*bs2, MPIU_SCALAR, 981 r_status2[i].MPI_SOURCE, tag3, comm, r_waits4+index); CHKERRQ(ierr); 982 } 983 PetscFree(r_status2); PetscFree(r_waits2); 984 985 /* Wait on sends1 and sends2 */ 986 s_status1 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(s_status1); 987 s_status2 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(s_status2); 988 989 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 990 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 991 PetscFree(s_status1); PetscFree(s_status2); 992 PetscFree(s_waits1); PetscFree(s_waits2); 993 994 /* Now allocate buffers for a->j, and send them off */ 995 sbuf_aj = (int **)PetscMalloc((nrqr+1)*sizeof(int *));CHKPTRQ(sbuf_aj); 996 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 997 sbuf_aj[0] = (int*) PetscMalloc((j+1)*sizeof(int)); CHKPTRQ(sbuf_aj[0]); 998 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 999 1000 s_waits3 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits3); 1001 { 1002 for (i=0; i<nrqr; i++) { 1003 rbuf1_i = rbuf1[i]; 1004 sbuf_aj_i = sbuf_aj[i]; 1005 ct1 = 2*rbuf1_i[0] + 1; 1006 ct2 = 0; 1007 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1008 kmax = rbuf1[i][2*j]; 1009 for (k=0; k<kmax; k++,ct1++) { 1010 row = rbuf1_i[ct1] - rstart; 1011 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1012 ncols = nzA + nzB; 1013 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1014 1015 /* load the column indices for this row into cols*/ 1016 cols = sbuf_aj_i + ct2; 1017 for (l=0; l<nzB; l++) { 1018 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp; 1019 else break; 1020 } 1021 imark = l; 1022 for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l]; 1023 for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]]; 1024 ct2 += ncols; 1025 } 1026 } 1027 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1028 } 1029 } 1030 r_status3 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(r_status3); 1031 s_status3 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(s_status3); 1032 1033 /* Allocate buffers for a->a, and send them off */ 1034 sbuf_aa = (Scalar **)PetscMalloc((nrqr+1)*sizeof(Scalar *));CHKPTRQ(sbuf_aa); 1035 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1036 sbuf_aa[0] = (Scalar*) PetscMalloc((j+1)*bs2*sizeof(Scalar));CHKPTRQ(sbuf_aa[0]); 1037 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2; 1038 1039 s_waits4 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits4); 1040 { 1041 for (i=0; i<nrqr; i++) { 1042 rbuf1_i = rbuf1[i]; 1043 sbuf_aa_i = sbuf_aa[i]; 1044 ct1 = 2*rbuf1_i[0]+1; 1045 ct2 = 0; 1046 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1047 kmax = rbuf1_i[2*j]; 1048 for (k=0; k<kmax; k++,ct1++) { 1049 row = rbuf1_i[ct1] - rstart; 1050 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1051 ncols = nzA + nzB; 1052 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1053 vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2; 1054 1055 /* load the column values for this row into vals*/ 1056 vals = sbuf_aa_i+ct2*bs2; 1057 for (l=0; l<nzB; l++) { 1058 if ((bmap[cworkB[l]]) < cstart) { 1059 PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(Scalar)); 1060 } 1061 else break; 1062 } 1063 imark = l; 1064 for (l=0; l<nzA; l++) { 1065 PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(Scalar)); 1066 } 1067 for (l=imark; l<nzB; l++) { 1068 PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(Scalar)); 1069 } 1070 ct2 += ncols; 1071 } 1072 } 1073 ierr = MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1074 } 1075 } 1076 r_status4 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(r_status4); 1077 s_status4 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(s_status4); 1078 PetscFree(rbuf1); 1079 1080 /* Form the matrix */ 1081 /* create col map */ 1082 { 1083 int *icol_i; 1084 1085 len = (1+ismax)*sizeof(int *) + ismax*c->Nbs*sizeof(int); 1086 cmap = (int **)PetscMalloc(len); CHKPTRQ(cmap); 1087 cmap[0] = (int *)(cmap + ismax); 1088 PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(int)); 1089 for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; } 1090 for (i=0; i<ismax; i++) { 1091 jmax = ncol[i]; 1092 icol_i = icol[i]; 1093 cmap_i = cmap[i]; 1094 for (j=0; j<jmax; j++) { 1095 cmap_i[icol_i[j]] = j+1; 1096 } 1097 } 1098 } 1099 1100 1101 /* Create lens which is required for MatCreate... */ 1102 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1103 len = (1+ismax)*sizeof(int *) + j*sizeof(int); 1104 lens = (int **)PetscMalloc(len); CHKPTRQ(lens); 1105 lens[0] = (int *)(lens + ismax); 1106 PetscMemzero(lens[0], j*sizeof(int)); 1107 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1108 1109 /* Update lens from local data */ 1110 for (i=0; i<ismax; i++) { 1111 jmax = nrow[i]; 1112 cmap_i = cmap[i]; 1113 irow_i = irow[i]; 1114 lens_i = lens[i]; 1115 for (j=0; j<jmax; j++) { 1116 row = irow_i[j]; 1117 proc = rtable[row]; 1118 if (proc == rank) { 1119 /* Get indices from matA and then from matB */ 1120 row = row - rstart; 1121 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1122 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1123 for (k=0; k<nzA; k++) { 1124 if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++;} 1125 } 1126 for (k=0; k<nzB; k++) { 1127 if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++;} 1128 } 1129 } 1130 } 1131 } 1132 1133 /* Create row map*/ 1134 len = (1+ismax)*sizeof(int *) + ismax*c->Mbs*sizeof(int); 1135 rmap = (int **)PetscMalloc(len); CHKPTRQ(rmap); 1136 rmap[0] = (int *)(rmap + ismax); 1137 PetscMemzero(rmap[0],ismax*c->Mbs*sizeof(int)); 1138 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + c->Mbs;} 1139 for (i=0; i<ismax; i++) { 1140 rmap_i = rmap[i]; 1141 irow_i = irow[i]; 1142 jmax = nrow[i]; 1143 for (j=0; j<jmax; j++) { 1144 rmap_i[irow_i[j]] = j; 1145 } 1146 } 1147 1148 /* Update lens from offproc data */ 1149 { 1150 int *rbuf2_i, *rbuf3_i, *sbuf1_i; 1151 1152 for (tmp2=0; tmp2<nrqs; tmp2++) { 1153 ierr = MPI_Waitany(nrqs, r_waits3, &i, r_status3+tmp2);CHKERRQ(ierr); 1154 index = pa[i]; 1155 sbuf1_i = sbuf1[index]; 1156 jmax = sbuf1_i[0]; 1157 ct1 = 2*jmax+1; 1158 ct2 = 0; 1159 rbuf2_i = rbuf2[i]; 1160 rbuf3_i = rbuf3[i]; 1161 for (j=1; j<=jmax; j++) { 1162 is_no = sbuf1_i[2*j-1]; 1163 max1 = sbuf1_i[2*j]; 1164 lens_i = lens[is_no]; 1165 cmap_i = cmap[is_no]; 1166 rmap_i = rmap[is_no]; 1167 for (k=0; k<max1; k++,ct1++) { 1168 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1169 max2 = rbuf2_i[ct1]; 1170 for (l=0; l<max2; l++,ct2++) { 1171 if (cmap_i[rbuf3_i[ct2]]) { 1172 lens_i[row]++; 1173 } 1174 } 1175 } 1176 } 1177 } 1178 } 1179 PetscFree(r_status3); PetscFree(r_waits3); 1180 ierr = MPI_Waitall(nrqr,s_waits3,s_status3); CHKERRQ(ierr); 1181 PetscFree(s_status3); PetscFree(s_waits3); 1182 1183 /* Create the submatrices */ 1184 if (scall == MAT_REUSE_MATRIX) { 1185 /* 1186 Assumes new rows are same length as the old rows, hence bug! 1187 */ 1188 for (i=0; i<ismax; i++) { 1189 mat = (Mat_SeqBAIJ *)(submats[i]->data); 1190 if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || mat->bs != bs)) { 1191 SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong size"); 1192 } 1193 if (PetscMemcmp(mat->ilen,lens[i], mat->mbs *sizeof(int))) { 1194 SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Cannot reuse matrix. wrong no of nonzeros"); 1195 } 1196 /* Initial matrix as if empty */ 1197 PetscMemzero(mat->ilen,mat->mbs*sizeof(int)); 1198 submats[i]->factor = C->factor; 1199 } 1200 } else { 1201 /* *submat = submats = (Mat *)PetscMalloc(ismax*sizeof(Mat)); CHKPTRQ(submats); */ 1202 for (i=0; i<ismax; i++) { 1203 ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,a->bs,nrow[i]*bs,ncol[i]*bs,0,lens[i],submats+i);CHKERRQ(ierr); 1204 } 1205 } 1206 1207 /* Assemble the matrices */ 1208 /* First assemble the local rows */ 1209 { 1210 int ilen_row,*imat_ilen, *imat_j, *imat_i; 1211 Scalar *imat_a; 1212 1213 for (i=0; i<ismax; i++) { 1214 mat = (Mat_SeqBAIJ *) submats[i]->data; 1215 imat_ilen = mat->ilen; 1216 imat_j = mat->j; 1217 imat_i = mat->i; 1218 imat_a = mat->a; 1219 cmap_i = cmap[i]; 1220 rmap_i = rmap[i]; 1221 irow_i = irow[i]; 1222 jmax = nrow[i]; 1223 for (j=0; j<jmax; j++) { 1224 row = irow_i[j]; 1225 proc = rtable[row]; 1226 if (proc == rank) { 1227 row = row - rstart; 1228 nzA = a_i[row+1] - a_i[row]; 1229 nzB = b_i[row+1] - b_i[row]; 1230 cworkA = a_j + a_i[row]; 1231 cworkB = b_j + b_i[row]; 1232 vworkA = a_a + a_i[row]*bs2; 1233 vworkB = b_a + b_i[row]*bs2; 1234 1235 row = rmap_i[row + rstart]; 1236 mat_i = imat_i[row]; 1237 mat_a = imat_a + mat_i*bs2; 1238 mat_j = imat_j + mat_i; 1239 ilen_row = imat_ilen[row]; 1240 1241 /* load the column indices for this row into cols*/ 1242 for (l=0; l<nzB; l++) { 1243 if ((ctmp = bmap[cworkB[l]]) < cstart) { 1244 if ((tcol = cmap_i[ctmp])) { 1245 *mat_j++ = tcol - 1; 1246 PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(Scalar)); mat_a += bs2; 1247 ilen_row++; 1248 } 1249 } else break; 1250 } 1251 imark = l; 1252 for (l=0; l<nzA; l++) { 1253 if ((tcol = cmap_i[cstart + cworkA[l]])) { 1254 *mat_j++ = tcol - 1; 1255 PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(Scalar)); mat_a += bs2; 1256 ilen_row++; 1257 } 1258 } 1259 for (l=imark; l<nzB; l++) { 1260 if ((tcol = cmap_i[bmap[cworkB[l]]])) { 1261 *mat_j++ = tcol - 1; 1262 PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(Scalar)); mat_a += bs2; 1263 ilen_row++; 1264 } 1265 } 1266 imat_ilen[row] = ilen_row; 1267 } 1268 } 1269 1270 } 1271 } 1272 1273 /* Now assemble the off proc rows*/ 1274 { 1275 int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1276 int *imat_j,*imat_i; 1277 Scalar *imat_a,*rbuf4_i; 1278 1279 for (tmp2=0; tmp2<nrqs; tmp2++) { 1280 ierr = MPI_Waitany(nrqs, r_waits4, &i, r_status4+tmp2);CHKERRQ(ierr); 1281 index = pa[i]; 1282 sbuf1_i = sbuf1[index]; 1283 jmax = sbuf1_i[0]; 1284 ct1 = 2*jmax + 1; 1285 ct2 = 0; 1286 rbuf2_i = rbuf2[i]; 1287 rbuf3_i = rbuf3[i]; 1288 rbuf4_i = rbuf4[i]; 1289 for (j=1; j<=jmax; j++) { 1290 is_no = sbuf1_i[2*j-1]; 1291 rmap_i = rmap[is_no]; 1292 cmap_i = cmap[is_no]; 1293 mat = (Mat_SeqBAIJ *) submats[is_no]->data; 1294 imat_ilen = mat->ilen; 1295 imat_j = mat->j; 1296 imat_i = mat->i; 1297 imat_a = mat->a; 1298 max1 = sbuf1_i[2*j]; 1299 for (k=0; k<max1; k++, ct1++) { 1300 row = sbuf1_i[ct1]; 1301 row = rmap_i[row]; 1302 ilen = imat_ilen[row]; 1303 mat_i = imat_i[row]; 1304 mat_a = imat_a + mat_i*bs2; 1305 mat_j = imat_j + mat_i; 1306 max2 = rbuf2_i[ct1]; 1307 for (l=0; l<max2; l++,ct2++) { 1308 if ((tcol = cmap_i[rbuf3_i[ct2]])) { 1309 *mat_j++ = tcol - 1; 1310 /* *mat_a++ = rbuf4_i[ct2]; */ 1311 PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(Scalar)); mat_a += bs2; 1312 ilen++; 1313 } 1314 } 1315 imat_ilen[row] = ilen; 1316 } 1317 } 1318 } 1319 } 1320 PetscFree(r_status4); PetscFree(r_waits4); 1321 ierr = MPI_Waitall(nrqr,s_waits4,s_status4); CHKERRQ(ierr); 1322 PetscFree(s_waits4); PetscFree(s_status4); 1323 1324 /* Restore the indices */ 1325 for (i=0; i<ismax; i++) { 1326 ierr = ISRestoreIndices(isrow[i], irow+i); CHKERRQ(ierr); 1327 ierr = ISRestoreIndices(iscol[i], icol+i); CHKERRQ(ierr); 1328 } 1329 1330 /* Destroy allocated memory */ 1331 PetscFree(irow); 1332 PetscFree(w1); 1333 PetscFree(pa); 1334 1335 PetscFree(sbuf1); 1336 PetscFree(rbuf2); 1337 for (i=0; i<nrqr; ++i) { 1338 PetscFree(sbuf2[i]); 1339 } 1340 for (i=0; i<nrqs; ++i) { 1341 PetscFree(rbuf3[i]); 1342 PetscFree(rbuf4[i]); 1343 } 1344 1345 PetscFree(sbuf2); 1346 PetscFree(rbuf3); 1347 PetscFree(rbuf4 ); 1348 PetscFree(sbuf_aj[0]); 1349 PetscFree(sbuf_aj); 1350 PetscFree(sbuf_aa[0]); 1351 PetscFree(sbuf_aa); 1352 1353 PetscFree(cmap); 1354 PetscFree(rmap); 1355 PetscFree(lens); 1356 1357 for (i=0; i<ismax; i++) { 1358 ierr = MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); 1359 ierr = MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); 1360 } 1361 1362 ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag3); CHKERRQ(ierr); 1363 ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag2); CHKERRQ(ierr); 1364 ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag1); CHKERRQ(ierr); 1365 1366 PetscFunctionReturn(0); 1367 } 1368 1369 1370