1 #ifdef PETSC_RCS_HEADER 2 static char vcid[] = "$Id: baijov.c,v 1.36 1999/09/27 21:30:07 bsmith Exp bsmith $"; 3 #endif 4 5 /* 6 Routines to compute overlapping regions of a parallel MPI matrix 7 and to find submatrices that were shared across processors. 8 */ 9 #include "src/mat/impls/baij/mpi/mpibaij.h" 10 #include "bitarray.h" 11 12 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat, int, IS *); 13 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat , int , char **,int*, int**); 14 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat , int, int **, int**, int* ); 15 extern int MatGetRow_MPIBAIJ(Mat,int,int*,int**,Scalar**); 16 extern int MatRestoreRow_MPIBAIJ(Mat,int,int*,int**,Scalar**); 17 18 #undef __FUNC__ 19 #define __FUNC__ "MatCompressIndicesGeneral_MPIBAIJ" 20 static int MatCompressIndicesGeneral_MPIBAIJ(Mat C, int imax, IS *is_in, IS *is_out) 21 { 22 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) C->data; 23 int ierr,isz,bs = baij->bs,Nbs,n,i,j,*idx,*nidx,ival; 24 BT table; 25 26 PetscFunctionBegin; 27 Nbs = baij->Nbs; 28 nidx = (int *) PetscMalloc((Nbs+1)*sizeof(int));CHKPTRQ(nidx); 29 ierr = BTCreate(Nbs,table);CHKERRQ(ierr); 30 31 for (i=0; i<imax; i++) { 32 isz = 0; 33 ierr = BTMemzero(Nbs,table);CHKERRQ(ierr); 34 ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr); 35 ierr = ISGetSize(is_in[i],&n);CHKERRQ(ierr); 36 for (j=0; j<n ; j++) { 37 ival = idx[j]/bs; /* convert the indices into block indices */ 38 if (ival>Nbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"index greater than mat-dim"); 39 if(!BTLookupSet(table, ival)) { nidx[isz++] = ival;} 40 } 41 ierr = ISRestoreIndices(is_in[i],&idx);CHKERRQ(ierr); 42 ierr = ISCreateGeneral(PETSC_COMM_SELF, isz, nidx, (is_out+i));CHKERRQ(ierr); 43 } 44 ierr = BTDestroy(table);CHKERRQ(ierr); 45 ierr = PetscFree(nidx);CHKERRQ(ierr); 46 PetscFunctionReturn(0); 47 } 48 49 #undef __FUNC__ 50 #define __FUNC__ "MatCompressIndicesSorted_MPIBAIJ" 51 static int MatCompressIndicesSorted_MPIBAIJ(Mat C, int imax, IS *is_in, IS *is_out) 52 { 53 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) C->data; 54 int ierr,bs=baij->bs,i,j,k,val,n,*idx,*nidx,Nbs=baij->Nbs,*idx_local; 55 PetscTruth flg; 56 57 PetscFunctionBegin; 58 for (i=0; i<imax; i++) { 59 ierr = ISSorted(is_in[i],&flg);CHKERRQ(ierr); 60 if (!flg) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Indices are not sorted"); 61 } 62 nidx = (int *) PetscMalloc((Nbs+1)*sizeof(int));CHKPTRQ(nidx); 63 /* Now check if the indices are in block order */ 64 for (i=0; i<imax; i++) { 65 ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr); 66 ierr = ISGetSize(is_in[i],&n);CHKERRQ(ierr); 67 if ( n%bs !=0 ) SETERRA(1,0,"Indices are not block ordered"); 68 69 n = n/bs; /* The reduced index size */ 70 idx_local = idx; 71 for (j=0; j<n ; j++) { 72 val = idx_local[0]; 73 if (val%bs != 0) SETERRA(1,0,"Indices are not block ordered"); 74 for (k=0; k<bs; k++) { 75 if ( val+k != idx_local[k]) SETERRA(1,0,"Indices are not block ordered"); 76 } 77 nidx[j] = val/bs; 78 idx_local +=bs; 79 } 80 ierr = ISRestoreIndices(is_in[i],&idx);CHKERRQ(ierr); 81 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,nidx,(is_out+i));CHKERRQ(ierr); 82 } 83 ierr = PetscFree(nidx);CHKERRQ(ierr); 84 PetscFunctionReturn(0); 85 } 86 87 #undef __FUNC__ 88 #define __FUNC__ "MatExpandIndices_MPIBAIJ" 89 static int MatExpandIndices_MPIBAIJ(Mat C, int imax, IS *is_in, IS *is_out) 90 { 91 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) C->data; 92 int ierr,bs = baij->bs,Nbs,n,i,j,k,*idx,*nidx; 93 94 PetscFunctionBegin; 95 Nbs = baij->Nbs; 96 97 nidx = (int *) PetscMalloc((Nbs*bs+1)*sizeof(int));CHKPTRQ(nidx); 98 99 for ( i=0; i<imax; i++ ) { 100 ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr); 101 ierr = ISGetSize(is_in[i],&n);CHKERRQ(ierr); 102 for (j=0; j<n ; ++j){ 103 for (k=0; k<bs; k++) 104 nidx[j*bs+k] = idx[j]*bs+k; 105 } 106 ierr = ISRestoreIndices(is_in[i],&idx);CHKERRQ(ierr); 107 ierr = ISCreateGeneral(PETSC_COMM_SELF, n*bs, nidx, (is_out+i));CHKERRQ(ierr); 108 } 109 ierr = PetscFree(nidx);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 113 114 #undef __FUNC__ 115 #define __FUNC__ "MatIncreaseOverlap_MPIBAIJ" 116 int MatIncreaseOverlap_MPIBAIJ(Mat C, int imax, IS *is, int ov) 117 { 118 int i, ierr; 119 IS *is_new; 120 121 PetscFunctionBegin; 122 is_new = (IS *)PetscMalloc(imax*sizeof(IS));CHKPTRQ(is_new); 123 /* Convert the indices into block format */ 124 ierr = MatCompressIndicesGeneral_MPIBAIJ(C, imax, is,is_new);CHKERRQ(ierr); 125 if (ov < 0){ SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative overlap specified\n");} 126 for (i=0; i<ov; ++i) { 127 ierr = MatIncreaseOverlap_MPIBAIJ_Once(C, imax, is_new);CHKERRQ(ierr); 128 } 129 for (i=0; i<imax; i++) {ierr = ISDestroy(is[i]);CHKERRQ(ierr);} 130 ierr = MatExpandIndices_MPIBAIJ(C, imax, is_new,is);CHKERRQ(ierr); 131 for (i=0; i<imax; i++) {ierr = ISDestroy(is_new[i]);CHKERRQ(ierr);} 132 ierr = PetscFree(is_new);CHKERRQ(ierr); 133 PetscFunctionReturn(0); 134 } 135 136 /* 137 Sample message format: 138 If a processor A wants processor B to process some elements corresponding 139 to index sets 1s[1], is[5] 140 mesg [0] = 2 ( no of index sets in the mesg) 141 ----------- 142 mesg [1] = 1 => is[1] 143 mesg [2] = sizeof(is[1]); 144 ----------- 145 mesg [5] = 5 => is[5] 146 mesg [6] = sizeof(is[5]); 147 ----------- 148 mesg [7] 149 mesg [n] datas[1] 150 ----------- 151 mesg[n+1] 152 mesg[m] data(is[5]) 153 ----------- 154 155 Notes: 156 nrqs - no of requests sent (or to be sent out) 157 nrqr - no of requests recieved (which have to be or which have been processed 158 */ 159 #undef __FUNC__ 160 #define __FUNC__ "MatIncreaseOverlap_MPIBAIJ_Once" 161 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat C, int imax, IS *is) 162 { 163 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 164 int **idx, *n, *w1, *w2, *w3, *w4, *rtable,**data,len,*idx_i; 165 int size,rank,Mbs,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr; 166 int *ctr,*pa,tag,*tmp,bsz,nrqr,*isz,*isz1,**xdata,bsz1,**rbuf2; 167 BT *table; 168 MPI_Comm comm; 169 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 170 MPI_Status *s_status,*recv_status; 171 172 PetscFunctionBegin; 173 comm = C->comm; 174 tag = C->tag; 175 size = c->size; 176 rank = c->rank; 177 Mbs = c->Mbs; 178 179 len = (imax+1)*sizeof(int *) + (imax + Mbs)*sizeof(int); 180 idx = (int **) PetscMalloc(len);CHKPTRQ(idx); 181 n = (int *) (idx + imax); 182 rtable = n + imax; 183 184 for (i=0; i<imax; i++) { 185 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 186 ierr = ISGetSize(is[i],&n[i]);CHKERRQ(ierr); 187 } 188 189 /* Create hash table for the mapping :row -> proc*/ 190 for (i=0,j=0; i<size; i++) { 191 len = c->rowners[i+1]; 192 for (; j<len; j++) { 193 rtable[j] = i; 194 } 195 } 196 197 /* evaluate communication - mesg to who, length of mesg, and buffer space 198 required. Based on this, buffers are allocated, and data copied into them*/ 199 w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1);/* mesg size */ 200 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 201 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 202 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 203 ierr = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/ 204 for (i=0; i<imax; i++) { 205 ierr = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/ 206 idx_i = idx[i]; 207 len = n[i]; 208 for (j=0; j<len; j++) { 209 row = idx_i[j]; 210 if (row < 0) { 211 SETERRQ(1,1,"Index set cannot have negative entries"); 212 } 213 proc = rtable[row]; 214 w4[proc]++; 215 } 216 for (j=0; j<size; j++){ 217 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 218 } 219 } 220 221 nrqs = 0; /* no of outgoing messages */ 222 msz = 0; /* total mesg length (for all proc */ 223 w1[rank] = 0; /* no mesg sent to intself */ 224 w3[rank] = 0; 225 for ( i=0; i<size; i++) { 226 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 227 } 228 /* pa - is list of processors to communicate with */ 229 pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); 230 for (i=0,j=0; i<size; i++) { 231 if (w1[i]) {pa[j] = i; j++;} 232 } 233 234 /* Each message would have a header = 1 + 2*(no of IS) + data */ 235 for (i=0; i<nrqs; i++) { 236 j = pa[i]; 237 w1[j] += w2[j] + 2*w3[j]; 238 msz += w1[j]; 239 } 240 241 242 /* Do a global reduction to determine how many messages to expect*/ 243 { 244 int *rw1, *rw2; 245 rw1 = (int *) PetscMalloc(2*size*sizeof(int));CHKPTRQ(rw1); 246 rw2 = rw1+size; 247 ierr = MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm);CHKERRQ(ierr); 248 bsz = rw1[rank]; 249 ierr = MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm);CHKERRQ(ierr); 250 nrqr = rw2[rank]; 251 ierr = PetscFree(rw1);CHKERRQ(ierr); 252 } 253 254 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 255 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 256 rbuf = (int**) PetscMalloc(len);CHKPTRQ(rbuf); 257 rbuf[0] = (int *) (rbuf + nrqr); 258 for (i=1; i<nrqr; ++i) rbuf[i] = rbuf[i-1] + bsz; 259 260 /* Post the receives */ 261 r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(r_waits1); 262 for (i=0; i<nrqr; ++i) { 263 ierr = MPI_Irecv(rbuf[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag,comm,r_waits1+i);CHKERRQ(ierr); 264 } 265 266 /* Allocate Memory for outgoing messages */ 267 len = 2*size*sizeof(int*) + (size+msz)*sizeof(int); 268 outdat = (int **)PetscMalloc(len);CHKPTRQ(outdat); 269 ptr = outdat + size; /* Pointers to the data in outgoing buffers */ 270 ierr = PetscMemzero(outdat,2*size*sizeof(int*));CHKERRQ(ierr); 271 tmp = (int *) (outdat + 2*size); 272 ctr = tmp + msz; 273 274 { 275 int *iptr = tmp,ict = 0; 276 for (i=0; i<nrqs; i++) { 277 j = pa[i]; 278 iptr += ict; 279 outdat[j] = iptr; 280 ict = w1[j]; 281 } 282 } 283 284 /* Form the outgoing messages */ 285 /*plug in the headers*/ 286 for (i=0; i<nrqs; i++) { 287 j = pa[i]; 288 outdat[j][0] = 0; 289 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr); 290 ptr[j] = outdat[j] + 2*w3[j] + 1; 291 } 292 293 /* Memory for doing local proc's work*/ 294 { 295 int *d_p; 296 char *t_p; 297 298 len = (imax)*(sizeof(BT) + sizeof(int *) + sizeof(int)) + 299 (Mbs)*imax*sizeof(int) + (Mbs/BITSPERBYTE+1)*imax*sizeof(char) + 1; 300 table = (BT *)PetscMalloc(len);CHKPTRQ(table); 301 ierr = PetscMemzero(table,len);CHKERRQ(ierr); 302 data = (int **)(table + imax); 303 isz = (int *)(data + imax); 304 d_p = (int *)(isz + imax); 305 t_p = (char *)(d_p + Mbs*imax); 306 for (i=0; i<imax; i++) { 307 table[i] = t_p + (Mbs/BITSPERBYTE+1)*i; 308 data[i] = d_p + (Mbs)*i; 309 } 310 } 311 312 /* Parse the IS and update local tables and the outgoing buf with the data*/ 313 { 314 int n_i,*data_i,isz_i,*outdat_j,ctr_j; 315 BT table_i; 316 317 for (i=0; i<imax; i++) { 318 ierr = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr); 319 n_i = n[i]; 320 table_i = table[i]; 321 idx_i = idx[i]; 322 data_i = data[i]; 323 isz_i = isz[i]; 324 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 325 row = idx_i[j]; 326 proc = rtable[row]; 327 if (proc != rank) { /* copy to the outgoing buffer */ 328 ctr[proc]++; 329 *ptr[proc] = row; 330 ptr[proc]++; 331 } 332 else { /* Update the local table */ 333 if (!BTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 334 } 335 } 336 /* Update the headers for the current IS */ 337 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 338 if ((ctr_j = ctr[j])) { 339 outdat_j = outdat[j]; 340 k = ++outdat_j[0]; 341 outdat_j[2*k] = ctr_j; 342 outdat_j[2*k-1] = i; 343 } 344 } 345 isz[i] = isz_i; 346 } 347 } 348 349 350 351 /* Now post the sends */ 352 s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(s_waits1); 353 for (i=0; i<nrqs; ++i) { 354 j = pa[i]; 355 ierr = MPI_Isend(outdat[j], w1[j], MPI_INT, j, tag, comm, s_waits1+i);CHKERRQ(ierr); 356 } 357 358 /* No longer need the original indices*/ 359 for (i=0; i<imax; ++i) { 360 ierr = ISRestoreIndices(is[i], idx+i);CHKERRQ(ierr); 361 } 362 ierr = PetscFree(idx);CHKERRQ(ierr); 363 364 for (i=0; i<imax; ++i) { 365 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 366 } 367 368 /* Do Local work*/ 369 ierr = MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 370 371 /* Receive messages*/ 372 { 373 int index; 374 375 recv_status = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(recv_status); 376 for (i=0; i<nrqr; ++i) { 377 ierr = MPI_Waitany(nrqr, r_waits1, &index, recv_status+i);CHKERRQ(ierr); 378 } 379 380 s_status = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(s_status); 381 ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr); 382 } 383 384 /* Phase 1 sends are complete - deallocate buffers */ 385 ierr = PetscFree(outdat);CHKERRQ(ierr); 386 ierr = PetscFree(w1);CHKERRQ(ierr); 387 388 xdata = (int **)PetscMalloc((nrqr+1)*sizeof(int *));CHKPTRQ(xdata); 389 isz1 = (int *)PetscMalloc((nrqr+1)*sizeof(int));CHKPTRQ(isz1); 390 ierr = MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 391 ierr = PetscFree(rbuf);CHKERRQ(ierr); 392 393 /* Send the data back*/ 394 /* Do a global reduction to know the buffer space req for incoming messages*/ 395 { 396 int *rw1, *rw2; 397 398 rw1 = (int *)PetscMalloc(2*size*sizeof(int));CHKPTRQ(rw1); 399 ierr = PetscMemzero(rw1,2*size*sizeof(int));CHKERRQ(ierr); 400 rw2 = rw1+size; 401 for (i=0; i<nrqr; ++i) { 402 proc = recv_status[i].MPI_SOURCE; 403 rw1[proc] = isz1[i]; 404 } 405 406 ierr = MPI_Allreduce(rw1, rw2, size, MPI_INT, MPI_MAX, comm);CHKERRQ(ierr); 407 bsz1 = rw2[rank]; 408 ierr = PetscFree(rw1);CHKERRQ(ierr); 409 } 410 411 /* Allocate buffers*/ 412 413 /* Allocate memory for recv buffers. Prob none if nrqr = 0 ???? */ 414 len = (nrqs+1)*sizeof(int*) + nrqs*bsz1*sizeof(int); 415 rbuf2 = (int**) PetscMalloc(len);CHKPTRQ(rbuf2); 416 rbuf2[0] = (int *) (rbuf2 + nrqs); 417 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + bsz1; 418 419 /* Post the receives */ 420 r_waits2 = (MPI_Request *)PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits2); 421 for (i=0; i<nrqs; ++i) { 422 ierr = MPI_Irecv(rbuf2[i], bsz1, MPI_INT, MPI_ANY_SOURCE, tag, comm, r_waits2+i);CHKERRQ(ierr); 423 } 424 425 /* Now post the sends */ 426 s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits2); 427 for (i=0; i<nrqr; ++i) { 428 j = recv_status[i].MPI_SOURCE; 429 ierr = MPI_Isend( xdata[i], isz1[i], MPI_INT, j, tag, comm, s_waits2+i);CHKERRQ(ierr); 430 } 431 432 /* receive work done on other processors*/ 433 { 434 int index, is_no, ct1, max,*rbuf2_i,isz_i,*data_i,jmax; 435 BT table_i; 436 MPI_Status *status2; 437 438 status2 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(status2); 439 440 for (i=0; i<nrqs; ++i) { 441 ierr = MPI_Waitany(nrqs, r_waits2, &index, status2+i);CHKERRQ(ierr); 442 /* Process the message*/ 443 rbuf2_i = rbuf2[index]; 444 ct1 = 2*rbuf2_i[0]+1; 445 jmax = rbuf2[index][0]; 446 for (j=1; j<=jmax; j++) { 447 max = rbuf2_i[2*j]; 448 is_no = rbuf2_i[2*j-1]; 449 isz_i = isz[is_no]; 450 data_i = data[is_no]; 451 table_i = table[is_no]; 452 for (k=0; k<max; k++,ct1++) { 453 row = rbuf2_i[ct1]; 454 if (!BTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 455 } 456 isz[is_no] = isz_i; 457 } 458 } 459 ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr); 460 ierr = PetscFree(status2); CHKERRQ(ierr); 461 } 462 463 for (i=0; i<imax; ++i) { 464 ierr = ISCreateGeneral(PETSC_COMM_SELF, isz[i], data[i], is+i);CHKERRQ(ierr); 465 } 466 467 ierr = PetscFree(pa);CHKERRQ(ierr); 468 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 469 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 470 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 471 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 472 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 473 ierr = PetscFree(table);CHKERRQ(ierr); 474 ierr = PetscFree(s_status);CHKERRQ(ierr); 475 ierr = PetscFree(recv_status);CHKERRQ(ierr); 476 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 477 ierr = PetscFree(xdata);CHKERRQ(ierr); 478 ierr = PetscFree(isz1);CHKERRQ(ierr); 479 PetscFunctionReturn(0); 480 } 481 482 #undef __FUNC__ 483 #define __FUNC__ "MatIncreaseOverlap_MPIBAIJ_Local" 484 /* 485 MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do 486 the work on the local processor. 487 488 Inputs: 489 C - MAT_MPIBAIJ; 490 imax - total no of index sets processed at a time; 491 table - an array of char - size = Mbs bits. 492 493 Output: 494 isz - array containing the count of the solution elements correspondign 495 to each index set; 496 data - pointer to the solutions 497 */ 498 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat C,int imax,BT *table,int *isz, 499 int **data) 500 { 501 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 502 Mat A = c->A, B = c->B; 503 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 504 int start, end, val, max, rstart,cstart,*ai, *aj; 505 int *bi, *bj, *garray, i, j, k, row,*data_i,isz_i; 506 BT table_i; 507 508 PetscFunctionBegin; 509 rstart = c->rstart; 510 cstart = c->cstart; 511 ai = a->i; 512 aj = a->j; 513 bi = b->i; 514 bj = b->j; 515 garray = c->garray; 516 517 518 for (i=0; i<imax; i++) { 519 data_i = data[i]; 520 table_i = table[i]; 521 isz_i = isz[i]; 522 for (j=0, max=isz[i]; j<max; j++) { 523 row = data_i[j] - rstart; 524 start = ai[row]; 525 end = ai[row+1]; 526 for (k=start; k<end; k++) { /* Amat */ 527 val = aj[k] + cstart; 528 if (!BTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 529 } 530 start = bi[row]; 531 end = bi[row+1]; 532 for (k=start; k<end; k++) { /* Bmat */ 533 val = garray[bj[k]]; 534 if (!BTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 535 } 536 } 537 isz[i] = isz_i; 538 } 539 PetscFunctionReturn(0); 540 } 541 #undef __FUNC__ 542 #define __FUNC__ "MatIncreaseOverlap_MPIBAIJ_Receive" 543 /* 544 MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages, 545 and return the output 546 547 Input: 548 C - the matrix 549 nrqr - no of messages being processed. 550 rbuf - an array of pointers to the recieved requests 551 552 Output: 553 xdata - array of messages to be sent back 554 isz1 - size of each message 555 556 For better efficiency perhaps we should malloc seperately each xdata[i], 557 then if a remalloc is required we need only copy the data for that one row 558 rather then all previous rows as it is now where a single large chunck of 559 memory is used. 560 561 */ 562 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,int nrqr,int **rbuf,int **xdata, int * isz1) 563 { 564 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 565 Mat A = c->A, B = c->B; 566 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 567 int rstart,cstart,*ai, *aj, *bi, *bj, *garray, i, j, k; 568 int row,total_sz,ct, ct1, ct2, ct3,mem_estimate, oct2, l, start, end; 569 int val, max1, max2, rank, Mbs, no_malloc =0, *tmp, new_estimate, ctr; 570 int *rbuf_i,kmax,rbuf_0,ierr; 571 BT xtable; 572 573 PetscFunctionBegin; 574 rank = c->rank; 575 Mbs = c->Mbs; 576 rstart = c->rstart; 577 cstart = c->cstart; 578 ai = a->i; 579 aj = a->j; 580 bi = b->i; 581 bj = b->j; 582 garray = c->garray; 583 584 585 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 586 rbuf_i = rbuf[i]; 587 rbuf_0 = rbuf_i[0]; 588 ct += rbuf_0; 589 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 590 } 591 592 if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs; 593 else max1 = 1; 594 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 595 xdata[0] = (int *) PetscMalloc(mem_estimate*sizeof(int));CHKPTRQ(xdata[0]); 596 ++no_malloc; 597 ierr = BTCreate(Mbs,xtable);CHKERRQ(ierr); 598 ierr = PetscMemzero(isz1,nrqr*sizeof(int));CHKERRQ(ierr); 599 600 ct3 = 0; 601 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 602 rbuf_i = rbuf[i]; 603 rbuf_0 = rbuf_i[0]; 604 ct1 = 2*rbuf_0+1; 605 ct2 = ct1; 606 ct3 += ct1; 607 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 608 ierr = BTMemzero(Mbs,xtable);CHKERRQ(ierr); 609 oct2 = ct2; 610 kmax = rbuf_i[2*j]; 611 for (k=0; k<kmax; k++, ct1++) { 612 row = rbuf_i[ct1]; 613 if (!BTLookupSet(xtable,row)) { 614 if (!(ct3 < mem_estimate)) { 615 new_estimate = (int)(1.5*mem_estimate)+1; 616 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 617 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr); 618 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 619 xdata[0] = tmp; 620 mem_estimate = new_estimate; ++no_malloc; 621 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 622 } 623 xdata[i][ct2++] = row; 624 ct3++; 625 } 626 } 627 for (k=oct2,max2=ct2; k<max2; k++) { 628 row = xdata[i][k] - rstart; 629 start = ai[row]; 630 end = ai[row+1]; 631 for (l=start; l<end; l++) { 632 val = aj[l] + cstart; 633 if (!BTLookupSet(xtable,val)) { 634 if (!(ct3 < mem_estimate)) { 635 new_estimate = (int)(1.5*mem_estimate)+1; 636 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 637 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr); 638 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 639 xdata[0] = tmp; 640 mem_estimate = new_estimate; ++no_malloc; 641 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 642 } 643 xdata[i][ct2++] = val; 644 ct3++; 645 } 646 } 647 start = bi[row]; 648 end = bi[row+1]; 649 for (l=start; l<end; l++) { 650 val = garray[bj[l]]; 651 if (!BTLookupSet(xtable,val)) { 652 if (!(ct3 < mem_estimate)) { 653 new_estimate = (int)(1.5*mem_estimate)+1; 654 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 655 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr); 656 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 657 xdata[0] = tmp; 658 mem_estimate = new_estimate; ++no_malloc; 659 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 660 } 661 xdata[i][ct2++] = val; 662 ct3++; 663 } 664 } 665 } 666 /* Update the header*/ 667 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 668 xdata[i][2*j-1] = rbuf_i[2*j-1]; 669 } 670 xdata[i][0] = rbuf_0; 671 xdata[i+1] = xdata[i] + ct2; 672 isz1[i] = ct2; /* size of each message */ 673 } 674 ierr = BTDestroy(xtable);CHKERRQ(ierr); 675 PLogInfo(0,"MatIncreaseOverlap_MPIBAIJ:[%d] Allocated %d bytes, required %d, no of mallocs = %d\n",rank,mem_estimate,ct3,no_malloc); 676 PetscFunctionReturn(0); 677 } 678 679 static int MatGetSubMatrices_MPIBAIJ_local(Mat,int,IS *,IS *,MatReuse,Mat *); 680 681 #undef __FUNC__ 682 #define __FUNC__ "MatGetSubMatrices_MPIBAIJ" 683 int MatGetSubMatrices_MPIBAIJ(Mat C,int ismax,IS *isrow,IS *iscol, 684 MatReuse scall,Mat **submat) 685 { 686 IS *isrow_new,*iscol_new; 687 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 688 int nmax,nstages_local,nstages,i,pos,max_no,ierr; 689 690 PetscFunctionBegin; 691 /* The compression and expansion should be avoided. Does'nt point 692 out errors might change the indices hence buggey */ 693 694 isrow_new = (IS *)PetscMalloc(2*(ismax+1)*sizeof(IS));CHKPTRQ(isrow_new); 695 iscol_new = isrow_new + ismax; 696 ierr = MatCompressIndicesSorted_MPIBAIJ(C, ismax, isrow,isrow_new);CHKERRQ(ierr); 697 ierr = MatCompressIndicesSorted_MPIBAIJ(C, ismax, iscol,iscol_new);CHKERRQ(ierr); 698 699 /* Allocate memory to hold all the submatrices */ 700 if (scall != MAT_REUSE_MATRIX) { 701 *submat = (Mat *)PetscMalloc((ismax+1)*sizeof(Mat));CHKPTRQ(*submat); 702 } 703 /* Determine the number of stages through which submatrices are done */ 704 nmax = 20*1000000 / (c->Nbs * sizeof(int)); 705 if (nmax == 0) nmax = 1; 706 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 707 708 /* Make sure every porcessor loops through the nstages */ 709 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);CHKERRQ(ierr); 710 711 for ( i=0,pos=0; i<nstages; i++ ) { 712 if (pos+nmax <= ismax) max_no = nmax; 713 else if (pos == ismax) max_no = 0; 714 else max_no = ismax-pos; 715 ierr = MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos);CHKERRQ(ierr); 716 pos += max_no; 717 } 718 719 for (i=0; i<ismax; i++) { 720 ierr = ISDestroy(isrow_new[i]);CHKERRQ(ierr); 721 ierr = ISDestroy(iscol_new[i]);CHKERRQ(ierr); 722 } 723 ierr = PetscFree(isrow_new);CHKERRQ(ierr); 724 PetscFunctionReturn(0); 725 } 726 727 /* -------------------------------------------------------------------------*/ 728 #undef __FUNC__ 729 #define __FUNC__ "MatGetSubMatrices_MPIBAIJ_local" 730 static int MatGetSubMatrices_MPIBAIJ_local(Mat C,int ismax,IS *isrow,IS *iscol, 731 MatReuse scall,Mat *submats) 732 { 733 Mat_MPIBAIJ *c = (Mat_MPIBAIJ *) C->data; 734 Mat A = c->A; 735 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data, *b = (Mat_SeqBAIJ*)c->B->data, *mat; 736 int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size; 737 int **sbuf1,**sbuf2, rank, Mbs,i,j,k,l,ct1,ct2,ierr, **rbuf1,row,proc; 738 int nrqs, msz, **ptr,index,*req_size,*ctr,*pa,*tmp,tcol,bsz,nrqr; 739 int **rbuf3,*req_source,**sbuf_aj, **rbuf2, max1,max2,**rmap; 740 int **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i; 741 int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i; 742 int *rmap_i,bs=c->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA, *cworkB; 743 int cstart = c->cstart,nzA,nzB,*a_i=a->i,*b_i=b->i,imark; 744 int *bmap = c->garray,ctmp,rstart=c->rstart,tag0,tag1,tag2,tag3; 745 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 746 MPI_Request *r_waits4,*s_waits3,*s_waits4; 747 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 748 MPI_Status *r_status3,*r_status4,*s_status4; 749 MPI_Comm comm; 750 Scalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB; 751 Scalar *a_a=a->a,*b_a=b->a; 752 753 PetscFunctionBegin; 754 comm = C->comm; 755 tag0 = C->tag; 756 size = c->size; 757 rank = c->rank; 758 Mbs = c->Mbs; 759 760 /* Get some new tags to keep the communication clean */ 761 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 762 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 763 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 764 765 /* Check if the col indices are sorted */ 766 for (i=0; i<ismax; i++) { 767 ierr = ISSorted(iscol[i],(PetscTruth*)&j);CHKERRQ(ierr); 768 if (!j) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted"); 769 } 770 771 len = (2*ismax+1)*(sizeof(int *) + sizeof(int)) + (Mbs+1)*sizeof(int); 772 irow = (int **)PetscMalloc(len);CHKPTRQ(irow); 773 icol = irow + ismax; 774 nrow = (int *) (icol + ismax); 775 ncol = nrow + ismax; 776 rtable = ncol + ismax; 777 778 for (i=0; i<ismax; i++) { 779 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 780 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 781 ierr = ISGetSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 782 ierr = ISGetSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 783 } 784 785 /* Create hash table for the mapping :row -> proc*/ 786 for (i=0,j=0; i<size; i++) { 787 jmax = c->rowners[i+1]; 788 for (; j<jmax; j++) { 789 rtable[j] = i; 790 } 791 } 792 793 /* evaluate communication - mesg to who, length of mesg, and buffer space 794 required. Based on this, buffers are allocated, and data copied into them*/ 795 w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1); /* mesg size */ 796 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 797 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 798 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 799 ierr = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/ 800 for (i=0; i<ismax; i++) { 801 ierr = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/ 802 jmax = nrow[i]; 803 irow_i = irow[i]; 804 for (j=0; j<jmax; j++) { 805 row = irow_i[j]; 806 proc = rtable[row]; 807 w4[proc]++; 808 } 809 for (j=0; j<size; j++) { 810 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 811 } 812 } 813 814 nrqs = 0; /* no of outgoing messages */ 815 msz = 0; /* total mesg length for all proc */ 816 w1[rank] = 0; /* no mesg sent to intself */ 817 w3[rank] = 0; 818 for (i=0; i<size; i++) { 819 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 820 } 821 pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); /*(proc -array)*/ 822 for (i=0, j=0; i<size; i++) { 823 if (w1[i]) { pa[j] = i; j++; } 824 } 825 826 /* Each message would have a header = 1 + 2*(no of IS) + data */ 827 for (i=0; i<nrqs; i++) { 828 j = pa[i]; 829 w1[j] += w2[j] + 2* w3[j]; 830 msz += w1[j]; 831 } 832 /* Do a global reduction to determine how many messages to expect*/ 833 { 834 int *rw1, *rw2; 835 rw1 = (int *)PetscMalloc(2*size*sizeof(int));CHKPTRQ(rw1); 836 rw2 = rw1+size; 837 ierr = MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm);CHKERRQ(ierr); 838 bsz = rw1[rank]; 839 ierr = MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm);CHKERRQ(ierr); 840 nrqr = rw2[rank]; 841 ierr = PetscFree(rw1);CHKERRQ(ierr); 842 } 843 844 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 845 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 846 rbuf1 = (int**) PetscMalloc(len);CHKPTRQ(rbuf1); 847 rbuf1[0] = (int *) (rbuf1 + nrqr); 848 for (i=1; i<nrqr; ++i) rbuf1[i] = rbuf1[i-1] + bsz; 849 850 /* Post the receives */ 851 r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(r_waits1); 852 for (i=0; i<nrqr; ++i) { 853 ierr = MPI_Irecv(rbuf1[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag0,comm,r_waits1+i);CHKERRQ(ierr); 854 } 855 856 /* Allocate Memory for outgoing messages */ 857 len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int); 858 sbuf1 = (int **)PetscMalloc(len);CHKPTRQ(sbuf1); 859 ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */ 860 ierr = PetscMemzero(sbuf1,2*size*sizeof(int*));CHKERRQ(ierr); 861 /* allocate memory for outgoing data + buf to receive the first reply */ 862 tmp = (int *) (ptr + size); 863 ctr = tmp + 2*msz; 864 865 { 866 int *iptr = tmp,ict = 0; 867 for (i=0; i<nrqs; i++) { 868 j = pa[i]; 869 iptr += ict; 870 sbuf1[j] = iptr; 871 ict = w1[j]; 872 } 873 } 874 875 /* Form the outgoing messages */ 876 /* Initialise the header space */ 877 for (i=0; i<nrqs; i++) { 878 j = pa[i]; 879 sbuf1[j][0] = 0; 880 ierr = PetscMemzero(sbuf1[j]+1, 2*w3[j]*sizeof(int));CHKERRQ(ierr); 881 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 882 } 883 884 /* Parse the isrow and copy data into outbuf */ 885 for (i=0; i<ismax; i++) { 886 ierr = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr); 887 irow_i = irow[i]; 888 jmax = nrow[i]; 889 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 890 row = irow_i[j]; 891 proc = rtable[row]; 892 if (proc != rank) { /* copy to the outgoing buf*/ 893 ctr[proc]++; 894 *ptr[proc] = row; 895 ptr[proc]++; 896 } 897 } 898 /* Update the headers for the current IS */ 899 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 900 if ((ctr_j = ctr[j])) { 901 sbuf1_j = sbuf1[j]; 902 k = ++sbuf1_j[0]; 903 sbuf1_j[2*k] = ctr_j; 904 sbuf1_j[2*k-1] = i; 905 } 906 } 907 } 908 909 /* Now post the sends */ 910 s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(s_waits1); 911 for (i=0; i<nrqs; ++i) { 912 j = pa[i]; 913 /* printf("[%d] Send Req to %d: size %d \n", rank,j, w1[j]); */ 914 ierr = MPI_Isend( sbuf1[j], w1[j], MPI_INT, j, tag0, comm, s_waits1+i);CHKERRQ(ierr); 915 } 916 917 /* Post Recieves to capture the buffer size */ 918 r_waits2 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits2); 919 rbuf2 = (int**)PetscMalloc((nrqs+1)*sizeof(int *));CHKPTRQ(rbuf2); 920 rbuf2[0] = tmp + msz; 921 for (i=1; i<nrqs; ++i) { 922 j = pa[i]; 923 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 924 } 925 for (i=0; i<nrqs; ++i) { 926 j = pa[i]; 927 ierr = MPI_Irecv( rbuf2[i], w1[j], MPI_INT, j, tag1, comm, r_waits2+i);CHKERRQ(ierr); 928 } 929 930 /* Send to other procs the buf size they should allocate */ 931 932 933 /* Receive messages*/ 934 s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits2); 935 r_status1 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(r_status1); 936 len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*); 937 sbuf2 = (int**) PetscMalloc(len);CHKPTRQ(sbuf2); 938 req_size = (int *) (sbuf2 + nrqr); 939 req_source = req_size + nrqr; 940 941 { 942 Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*) c->A->data, *sB = (Mat_SeqBAIJ*) c->B->data; 943 int *sAi = sA->i, *sBi = sB->i, id, *sbuf2_i; 944 945 for (i=0; i<nrqr; ++i) { 946 ierr = MPI_Waitany(nrqr, r_waits1, &index, r_status1+i);CHKERRQ(ierr); 947 req_size[index] = 0; 948 rbuf1_i = rbuf1[index]; 949 start = 2*rbuf1_i[0] + 1; 950 ierr = MPI_Get_count(r_status1+i,MPI_INT, &end);CHKERRQ(ierr); 951 sbuf2[index] = (int *)PetscMalloc(end*sizeof(int));CHKPTRQ(sbuf2[index]); 952 sbuf2_i = sbuf2[index]; 953 for (j=start; j<end; j++) { 954 id = rbuf1_i[j] - rstart; 955 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 956 sbuf2_i[j] = ncols; 957 req_size[index] += ncols; 958 } 959 req_source[index] = r_status1[i].MPI_SOURCE; 960 /* form the header */ 961 sbuf2_i[0] = req_size[index]; 962 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 963 ierr = MPI_Isend(sbuf2_i,end,MPI_INT,req_source[index],tag1,comm,s_waits2+i);CHKERRQ(ierr); 964 } 965 } 966 ierr = PetscFree(r_status1);CHKERRQ(ierr); 967 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 968 969 /* recv buffer sizes */ 970 /* Receive messages*/ 971 972 rbuf3 = (int**)PetscMalloc((nrqs+1)*sizeof(int*));CHKPTRQ(rbuf3); 973 rbuf4 = (Scalar**)PetscMalloc((nrqs+1)*sizeof(Scalar*));CHKPTRQ(rbuf4); 974 r_waits3 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits3); 975 r_waits4 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits4); 976 r_status2 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(r_status2); 977 978 for (i=0; i<nrqs; ++i) { 979 ierr = MPI_Waitany(nrqs, r_waits2, &index, r_status2+i);CHKERRQ(ierr); 980 rbuf3[index] = (int *)PetscMalloc(rbuf2[index][0]*sizeof(int));CHKPTRQ(rbuf3[index]); 981 rbuf4[index] = (Scalar *)PetscMalloc(rbuf2[index][0]*bs2*sizeof(Scalar));CHKPTRQ(rbuf4[index]); 982 ierr = MPI_Irecv(rbuf3[index],rbuf2[index][0], MPI_INT, 983 r_status2[i].MPI_SOURCE, tag2, comm, r_waits3+index);CHKERRQ(ierr); 984 ierr = MPI_Irecv(rbuf4[index],rbuf2[index][0]*bs2, MPIU_SCALAR, 985 r_status2[i].MPI_SOURCE, tag3, comm, r_waits4+index);CHKERRQ(ierr); 986 } 987 ierr = PetscFree(r_status2);CHKERRQ(ierr); 988 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 989 990 /* Wait on sends1 and sends2 */ 991 s_status1 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(s_status1); 992 s_status2 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(s_status2); 993 994 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 995 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 996 ierr = PetscFree(s_status1);CHKERRQ(ierr); 997 ierr = PetscFree(s_status2);CHKERRQ(ierr); 998 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 999 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1000 1001 /* Now allocate buffers for a->j, and send them off */ 1002 sbuf_aj = (int **)PetscMalloc((nrqr+1)*sizeof(int *));CHKPTRQ(sbuf_aj); 1003 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1004 sbuf_aj[0] = (int*) PetscMalloc((j+1)*sizeof(int));CHKPTRQ(sbuf_aj[0]); 1005 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1006 1007 s_waits3 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits3); 1008 { 1009 for (i=0; i<nrqr; i++) { 1010 rbuf1_i = rbuf1[i]; 1011 sbuf_aj_i = sbuf_aj[i]; 1012 ct1 = 2*rbuf1_i[0] + 1; 1013 ct2 = 0; 1014 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1015 kmax = rbuf1[i][2*j]; 1016 for (k=0; k<kmax; k++,ct1++) { 1017 row = rbuf1_i[ct1] - rstart; 1018 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1019 ncols = nzA + nzB; 1020 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1021 1022 /* load the column indices for this row into cols*/ 1023 cols = sbuf_aj_i + ct2; 1024 for (l=0; l<nzB; l++) { 1025 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp; 1026 else break; 1027 } 1028 imark = l; 1029 for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l]; 1030 for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]]; 1031 ct2 += ncols; 1032 } 1033 } 1034 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1035 } 1036 } 1037 r_status3 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(r_status3); 1038 s_status3 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(s_status3); 1039 1040 /* Allocate buffers for a->a, and send them off */ 1041 sbuf_aa = (Scalar **)PetscMalloc((nrqr+1)*sizeof(Scalar *));CHKPTRQ(sbuf_aa); 1042 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1043 sbuf_aa[0] = (Scalar*) PetscMalloc((j+1)*bs2*sizeof(Scalar));CHKPTRQ(sbuf_aa[0]); 1044 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2; 1045 1046 s_waits4 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits4); 1047 { 1048 for (i=0; i<nrqr; i++) { 1049 rbuf1_i = rbuf1[i]; 1050 sbuf_aa_i = sbuf_aa[i]; 1051 ct1 = 2*rbuf1_i[0]+1; 1052 ct2 = 0; 1053 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1054 kmax = rbuf1_i[2*j]; 1055 for (k=0; k<kmax; k++,ct1++) { 1056 row = rbuf1_i[ct1] - rstart; 1057 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1058 ncols = nzA + nzB; 1059 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1060 vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2; 1061 1062 /* load the column values for this row into vals*/ 1063 vals = sbuf_aa_i+ct2*bs2; 1064 for (l=0; l<nzB; l++) { 1065 if ((bmap[cworkB[l]]) < cstart) { 1066 ierr = PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(Scalar));CHKERRQ(ierr); 1067 } 1068 else break; 1069 } 1070 imark = l; 1071 for (l=0; l<nzA; l++) { 1072 ierr = PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(Scalar));CHKERRQ(ierr); 1073 } 1074 for (l=imark; l<nzB; l++) { 1075 ierr = PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(Scalar));CHKERRQ(ierr); 1076 } 1077 ct2 += ncols; 1078 } 1079 } 1080 ierr = MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1081 } 1082 } 1083 r_status4 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(r_status4); 1084 s_status4 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(s_status4); 1085 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1086 1087 /* Form the matrix */ 1088 /* create col map */ 1089 { 1090 int *icol_i; 1091 1092 len = (1+ismax)*sizeof(int *) + ismax*c->Nbs*sizeof(int); 1093 cmap = (int **)PetscMalloc(len);CHKPTRQ(cmap); 1094 cmap[0] = (int *)(cmap + ismax); 1095 ierr = PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(int));CHKERRQ(ierr); 1096 for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; } 1097 for (i=0; i<ismax; i++) { 1098 jmax = ncol[i]; 1099 icol_i = icol[i]; 1100 cmap_i = cmap[i]; 1101 for (j=0; j<jmax; j++) { 1102 cmap_i[icol_i[j]] = j+1; 1103 } 1104 } 1105 } 1106 1107 1108 /* Create lens which is required for MatCreate... */ 1109 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1110 len = (1+ismax)*sizeof(int *) + j*sizeof(int); 1111 lens = (int **)PetscMalloc(len);CHKPTRQ(lens); 1112 lens[0] = (int *)(lens + ismax); 1113 ierr = PetscMemzero(lens[0], j*sizeof(int));CHKERRQ(ierr); 1114 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1115 1116 /* Update lens from local data */ 1117 for (i=0; i<ismax; i++) { 1118 jmax = nrow[i]; 1119 cmap_i = cmap[i]; 1120 irow_i = irow[i]; 1121 lens_i = lens[i]; 1122 for (j=0; j<jmax; j++) { 1123 row = irow_i[j]; 1124 proc = rtable[row]; 1125 if (proc == rank) { 1126 /* Get indices from matA and then from matB */ 1127 row = row - rstart; 1128 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1129 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1130 for (k=0; k<nzA; k++) { 1131 if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++;} 1132 } 1133 for (k=0; k<nzB; k++) { 1134 if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++;} 1135 } 1136 } 1137 } 1138 } 1139 1140 /* Create row map*/ 1141 len = (1+ismax)*sizeof(int *) + ismax*c->Mbs*sizeof(int); 1142 rmap = (int **)PetscMalloc(len);CHKPTRQ(rmap); 1143 rmap[0] = (int *)(rmap + ismax); 1144 ierr = PetscMemzero(rmap[0],ismax*c->Mbs*sizeof(int));CHKERRQ(ierr); 1145 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + c->Mbs;} 1146 for (i=0; i<ismax; i++) { 1147 rmap_i = rmap[i]; 1148 irow_i = irow[i]; 1149 jmax = nrow[i]; 1150 for (j=0; j<jmax; j++) { 1151 rmap_i[irow_i[j]] = j; 1152 } 1153 } 1154 1155 /* Update lens from offproc data */ 1156 { 1157 int *rbuf2_i, *rbuf3_i, *sbuf1_i; 1158 1159 for (tmp2=0; tmp2<nrqs; tmp2++) { 1160 ierr = MPI_Waitany(nrqs, r_waits3, &i, r_status3+tmp2);CHKERRQ(ierr); 1161 index = pa[i]; 1162 sbuf1_i = sbuf1[index]; 1163 jmax = sbuf1_i[0]; 1164 ct1 = 2*jmax+1; 1165 ct2 = 0; 1166 rbuf2_i = rbuf2[i]; 1167 rbuf3_i = rbuf3[i]; 1168 for (j=1; j<=jmax; j++) { 1169 is_no = sbuf1_i[2*j-1]; 1170 max1 = sbuf1_i[2*j]; 1171 lens_i = lens[is_no]; 1172 cmap_i = cmap[is_no]; 1173 rmap_i = rmap[is_no]; 1174 for (k=0; k<max1; k++,ct1++) { 1175 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1176 max2 = rbuf2_i[ct1]; 1177 for (l=0; l<max2; l++,ct2++) { 1178 if (cmap_i[rbuf3_i[ct2]]) { 1179 lens_i[row]++; 1180 } 1181 } 1182 } 1183 } 1184 } 1185 } 1186 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1187 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1188 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1189 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1190 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1191 1192 /* Create the submatrices */ 1193 if (scall == MAT_REUSE_MATRIX) { 1194 /* 1195 Assumes new rows are same length as the old rows, hence bug! 1196 */ 1197 for (i=0; i<ismax; i++) { 1198 mat = (Mat_SeqBAIJ *)(submats[i]->data); 1199 if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || mat->bs != bs)) { 1200 SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong size"); 1201 } 1202 if (PetscMemcmp(mat->ilen,lens[i], mat->mbs *sizeof(int))) { 1203 SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Cannot reuse matrix. wrong no of nonzeros"); 1204 } 1205 /* Initial matrix as if empty */ 1206 ierr = PetscMemzero(mat->ilen,mat->mbs*sizeof(int));CHKERRQ(ierr); 1207 submats[i]->factor = C->factor; 1208 } 1209 } else { 1210 for (i=0; i<ismax; i++) { 1211 ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,a->bs,nrow[i]*bs,ncol[i]*bs,0,lens[i],submats+i);CHKERRQ(ierr); 1212 } 1213 } 1214 1215 /* Assemble the matrices */ 1216 /* First assemble the local rows */ 1217 { 1218 int ilen_row,*imat_ilen, *imat_j, *imat_i; 1219 Scalar *imat_a; 1220 1221 for (i=0; i<ismax; i++) { 1222 mat = (Mat_SeqBAIJ *) submats[i]->data; 1223 imat_ilen = mat->ilen; 1224 imat_j = mat->j; 1225 imat_i = mat->i; 1226 imat_a = mat->a; 1227 cmap_i = cmap[i]; 1228 rmap_i = rmap[i]; 1229 irow_i = irow[i]; 1230 jmax = nrow[i]; 1231 for (j=0; j<jmax; j++) { 1232 row = irow_i[j]; 1233 proc = rtable[row]; 1234 if (proc == rank) { 1235 row = row - rstart; 1236 nzA = a_i[row+1] - a_i[row]; 1237 nzB = b_i[row+1] - b_i[row]; 1238 cworkA = a_j + a_i[row]; 1239 cworkB = b_j + b_i[row]; 1240 vworkA = a_a + a_i[row]*bs2; 1241 vworkB = b_a + b_i[row]*bs2; 1242 1243 row = rmap_i[row + rstart]; 1244 mat_i = imat_i[row]; 1245 mat_a = imat_a + mat_i*bs2; 1246 mat_j = imat_j + mat_i; 1247 ilen_row = imat_ilen[row]; 1248 1249 /* load the column indices for this row into cols*/ 1250 for (l=0; l<nzB; l++) { 1251 if ((ctmp = bmap[cworkB[l]]) < cstart) { 1252 if ((tcol = cmap_i[ctmp])) { 1253 *mat_j++ = tcol - 1; 1254 ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(Scalar));CHKERRQ(ierr); 1255 mat_a += bs2; 1256 ilen_row++; 1257 } 1258 } else break; 1259 } 1260 imark = l; 1261 for (l=0; l<nzA; l++) { 1262 if ((tcol = cmap_i[cstart + cworkA[l]])) { 1263 *mat_j++ = tcol - 1; 1264 ierr = PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(Scalar));CHKERRQ(ierr); 1265 mat_a += bs2; 1266 ilen_row++; 1267 } 1268 } 1269 for (l=imark; l<nzB; l++) { 1270 if ((tcol = cmap_i[bmap[cworkB[l]]])) { 1271 *mat_j++ = tcol - 1; 1272 ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(Scalar));CHKERRQ(ierr); 1273 mat_a += bs2; 1274 ilen_row++; 1275 } 1276 } 1277 imat_ilen[row] = ilen_row; 1278 } 1279 } 1280 1281 } 1282 } 1283 1284 /* Now assemble the off proc rows*/ 1285 { 1286 int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1287 int *imat_j,*imat_i; 1288 Scalar *imat_a,*rbuf4_i; 1289 1290 for (tmp2=0; tmp2<nrqs; tmp2++) { 1291 ierr = MPI_Waitany(nrqs, r_waits4, &i, r_status4+tmp2);CHKERRQ(ierr); 1292 index = pa[i]; 1293 sbuf1_i = sbuf1[index]; 1294 jmax = sbuf1_i[0]; 1295 ct1 = 2*jmax + 1; 1296 ct2 = 0; 1297 rbuf2_i = rbuf2[i]; 1298 rbuf3_i = rbuf3[i]; 1299 rbuf4_i = rbuf4[i]; 1300 for (j=1; j<=jmax; j++) { 1301 is_no = sbuf1_i[2*j-1]; 1302 rmap_i = rmap[is_no]; 1303 cmap_i = cmap[is_no]; 1304 mat = (Mat_SeqBAIJ *) submats[is_no]->data; 1305 imat_ilen = mat->ilen; 1306 imat_j = mat->j; 1307 imat_i = mat->i; 1308 imat_a = mat->a; 1309 max1 = sbuf1_i[2*j]; 1310 for (k=0; k<max1; k++, ct1++) { 1311 row = sbuf1_i[ct1]; 1312 row = rmap_i[row]; 1313 ilen = imat_ilen[row]; 1314 mat_i = imat_i[row]; 1315 mat_a = imat_a + mat_i*bs2; 1316 mat_j = imat_j + mat_i; 1317 max2 = rbuf2_i[ct1]; 1318 for (l=0; l<max2; l++,ct2++) { 1319 if ((tcol = cmap_i[rbuf3_i[ct2]])) { 1320 *mat_j++ = tcol - 1; 1321 /* *mat_a++= rbuf4_i[ct2]; */ 1322 ierr = PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(Scalar));CHKERRQ(ierr); 1323 mat_a += bs2; 1324 ilen++; 1325 } 1326 } 1327 imat_ilen[row] = ilen; 1328 } 1329 } 1330 } 1331 } 1332 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1333 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1334 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1335 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1336 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1337 1338 /* Restore the indices */ 1339 for (i=0; i<ismax; i++) { 1340 ierr = ISRestoreIndices(isrow[i], irow+i);CHKERRQ(ierr); 1341 ierr = ISRestoreIndices(iscol[i], icol+i);CHKERRQ(ierr); 1342 } 1343 1344 /* Destroy allocated memory */ 1345 ierr = PetscFree(irow);CHKERRQ(ierr); 1346 ierr = PetscFree(w1);CHKERRQ(ierr); 1347 ierr = PetscFree(pa);CHKERRQ(ierr); 1348 1349 ierr = PetscFree(sbuf1);CHKERRQ(ierr); 1350 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1351 for (i=0; i<nrqr; ++i) { 1352 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1353 } 1354 for (i=0; i<nrqs; ++i) { 1355 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1356 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1357 } 1358 1359 ierr = PetscFree(sbuf2);CHKERRQ(ierr); 1360 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1361 ierr = PetscFree(rbuf4 );CHKERRQ(ierr); 1362 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1363 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1364 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1365 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1366 1367 ierr = PetscFree(cmap);CHKERRQ(ierr); 1368 ierr = PetscFree(rmap);CHKERRQ(ierr); 1369 ierr = PetscFree(lens);CHKERRQ(ierr); 1370 1371 for (i=0; i<ismax; i++) { 1372 ierr = MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1373 ierr = MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1374 } 1375 1376 ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1377 ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1378 ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 1379 1380 PetscFunctionReturn(0); 1381 } 1382 1383 1384