1 /*$Id: baijov.c,v 1.44 2000/04/09 04:36:25 bsmith Exp bsmith $*/ 2 3 /* 4 Routines to compute overlapping regions of a parallel MPI matrix 5 and to find submatrices that were shared across processors. 6 */ 7 #include "src/mat/impls/baij/mpi/mpibaij.h" 8 #include "bitarray.h" 9 10 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat,int,IS *); 11 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat,int,char **,int*,int**); 12 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat,int,int **,int**,int*); 13 extern int MatGetRow_MPIBAIJ(Mat,int,int*,int**,Scalar**); 14 extern int MatRestoreRow_MPIBAIJ(Mat,int,int*,int**,Scalar**); 15 16 #undef __FUNC__ 17 #define __FUNC__ /*<a name=""></a>*/"MatCompressIndicesGeneral_MPIBAIJ" 18 static int MatCompressIndicesGeneral_MPIBAIJ(Mat C,int imax,IS *is_in,IS *is_out) 19 { 20 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ*)C->data; 21 int ierr,isz,bs = baij->bs,Nbs,n,i,j,*idx,*nidx,ival; 22 PetscBT table; 23 24 PetscFunctionBegin; 25 Nbs = baij->Nbs; 26 nidx = (int*)PetscMalloc((Nbs+1)*sizeof(int));CHKPTRQ(nidx); 27 ierr = PetscBTCreate(Nbs,table);CHKERRQ(ierr); 28 29 for (i=0; i<imax; i++) { 30 isz = 0; 31 ierr = PetscBTMemzero(Nbs,table);CHKERRQ(ierr); 32 ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr); 33 ierr = ISGetSize(is_in[i],&n);CHKERRQ(ierr); 34 for (j=0; j<n ; j++) { 35 ival = idx[j]/bs; /* convert the indices into block indices */ 36 if (ival>Nbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"index greater than mat-dim"); 37 if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;} 38 } 39 ierr = ISRestoreIndices(is_in[i],&idx);CHKERRQ(ierr); 40 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz,nidx,(is_out+i));CHKERRQ(ierr); 41 } 42 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 43 ierr = PetscFree(nidx);CHKERRQ(ierr); 44 PetscFunctionReturn(0); 45 } 46 47 #undef __FUNC__ 48 #define __FUNC__ /*<a name=""></a>*/"MatCompressIndicesSorted_MPIBAIJ" 49 static int MatCompressIndicesSorted_MPIBAIJ(Mat C,int imax,IS *is_in,IS *is_out) 50 { 51 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ*)C->data; 52 int ierr,bs=baij->bs,i,j,k,val,n,*idx,*nidx,Nbs=baij->Nbs,*idx_local; 53 PetscTruth flg; 54 55 PetscFunctionBegin; 56 for (i=0; i<imax; i++) { 57 ierr = ISSorted(is_in[i],&flg);CHKERRQ(ierr); 58 if (!flg) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Indices are not sorted"); 59 } 60 nidx = (int*)PetscMalloc((Nbs+1)*sizeof(int));CHKPTRQ(nidx); 61 /* Now check if the indices are in block order */ 62 for (i=0; i<imax; i++) { 63 ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr); 64 ierr = ISGetSize(is_in[i],&n);CHKERRQ(ierr); 65 if (n%bs !=0) SETERRA(1,0,"Indices are not block ordered"); 66 67 n = n/bs; /* The reduced index size */ 68 idx_local = idx; 69 for (j=0; j<n ; j++) { 70 val = idx_local[0]; 71 if (val%bs != 0) SETERRA(1,0,"Indices are not block ordered"); 72 for (k=0; k<bs; k++) { 73 if (val+k != idx_local[k]) SETERRA(1,0,"Indices are not block ordered"); 74 } 75 nidx[j] = val/bs; 76 idx_local +=bs; 77 } 78 ierr = ISRestoreIndices(is_in[i],&idx);CHKERRQ(ierr); 79 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,nidx,(is_out+i));CHKERRQ(ierr); 80 } 81 ierr = PetscFree(nidx);CHKERRQ(ierr); 82 PetscFunctionReturn(0); 83 } 84 85 #undef __FUNC__ 86 #define __FUNC__ /*<a name=""></a>*/"MatExpandIndices_MPIBAIJ" 87 static int MatExpandIndices_MPIBAIJ(Mat C,int imax,IS *is_in,IS *is_out) 88 { 89 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ*)C->data; 90 int ierr,bs = baij->bs,Nbs,n,i,j,k,*idx,*nidx; 91 92 PetscFunctionBegin; 93 Nbs = baij->Nbs; 94 95 nidx = (int*)PetscMalloc((Nbs*bs+1)*sizeof(int));CHKPTRQ(nidx); 96 97 for (i=0; i<imax; i++) { 98 ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr); 99 ierr = ISGetSize(is_in[i],&n);CHKERRQ(ierr); 100 for (j=0; j<n ; ++j){ 101 for (k=0; k<bs; k++) 102 nidx[j*bs+k] = idx[j]*bs+k; 103 } 104 ierr = ISRestoreIndices(is_in[i],&idx);CHKERRQ(ierr); 105 ierr = ISCreateGeneral(PETSC_COMM_SELF,n*bs,nidx,is_out+i);CHKERRQ(ierr); 106 } 107 ierr = PetscFree(nidx);CHKERRQ(ierr); 108 PetscFunctionReturn(0); 109 } 110 111 112 #undef __FUNC__ 113 #define __FUNC__ /*<a name=""></a>*/"MatIncreaseOverlap_MPIBAIJ" 114 int MatIncreaseOverlap_MPIBAIJ(Mat C,int imax,IS *is,int ov) 115 { 116 int i,ierr; 117 IS *is_new; 118 119 PetscFunctionBegin; 120 is_new = (IS *)PetscMalloc(imax*sizeof(IS));CHKPTRQ(is_new); 121 /* Convert the indices into block format */ 122 ierr = MatCompressIndicesGeneral_MPIBAIJ(C,imax,is,is_new);CHKERRQ(ierr); 123 if (ov < 0){ SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative overlap specified\n");} 124 for (i=0; i<ov; ++i) { 125 ierr = MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);CHKERRQ(ierr); 126 } 127 for (i=0; i<imax; i++) {ierr = ISDestroy(is[i]);CHKERRQ(ierr);} 128 ierr = MatExpandIndices_MPIBAIJ(C,imax,is_new,is);CHKERRQ(ierr); 129 for (i=0; i<imax; i++) {ierr = ISDestroy(is_new[i]);CHKERRQ(ierr);} 130 ierr = PetscFree(is_new);CHKERRQ(ierr); 131 PetscFunctionReturn(0); 132 } 133 134 /* 135 Sample message format: 136 If a processor A wants processor B to process some elements corresponding 137 to index sets 1s[1], is[5] 138 mesg [0] = 2 (no of index sets in the mesg) 139 ----------- 140 mesg [1] = 1 => is[1] 141 mesg [2] = sizeof(is[1]); 142 ----------- 143 mesg [5] = 5 => is[5] 144 mesg [6] = sizeof(is[5]); 145 ----------- 146 mesg [7] 147 mesg [n] datas[1] 148 ----------- 149 mesg[n+1] 150 mesg[m] data(is[5]) 151 ----------- 152 153 Notes: 154 nrqs - no of requests sent (or to be sent out) 155 nrqr - no of requests recieved (which have to be or which have been processed 156 */ 157 #undef __FUNC__ 158 #define __FUNC__ /*<a name=""></a>*/"MatIncreaseOverlap_MPIBAIJ_Once" 159 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat C,int imax,IS *is) 160 { 161 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 162 int **idx,*n,*w1,*w2,*w3,*w4,*rtable,**data,len,*idx_i; 163 int size,rank,Mbs,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr; 164 int *ctr,*pa,tag,*tmp,bsz,nrqr,*isz,*isz1,**xdata,bsz1,**rbuf2; 165 PetscBT *table; 166 MPI_Comm comm; 167 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 168 MPI_Status *s_status,*recv_status; 169 170 PetscFunctionBegin; 171 comm = C->comm; 172 tag = C->tag; 173 size = c->size; 174 rank = c->rank; 175 Mbs = c->Mbs; 176 177 len = (imax+1)*sizeof(int*)+ (imax + Mbs)*sizeof(int); 178 idx = (int**)PetscMalloc(len);CHKPTRQ(idx); 179 n = (int*)(idx + imax); 180 rtable = n + imax; 181 182 for (i=0; i<imax; i++) { 183 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 184 ierr = ISGetSize(is[i],&n[i]);CHKERRQ(ierr); 185 } 186 187 /* Create hash table for the mapping :row -> proc*/ 188 for (i=0,j=0; i<size; i++) { 189 len = c->rowners[i+1]; 190 for (; j<len; j++) { 191 rtable[j] = i; 192 } 193 } 194 195 /* evaluate communication - mesg to who,length of mesg, and buffer space 196 required. Based on this, buffers are allocated, and data copied into them*/ 197 w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1);/* mesg size */ 198 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 199 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 200 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 201 ierr = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/ 202 for (i=0; i<imax; i++) { 203 ierr = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/ 204 idx_i = idx[i]; 205 len = n[i]; 206 for (j=0; j<len; j++) { 207 row = idx_i[j]; 208 if (row < 0) { 209 SETERRQ(1,1,"Index set cannot have negative entries"); 210 } 211 proc = rtable[row]; 212 w4[proc]++; 213 } 214 for (j=0; j<size; j++){ 215 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 216 } 217 } 218 219 nrqs = 0; /* no of outgoing messages */ 220 msz = 0; /* total mesg length (for all proc */ 221 w1[rank] = 0; /* no mesg sent to intself */ 222 w3[rank] = 0; 223 for (i=0; i<size; i++) { 224 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 225 } 226 /* pa - is list of processors to communicate with */ 227 pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); 228 for (i=0,j=0; i<size; i++) { 229 if (w1[i]) {pa[j] = i; j++;} 230 } 231 232 /* Each message would have a header = 1 + 2*(no of IS) + data */ 233 for (i=0; i<nrqs; i++) { 234 j = pa[i]; 235 w1[j] += w2[j] + 2*w3[j]; 236 msz += w1[j]; 237 } 238 239 240 /* Do a global reduction to determine how many messages to expect*/ 241 { 242 int *rw1; 243 rw1 = (int*)PetscMalloc(2*size*sizeof(int));CHKPTRQ(rw1); 244 ierr = MPI_Allreduce(w1,rw1,2*size,MPI_INT,PetscMaxSum_Op,comm);CHKERRQ(ierr); 245 bsz = rw1[rank]; 246 nrqr = rw1[size+rank]; 247 ierr = PetscFree(rw1);CHKERRQ(ierr); 248 } 249 250 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 251 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 252 rbuf = (int**)PetscMalloc(len);CHKPTRQ(rbuf); 253 rbuf[0] = (int*)(rbuf + nrqr); 254 for (i=1; i<nrqr; ++i) rbuf[i] = rbuf[i-1] + bsz; 255 256 /* Post the receives */ 257 r_waits1 = (MPI_Request*)PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(r_waits1); 258 for (i=0; i<nrqr; ++i) { 259 ierr = MPI_Irecv(rbuf[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag,comm,r_waits1+i);CHKERRQ(ierr); 260 } 261 262 /* Allocate Memory for outgoing messages */ 263 len = 2*size*sizeof(int*) + (size+msz)*sizeof(int); 264 outdat = (int **)PetscMalloc(len);CHKPTRQ(outdat); 265 ptr = outdat + size; /* Pointers to the data in outgoing buffers */ 266 ierr = PetscMemzero(outdat,2*size*sizeof(int*));CHKERRQ(ierr); 267 tmp = (int*)(outdat + 2*size); 268 ctr = tmp + msz; 269 270 { 271 int *iptr = tmp,ict = 0; 272 for (i=0; i<nrqs; i++) { 273 j = pa[i]; 274 iptr += ict; 275 outdat[j] = iptr; 276 ict = w1[j]; 277 } 278 } 279 280 /* Form the outgoing messages */ 281 /*plug in the headers*/ 282 for (i=0; i<nrqs; i++) { 283 j = pa[i]; 284 outdat[j][0] = 0; 285 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr); 286 ptr[j] = outdat[j] + 2*w3[j] + 1; 287 } 288 289 /* Memory for doing local proc's work*/ 290 { 291 int *d_p; 292 char *t_p; 293 294 len = (imax)*(sizeof(PetscBT) + sizeof(int*)+ sizeof(int)) + 295 (Mbs)*imax*sizeof(int) + (Mbs/BITSPERBYTE+1)*imax*sizeof(char) + 1; 296 table = (PetscBT *)PetscMalloc(len);CHKPTRQ(table); 297 ierr = PetscMemzero(table,len);CHKERRQ(ierr); 298 data = (int **)(table + imax); 299 isz = (int *)(data + imax); 300 d_p = (int *)(isz + imax); 301 t_p = (char *)(d_p + Mbs*imax); 302 for (i=0; i<imax; i++) { 303 table[i] = t_p + (Mbs/BITSPERBYTE+1)*i; 304 data[i] = d_p + (Mbs)*i; 305 } 306 } 307 308 /* Parse the IS and update local tables and the outgoing buf with the data*/ 309 { 310 int n_i,*data_i,isz_i,*outdat_j,ctr_j; 311 PetscBT table_i; 312 313 for (i=0; i<imax; i++) { 314 ierr = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr); 315 n_i = n[i]; 316 table_i = table[i]; 317 idx_i = idx[i]; 318 data_i = data[i]; 319 isz_i = isz[i]; 320 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 321 row = idx_i[j]; 322 proc = rtable[row]; 323 if (proc != rank) { /* copy to the outgoing buffer */ 324 ctr[proc]++; 325 *ptr[proc] = row; 326 ptr[proc]++; 327 } 328 else { /* Update the local table */ 329 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 330 } 331 } 332 /* Update the headers for the current IS */ 333 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 334 if ((ctr_j = ctr[j])) { 335 outdat_j = outdat[j]; 336 k = ++outdat_j[0]; 337 outdat_j[2*k] = ctr_j; 338 outdat_j[2*k-1] = i; 339 } 340 } 341 isz[i] = isz_i; 342 } 343 } 344 345 /* Now post the sends */ 346 s_waits1 = (MPI_Request*)PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(s_waits1); 347 for (i=0; i<nrqs; ++i) { 348 j = pa[i]; 349 ierr = MPI_Isend(outdat[j],w1[j],MPI_INT,j,tag,comm,s_waits1+i);CHKERRQ(ierr); 350 } 351 352 /* No longer need the original indices*/ 353 for (i=0; i<imax; ++i) { 354 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 355 } 356 ierr = PetscFree(idx);CHKERRQ(ierr); 357 358 for (i=0; i<imax; ++i) { 359 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 360 } 361 362 /* Do Local work*/ 363 ierr = MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 364 365 /* Receive messages*/ 366 { 367 int index; 368 369 recv_status = (MPI_Status*)PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(recv_status); 370 for (i=0; i<nrqr; ++i) { 371 ierr = MPI_Waitany(nrqr,r_waits1,&index,recv_status+i);CHKERRQ(ierr); 372 } 373 374 s_status = (MPI_Status*)PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(s_status); 375 ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr); 376 } 377 378 /* Phase 1 sends are complete - deallocate buffers */ 379 ierr = PetscFree(outdat);CHKERRQ(ierr); 380 ierr = PetscFree(w1);CHKERRQ(ierr); 381 382 xdata = (int **)PetscMalloc((nrqr+1)*sizeof(int *));CHKPTRQ(xdata); 383 isz1 = (int *)PetscMalloc((nrqr+1)*sizeof(int));CHKPTRQ(isz1); 384 ierr = MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 385 ierr = PetscFree(rbuf);CHKERRQ(ierr); 386 387 /* Send the data back*/ 388 /* Do a global reduction to know the buffer space req for incoming messages*/ 389 { 390 int *rw1,*rw2; 391 392 rw1 = (int *)PetscMalloc(2*size*sizeof(int));CHKPTRQ(rw1); 393 ierr = PetscMemzero(rw1,2*size*sizeof(int));CHKERRQ(ierr); 394 rw2 = rw1+size; 395 for (i=0; i<nrqr; ++i) { 396 proc = recv_status[i].MPI_SOURCE; 397 rw1[proc] = isz1[i]; 398 } 399 400 ierr = MPI_Allreduce(rw1,rw2,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr); 401 bsz1 = rw2[rank]; 402 ierr = PetscFree(rw1);CHKERRQ(ierr); 403 } 404 405 /* Allocate buffers*/ 406 407 /* Allocate memory for recv buffers. Prob none if nrqr = 0 ???? */ 408 len = (nrqs+1)*sizeof(int*) + nrqs*bsz1*sizeof(int); 409 rbuf2 = (int**)PetscMalloc(len);CHKPTRQ(rbuf2); 410 rbuf2[0] = (int*)(rbuf2 + nrqs); 411 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + bsz1; 412 413 /* Post the receives */ 414 r_waits2 = (MPI_Request *)PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits2); 415 for (i=0; i<nrqs; ++i) { 416 ierr = MPI_Irecv(rbuf2[i],bsz1,MPI_INT,MPI_ANY_SOURCE,tag,comm,r_waits2+i);CHKERRQ(ierr); 417 } 418 419 /* Now post the sends */ 420 s_waits2 = (MPI_Request*)PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits2); 421 for (i=0; i<nrqr; ++i) { 422 j = recv_status[i].MPI_SOURCE; 423 ierr = MPI_Isend(xdata[i],isz1[i],MPI_INT,j,tag,comm,s_waits2+i);CHKERRQ(ierr); 424 } 425 426 /* receive work done on other processors*/ 427 { 428 int index,is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 429 PetscBT table_i; 430 MPI_Status *status2; 431 432 status2 = (MPI_Status*)PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(status2); 433 434 for (i=0; i<nrqs; ++i) { 435 ierr = MPI_Waitany(nrqs,r_waits2,&index,status2+i);CHKERRQ(ierr); 436 /* Process the message*/ 437 rbuf2_i = rbuf2[index]; 438 ct1 = 2*rbuf2_i[0]+1; 439 jmax = rbuf2[index][0]; 440 for (j=1; j<=jmax; j++) { 441 max = rbuf2_i[2*j]; 442 is_no = rbuf2_i[2*j-1]; 443 isz_i = isz[is_no]; 444 data_i = data[is_no]; 445 table_i = table[is_no]; 446 for (k=0; k<max; k++,ct1++) { 447 row = rbuf2_i[ct1]; 448 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 449 } 450 isz[is_no] = isz_i; 451 } 452 } 453 ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr); 454 ierr = PetscFree(status2);CHKERRQ(ierr); 455 } 456 457 for (i=0; i<imax; ++i) { 458 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);CHKERRQ(ierr); 459 } 460 461 ierr = PetscFree(pa);CHKERRQ(ierr); 462 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 463 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 464 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 465 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 466 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 467 ierr = PetscFree(table);CHKERRQ(ierr); 468 ierr = PetscFree(s_status);CHKERRQ(ierr); 469 ierr = PetscFree(recv_status);CHKERRQ(ierr); 470 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 471 ierr = PetscFree(xdata);CHKERRQ(ierr); 472 ierr = PetscFree(isz1);CHKERRQ(ierr); 473 PetscFunctionReturn(0); 474 } 475 476 #undef __FUNC__ 477 #define __FUNC__ /*<a name=""></a>*/"MatIncreaseOverlap_MPIBAIJ_Local" 478 /* 479 MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do 480 the work on the local processor. 481 482 Inputs: 483 C - MAT_MPIBAIJ; 484 imax - total no of index sets processed at a time; 485 table - an array of char - size = Mbs bits. 486 487 Output: 488 isz - array containing the count of the solution elements correspondign 489 to each index set; 490 data - pointer to the solutions 491 */ 492 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat C,int imax,PetscBT *table,int *isz,int **data) 493 { 494 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 495 Mat A = c->A,B = c->B; 496 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 497 int start,end,val,max,rstart,cstart,*ai,*aj; 498 int *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 499 PetscBT table_i; 500 501 PetscFunctionBegin; 502 rstart = c->rstart; 503 cstart = c->cstart; 504 ai = a->i; 505 aj = a->j; 506 bi = b->i; 507 bj = b->j; 508 garray = c->garray; 509 510 511 for (i=0; i<imax; i++) { 512 data_i = data[i]; 513 table_i = table[i]; 514 isz_i = isz[i]; 515 for (j=0,max=isz[i]; j<max; j++) { 516 row = data_i[j] - rstart; 517 start = ai[row]; 518 end = ai[row+1]; 519 for (k=start; k<end; k++) { /* Amat */ 520 val = aj[k] + cstart; 521 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 522 } 523 start = bi[row]; 524 end = bi[row+1]; 525 for (k=start; k<end; k++) { /* Bmat */ 526 val = garray[bj[k]]; 527 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 528 } 529 } 530 isz[i] = isz_i; 531 } 532 PetscFunctionReturn(0); 533 } 534 #undef __FUNC__ 535 #define __FUNC__ /*<a name=""></a>*/"MatIncreaseOverlap_MPIBAIJ_Receive" 536 /* 537 MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages, 538 and return the output 539 540 Input: 541 C - the matrix 542 nrqr - no of messages being processed. 543 rbuf - an array of pointers to the recieved requests 544 545 Output: 546 xdata - array of messages to be sent back 547 isz1 - size of each message 548 549 For better efficiency perhaps we should malloc seperately each xdata[i], 550 then if a remalloc is required we need only copy the data for that one row 551 rather then all previous rows as it is now where a single large chunck of 552 memory is used. 553 554 */ 555 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,int nrqr,int **rbuf,int **xdata,int * isz1) 556 { 557 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 558 Mat A = c->A,B = c->B; 559 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 560 int rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 561 int row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 562 int val,max1,max2,rank,Mbs,no_malloc =0,*tmp,new_estimate,ctr; 563 int *rbuf_i,kmax,rbuf_0,ierr; 564 PetscBT xtable; 565 566 PetscFunctionBegin; 567 rank = c->rank; 568 Mbs = c->Mbs; 569 rstart = c->rstart; 570 cstart = c->cstart; 571 ai = a->i; 572 aj = a->j; 573 bi = b->i; 574 bj = b->j; 575 garray = c->garray; 576 577 578 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 579 rbuf_i = rbuf[i]; 580 rbuf_0 = rbuf_i[0]; 581 ct += rbuf_0; 582 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 583 } 584 585 if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs; 586 else max1 = 1; 587 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 588 xdata[0] = (int*)PetscMalloc(mem_estimate*sizeof(int));CHKPTRQ(xdata[0]); 589 ++no_malloc; 590 ierr = PetscBTCreate(Mbs,xtable);CHKERRQ(ierr); 591 ierr = PetscMemzero(isz1,nrqr*sizeof(int));CHKERRQ(ierr); 592 593 ct3 = 0; 594 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 595 rbuf_i = rbuf[i]; 596 rbuf_0 = rbuf_i[0]; 597 ct1 = 2*rbuf_0+1; 598 ct2 = ct1; 599 ct3 += ct1; 600 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 601 ierr = PetscBTMemzero(Mbs,xtable);CHKERRQ(ierr); 602 oct2 = ct2; 603 kmax = rbuf_i[2*j]; 604 for (k=0; k<kmax; k++,ct1++) { 605 row = rbuf_i[ct1]; 606 if (!PetscBTLookupSet(xtable,row)) { 607 if (!(ct3 < mem_estimate)) { 608 new_estimate = (int)(1.5*mem_estimate)+1; 609 tmp = (int*)PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 610 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr); 611 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 612 xdata[0] = tmp; 613 mem_estimate = new_estimate; ++no_malloc; 614 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 615 } 616 xdata[i][ct2++] = row; 617 ct3++; 618 } 619 } 620 for (k=oct2,max2=ct2; k<max2; k++) { 621 row = xdata[i][k] - rstart; 622 start = ai[row]; 623 end = ai[row+1]; 624 for (l=start; l<end; l++) { 625 val = aj[l] + cstart; 626 if (!PetscBTLookupSet(xtable,val)) { 627 if (!(ct3 < mem_estimate)) { 628 new_estimate = (int)(1.5*mem_estimate)+1; 629 tmp = (int*)PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 630 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr); 631 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 632 xdata[0] = tmp; 633 mem_estimate = new_estimate; ++no_malloc; 634 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 635 } 636 xdata[i][ct2++] = val; 637 ct3++; 638 } 639 } 640 start = bi[row]; 641 end = bi[row+1]; 642 for (l=start; l<end; l++) { 643 val = garray[bj[l]]; 644 if (!PetscBTLookupSet(xtable,val)) { 645 if (!(ct3 < mem_estimate)) { 646 new_estimate = (int)(1.5*mem_estimate)+1; 647 tmp = (int*)PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 648 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr); 649 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 650 xdata[0] = tmp; 651 mem_estimate = new_estimate; ++no_malloc; 652 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 653 } 654 xdata[i][ct2++] = val; 655 ct3++; 656 } 657 } 658 } 659 /* Update the header*/ 660 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 661 xdata[i][2*j-1] = rbuf_i[2*j-1]; 662 } 663 xdata[i][0] = rbuf_0; 664 xdata[i+1] = xdata[i] + ct2; 665 isz1[i] = ct2; /* size of each message */ 666 } 667 ierr = PetscBTDestroy(xtable);CHKERRQ(ierr); 668 PLogInfo(0,"MatIncreaseOverlap_MPIBAIJ:[%d] Allocated %d bytes, required %d, no of mallocs = %d\n",rank,mem_estimate,ct3,no_malloc); 669 PetscFunctionReturn(0); 670 } 671 672 static int MatGetSubMatrices_MPIBAIJ_local(Mat,int,IS *,IS *,MatReuse,Mat *); 673 674 #undef __FUNC__ 675 #define __FUNC__ /*<a name=""></a>*/"MatGetSubMatrices_MPIBAIJ" 676 int MatGetSubMatrices_MPIBAIJ(Mat C,int ismax,IS *isrow,IS *iscol,MatReuse scall,Mat **submat) 677 { 678 IS *isrow_new,*iscol_new; 679 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 680 int nmax,nstages_local,nstages,i,pos,max_no,ierr; 681 682 PetscFunctionBegin; 683 /* The compression and expansion should be avoided. Does'nt point 684 out errors might change the indices hence buggey */ 685 686 isrow_new = (IS *)PetscMalloc(2*(ismax+1)*sizeof(IS));CHKPTRQ(isrow_new); 687 iscol_new = isrow_new + ismax; 688 ierr = MatCompressIndicesSorted_MPIBAIJ(C,ismax,isrow,isrow_new);CHKERRQ(ierr); 689 ierr = MatCompressIndicesSorted_MPIBAIJ(C,ismax,iscol,iscol_new);CHKERRQ(ierr); 690 691 /* Allocate memory to hold all the submatrices */ 692 if (scall != MAT_REUSE_MATRIX) { 693 *submat = (Mat *)PetscMalloc((ismax+1)*sizeof(Mat));CHKPTRQ(*submat); 694 } 695 /* Determine the number of stages through which submatrices are done */ 696 nmax = 20*1000000 / (c->Nbs * sizeof(int)); 697 if (!nmax) nmax = 1; 698 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 699 700 /* Make sure every porcessor loops through the nstages */ 701 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);CHKERRQ(ierr); 702 703 for (i=0,pos=0; i<nstages; i++) { 704 if (pos+nmax <= ismax) max_no = nmax; 705 else if (pos == ismax) max_no = 0; 706 else max_no = ismax-pos; 707 ierr = MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos);CHKERRQ(ierr); 708 pos += max_no; 709 } 710 711 for (i=0; i<ismax; i++) { 712 ierr = ISDestroy(isrow_new[i]);CHKERRQ(ierr); 713 ierr = ISDestroy(iscol_new[i]);CHKERRQ(ierr); 714 } 715 ierr = PetscFree(isrow_new);CHKERRQ(ierr); 716 PetscFunctionReturn(0); 717 } 718 719 /* -------------------------------------------------------------------------*/ 720 #undef __FUNC__ 721 #define __FUNC__ /*<a name=""></a>*/"MatGetSubMatrices_MPIBAIJ_local" 722 static int MatGetSubMatrices_MPIBAIJ_local(Mat C,int ismax,IS *isrow,IS *iscol,MatReuse scall,Mat *submats) 723 { 724 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 725 Mat A = c->A; 726 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat; 727 int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size; 728 int **sbuf1,**sbuf2,rank,Mbs,i,j,k,l,ct1,ct2,ierr,**rbuf1,row,proc; 729 int nrqs,msz,**ptr,index,*req_size,*ctr,*pa,*tmp,tcol,bsz,nrqr; 730 int **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap; 731 int **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i; 732 int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i; 733 int *rmap_i,bs=c->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB; 734 int cstart = c->cstart,nzA,nzB,*a_i=a->i,*b_i=b->i,imark; 735 int *bmap = c->garray,ctmp,rstart=c->rstart,tag0,tag1,tag2,tag3; 736 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 737 MPI_Request *r_waits4,*s_waits3,*s_waits4; 738 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 739 MPI_Status *r_status3,*r_status4,*s_status4; 740 MPI_Comm comm; 741 MatScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB; 742 MatScalar *a_a=a->a,*b_a=b->a; 743 PetscTruth flag; 744 745 PetscFunctionBegin; 746 comm = C->comm; 747 tag0 = C->tag; 748 size = c->size; 749 rank = c->rank; 750 Mbs = c->Mbs; 751 752 /* Get some new tags to keep the communication clean */ 753 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 754 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 755 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 756 757 /* Check if the col indices are sorted */ 758 for (i=0; i<ismax; i++) { 759 ierr = ISSorted(iscol[i],(PetscTruth*)&j);CHKERRQ(ierr); 760 if (!j) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted"); 761 } 762 763 len = (2*ismax+1)*(sizeof(int*)+ sizeof(int)) + (Mbs+1)*sizeof(int); 764 irow = (int **)PetscMalloc(len);CHKPTRQ(irow); 765 icol = irow + ismax; 766 nrow = (int*)(icol + ismax); 767 ncol = nrow + ismax; 768 rtable = ncol + ismax; 769 770 for (i=0; i<ismax; i++) { 771 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 772 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 773 ierr = ISGetSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 774 ierr = ISGetSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 775 } 776 777 /* Create hash table for the mapping :row -> proc*/ 778 for (i=0,j=0; i<size; i++) { 779 jmax = c->rowners[i+1]; 780 for (; j<jmax; j++) { 781 rtable[j] = i; 782 } 783 } 784 785 /* evaluate communication - mesg to who,length of mesg,and buffer space 786 required. Based on this, buffers are allocated, and data copied into them*/ 787 w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1); /* mesg size */ 788 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 789 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 790 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 791 ierr = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/ 792 for (i=0; i<ismax; i++) { 793 ierr = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/ 794 jmax = nrow[i]; 795 irow_i = irow[i]; 796 for (j=0; j<jmax; j++) { 797 row = irow_i[j]; 798 proc = rtable[row]; 799 w4[proc]++; 800 } 801 for (j=0; j<size; j++) { 802 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 803 } 804 } 805 806 nrqs = 0; /* no of outgoing messages */ 807 msz = 0; /* total mesg length for all proc */ 808 w1[rank] = 0; /* no mesg sent to intself */ 809 w3[rank] = 0; 810 for (i=0; i<size; i++) { 811 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 812 } 813 pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); /*(proc -array)*/ 814 for (i=0,j=0; i<size; i++) { 815 if (w1[i]) { pa[j] = i; j++; } 816 } 817 818 /* Each message would have a header = 1 + 2*(no of IS) + data */ 819 for (i=0; i<nrqs; i++) { 820 j = pa[i]; 821 w1[j] += w2[j] + 2* w3[j]; 822 msz += w1[j]; 823 } 824 /* Do a global reduction to determine how many messages to expect*/ 825 { 826 int *rw1; 827 rw1 = (int *)PetscMalloc(2*size*sizeof(int));CHKPTRQ(rw1); 828 ierr = MPI_Allreduce(w1,rw1,2*size,MPI_INT,PetscMaxSum_Op,comm);CHKERRQ(ierr); 829 bsz = rw1[rank]; 830 nrqr = rw1[size+rank]; 831 ierr = PetscFree(rw1);CHKERRQ(ierr); 832 } 833 834 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 835 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 836 rbuf1 = (int**)PetscMalloc(len);CHKPTRQ(rbuf1); 837 rbuf1[0] = (int*)(rbuf1 + nrqr); 838 for (i=1; i<nrqr; ++i) rbuf1[i] = rbuf1[i-1] + bsz; 839 840 /* Post the receives */ 841 r_waits1 = (MPI_Request*)PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(r_waits1); 842 for (i=0; i<nrqr; ++i) { 843 ierr = MPI_Irecv(rbuf1[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag0,comm,r_waits1+i);CHKERRQ(ierr); 844 } 845 846 /* Allocate Memory for outgoing messages */ 847 len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int); 848 sbuf1 = (int **)PetscMalloc(len);CHKPTRQ(sbuf1); 849 ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */ 850 ierr = PetscMemzero(sbuf1,2*size*sizeof(int*));CHKERRQ(ierr); 851 /* allocate memory for outgoing data + buf to receive the first reply */ 852 tmp = (int*)(ptr + size); 853 ctr = tmp + 2*msz; 854 855 { 856 int *iptr = tmp,ict = 0; 857 for (i=0; i<nrqs; i++) { 858 j = pa[i]; 859 iptr += ict; 860 sbuf1[j] = iptr; 861 ict = w1[j]; 862 } 863 } 864 865 /* Form the outgoing messages */ 866 /* Initialise the header space */ 867 for (i=0; i<nrqs; i++) { 868 j = pa[i]; 869 sbuf1[j][0] = 0; 870 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr); 871 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 872 } 873 874 /* Parse the isrow and copy data into outbuf */ 875 for (i=0; i<ismax; i++) { 876 ierr = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr); 877 irow_i = irow[i]; 878 jmax = nrow[i]; 879 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 880 row = irow_i[j]; 881 proc = rtable[row]; 882 if (proc != rank) { /* copy to the outgoing buf*/ 883 ctr[proc]++; 884 *ptr[proc] = row; 885 ptr[proc]++; 886 } 887 } 888 /* Update the headers for the current IS */ 889 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 890 if ((ctr_j = ctr[j])) { 891 sbuf1_j = sbuf1[j]; 892 k = ++sbuf1_j[0]; 893 sbuf1_j[2*k] = ctr_j; 894 sbuf1_j[2*k-1] = i; 895 } 896 } 897 } 898 899 /* Now post the sends */ 900 s_waits1 = (MPI_Request*)PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(s_waits1); 901 for (i=0; i<nrqs; ++i) { 902 j = pa[i]; 903 ierr = MPI_Isend(sbuf1[j],w1[j],MPI_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 904 } 905 906 /* Post Recieves to capture the buffer size */ 907 r_waits2 = (MPI_Request*)PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits2); 908 rbuf2 = (int**)PetscMalloc((nrqs+1)*sizeof(int *));CHKPTRQ(rbuf2); 909 rbuf2[0] = tmp + msz; 910 for (i=1; i<nrqs; ++i) { 911 j = pa[i]; 912 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 913 } 914 for (i=0; i<nrqs; ++i) { 915 j = pa[i]; 916 ierr = MPI_Irecv(rbuf2[i],w1[j],MPI_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 917 } 918 919 /* Send to other procs the buf size they should allocate */ 920 921 922 /* Receive messages*/ 923 s_waits2 = (MPI_Request*)PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits2); 924 r_status1 = (MPI_Status*)PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(r_status1); 925 len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*); 926 sbuf2 = (int**)PetscMalloc(len);CHKPTRQ(sbuf2); 927 req_size = (int*)(sbuf2 + nrqr); 928 req_source = req_size + nrqr; 929 930 { 931 Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data; 932 int *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i; 933 934 for (i=0; i<nrqr; ++i) { 935 ierr = MPI_Waitany(nrqr,r_waits1,&index,r_status1+i);CHKERRQ(ierr); 936 req_size[index] = 0; 937 rbuf1_i = rbuf1[index]; 938 start = 2*rbuf1_i[0] + 1; 939 ierr = MPI_Get_count(r_status1+i,MPI_INT,&end);CHKERRQ(ierr); 940 sbuf2[index] = (int *)PetscMalloc(end*sizeof(int));CHKPTRQ(sbuf2[index]); 941 sbuf2_i = sbuf2[index]; 942 for (j=start; j<end; j++) { 943 id = rbuf1_i[j] - rstart; 944 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 945 sbuf2_i[j] = ncols; 946 req_size[index] += ncols; 947 } 948 req_source[index] = r_status1[i].MPI_SOURCE; 949 /* form the header */ 950 sbuf2_i[0] = req_size[index]; 951 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 952 ierr = MPI_Isend(sbuf2_i,end,MPI_INT,req_source[index],tag1,comm,s_waits2+i);CHKERRQ(ierr); 953 } 954 } 955 ierr = PetscFree(r_status1);CHKERRQ(ierr); 956 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 957 958 /* recv buffer sizes */ 959 /* Receive messages*/ 960 961 rbuf3 = (int**)PetscMalloc((nrqs+1)*sizeof(int*));CHKPTRQ(rbuf3); 962 rbuf4 = (MatScalar**)PetscMalloc((nrqs+1)*sizeof(MatScalar*));CHKPTRQ(rbuf4); 963 r_waits3 = (MPI_Request*)PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits3); 964 r_waits4 = (MPI_Request*)PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits4); 965 r_status2 = (MPI_Status*)PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(r_status2); 966 967 for (i=0; i<nrqs; ++i) { 968 ierr = MPI_Waitany(nrqs,r_waits2,&index,r_status2+i);CHKERRQ(ierr); 969 rbuf3[index] = (int *)PetscMalloc(rbuf2[index][0]*sizeof(int));CHKPTRQ(rbuf3[index]); 970 rbuf4[index] = (MatScalar *)PetscMalloc(rbuf2[index][0]*bs2*sizeof(MatScalar));CHKPTRQ(rbuf4[index]); 971 ierr = MPI_Irecv(rbuf3[index],rbuf2[index][0],MPI_INT, 972 r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+index);CHKERRQ(ierr); 973 ierr = MPI_Irecv(rbuf4[index],rbuf2[index][0]*bs2,MPIU_MATSCALAR, 974 r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+index);CHKERRQ(ierr); 975 } 976 ierr = PetscFree(r_status2);CHKERRQ(ierr); 977 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 978 979 /* Wait on sends1 and sends2 */ 980 s_status1 = (MPI_Status*)PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(s_status1); 981 s_status2 = (MPI_Status*)PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(s_status2); 982 983 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 984 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 985 ierr = PetscFree(s_status1);CHKERRQ(ierr); 986 ierr = PetscFree(s_status2);CHKERRQ(ierr); 987 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 988 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 989 990 /* Now allocate buffers for a->j, and send them off */ 991 sbuf_aj = (int **)PetscMalloc((nrqr+1)*sizeof(int *));CHKPTRQ(sbuf_aj); 992 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 993 sbuf_aj[0] = (int*)PetscMalloc((j+1)*sizeof(int));CHKPTRQ(sbuf_aj[0]); 994 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 995 996 s_waits3 = (MPI_Request*)PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits3); 997 { 998 for (i=0; i<nrqr; i++) { 999 rbuf1_i = rbuf1[i]; 1000 sbuf_aj_i = sbuf_aj[i]; 1001 ct1 = 2*rbuf1_i[0] + 1; 1002 ct2 = 0; 1003 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1004 kmax = rbuf1[i][2*j]; 1005 for (k=0; k<kmax; k++,ct1++) { 1006 row = rbuf1_i[ct1] - rstart; 1007 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1008 ncols = nzA + nzB; 1009 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1010 1011 /* load the column indices for this row into cols*/ 1012 cols = sbuf_aj_i + ct2; 1013 for (l=0; l<nzB; l++) { 1014 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp; 1015 else break; 1016 } 1017 imark = l; 1018 for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l]; 1019 for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]]; 1020 ct2 += ncols; 1021 } 1022 } 1023 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1024 } 1025 } 1026 r_status3 = (MPI_Status*)PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(r_status3); 1027 s_status3 = (MPI_Status*)PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(s_status3); 1028 1029 /* Allocate buffers for a->a, and send them off */ 1030 sbuf_aa = (MatScalar **)PetscMalloc((nrqr+1)*sizeof(MatScalar *));CHKPTRQ(sbuf_aa); 1031 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1032 sbuf_aa[0] = (MatScalar*)PetscMalloc((j+1)*bs2*sizeof(MatScalar));CHKPTRQ(sbuf_aa[0]); 1033 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2; 1034 1035 s_waits4 = (MPI_Request*)PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits4); 1036 { 1037 for (i=0; i<nrqr; i++) { 1038 rbuf1_i = rbuf1[i]; 1039 sbuf_aa_i = sbuf_aa[i]; 1040 ct1 = 2*rbuf1_i[0]+1; 1041 ct2 = 0; 1042 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1043 kmax = rbuf1_i[2*j]; 1044 for (k=0; k<kmax; k++,ct1++) { 1045 row = rbuf1_i[ct1] - rstart; 1046 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1047 ncols = nzA + nzB; 1048 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1049 vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2; 1050 1051 /* load the column values for this row into vals*/ 1052 vals = sbuf_aa_i+ct2*bs2; 1053 for (l=0; l<nzB; l++) { 1054 if ((bmap[cworkB[l]]) < cstart) { 1055 ierr = PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1056 } 1057 else break; 1058 } 1059 imark = l; 1060 for (l=0; l<nzA; l++) { 1061 ierr = PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1062 } 1063 for (l=imark; l<nzB; l++) { 1064 ierr = PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1065 } 1066 ct2 += ncols; 1067 } 1068 } 1069 ierr = MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1070 } 1071 } 1072 r_status4 = (MPI_Status*)PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(r_status4); 1073 s_status4 = (MPI_Status*)PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(s_status4); 1074 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1075 1076 /* Form the matrix */ 1077 /* create col map */ 1078 { 1079 int *icol_i; 1080 1081 len = (1+ismax)*sizeof(int*)+ ismax*c->Nbs*sizeof(int); 1082 cmap = (int **)PetscMalloc(len);CHKPTRQ(cmap); 1083 cmap[0] = (int *)(cmap + ismax); 1084 ierr = PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(int));CHKERRQ(ierr); 1085 for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; } 1086 for (i=0; i<ismax; i++) { 1087 jmax = ncol[i]; 1088 icol_i = icol[i]; 1089 cmap_i = cmap[i]; 1090 for (j=0; j<jmax; j++) { 1091 cmap_i[icol_i[j]] = j+1; 1092 } 1093 } 1094 } 1095 1096 1097 /* Create lens which is required for MatCreate... */ 1098 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1099 len = (1+ismax)*sizeof(int*)+ j*sizeof(int); 1100 lens = (int **)PetscMalloc(len);CHKPTRQ(lens); 1101 lens[0] = (int *)(lens + ismax); 1102 ierr = PetscMemzero(lens[0],j*sizeof(int));CHKERRQ(ierr); 1103 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1104 1105 /* Update lens from local data */ 1106 for (i=0; i<ismax; i++) { 1107 jmax = nrow[i]; 1108 cmap_i = cmap[i]; 1109 irow_i = irow[i]; 1110 lens_i = lens[i]; 1111 for (j=0; j<jmax; j++) { 1112 row = irow_i[j]; 1113 proc = rtable[row]; 1114 if (proc == rank) { 1115 /* Get indices from matA and then from matB */ 1116 row = row - rstart; 1117 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1118 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1119 for (k=0; k<nzA; k++) { 1120 if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++;} 1121 } 1122 for (k=0; k<nzB; k++) { 1123 if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++;} 1124 } 1125 } 1126 } 1127 } 1128 1129 /* Create row map*/ 1130 len = (1+ismax)*sizeof(int*)+ ismax*c->Mbs*sizeof(int); 1131 rmap = (int **)PetscMalloc(len);CHKPTRQ(rmap); 1132 rmap[0] = (int *)(rmap + ismax); 1133 ierr = PetscMemzero(rmap[0],ismax*c->Mbs*sizeof(int));CHKERRQ(ierr); 1134 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + c->Mbs;} 1135 for (i=0; i<ismax; i++) { 1136 rmap_i = rmap[i]; 1137 irow_i = irow[i]; 1138 jmax = nrow[i]; 1139 for (j=0; j<jmax; j++) { 1140 rmap_i[irow_i[j]] = j; 1141 } 1142 } 1143 1144 /* Update lens from offproc data */ 1145 { 1146 int *rbuf2_i,*rbuf3_i,*sbuf1_i; 1147 1148 for (tmp2=0; tmp2<nrqs; tmp2++) { 1149 ierr = MPI_Waitany(nrqs,r_waits3,&i,r_status3+tmp2);CHKERRQ(ierr); 1150 index = pa[i]; 1151 sbuf1_i = sbuf1[index]; 1152 jmax = sbuf1_i[0]; 1153 ct1 = 2*jmax+1; 1154 ct2 = 0; 1155 rbuf2_i = rbuf2[i]; 1156 rbuf3_i = rbuf3[i]; 1157 for (j=1; j<=jmax; j++) { 1158 is_no = sbuf1_i[2*j-1]; 1159 max1 = sbuf1_i[2*j]; 1160 lens_i = lens[is_no]; 1161 cmap_i = cmap[is_no]; 1162 rmap_i = rmap[is_no]; 1163 for (k=0; k<max1; k++,ct1++) { 1164 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1165 max2 = rbuf2_i[ct1]; 1166 for (l=0; l<max2; l++,ct2++) { 1167 if (cmap_i[rbuf3_i[ct2]]) { 1168 lens_i[row]++; 1169 } 1170 } 1171 } 1172 } 1173 } 1174 } 1175 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1176 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1177 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1178 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1179 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1180 1181 /* Create the submatrices */ 1182 if (scall == MAT_REUSE_MATRIX) { 1183 /* 1184 Assumes new rows are same length as the old rows, hence bug! 1185 */ 1186 for (i=0; i<ismax; i++) { 1187 mat = (Mat_SeqBAIJ *)(submats[i]->data); 1188 if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || mat->bs != bs)) { 1189 SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong size"); 1190 } 1191 ierr = PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(int),&flag);CHKERRQ(ierr); 1192 if (flag == PETSC_FALSE) { 1193 SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Cannot reuse matrix. wrong no of nonzeros"); 1194 } 1195 /* Initial matrix as if empty */ 1196 ierr = PetscMemzero(mat->ilen,mat->mbs*sizeof(int));CHKERRQ(ierr); 1197 submats[i]->factor = C->factor; 1198 } 1199 } else { 1200 for (i=0; i<ismax; i++) { 1201 ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,a->bs,nrow[i]*bs,ncol[i]*bs,0,lens[i],submats+i);CHKERRQ(ierr); 1202 } 1203 } 1204 1205 /* Assemble the matrices */ 1206 /* First assemble the local rows */ 1207 { 1208 int ilen_row,*imat_ilen,*imat_j,*imat_i; 1209 MatScalar *imat_a; 1210 1211 for (i=0; i<ismax; i++) { 1212 mat = (Mat_SeqBAIJ*)submats[i]->data; 1213 imat_ilen = mat->ilen; 1214 imat_j = mat->j; 1215 imat_i = mat->i; 1216 imat_a = mat->a; 1217 cmap_i = cmap[i]; 1218 rmap_i = rmap[i]; 1219 irow_i = irow[i]; 1220 jmax = nrow[i]; 1221 for (j=0; j<jmax; j++) { 1222 row = irow_i[j]; 1223 proc = rtable[row]; 1224 if (proc == rank) { 1225 row = row - rstart; 1226 nzA = a_i[row+1] - a_i[row]; 1227 nzB = b_i[row+1] - b_i[row]; 1228 cworkA = a_j + a_i[row]; 1229 cworkB = b_j + b_i[row]; 1230 vworkA = a_a + a_i[row]*bs2; 1231 vworkB = b_a + b_i[row]*bs2; 1232 1233 row = rmap_i[row + rstart]; 1234 mat_i = imat_i[row]; 1235 mat_a = imat_a + mat_i*bs2; 1236 mat_j = imat_j + mat_i; 1237 ilen_row = imat_ilen[row]; 1238 1239 /* load the column indices for this row into cols*/ 1240 for (l=0; l<nzB; l++) { 1241 if ((ctmp = bmap[cworkB[l]]) < cstart) { 1242 if ((tcol = cmap_i[ctmp])) { 1243 *mat_j++ = tcol - 1; 1244 ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1245 mat_a += bs2; 1246 ilen_row++; 1247 } 1248 } else break; 1249 } 1250 imark = l; 1251 for (l=0; l<nzA; l++) { 1252 if ((tcol = cmap_i[cstart + cworkA[l]])) { 1253 *mat_j++ = tcol - 1; 1254 ierr = PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1255 mat_a += bs2; 1256 ilen_row++; 1257 } 1258 } 1259 for (l=imark; l<nzB; l++) { 1260 if ((tcol = cmap_i[bmap[cworkB[l]]])) { 1261 *mat_j++ = tcol - 1; 1262 ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1263 mat_a += bs2; 1264 ilen_row++; 1265 } 1266 } 1267 imat_ilen[row] = ilen_row; 1268 } 1269 } 1270 1271 } 1272 } 1273 1274 /* Now assemble the off proc rows*/ 1275 { 1276 int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1277 int *imat_j,*imat_i; 1278 MatScalar *imat_a,*rbuf4_i; 1279 1280 for (tmp2=0; tmp2<nrqs; tmp2++) { 1281 ierr = MPI_Waitany(nrqs,r_waits4,&i,r_status4+tmp2);CHKERRQ(ierr); 1282 index = pa[i]; 1283 sbuf1_i = sbuf1[index]; 1284 jmax = sbuf1_i[0]; 1285 ct1 = 2*jmax + 1; 1286 ct2 = 0; 1287 rbuf2_i = rbuf2[i]; 1288 rbuf3_i = rbuf3[i]; 1289 rbuf4_i = rbuf4[i]; 1290 for (j=1; j<=jmax; j++) { 1291 is_no = sbuf1_i[2*j-1]; 1292 rmap_i = rmap[is_no]; 1293 cmap_i = cmap[is_no]; 1294 mat = (Mat_SeqBAIJ*)submats[is_no]->data; 1295 imat_ilen = mat->ilen; 1296 imat_j = mat->j; 1297 imat_i = mat->i; 1298 imat_a = mat->a; 1299 max1 = sbuf1_i[2*j]; 1300 for (k=0; k<max1; k++,ct1++) { 1301 row = sbuf1_i[ct1]; 1302 row = rmap_i[row]; 1303 ilen = imat_ilen[row]; 1304 mat_i = imat_i[row]; 1305 mat_a = imat_a + mat_i*bs2; 1306 mat_j = imat_j + mat_i; 1307 max2 = rbuf2_i[ct1]; 1308 for (l=0; l<max2; l++,ct2++) { 1309 if ((tcol = cmap_i[rbuf3_i[ct2]])) { 1310 *mat_j++ = tcol - 1; 1311 /* *mat_a++= rbuf4_i[ct2]; */ 1312 ierr = PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1313 mat_a += bs2; 1314 ilen++; 1315 } 1316 } 1317 imat_ilen[row] = ilen; 1318 } 1319 } 1320 } 1321 } 1322 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1323 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1324 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1325 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1326 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1327 1328 /* Restore the indices */ 1329 for (i=0; i<ismax; i++) { 1330 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1331 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1332 } 1333 1334 /* Destroy allocated memory */ 1335 ierr = PetscFree(irow);CHKERRQ(ierr); 1336 ierr = PetscFree(w1);CHKERRQ(ierr); 1337 ierr = PetscFree(pa);CHKERRQ(ierr); 1338 1339 ierr = PetscFree(sbuf1);CHKERRQ(ierr); 1340 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1341 for (i=0; i<nrqr; ++i) { 1342 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1343 } 1344 for (i=0; i<nrqs; ++i) { 1345 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1346 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1347 } 1348 1349 ierr = PetscFree(sbuf2);CHKERRQ(ierr); 1350 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1351 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1352 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1353 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1354 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1355 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1356 1357 ierr = PetscFree(cmap);CHKERRQ(ierr); 1358 ierr = PetscFree(rmap);CHKERRQ(ierr); 1359 ierr = PetscFree(lens);CHKERRQ(ierr); 1360 1361 for (i=0; i<ismax; i++) { 1362 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1363 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1364 } 1365 1366 ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1367 ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1368 ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 1369 1370 PetscFunctionReturn(0); 1371 } 1372 1373 1374