1 /*$Id: baijov.c,v 1.65 2001/08/06 21:15:42 bsmith Exp $*/ 2 3 /* 4 Routines to compute overlapping regions of a parallel MPI matrix 5 and to find submatrices that were shared across processors. 6 */ 7 #include "src/mat/impls/baij/mpi/mpibaij.h" 8 #include "petscbt.h" 9 10 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat,int,IS *); 11 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat,int,char **,int*,int**); 12 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat,int,int **,int**,int*); 13 EXTERN int MatGetRow_MPIBAIJ(Mat,int,int*,int**,PetscScalar**); 14 EXTERN int MatRestoreRow_MPIBAIJ(Mat,int,int*,int**,PetscScalar**); 15 16 #undef __FUNCT__ 17 #define __FUNCT__ "MatCompressIndicesGeneral_MPIBAIJ" 18 static int MatCompressIndicesGeneral_MPIBAIJ(Mat C,int imax,const IS is_in[],IS is_out[]) 19 { 20 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ*)C->data; 21 int ierr,isz,bs = baij->bs,n,i,j,*idx,ival; 22 #if defined (PETSC_USE_CTABLE) 23 PetscTable gid1_lid1; 24 int tt, gid1, *nidx; 25 PetscTablePosition tpos; 26 #else 27 int Nbs,*nidx; 28 PetscBT table; 29 #endif 30 31 PetscFunctionBegin; 32 #if defined (PETSC_USE_CTABLE) 33 ierr = PetscTableCreate(baij->mbs,&gid1_lid1);CHKERRQ(ierr); 34 #else 35 Nbs = baij->Nbs; 36 ierr = PetscMalloc((Nbs+1)*sizeof(int),&nidx);CHKERRQ(ierr); 37 ierr = PetscBTCreate(Nbs,table);CHKERRQ(ierr); 38 #endif 39 for (i=0; i<imax; i++) { 40 isz = 0; 41 #if defined (PETSC_USE_CTABLE) 42 ierr = PetscTableRemoveAll(gid1_lid1);CHKERRQ(ierr); 43 #else 44 ierr = PetscBTMemzero(Nbs,table);CHKERRQ(ierr); 45 #endif 46 ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr); 47 ierr = ISGetLocalSize(is_in[i],&n);CHKERRQ(ierr); 48 for (j=0; j<n ; j++) { 49 ival = idx[j]/bs; /* convert the indices into block indices */ 50 #if defined (PETSC_USE_CTABLE) 51 ierr = PetscTableFind(gid1_lid1,ival+1,&tt);CHKERRQ(ierr); 52 if (!tt) { 53 ierr = PetscTableAdd(gid1_lid1,ival+1,isz+1);CHKERRQ(ierr); 54 isz++; 55 } 56 #else 57 if (ival>Nbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 58 if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;} 59 #endif 60 } 61 ierr = ISRestoreIndices(is_in[i],&idx);CHKERRQ(ierr); 62 #if defined (PETSC_USE_CTABLE) 63 ierr = PetscMalloc((isz+1)*sizeof(int),&nidx);CHKERRQ(ierr); 64 ierr = PetscTableGetHeadPosition(gid1_lid1,&tpos);CHKERRQ(ierr); 65 j = 0; 66 while (tpos) { 67 ierr = PetscTableGetNext(gid1_lid1,&tpos,&gid1,&tt);CHKERRQ(ierr); 68 if (tt-- > isz) { SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than array-dim"); } 69 nidx[tt] = gid1 - 1; 70 j++; 71 } 72 if (j != isz) { SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"table error: jj != isz"); } 73 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz,nidx,(is_out+i));CHKERRQ(ierr); 74 ierr = PetscFree(nidx);CHKERRQ(ierr); 75 #else 76 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz,nidx,(is_out+i));CHKERRQ(ierr); 77 #endif 78 } 79 #if defined (PETSC_USE_CTABLE) 80 ierr = PetscTableDelete(gid1_lid1);CHKERRQ(ierr); 81 #else 82 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 83 ierr = PetscFree(nidx);CHKERRQ(ierr); 84 #endif 85 PetscFunctionReturn(0); 86 } 87 88 #undef __FUNCT__ 89 #define __FUNCT__ "MatCompressIndicesSorted_MPIBAIJ" 90 static int MatCompressIndicesSorted_MPIBAIJ(Mat C,int imax,const IS is_in[],IS is_out[]) 91 { 92 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ*)C->data; 93 int ierr,bs=baij->bs,i,j,k,val,n,*idx,*nidx,*idx_local; 94 PetscTruth flg; 95 #if defined (PETSC_USE_CTABLE) 96 int maxsz; 97 #else 98 int Nbs=baij->Nbs; 99 #endif 100 PetscFunctionBegin; 101 for (i=0; i<imax; i++) { 102 ierr = ISSorted(is_in[i],&flg);CHKERRQ(ierr); 103 if (!flg) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Indices are not sorted"); 104 } 105 #if defined (PETSC_USE_CTABLE) 106 /* Now check max size */ 107 for (i=0,maxsz=0; i<imax; i++) { 108 ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr); 109 ierr = ISGetLocalSize(is_in[i],&n);CHKERRQ(ierr); 110 if (n%bs !=0) SETERRQ(1,"Indices are not block ordered"); 111 n = n/bs; /* The reduced index size */ 112 if (n > maxsz) maxsz = n; 113 } 114 ierr = PetscMalloc((maxsz+1)*sizeof(int),&nidx);CHKERRQ(ierr); 115 #else 116 ierr = PetscMalloc((Nbs+1)*sizeof(int),&nidx);CHKERRQ(ierr); 117 #endif 118 /* Now check if the indices are in block order */ 119 for (i=0; i<imax; i++) { 120 ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr); 121 ierr = ISGetLocalSize(is_in[i],&n);CHKERRQ(ierr); 122 if (n%bs !=0) SETERRQ(1,"Indices are not block ordered"); 123 124 n = n/bs; /* The reduced index size */ 125 idx_local = idx; 126 for (j=0; j<n ; j++) { 127 val = idx_local[0]; 128 if (val%bs != 0) SETERRQ(1,"Indices are not block ordered"); 129 for (k=0; k<bs; k++) { 130 if (val+k != idx_local[k]) SETERRQ(1,"Indices are not block ordered"); 131 } 132 nidx[j] = val/bs; 133 idx_local +=bs; 134 } 135 ierr = ISRestoreIndices(is_in[i],&idx);CHKERRQ(ierr); 136 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,nidx,(is_out+i));CHKERRQ(ierr); 137 } 138 ierr = PetscFree(nidx);CHKERRQ(ierr); 139 140 PetscFunctionReturn(0); 141 } 142 143 #undef __FUNCT__ 144 #define __FUNCT__ "MatExpandIndices_MPIBAIJ" 145 static int MatExpandIndices_MPIBAIJ(Mat C,int imax,const IS is_in[],IS is_out[]) 146 { 147 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ*)C->data; 148 int ierr,bs = baij->bs,n,i,j,k,*idx,*nidx; 149 #if defined (PETSC_USE_CTABLE) 150 int maxsz; 151 #else 152 int Nbs = baij->Nbs; 153 #endif 154 155 PetscFunctionBegin; 156 157 #if defined (PETSC_USE_CTABLE) 158 /* Now check max size */ 159 for (i=0,maxsz=0; i<imax; i++) { 160 ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr); 161 ierr = ISGetLocalSize(is_in[i],&n);CHKERRQ(ierr); 162 if (n*bs > maxsz) maxsz = n*bs; 163 } 164 ierr = PetscMalloc((maxsz+1)*sizeof(int),&nidx);CHKERRQ(ierr); 165 #else 166 ierr = PetscMalloc((Nbs*bs+1)*sizeof(int),&nidx);CHKERRQ(ierr); 167 #endif 168 169 for (i=0; i<imax; i++) { 170 ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr); 171 ierr = ISGetLocalSize(is_in[i],&n);CHKERRQ(ierr); 172 for (j=0; j<n ; ++j){ 173 for (k=0; k<bs; k++) 174 nidx[j*bs+k] = idx[j]*bs+k; 175 } 176 ierr = ISRestoreIndices(is_in[i],&idx);CHKERRQ(ierr); 177 ierr = ISCreateGeneral(PETSC_COMM_SELF,n*bs,nidx,is_out+i);CHKERRQ(ierr); 178 } 179 ierr = PetscFree(nidx);CHKERRQ(ierr); 180 PetscFunctionReturn(0); 181 } 182 183 184 #undef __FUNCT__ 185 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ" 186 int MatIncreaseOverlap_MPIBAIJ(Mat C,int imax,IS is[],int ov) 187 { 188 int i,ierr; 189 IS *is_new; 190 191 PetscFunctionBegin; 192 ierr = PetscMalloc(imax*sizeof(IS),&is_new);CHKERRQ(ierr); 193 /* Convert the indices into block format */ 194 ierr = MatCompressIndicesGeneral_MPIBAIJ(C,imax,is,is_new);CHKERRQ(ierr); 195 if (ov < 0){ SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");} 196 for (i=0; i<ov; ++i) { 197 ierr = MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);CHKERRQ(ierr); 198 } 199 for (i=0; i<imax; i++) {ierr = ISDestroy(is[i]);CHKERRQ(ierr);} 200 ierr = MatExpandIndices_MPIBAIJ(C,imax,is_new,is);CHKERRQ(ierr); 201 for (i=0; i<imax; i++) {ierr = ISDestroy(is_new[i]);CHKERRQ(ierr);} 202 ierr = PetscFree(is_new);CHKERRQ(ierr); 203 PetscFunctionReturn(0); 204 } 205 206 /* 207 Sample message format: 208 If a processor A wants processor B to process some elements corresponding 209 to index sets 1s[1], is[5] 210 mesg [0] = 2 (no of index sets in the mesg) 211 ----------- 212 mesg [1] = 1 => is[1] 213 mesg [2] = sizeof(is[1]); 214 ----------- 215 mesg [5] = 5 => is[5] 216 mesg [6] = sizeof(is[5]); 217 ----------- 218 mesg [7] 219 mesg [n] datas[1] 220 ----------- 221 mesg[n+1] 222 mesg[m] data(is[5]) 223 ----------- 224 225 Notes: 226 nrqs - no of requests sent (or to be sent out) 227 nrqr - no of requests recieved (which have to be or which have been processed 228 */ 229 #undef __FUNCT__ 230 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ_Once" 231 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat C,int imax,IS is[]) 232 { 233 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 234 int **idx,*n,*w1,*w2,*w3,*w4,*rtable,**data,len,*idx_i; 235 int size,rank,Mbs,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr; 236 int *ctr,*pa,*tmp,nrqr,*isz,*isz1,**xdata,**rbuf2; 237 int *onodes1,*olengths1,tag1,tag2,*onodes2,*olengths2; 238 PetscBT *table; 239 MPI_Comm comm; 240 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 241 MPI_Status *s_status,*recv_status; 242 243 PetscFunctionBegin; 244 comm = C->comm; 245 size = c->size; 246 rank = c->rank; 247 Mbs = c->Mbs; 248 249 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 250 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 251 252 len = (imax+1)*sizeof(int*)+ (imax + Mbs)*sizeof(int); 253 ierr = PetscMalloc(len,&idx);CHKERRQ(ierr); 254 n = (int*)(idx + imax); 255 rtable = n + imax; 256 257 for (i=0; i<imax; i++) { 258 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 259 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 260 } 261 262 /* Create hash table for the mapping :row -> proc*/ 263 for (i=0,j=0; i<size; i++) { 264 len = c->rowners[i+1]; 265 for (; j<len; j++) { 266 rtable[j] = i; 267 } 268 } 269 270 /* evaluate communication - mesg to who,length of mesg, and buffer space 271 required. Based on this, buffers are allocated, and data copied into them*/ 272 ierr = PetscMalloc(size*4*sizeof(int),&w1);CHKERRQ(ierr);/* mesg size */ 273 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 274 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 275 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 276 ierr = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/ 277 for (i=0; i<imax; i++) { 278 ierr = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/ 279 idx_i = idx[i]; 280 len = n[i]; 281 for (j=0; j<len; j++) { 282 row = idx_i[j]; 283 if (row < 0) { 284 SETERRQ(1,"Index set cannot have negative entries"); 285 } 286 proc = rtable[row]; 287 w4[proc]++; 288 } 289 for (j=0; j<size; j++){ 290 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 291 } 292 } 293 294 nrqs = 0; /* no of outgoing messages */ 295 msz = 0; /* total mesg length (for all proc */ 296 w1[rank] = 0; /* no mesg sent to intself */ 297 w3[rank] = 0; 298 for (i=0; i<size; i++) { 299 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 300 } 301 /* pa - is list of processors to communicate with */ 302 ierr = PetscMalloc((nrqs+1)*sizeof(int),&pa);CHKERRQ(ierr); 303 for (i=0,j=0; i<size; i++) { 304 if (w1[i]) {pa[j] = i; j++;} 305 } 306 307 /* Each message would have a header = 1 + 2*(no of IS) + data */ 308 for (i=0; i<nrqs; i++) { 309 j = pa[i]; 310 w1[j] += w2[j] + 2*w3[j]; 311 msz += w1[j]; 312 } 313 314 /* Determine the number of messages to expect, their lengths, from from-ids */ 315 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 316 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 317 318 /* Now post the Irecvs corresponding to these messages */ 319 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 320 321 /* Allocate Memory for outgoing messages */ 322 len = 2*size*sizeof(int*) + (size+msz)*sizeof(int); 323 ierr = PetscMalloc(len,&outdat);CHKERRQ(ierr); 324 ptr = outdat + size; /* Pointers to the data in outgoing buffers */ 325 ierr = PetscMemzero(outdat,2*size*sizeof(int*));CHKERRQ(ierr); 326 tmp = (int*)(outdat + 2*size); 327 ctr = tmp + msz; 328 329 { 330 int *iptr = tmp,ict = 0; 331 for (i=0; i<nrqs; i++) { 332 j = pa[i]; 333 iptr += ict; 334 outdat[j] = iptr; 335 ict = w1[j]; 336 } 337 } 338 339 /* Form the outgoing messages */ 340 /*plug in the headers*/ 341 for (i=0; i<nrqs; i++) { 342 j = pa[i]; 343 outdat[j][0] = 0; 344 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr); 345 ptr[j] = outdat[j] + 2*w3[j] + 1; 346 } 347 348 /* Memory for doing local proc's work*/ 349 { 350 int *d_p; 351 char *t_p; 352 353 len = (imax)*(sizeof(PetscBT) + sizeof(int*)+ sizeof(int)) + 354 (Mbs)*imax*sizeof(int) + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1; 355 ierr = PetscMalloc(len,&table);CHKERRQ(ierr); 356 ierr = PetscMemzero(table,len);CHKERRQ(ierr); 357 data = (int **)(table + imax); 358 isz = (int *)(data + imax); 359 d_p = (int *)(isz + imax); 360 t_p = (char *)(d_p + Mbs*imax); 361 for (i=0; i<imax; i++) { 362 table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i; 363 data[i] = d_p + (Mbs)*i; 364 } 365 } 366 367 /* Parse the IS and update local tables and the outgoing buf with the data*/ 368 { 369 int n_i,*data_i,isz_i,*outdat_j,ctr_j; 370 PetscBT table_i; 371 372 for (i=0; i<imax; i++) { 373 ierr = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr); 374 n_i = n[i]; 375 table_i = table[i]; 376 idx_i = idx[i]; 377 data_i = data[i]; 378 isz_i = isz[i]; 379 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 380 row = idx_i[j]; 381 proc = rtable[row]; 382 if (proc != rank) { /* copy to the outgoing buffer */ 383 ctr[proc]++; 384 *ptr[proc] = row; 385 ptr[proc]++; 386 } 387 else { /* Update the local table */ 388 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 389 } 390 } 391 /* Update the headers for the current IS */ 392 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 393 if ((ctr_j = ctr[j])) { 394 outdat_j = outdat[j]; 395 k = ++outdat_j[0]; 396 outdat_j[2*k] = ctr_j; 397 outdat_j[2*k-1] = i; 398 } 399 } 400 isz[i] = isz_i; 401 } 402 } 403 404 /* Now post the sends */ 405 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 406 for (i=0; i<nrqs; ++i) { 407 j = pa[i]; 408 ierr = MPI_Isend(outdat[j],w1[j],MPI_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 409 } 410 411 /* No longer need the original indices*/ 412 for (i=0; i<imax; ++i) { 413 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 414 } 415 ierr = PetscFree(idx);CHKERRQ(ierr); 416 417 for (i=0; i<imax; ++i) { 418 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 419 } 420 421 /* Do Local work*/ 422 ierr = MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 423 424 /* Receive messages*/ 425 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 426 ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr); 427 428 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 429 ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr); 430 431 /* Phase 1 sends are complete - deallocate buffers */ 432 ierr = PetscFree(outdat);CHKERRQ(ierr); 433 ierr = PetscFree(w1);CHKERRQ(ierr); 434 435 ierr = PetscMalloc((nrqr+1)*sizeof(int *),&xdata);CHKERRQ(ierr); 436 ierr = PetscMalloc((nrqr+1)*sizeof(int),&isz1);CHKERRQ(ierr); 437 ierr = MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 438 ierr = PetscFree(rbuf);CHKERRQ(ierr); 439 440 /* Send the data back*/ 441 /* Do a global reduction to know the buffer space req for incoming messages*/ 442 { 443 int *rw1; 444 445 ierr = PetscMalloc(size*sizeof(int),&rw1);CHKERRQ(ierr); 446 ierr = PetscMemzero(rw1,size*sizeof(int));CHKERRQ(ierr); 447 448 for (i=0; i<nrqr; ++i) { 449 proc = recv_status[i].MPI_SOURCE; 450 if (proc != onodes1[i]) SETERRQ(1,"MPI_SOURCE mismatch"); 451 rw1[proc] = isz1[i]; 452 } 453 454 ierr = PetscFree(onodes1);CHKERRQ(ierr); 455 ierr = PetscFree(olengths1);CHKERRQ(ierr); 456 457 /* Determine the number of messages to expect, their lengths, from from-ids */ 458 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 459 PetscFree(rw1); 460 } 461 /* Now post the Irecvs corresponding to these messages */ 462 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 463 464 /* Now post the sends */ 465 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 466 for (i=0; i<nrqr; ++i) { 467 j = recv_status[i].MPI_SOURCE; 468 ierr = MPI_Isend(xdata[i],isz1[i],MPI_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 469 } 470 471 /* receive work done on other processors*/ 472 { 473 int idex,is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 474 PetscBT table_i; 475 MPI_Status *status2; 476 477 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 478 479 for (i=0; i<nrqs; ++i) { 480 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 481 /* Process the message*/ 482 rbuf2_i = rbuf2[idex]; 483 ct1 = 2*rbuf2_i[0]+1; 484 jmax = rbuf2[idex][0]; 485 for (j=1; j<=jmax; j++) { 486 max = rbuf2_i[2*j]; 487 is_no = rbuf2_i[2*j-1]; 488 isz_i = isz[is_no]; 489 data_i = data[is_no]; 490 table_i = table[is_no]; 491 for (k=0; k<max; k++,ct1++) { 492 row = rbuf2_i[ct1]; 493 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 494 } 495 isz[is_no] = isz_i; 496 } 497 } 498 ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr); 499 ierr = PetscFree(status2);CHKERRQ(ierr); 500 } 501 502 for (i=0; i<imax; ++i) { 503 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);CHKERRQ(ierr); 504 } 505 506 507 ierr = PetscFree(onodes2);CHKERRQ(ierr); 508 ierr = PetscFree(olengths2);CHKERRQ(ierr); 509 510 ierr = PetscFree(pa);CHKERRQ(ierr); 511 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 512 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 513 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 514 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 515 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 516 ierr = PetscFree(table);CHKERRQ(ierr); 517 ierr = PetscFree(s_status);CHKERRQ(ierr); 518 ierr = PetscFree(recv_status);CHKERRQ(ierr); 519 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 520 ierr = PetscFree(xdata);CHKERRQ(ierr); 521 ierr = PetscFree(isz1);CHKERRQ(ierr); 522 PetscFunctionReturn(0); 523 } 524 525 #undef __FUNCT__ 526 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ_Local" 527 /* 528 MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do 529 the work on the local processor. 530 531 Inputs: 532 C - MAT_MPIBAIJ; 533 imax - total no of index sets processed at a time; 534 table - an array of char - size = Mbs bits. 535 536 Output: 537 isz - array containing the count of the solution elements correspondign 538 to each index set; 539 data - pointer to the solutions 540 */ 541 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat C,int imax,PetscBT *table,int *isz,int **data) 542 { 543 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 544 Mat A = c->A,B = c->B; 545 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 546 int start,end,val,max,rstart,cstart,*ai,*aj; 547 int *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 548 PetscBT table_i; 549 550 PetscFunctionBegin; 551 rstart = c->rstart; 552 cstart = c->cstart; 553 ai = a->i; 554 aj = a->j; 555 bi = b->i; 556 bj = b->j; 557 garray = c->garray; 558 559 560 for (i=0; i<imax; i++) { 561 data_i = data[i]; 562 table_i = table[i]; 563 isz_i = isz[i]; 564 for (j=0,max=isz[i]; j<max; j++) { 565 row = data_i[j] - rstart; 566 start = ai[row]; 567 end = ai[row+1]; 568 for (k=start; k<end; k++) { /* Amat */ 569 val = aj[k] + cstart; 570 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 571 } 572 start = bi[row]; 573 end = bi[row+1]; 574 for (k=start; k<end; k++) { /* Bmat */ 575 val = garray[bj[k]]; 576 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 577 } 578 } 579 isz[i] = isz_i; 580 } 581 PetscFunctionReturn(0); 582 } 583 #undef __FUNCT__ 584 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ_Receive" 585 /* 586 MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages, 587 and return the output 588 589 Input: 590 C - the matrix 591 nrqr - no of messages being processed. 592 rbuf - an array of pointers to the recieved requests 593 594 Output: 595 xdata - array of messages to be sent back 596 isz1 - size of each message 597 598 For better efficiency perhaps we should malloc seperately each xdata[i], 599 then if a remalloc is required we need only copy the data for that one row 600 rather then all previous rows as it is now where a single large chunck of 601 memory is used. 602 603 */ 604 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,int nrqr,int **rbuf,int **xdata,int * isz1) 605 { 606 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 607 Mat A = c->A,B = c->B; 608 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 609 int rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 610 int row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 611 int val,max1,max2,rank,Mbs,no_malloc =0,*tmp,new_estimate,ctr; 612 int *rbuf_i,kmax,rbuf_0,ierr; 613 PetscBT xtable; 614 615 PetscFunctionBegin; 616 rank = c->rank; 617 Mbs = c->Mbs; 618 rstart = c->rstart; 619 cstart = c->cstart; 620 ai = a->i; 621 aj = a->j; 622 bi = b->i; 623 bj = b->j; 624 garray = c->garray; 625 626 627 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 628 rbuf_i = rbuf[i]; 629 rbuf_0 = rbuf_i[0]; 630 ct += rbuf_0; 631 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 632 } 633 634 if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs; 635 else max1 = 1; 636 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 637 ierr = PetscMalloc(mem_estimate*sizeof(int),&xdata[0]);CHKERRQ(ierr); 638 ++no_malloc; 639 ierr = PetscBTCreate(Mbs,xtable);CHKERRQ(ierr); 640 ierr = PetscMemzero(isz1,nrqr*sizeof(int));CHKERRQ(ierr); 641 642 ct3 = 0; 643 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 644 rbuf_i = rbuf[i]; 645 rbuf_0 = rbuf_i[0]; 646 ct1 = 2*rbuf_0+1; 647 ct2 = ct1; 648 ct3 += ct1; 649 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 650 ierr = PetscBTMemzero(Mbs,xtable);CHKERRQ(ierr); 651 oct2 = ct2; 652 kmax = rbuf_i[2*j]; 653 for (k=0; k<kmax; k++,ct1++) { 654 row = rbuf_i[ct1]; 655 if (!PetscBTLookupSet(xtable,row)) { 656 if (!(ct3 < mem_estimate)) { 657 new_estimate = (int)(1.5*mem_estimate)+1; 658 ierr = PetscMalloc(new_estimate * sizeof(int),&tmp);CHKERRQ(ierr); 659 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr); 660 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 661 xdata[0] = tmp; 662 mem_estimate = new_estimate; ++no_malloc; 663 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 664 } 665 xdata[i][ct2++] = row; 666 ct3++; 667 } 668 } 669 for (k=oct2,max2=ct2; k<max2; k++) { 670 row = xdata[i][k] - rstart; 671 start = ai[row]; 672 end = ai[row+1]; 673 for (l=start; l<end; l++) { 674 val = aj[l] + cstart; 675 if (!PetscBTLookupSet(xtable,val)) { 676 if (!(ct3 < mem_estimate)) { 677 new_estimate = (int)(1.5*mem_estimate)+1; 678 ierr = PetscMalloc(new_estimate * sizeof(int),&tmp);CHKERRQ(ierr); 679 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr); 680 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 681 xdata[0] = tmp; 682 mem_estimate = new_estimate; ++no_malloc; 683 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 684 } 685 xdata[i][ct2++] = val; 686 ct3++; 687 } 688 } 689 start = bi[row]; 690 end = bi[row+1]; 691 for (l=start; l<end; l++) { 692 val = garray[bj[l]]; 693 if (!PetscBTLookupSet(xtable,val)) { 694 if (!(ct3 < mem_estimate)) { 695 new_estimate = (int)(1.5*mem_estimate)+1; 696 ierr = PetscMalloc(new_estimate * sizeof(int),&tmp);CHKERRQ(ierr); 697 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr); 698 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 699 xdata[0] = tmp; 700 mem_estimate = new_estimate; ++no_malloc; 701 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 702 } 703 xdata[i][ct2++] = val; 704 ct3++; 705 } 706 } 707 } 708 /* Update the header*/ 709 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 710 xdata[i][2*j-1] = rbuf_i[2*j-1]; 711 } 712 xdata[i][0] = rbuf_0; 713 xdata[i+1] = xdata[i] + ct2; 714 isz1[i] = ct2; /* size of each message */ 715 } 716 ierr = PetscBTDestroy(xtable);CHKERRQ(ierr); 717 PetscLogInfo(0,"MatIncreaseOverlap_MPIBAIJ:[%d] Allocated %d bytes, required %d, no of mallocs = %d\n",rank,mem_estimate,ct3,no_malloc); 718 PetscFunctionReturn(0); 719 } 720 721 static int MatGetSubMatrices_MPIBAIJ_local(Mat,int,const IS[],const IS[],MatReuse,Mat *); 722 723 #undef __FUNCT__ 724 #define __FUNCT__ "MatGetSubMatrices_MPIBAIJ" 725 int MatGetSubMatrices_MPIBAIJ(Mat C,int ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 726 { 727 IS *isrow_new,*iscol_new; 728 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 729 int nmax,nstages_local,nstages,i,pos,max_no,ierr; 730 731 PetscFunctionBegin; 732 /* The compression and expansion should be avoided. Does'nt point 733 out errors might change the indices hence buggey */ 734 735 ierr = PetscMalloc(2*(ismax+1)*sizeof(IS),&isrow_new);CHKERRQ(ierr); 736 iscol_new = isrow_new + ismax; 737 ierr = MatCompressIndicesSorted_MPIBAIJ(C,ismax,isrow,isrow_new);CHKERRQ(ierr); 738 ierr = MatCompressIndicesSorted_MPIBAIJ(C,ismax,iscol,iscol_new);CHKERRQ(ierr); 739 740 /* Allocate memory to hold all the submatrices */ 741 if (scall != MAT_REUSE_MATRIX) { 742 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 743 } 744 /* Determine the number of stages through which submatrices are done */ 745 nmax = 20*1000000 / (c->Nbs * sizeof(int)); 746 if (!nmax) nmax = 1; 747 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 748 749 /* Make sure every processor loops through the nstages */ 750 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);CHKERRQ(ierr); 751 for (i=0,pos=0; i<nstages; i++) { 752 if (pos+nmax <= ismax) max_no = nmax; 753 else if (pos == ismax) max_no = 0; 754 else max_no = ismax-pos; 755 ierr = MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos);CHKERRQ(ierr); 756 pos += max_no; 757 } 758 759 for (i=0; i<ismax; i++) { 760 ierr = ISDestroy(isrow_new[i]);CHKERRQ(ierr); 761 ierr = ISDestroy(iscol_new[i]);CHKERRQ(ierr); 762 } 763 ierr = PetscFree(isrow_new);CHKERRQ(ierr); 764 PetscFunctionReturn(0); 765 } 766 767 #if defined (PETSC_USE_CTABLE) 768 #undef __FUNCT__ 769 #define __FUNCT__ "PetscGetProc" 770 int PetscGetProc(const int gid, const int numprocs, const int proc_gnode[], int *proc) 771 { 772 int nGlobalNd = proc_gnode[numprocs]; 773 int fproc = (int) ((float)gid * (float)numprocs / (float)nGlobalNd + 0.5); 774 775 PetscFunctionBegin; 776 /* if(fproc < 0) SETERRQ(1,"fproc < 0");*/ 777 if (fproc > numprocs) fproc = numprocs; 778 while (gid < proc_gnode[fproc] || gid >= proc_gnode[fproc+1]) { 779 if (gid < proc_gnode[fproc]) fproc--; 780 else fproc++; 781 } 782 /* if(fproc<0 || fproc>=numprocs) { SETERRQ(1,"fproc < 0 || fproc >= numprocs"); }*/ 783 *proc = fproc; 784 PetscFunctionReturn(0); 785 } 786 #endif 787 788 /* -------------------------------------------------------------------------*/ 789 #undef __FUNCT__ 790 #define __FUNCT__ "MatGetSubMatrices_MPIBAIJ_local" 791 static int MatGetSubMatrices_MPIBAIJ_local(Mat C,int ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 792 { 793 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 794 Mat A = c->A; 795 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat; 796 int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,start,end,size; 797 int **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,ierr,**rbuf1,row,proc; 798 int nrqs,msz,**ptr,idex,*req_size,*ctr,*pa,*tmp,tcol,nrqr; 799 int **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 800 int **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i; 801 int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 802 int bs=c->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB; 803 int cstart = c->cstart,nzA,nzB,*a_i=a->i,*b_i=b->i,imark; 804 int *bmap = c->garray,ctmp,rstart=c->rstart,tag0,tag1,tag2,tag3; 805 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 806 MPI_Request *r_waits4,*s_waits3,*s_waits4; 807 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 808 MPI_Status *r_status3,*r_status4,*s_status4; 809 MPI_Comm comm; 810 MatScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB; 811 MatScalar *a_a=a->a,*b_a=b->a; 812 PetscTruth flag; 813 int *onodes1,*olengths1; 814 815 #if defined (PETSC_USE_CTABLE) 816 int tt; 817 PetscTable *rowmaps,*colmaps,lrow1_grow1,lcol1_gcol1; 818 #else 819 int **cmap,*cmap_i,*rtable,*rmap_i,**rmap, Mbs = c->Mbs; 820 #endif 821 822 PetscFunctionBegin; 823 comm = C->comm; 824 tag0 = C->tag; 825 size = c->size; 826 rank = c->rank; 827 828 /* Get some new tags to keep the communication clean */ 829 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 830 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 831 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 832 833 /* Check if the col indices are sorted */ 834 for (i=0; i<ismax; i++) { 835 ierr = ISSorted(iscol[i],(PetscTruth*)&j);CHKERRQ(ierr); 836 if (!j) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 837 } 838 839 len = (2*ismax+1)*(sizeof(int*)+ sizeof(int)); 840 #if !defined (PETSC_USE_CTABLE) 841 len += (Mbs+1)*sizeof(int); 842 #endif 843 ierr = PetscMalloc(len,&irow);CHKERRQ(ierr); 844 icol = irow + ismax; 845 nrow = (int*)(icol + ismax); 846 ncol = nrow + ismax; 847 #if !defined (PETSC_USE_CTABLE) 848 rtable = ncol + ismax; 849 /* Create hash table for the mapping :row -> proc*/ 850 for (i=0,j=0; i<size; i++) { 851 jmax = c->rowners[i+1]; 852 for (; j<jmax; j++) { 853 rtable[j] = i; 854 } 855 } 856 #endif 857 858 for (i=0; i<ismax; i++) { 859 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 860 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 861 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 862 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 863 } 864 865 /* evaluate communication - mesg to who,length of mesg,and buffer space 866 required. Based on this, buffers are allocated, and data copied into them*/ 867 ierr = PetscMalloc(size*4*sizeof(int),&w1);CHKERRQ(ierr); /* mesg size */ 868 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 869 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 870 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 871 ierr = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/ 872 for (i=0; i<ismax; i++) { 873 ierr = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/ 874 jmax = nrow[i]; 875 irow_i = irow[i]; 876 for (j=0; j<jmax; j++) { 877 row = irow_i[j]; 878 #if defined (PETSC_USE_CTABLE) 879 ierr = PetscGetProc(row,size,c->rowners,&proc);CHKERRQ(ierr); 880 #else 881 proc = rtable[row]; 882 #endif 883 w4[proc]++; 884 } 885 for (j=0; j<size; j++) { 886 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 887 } 888 } 889 890 nrqs = 0; /* no of outgoing messages */ 891 msz = 0; /* total mesg length for all proc */ 892 w1[rank] = 0; /* no mesg sent to intself */ 893 w3[rank] = 0; 894 for (i=0; i<size; i++) { 895 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 896 } 897 ierr = PetscMalloc((nrqs+1)*sizeof(int),&pa);CHKERRQ(ierr); /*(proc -array)*/ 898 for (i=0,j=0; i<size; i++) { 899 if (w1[i]) { pa[j] = i; j++; } 900 } 901 902 /* Each message would have a header = 1 + 2*(no of IS) + data */ 903 for (i=0; i<nrqs; i++) { 904 j = pa[i]; 905 w1[j] += w2[j] + 2* w3[j]; 906 msz += w1[j]; 907 } 908 909 /* Determine the number of messages to expect, their lengths, from from-ids */ 910 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 911 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 912 913 /* Now post the Irecvs corresponding to these messages */ 914 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 915 916 ierr = PetscFree(onodes1);CHKERRQ(ierr); 917 ierr = PetscFree(olengths1);CHKERRQ(ierr); 918 919 /* Allocate Memory for outgoing messages */ 920 len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int); 921 ierr = PetscMalloc(len,&sbuf1);CHKERRQ(ierr); 922 ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */ 923 ierr = PetscMemzero(sbuf1,2*size*sizeof(int*));CHKERRQ(ierr); 924 /* allocate memory for outgoing data + buf to receive the first reply */ 925 tmp = (int*)(ptr + size); 926 ctr = tmp + 2*msz; 927 928 { 929 int *iptr = tmp,ict = 0; 930 for (i=0; i<nrqs; i++) { 931 j = pa[i]; 932 iptr += ict; 933 sbuf1[j] = iptr; 934 ict = w1[j]; 935 } 936 } 937 938 /* Form the outgoing messages */ 939 /* Initialise the header space */ 940 for (i=0; i<nrqs; i++) { 941 j = pa[i]; 942 sbuf1[j][0] = 0; 943 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr); 944 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 945 } 946 947 /* Parse the isrow and copy data into outbuf */ 948 for (i=0; i<ismax; i++) { 949 ierr = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr); 950 irow_i = irow[i]; 951 jmax = nrow[i]; 952 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 953 row = irow_i[j]; 954 #if defined (PETSC_USE_CTABLE) 955 ierr = PetscGetProc(row,size,c->rowners,&proc);CHKERRQ(ierr); 956 #else 957 proc = rtable[row]; 958 #endif 959 if (proc != rank) { /* copy to the outgoing buf*/ 960 ctr[proc]++; 961 *ptr[proc] = row; 962 ptr[proc]++; 963 } 964 } 965 /* Update the headers for the current IS */ 966 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 967 if ((ctr_j = ctr[j])) { 968 sbuf1_j = sbuf1[j]; 969 k = ++sbuf1_j[0]; 970 sbuf1_j[2*k] = ctr_j; 971 sbuf1_j[2*k-1] = i; 972 } 973 } 974 } 975 976 /* Now post the sends */ 977 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 978 for (i=0; i<nrqs; ++i) { 979 j = pa[i]; 980 ierr = MPI_Isend(sbuf1[j],w1[j],MPI_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 981 } 982 983 /* Post Recieves to capture the buffer size */ 984 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 985 ierr = PetscMalloc((nrqs+1)*sizeof(int *),&rbuf2);CHKERRQ(ierr); 986 rbuf2[0] = tmp + msz; 987 for (i=1; i<nrqs; ++i) { 988 j = pa[i]; 989 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 990 } 991 for (i=0; i<nrqs; ++i) { 992 j = pa[i]; 993 ierr = MPI_Irecv(rbuf2[i],w1[j],MPI_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 994 } 995 996 /* Send to other procs the buf size they should allocate */ 997 998 /* Receive messages*/ 999 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 1000 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 1001 len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*); 1002 ierr = PetscMalloc(len,&sbuf2);CHKERRQ(ierr); 1003 req_size = (int*)(sbuf2 + nrqr); 1004 req_source = req_size + nrqr; 1005 1006 { 1007 Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data; 1008 int *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i; 1009 1010 for (i=0; i<nrqr; ++i) { 1011 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 1012 req_size[idex] = 0; 1013 rbuf1_i = rbuf1[idex]; 1014 start = 2*rbuf1_i[0] + 1; 1015 ierr = MPI_Get_count(r_status1+i,MPI_INT,&end);CHKERRQ(ierr); 1016 ierr = PetscMalloc(end*sizeof(int),&sbuf2[idex]);CHKERRQ(ierr); 1017 sbuf2_i = sbuf2[idex]; 1018 for (j=start; j<end; j++) { 1019 id = rbuf1_i[j] - rstart; 1020 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 1021 sbuf2_i[j] = ncols; 1022 req_size[idex] += ncols; 1023 } 1024 req_source[idex] = r_status1[i].MPI_SOURCE; 1025 /* form the header */ 1026 sbuf2_i[0] = req_size[idex]; 1027 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 1028 ierr = MPI_Isend(sbuf2_i,end,MPI_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1029 } 1030 } 1031 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1032 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1033 1034 /* recv buffer sizes */ 1035 /* Receive messages*/ 1036 1037 ierr = PetscMalloc((nrqs+1)*sizeof(int*),&rbuf3);CHKERRQ(ierr); 1038 ierr = PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);CHKERRQ(ierr); 1039 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 1040 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 1041 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 1042 1043 for (i=0; i<nrqs; ++i) { 1044 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1045 ierr = PetscMalloc(rbuf2[idex][0]*sizeof(int),&rbuf3[idex]);CHKERRQ(ierr); 1046 ierr = PetscMalloc(rbuf2[idex][0]*bs2*sizeof(MatScalar),&rbuf4[idex]);CHKERRQ(ierr); 1047 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPI_INT, 1048 r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1049 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR, 1050 r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1051 } 1052 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1053 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1054 1055 /* Wait on sends1 and sends2 */ 1056 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 1057 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 1058 1059 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1060 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1061 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1062 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1063 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1064 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1065 1066 /* Now allocate buffers for a->j, and send them off */ 1067 ierr = PetscMalloc((nrqr+1)*sizeof(int *),&sbuf_aj);CHKERRQ(ierr); 1068 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1069 ierr = PetscMalloc((j+1)*sizeof(int),&sbuf_aj[0]);CHKERRQ(ierr); 1070 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1071 1072 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 1073 { 1074 for (i=0; i<nrqr; i++) { 1075 rbuf1_i = rbuf1[i]; 1076 sbuf_aj_i = sbuf_aj[i]; 1077 ct1 = 2*rbuf1_i[0] + 1; 1078 ct2 = 0; 1079 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1080 kmax = rbuf1[i][2*j]; 1081 for (k=0; k<kmax; k++,ct1++) { 1082 row = rbuf1_i[ct1] - rstart; 1083 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1084 ncols = nzA + nzB; 1085 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1086 1087 /* load the column indices for this row into cols*/ 1088 cols = sbuf_aj_i + ct2; 1089 for (l=0; l<nzB; l++) { 1090 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp; 1091 else break; 1092 } 1093 imark = l; 1094 for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l]; 1095 for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]]; 1096 ct2 += ncols; 1097 } 1098 } 1099 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1100 } 1101 } 1102 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 1103 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 1104 1105 /* Allocate buffers for a->a, and send them off */ 1106 ierr = PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);CHKERRQ(ierr); 1107 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1108 ierr = PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);CHKERRQ(ierr); 1109 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2; 1110 1111 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 1112 { 1113 for (i=0; i<nrqr; i++) { 1114 rbuf1_i = rbuf1[i]; 1115 sbuf_aa_i = sbuf_aa[i]; 1116 ct1 = 2*rbuf1_i[0]+1; 1117 ct2 = 0; 1118 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1119 kmax = rbuf1_i[2*j]; 1120 for (k=0; k<kmax; k++,ct1++) { 1121 row = rbuf1_i[ct1] - rstart; 1122 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1123 ncols = nzA + nzB; 1124 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1125 vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2; 1126 1127 /* load the column values for this row into vals*/ 1128 vals = sbuf_aa_i+ct2*bs2; 1129 for (l=0; l<nzB; l++) { 1130 if ((bmap[cworkB[l]]) < cstart) { 1131 ierr = PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1132 } 1133 else break; 1134 } 1135 imark = l; 1136 for (l=0; l<nzA; l++) { 1137 ierr = PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1138 } 1139 for (l=imark; l<nzB; l++) { 1140 ierr = PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1141 } 1142 ct2 += ncols; 1143 } 1144 } 1145 ierr = MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1146 } 1147 } 1148 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 1149 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 1150 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1151 1152 /* Form the matrix */ 1153 /* create col map */ 1154 { 1155 int *icol_i; 1156 #if defined (PETSC_USE_CTABLE) 1157 /* Create row map*/ 1158 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&colmaps);CHKERRQ(ierr); 1159 for (i=0; i<ismax; i++) { 1160 ierr = PetscTableCreate(ncol[i]+1,&colmaps[i]);CHKERRQ(ierr); 1161 } 1162 #else 1163 len = (1+ismax)*sizeof(int*)+ ismax*c->Nbs*sizeof(int); 1164 ierr = PetscMalloc(len,&cmap);CHKERRQ(ierr); 1165 cmap[0] = (int *)(cmap + ismax); 1166 ierr = PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(int));CHKERRQ(ierr); 1167 for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; } 1168 #endif 1169 for (i=0; i<ismax; i++) { 1170 jmax = ncol[i]; 1171 icol_i = icol[i]; 1172 #if defined (PETSC_USE_CTABLE) 1173 lcol1_gcol1 = colmaps[i]; 1174 for (j=0; j<jmax; j++) { 1175 ierr = PetscTableAdd(lcol1_gcol1,icol_i[j]+1,j+1);CHKERRQ(ierr); 1176 } 1177 #else 1178 cmap_i = cmap[i]; 1179 for (j=0; j<jmax; j++) { 1180 cmap_i[icol_i[j]] = j+1; 1181 } 1182 #endif 1183 } 1184 } 1185 1186 /* Create lens which is required for MatCreate... */ 1187 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1188 len = (1+ismax)*sizeof(int*)+ j*sizeof(int); 1189 ierr = PetscMalloc(len,&lens);CHKERRQ(ierr); 1190 lens[0] = (int *)(lens + ismax); 1191 ierr = PetscMemzero(lens[0],j*sizeof(int));CHKERRQ(ierr); 1192 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1193 1194 /* Update lens from local data */ 1195 for (i=0; i<ismax; i++) { 1196 jmax = nrow[i]; 1197 #if defined (PETSC_USE_CTABLE) 1198 lcol1_gcol1 = colmaps[i]; 1199 #else 1200 cmap_i = cmap[i]; 1201 #endif 1202 irow_i = irow[i]; 1203 lens_i = lens[i]; 1204 for (j=0; j<jmax; j++) { 1205 row = irow_i[j]; 1206 #if defined (PETSC_USE_CTABLE) 1207 ierr = PetscGetProc(row,size,c->rowners,&proc);CHKERRQ(ierr); 1208 #else 1209 proc = rtable[row]; 1210 #endif 1211 if (proc == rank) { 1212 /* Get indices from matA and then from matB */ 1213 row = row - rstart; 1214 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1215 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1216 #if defined (PETSC_USE_CTABLE) 1217 for (k=0; k<nzA; k++) { 1218 ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[k]+1,&tt);CHKERRQ(ierr); 1219 if (tt) { lens_i[j]++; } 1220 } 1221 for (k=0; k<nzB; k++) { 1222 ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[k]]+1,&tt);CHKERRQ(ierr); 1223 if (tt) { lens_i[j]++; } 1224 } 1225 #else 1226 for (k=0; k<nzA; k++) { 1227 if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++; } 1228 } 1229 for (k=0; k<nzB; k++) { 1230 if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++; } 1231 } 1232 #endif 1233 } 1234 } 1235 } 1236 #if defined (PETSC_USE_CTABLE) 1237 /* Create row map*/ 1238 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rowmaps);CHKERRQ(ierr); 1239 for (i=0; i<ismax; i++){ 1240 ierr = PetscTableCreate(nrow[i]+1,&rowmaps[i]);CHKERRQ(ierr); 1241 } 1242 #else 1243 /* Create row map*/ 1244 len = (1+ismax)*sizeof(int*)+ ismax*Mbs*sizeof(int); 1245 ierr = PetscMalloc(len,&rmap);CHKERRQ(ierr); 1246 rmap[0] = (int *)(rmap + ismax); 1247 ierr = PetscMemzero(rmap[0],ismax*Mbs*sizeof(int));CHKERRQ(ierr); 1248 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + Mbs;} 1249 #endif 1250 for (i=0; i<ismax; i++) { 1251 irow_i = irow[i]; 1252 jmax = nrow[i]; 1253 #if defined (PETSC_USE_CTABLE) 1254 lrow1_grow1 = rowmaps[i]; 1255 for (j=0; j<jmax; j++) { 1256 ierr = PetscTableAdd(lrow1_grow1,irow_i[j]+1,j+1);CHKERRQ(ierr); 1257 } 1258 #else 1259 rmap_i = rmap[i]; 1260 for (j=0; j<jmax; j++) { 1261 rmap_i[irow_i[j]] = j; 1262 } 1263 #endif 1264 } 1265 1266 /* Update lens from offproc data */ 1267 { 1268 int *rbuf2_i,*rbuf3_i,*sbuf1_i; 1269 1270 for (tmp2=0; tmp2<nrqs; tmp2++) { 1271 ierr = MPI_Waitany(nrqs,r_waits3,&i,r_status3+tmp2);CHKERRQ(ierr); 1272 idex = pa[i]; 1273 sbuf1_i = sbuf1[idex]; 1274 jmax = sbuf1_i[0]; 1275 ct1 = 2*jmax+1; 1276 ct2 = 0; 1277 rbuf2_i = rbuf2[i]; 1278 rbuf3_i = rbuf3[i]; 1279 for (j=1; j<=jmax; j++) { 1280 is_no = sbuf1_i[2*j-1]; 1281 max1 = sbuf1_i[2*j]; 1282 lens_i = lens[is_no]; 1283 #if defined (PETSC_USE_CTABLE) 1284 lcol1_gcol1 = colmaps[is_no]; 1285 lrow1_grow1 = rowmaps[is_no]; 1286 #else 1287 cmap_i = cmap[is_no]; 1288 rmap_i = rmap[is_no]; 1289 #endif 1290 for (k=0; k<max1; k++,ct1++) { 1291 #if defined (PETSC_USE_CTABLE) 1292 ierr = PetscTableFind(lrow1_grow1,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1293 row--; 1294 if(row < 0) { SETERRQ(1,"row not found in table"); } 1295 #else 1296 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1297 #endif 1298 max2 = rbuf2_i[ct1]; 1299 for (l=0; l<max2; l++,ct2++) { 1300 #if defined (PETSC_USE_CTABLE) 1301 ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tt);CHKERRQ(ierr); 1302 if (tt) { 1303 lens_i[row]++; 1304 } 1305 #else 1306 if (cmap_i[rbuf3_i[ct2]]) { 1307 lens_i[row]++; 1308 } 1309 #endif 1310 } 1311 } 1312 } 1313 } 1314 } 1315 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1316 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1317 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1318 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1319 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1320 1321 /* Create the submatrices */ 1322 if (scall == MAT_REUSE_MATRIX) { 1323 /* 1324 Assumes new rows are same length as the old rows, hence bug! 1325 */ 1326 for (i=0; i<ismax; i++) { 1327 mat = (Mat_SeqBAIJ *)(submats[i]->data); 1328 if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || mat->bs != bs)) { 1329 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1330 } 1331 ierr = PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(int),&flag);CHKERRQ(ierr); 1332 if (flag == PETSC_FALSE) { 1333 SETERRQ(PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros"); 1334 } 1335 /* Initial matrix as if empty */ 1336 ierr = PetscMemzero(mat->ilen,mat->mbs*sizeof(int));CHKERRQ(ierr); 1337 submats[i]->factor = C->factor; 1338 } 1339 } else { 1340 for (i=0; i<ismax; i++) { 1341 ierr = MatCreate(PETSC_COMM_SELF,nrow[i]*bs,ncol[i]*bs,nrow[i]*bs,ncol[i]*bs,submats+i);CHKERRQ(ierr); 1342 ierr = MatSetType(submats[i],A->type_name);CHKERRQ(ierr); 1343 ierr = MatSeqBAIJSetPreallocation(submats[i],a->bs,0,lens[i]);CHKERRQ(ierr); 1344 ierr = MatSeqSBAIJSetPreallocation(submats[i],a->bs,0,lens[i]);CHKERRQ(ierr); 1345 } 1346 } 1347 1348 /* Assemble the matrices */ 1349 /* First assemble the local rows */ 1350 { 1351 int ilen_row,*imat_ilen,*imat_j,*imat_i; 1352 MatScalar *imat_a; 1353 1354 for (i=0; i<ismax; i++) { 1355 mat = (Mat_SeqBAIJ*)submats[i]->data; 1356 imat_ilen = mat->ilen; 1357 imat_j = mat->j; 1358 imat_i = mat->i; 1359 imat_a = mat->a; 1360 1361 #if defined (PETSC_USE_CTABLE) 1362 lcol1_gcol1 = colmaps[i]; 1363 lrow1_grow1 = rowmaps[i]; 1364 #else 1365 cmap_i = cmap[i]; 1366 rmap_i = rmap[i]; 1367 #endif 1368 irow_i = irow[i]; 1369 jmax = nrow[i]; 1370 for (j=0; j<jmax; j++) { 1371 row = irow_i[j]; 1372 #if defined (PETSC_USE_CTABLE) 1373 ierr = PetscGetProc(row,size,c->rowners,&proc);CHKERRQ(ierr); 1374 #else 1375 proc = rtable[row]; 1376 #endif 1377 if (proc == rank) { 1378 row = row - rstart; 1379 nzA = a_i[row+1] - a_i[row]; 1380 nzB = b_i[row+1] - b_i[row]; 1381 cworkA = a_j + a_i[row]; 1382 cworkB = b_j + b_i[row]; 1383 vworkA = a_a + a_i[row]*bs2; 1384 vworkB = b_a + b_i[row]*bs2; 1385 #if defined (PETSC_USE_CTABLE) 1386 ierr = PetscTableFind(lrow1_grow1,row+rstart+1,&row);CHKERRQ(ierr); 1387 row--; 1388 if (row < 0) { SETERRQ(1,"row not found in table"); } 1389 #else 1390 row = rmap_i[row + rstart]; 1391 #endif 1392 mat_i = imat_i[row]; 1393 mat_a = imat_a + mat_i*bs2; 1394 mat_j = imat_j + mat_i; 1395 ilen_row = imat_ilen[row]; 1396 1397 /* load the column indices for this row into cols*/ 1398 for (l=0; l<nzB; l++) { 1399 if ((ctmp = bmap[cworkB[l]]) < cstart) { 1400 #if defined (PETSC_USE_CTABLE) 1401 ierr = PetscTableFind(lcol1_gcol1,ctmp+1,&tcol);CHKERRQ(ierr); 1402 if (tcol) { 1403 #else 1404 if ((tcol = cmap_i[ctmp])) { 1405 #endif 1406 *mat_j++ = tcol - 1; 1407 ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1408 mat_a += bs2; 1409 ilen_row++; 1410 } 1411 } else break; 1412 } 1413 imark = l; 1414 for (l=0; l<nzA; l++) { 1415 #if defined (PETSC_USE_CTABLE) 1416 ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[l]+1,&tcol);CHKERRQ(ierr); 1417 if (tcol) { 1418 #else 1419 if ((tcol = cmap_i[cstart + cworkA[l]])) { 1420 #endif 1421 *mat_j++ = tcol - 1; 1422 ierr = PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1423 mat_a += bs2; 1424 ilen_row++; 1425 } 1426 } 1427 for (l=imark; l<nzB; l++) { 1428 #if defined (PETSC_USE_CTABLE) 1429 ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[l]]+1,&tcol);CHKERRQ(ierr); 1430 if (tcol) { 1431 #else 1432 if ((tcol = cmap_i[bmap[cworkB[l]]])) { 1433 #endif 1434 *mat_j++ = tcol - 1; 1435 ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1436 mat_a += bs2; 1437 ilen_row++; 1438 } 1439 } 1440 imat_ilen[row] = ilen_row; 1441 } 1442 } 1443 1444 } 1445 } 1446 1447 /* Now assemble the off proc rows*/ 1448 { 1449 int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1450 int *imat_j,*imat_i; 1451 MatScalar *imat_a,*rbuf4_i; 1452 1453 for (tmp2=0; tmp2<nrqs; tmp2++) { 1454 ierr = MPI_Waitany(nrqs,r_waits4,&i,r_status4+tmp2);CHKERRQ(ierr); 1455 idex = pa[i]; 1456 sbuf1_i = sbuf1[idex]; 1457 jmax = sbuf1_i[0]; 1458 ct1 = 2*jmax + 1; 1459 ct2 = 0; 1460 rbuf2_i = rbuf2[i]; 1461 rbuf3_i = rbuf3[i]; 1462 rbuf4_i = rbuf4[i]; 1463 for (j=1; j<=jmax; j++) { 1464 is_no = sbuf1_i[2*j-1]; 1465 #if defined (PETSC_USE_CTABLE) 1466 lrow1_grow1 = rowmaps[is_no]; 1467 lcol1_gcol1 = colmaps[is_no]; 1468 #else 1469 rmap_i = rmap[is_no]; 1470 cmap_i = cmap[is_no]; 1471 #endif 1472 mat = (Mat_SeqBAIJ*)submats[is_no]->data; 1473 imat_ilen = mat->ilen; 1474 imat_j = mat->j; 1475 imat_i = mat->i; 1476 imat_a = mat->a; 1477 max1 = sbuf1_i[2*j]; 1478 for (k=0; k<max1; k++,ct1++) { 1479 row = sbuf1_i[ct1]; 1480 #if defined (PETSC_USE_CTABLE) 1481 ierr = PetscTableFind(lrow1_grow1,row+1,&row);CHKERRQ(ierr); 1482 row--; 1483 if(row < 0) { SETERRQ(1,"row not found in table"); } 1484 #else 1485 row = rmap_i[row]; 1486 #endif 1487 ilen = imat_ilen[row]; 1488 mat_i = imat_i[row]; 1489 mat_a = imat_a + mat_i*bs2; 1490 mat_j = imat_j + mat_i; 1491 max2 = rbuf2_i[ct1]; 1492 for (l=0; l<max2; l++,ct2++) { 1493 #if defined (PETSC_USE_CTABLE) 1494 ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1495 if (tcol) { 1496 #else 1497 if ((tcol = cmap_i[rbuf3_i[ct2]])) { 1498 #endif 1499 *mat_j++ = tcol - 1; 1500 /* *mat_a++= rbuf4_i[ct2]; */ 1501 ierr = PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1502 mat_a += bs2; 1503 ilen++; 1504 } 1505 } 1506 imat_ilen[row] = ilen; 1507 } 1508 } 1509 } 1510 } 1511 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1512 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1513 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1514 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1515 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1516 1517 /* Restore the indices */ 1518 for (i=0; i<ismax; i++) { 1519 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1520 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1521 } 1522 1523 /* Destroy allocated memory */ 1524 ierr = PetscFree(irow);CHKERRQ(ierr); 1525 ierr = PetscFree(w1);CHKERRQ(ierr); 1526 ierr = PetscFree(pa);CHKERRQ(ierr); 1527 1528 ierr = PetscFree(sbuf1);CHKERRQ(ierr); 1529 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1530 for (i=0; i<nrqr; ++i) { 1531 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1532 } 1533 for (i=0; i<nrqs; ++i) { 1534 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1535 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1536 } 1537 1538 ierr = PetscFree(sbuf2);CHKERRQ(ierr); 1539 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1540 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1541 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1542 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1543 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1544 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1545 1546 #if defined (PETSC_USE_CTABLE) 1547 for (i=0; i<ismax; i++){ 1548 ierr = PetscTableDelete(rowmaps[i]);CHKERRQ(ierr); 1549 ierr = PetscTableDelete(colmaps[i]);CHKERRQ(ierr); 1550 } 1551 ierr = PetscFree(colmaps);CHKERRQ(ierr); 1552 ierr = PetscFree(rowmaps);CHKERRQ(ierr); 1553 #else 1554 ierr = PetscFree(rmap);CHKERRQ(ierr); 1555 ierr = PetscFree(cmap);CHKERRQ(ierr); 1556 #endif 1557 ierr = PetscFree(lens);CHKERRQ(ierr); 1558 1559 for (i=0; i<ismax; i++) { 1560 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1561 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1562 } 1563 1564 PetscFunctionReturn(0); 1565 } 1566 1567