1632d0f97SHong Zhang 2632d0f97SHong Zhang /* 3632d0f97SHong Zhang Routines to compute overlapping regions of a parallel MPI matrix. 4632d0f97SHong Zhang Used for finding submatrices that were shared across processors. 5632d0f97SHong Zhang */ 6c6db04a5SJed Brown #include <../src/mat/impls/sbaij/mpi/mpisbaij.h> 7c6db04a5SJed Brown #include <petscbt.h> 8632d0f97SHong Zhang 913f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat,PetscInt,IS*); 1013f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat,PetscInt*,PetscInt,PetscInt*,PetscBT*); 11632d0f97SHong Zhang 12*db41cccfSHong Zhang /* -------------------------------------------------------------------------*/ 13*db41cccfSHong Zhang /* This code is modified from MatGetSubMatrices_MPIBAIJ_local() */ 14*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 15*db41cccfSHong Zhang #undef __FUNCT__ 16*db41cccfSHong Zhang #define __FUNCT__ "PetscGetProc_ijonly" 17*db41cccfSHong Zhang PetscErrorCode PetscGetProc_ijonly(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank) 18*db41cccfSHong Zhang { 19*db41cccfSHong Zhang PetscInt nGlobalNd = proc_gnode[size]; 20*db41cccfSHong Zhang PetscMPIInt fproc = PetscMPIIntCast( (PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5))); 21*db41cccfSHong Zhang 22*db41cccfSHong Zhang PetscFunctionBegin; 23*db41cccfSHong Zhang if (fproc > size) fproc = size; 24*db41cccfSHong Zhang while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) { 25*db41cccfSHong Zhang if (row < proc_gnode[fproc]) fproc--; 26*db41cccfSHong Zhang else fproc++; 27*db41cccfSHong Zhang } 28*db41cccfSHong Zhang *rank = fproc; 29*db41cccfSHong Zhang PetscFunctionReturn(0); 30*db41cccfSHong Zhang } 31*db41cccfSHong Zhang #endif 32*db41cccfSHong Zhang 33*db41cccfSHong Zhang #undef __FUNCT__ 34*db41cccfSHong Zhang #define __FUNCT__ "MatGetSubMatrices_MPIBAIJ_local_ijonly" 35*db41cccfSHong Zhang PetscErrorCode MatGetSubMatrices_MPIBAIJ_local_ijonly(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 36*db41cccfSHong Zhang { 37*db41cccfSHong Zhang Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 38*db41cccfSHong Zhang Mat A = c->A; 39*db41cccfSHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat; 40*db41cccfSHong Zhang const PetscInt **irow,**icol,*irow_i; 41*db41cccfSHong Zhang PetscInt *nrow,*ncol,*w3,*w4,start; 42*db41cccfSHong Zhang PetscErrorCode ierr; 43*db41cccfSHong Zhang PetscMPIInt size,tag0,tag1,tag2,tag3,*w1,*w2,nrqr,idex,end,proc; 44*db41cccfSHong Zhang PetscInt **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,**rbuf1,row; 45*db41cccfSHong Zhang PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 46*db41cccfSHong Zhang PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 47*db41cccfSHong Zhang PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 48*db41cccfSHong Zhang PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 49*db41cccfSHong Zhang PetscInt *a_j=a->j,*b_j=b->j,*cworkA,*cworkB; //bs=C->rmap->bs,bs2=c->bs2 50*db41cccfSHong Zhang PetscInt cstart = c->cstartbs,nzA,nzB,*a_i=a->i,*b_i=b->i,imark; 51*db41cccfSHong Zhang PetscInt *bmap = c->garray,ctmp,rstart=c->rstartbs; 52*db41cccfSHong Zhang MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3,*s_waits3; 53*db41cccfSHong Zhang //MPI_Request *r_waits4,*s_waits4; 54*db41cccfSHong Zhang MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2,*r_status3; 55*db41cccfSHong Zhang //MPI_Status *r_status4,*s_status4; 56*db41cccfSHong Zhang MPI_Comm comm; 57*db41cccfSHong Zhang //MatScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB; 58*db41cccfSHong Zhang //MatScalar *a_a=a->a,*b_a=b->a; 59*db41cccfSHong Zhang PetscBool flag; 60*db41cccfSHong Zhang PetscMPIInt *onodes1,*olengths1; 61*db41cccfSHong Zhang 62*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 63*db41cccfSHong Zhang PetscInt tt; 64*db41cccfSHong Zhang PetscTable *rowmaps,*colmaps,lrow1_grow1,lcol1_gcol1; 65*db41cccfSHong Zhang #else 66*db41cccfSHong Zhang PetscInt **cmap,*cmap_i,*rtable,*rmap_i,**rmap, Mbs = c->Mbs; 67*db41cccfSHong Zhang #endif 68*db41cccfSHong Zhang 69*db41cccfSHong Zhang PetscFunctionBegin; 70*db41cccfSHong Zhang comm = ((PetscObject)C)->comm; 71*db41cccfSHong Zhang tag0 = ((PetscObject)C)->tag; 72*db41cccfSHong Zhang size = c->size; 73*db41cccfSHong Zhang rank = c->rank; 74*db41cccfSHong Zhang 75*db41cccfSHong Zhang /* Get some new tags to keep the communication clean */ 76*db41cccfSHong Zhang ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 77*db41cccfSHong Zhang ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 78*db41cccfSHong Zhang ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 79*db41cccfSHong Zhang 80*db41cccfSHong Zhang #if defined(PETSC_USE_CTABLE) 81*db41cccfSHong Zhang ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr); 82*db41cccfSHong Zhang #else 83*db41cccfSHong Zhang ierr = PetscMalloc5(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol,Mbs+1,PetscInt,&rtable);CHKERRQ(ierr); 84*db41cccfSHong Zhang /* Create hash table for the mapping :row -> proc*/ 85*db41cccfSHong Zhang for (i=0,j=0; i<size; i++) { 86*db41cccfSHong Zhang jmax = c->rowners[i+1]; 87*db41cccfSHong Zhang for (; j<jmax; j++) { 88*db41cccfSHong Zhang rtable[j] = i; 89*db41cccfSHong Zhang } 90*db41cccfSHong Zhang } 91*db41cccfSHong Zhang #endif 92*db41cccfSHong Zhang 93*db41cccfSHong Zhang for (i=0; i<ismax; i++) { 94*db41cccfSHong Zhang ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 95*db41cccfSHong Zhang ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 96*db41cccfSHong Zhang ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 97*db41cccfSHong Zhang ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 98*db41cccfSHong Zhang } 99*db41cccfSHong Zhang 100*db41cccfSHong Zhang /* evaluate communication - mesg to who,length of mesg,and buffer space 101*db41cccfSHong Zhang required. Based on this, buffers are allocated, and data copied into them*/ 102*db41cccfSHong Zhang ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscInt,&w3,size,PetscInt,&w4);CHKERRQ(ierr); 103*db41cccfSHong Zhang ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 104*db41cccfSHong Zhang ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 105*db41cccfSHong Zhang ierr = PetscMemzero(w3,size*sizeof(PetscInt));CHKERRQ(ierr); 106*db41cccfSHong Zhang for (i=0; i<ismax; i++) { 107*db41cccfSHong Zhang ierr = PetscMemzero(w4,size*sizeof(PetscInt));CHKERRQ(ierr); /* initialise work vector*/ 108*db41cccfSHong Zhang jmax = nrow[i]; 109*db41cccfSHong Zhang irow_i = irow[i]; 110*db41cccfSHong Zhang for (j=0; j<jmax; j++) { 111*db41cccfSHong Zhang row = irow_i[j]; 112*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 113*db41cccfSHong Zhang ierr = PetscGetProc_ijonly(row,size,c->rangebs,&proc);CHKERRQ(ierr); 114*db41cccfSHong Zhang #else 115*db41cccfSHong Zhang proc = rtable[row]; 116*db41cccfSHong Zhang #endif 117*db41cccfSHong Zhang w4[proc]++; 118*db41cccfSHong Zhang } 119*db41cccfSHong Zhang for (j=0; j<size; j++) { 120*db41cccfSHong Zhang if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 121*db41cccfSHong Zhang } 122*db41cccfSHong Zhang } 123*db41cccfSHong Zhang 124*db41cccfSHong Zhang nrqs = 0; /* no of outgoing messages */ 125*db41cccfSHong Zhang msz = 0; /* total mesg length for all proc */ 126*db41cccfSHong Zhang w1[rank] = 0; /* no mesg sent to intself */ 127*db41cccfSHong Zhang w3[rank] = 0; 128*db41cccfSHong Zhang for (i=0; i<size; i++) { 129*db41cccfSHong Zhang if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 130*db41cccfSHong Zhang } 131*db41cccfSHong Zhang ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ 132*db41cccfSHong Zhang for (i=0,j=0; i<size; i++) { 133*db41cccfSHong Zhang if (w1[i]) { pa[j] = i; j++; } 134*db41cccfSHong Zhang } 135*db41cccfSHong Zhang 136*db41cccfSHong Zhang /* Each message would have a header = 1 + 2*(no of IS) + data */ 137*db41cccfSHong Zhang for (i=0; i<nrqs; i++) { 138*db41cccfSHong Zhang j = pa[i]; 139*db41cccfSHong Zhang w1[j] += w2[j] + 2* w3[j]; 140*db41cccfSHong Zhang msz += w1[j]; 141*db41cccfSHong Zhang } 142*db41cccfSHong Zhang 143*db41cccfSHong Zhang /* Determine the number of messages to expect, their lengths, from from-ids */ 144*db41cccfSHong Zhang ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 145*db41cccfSHong Zhang ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 146*db41cccfSHong Zhang 147*db41cccfSHong Zhang /* Now post the Irecvs corresponding to these messages */ 148*db41cccfSHong Zhang ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 149*db41cccfSHong Zhang 150*db41cccfSHong Zhang ierr = PetscFree(onodes1);CHKERRQ(ierr); 151*db41cccfSHong Zhang ierr = PetscFree(olengths1);CHKERRQ(ierr); 152*db41cccfSHong Zhang 153*db41cccfSHong Zhang /* Allocate Memory for outgoing messages */ 154*db41cccfSHong Zhang ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 155*db41cccfSHong Zhang ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 156*db41cccfSHong Zhang ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 157*db41cccfSHong Zhang { 158*db41cccfSHong Zhang PetscInt *iptr = tmp,ict = 0; 159*db41cccfSHong Zhang for (i=0; i<nrqs; i++) { 160*db41cccfSHong Zhang j = pa[i]; 161*db41cccfSHong Zhang iptr += ict; 162*db41cccfSHong Zhang sbuf1[j] = iptr; 163*db41cccfSHong Zhang ict = w1[j]; 164*db41cccfSHong Zhang } 165*db41cccfSHong Zhang } 166*db41cccfSHong Zhang 167*db41cccfSHong Zhang /* Form the outgoing messages */ 168*db41cccfSHong Zhang /* Initialise the header space */ 169*db41cccfSHong Zhang for (i=0; i<nrqs; i++) { 170*db41cccfSHong Zhang j = pa[i]; 171*db41cccfSHong Zhang sbuf1[j][0] = 0; 172*db41cccfSHong Zhang ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 173*db41cccfSHong Zhang ptr[j] = sbuf1[j] + 2*w3[j] + 1; 174*db41cccfSHong Zhang } 175*db41cccfSHong Zhang 176*db41cccfSHong Zhang /* Parse the isrow and copy data into outbuf */ 177*db41cccfSHong Zhang for (i=0; i<ismax; i++) { 178*db41cccfSHong Zhang ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 179*db41cccfSHong Zhang irow_i = irow[i]; 180*db41cccfSHong Zhang jmax = nrow[i]; 181*db41cccfSHong Zhang for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 182*db41cccfSHong Zhang row = irow_i[j]; 183*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 184*db41cccfSHong Zhang ierr = PetscGetProc_ijonly(row,size,c->rangebs,&proc);CHKERRQ(ierr); 185*db41cccfSHong Zhang #else 186*db41cccfSHong Zhang proc = rtable[row]; 187*db41cccfSHong Zhang #endif 188*db41cccfSHong Zhang if (proc != rank) { /* copy to the outgoing buf*/ 189*db41cccfSHong Zhang ctr[proc]++; 190*db41cccfSHong Zhang *ptr[proc] = row; 191*db41cccfSHong Zhang ptr[proc]++; 192*db41cccfSHong Zhang } 193*db41cccfSHong Zhang } 194*db41cccfSHong Zhang /* Update the headers for the current IS */ 195*db41cccfSHong Zhang for (j=0; j<size; j++) { /* Can Optimise this loop too */ 196*db41cccfSHong Zhang if ((ctr_j = ctr[j])) { 197*db41cccfSHong Zhang sbuf1_j = sbuf1[j]; 198*db41cccfSHong Zhang k = ++sbuf1_j[0]; 199*db41cccfSHong Zhang sbuf1_j[2*k] = ctr_j; 200*db41cccfSHong Zhang sbuf1_j[2*k-1] = i; 201*db41cccfSHong Zhang } 202*db41cccfSHong Zhang } 203*db41cccfSHong Zhang } 204*db41cccfSHong Zhang 205*db41cccfSHong Zhang /* Now post the sends */ 206*db41cccfSHong Zhang ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 207*db41cccfSHong Zhang for (i=0; i<nrqs; ++i) { 208*db41cccfSHong Zhang j = pa[i]; 209*db41cccfSHong Zhang ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 210*db41cccfSHong Zhang } 211*db41cccfSHong Zhang 212*db41cccfSHong Zhang /* Post Recieves to capture the buffer size */ 213*db41cccfSHong Zhang ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 214*db41cccfSHong Zhang ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr); 215*db41cccfSHong Zhang rbuf2[0] = tmp + msz; 216*db41cccfSHong Zhang for (i=1; i<nrqs; ++i) { 217*db41cccfSHong Zhang j = pa[i]; 218*db41cccfSHong Zhang rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 219*db41cccfSHong Zhang } 220*db41cccfSHong Zhang for (i=0; i<nrqs; ++i) { 221*db41cccfSHong Zhang j = pa[i]; 222*db41cccfSHong Zhang ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 223*db41cccfSHong Zhang } 224*db41cccfSHong Zhang 225*db41cccfSHong Zhang /* Send to other procs the buf size they should allocate */ 226*db41cccfSHong Zhang 227*db41cccfSHong Zhang /* Receive messages*/ 228*db41cccfSHong Zhang ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 229*db41cccfSHong Zhang ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 230*db41cccfSHong Zhang ierr = PetscMalloc3(nrqr+1,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr); 231*db41cccfSHong Zhang { 232*db41cccfSHong Zhang Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data; 233*db41cccfSHong Zhang PetscInt *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i; 234*db41cccfSHong Zhang 235*db41cccfSHong Zhang for (i=0; i<nrqr; ++i) { 236*db41cccfSHong Zhang ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 237*db41cccfSHong Zhang req_size[idex] = 0; 238*db41cccfSHong Zhang rbuf1_i = rbuf1[idex]; 239*db41cccfSHong Zhang start = 2*rbuf1_i[0] + 1; 240*db41cccfSHong Zhang ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 241*db41cccfSHong Zhang ierr = PetscMalloc(end*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr); 242*db41cccfSHong Zhang sbuf2_i = sbuf2[idex]; 243*db41cccfSHong Zhang for (j=start; j<end; j++) { 244*db41cccfSHong Zhang id = rbuf1_i[j] - rstart; 245*db41cccfSHong Zhang ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 246*db41cccfSHong Zhang sbuf2_i[j] = ncols; 247*db41cccfSHong Zhang req_size[idex] += ncols; 248*db41cccfSHong Zhang } 249*db41cccfSHong Zhang req_source[idex] = r_status1[i].MPI_SOURCE; 250*db41cccfSHong Zhang /* form the header */ 251*db41cccfSHong Zhang sbuf2_i[0] = req_size[idex]; 252*db41cccfSHong Zhang for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 253*db41cccfSHong Zhang ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 254*db41cccfSHong Zhang } 255*db41cccfSHong Zhang } 256*db41cccfSHong Zhang ierr = PetscFree(r_status1);CHKERRQ(ierr); 257*db41cccfSHong Zhang ierr = PetscFree(r_waits1);CHKERRQ(ierr); 258*db41cccfSHong Zhang 259*db41cccfSHong Zhang /* recv buffer sizes */ 260*db41cccfSHong Zhang /* Receive messages*/ 261*db41cccfSHong Zhang 262*db41cccfSHong Zhang ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr); 263*db41cccfSHong Zhang //ierr = PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);CHKERRQ(ierr); 264*db41cccfSHong Zhang ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 265*db41cccfSHong Zhang //ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 266*db41cccfSHong Zhang ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 267*db41cccfSHong Zhang 268*db41cccfSHong Zhang for (i=0; i<nrqs; ++i) { 269*db41cccfSHong Zhang ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 270*db41cccfSHong Zhang ierr = PetscMalloc(rbuf2[idex][0]*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr); 271*db41cccfSHong Zhang //ierr = PetscMalloc(rbuf2[idex][0]*bs2*sizeof(MatScalar),&rbuf4[idex]);CHKERRQ(ierr); 272*db41cccfSHong Zhang ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 273*db41cccfSHong Zhang //ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 274*db41cccfSHong Zhang } 275*db41cccfSHong Zhang ierr = PetscFree(r_status2);CHKERRQ(ierr); 276*db41cccfSHong Zhang ierr = PetscFree(r_waits2);CHKERRQ(ierr); 277*db41cccfSHong Zhang 278*db41cccfSHong Zhang /* Wait on sends1 and sends2 */ 279*db41cccfSHong Zhang ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 280*db41cccfSHong Zhang ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 281*db41cccfSHong Zhang 282*db41cccfSHong Zhang if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 283*db41cccfSHong Zhang if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 284*db41cccfSHong Zhang ierr = PetscFree(s_status1);CHKERRQ(ierr); 285*db41cccfSHong Zhang ierr = PetscFree(s_status2);CHKERRQ(ierr); 286*db41cccfSHong Zhang ierr = PetscFree(s_waits1);CHKERRQ(ierr); 287*db41cccfSHong Zhang ierr = PetscFree(s_waits2);CHKERRQ(ierr); 288*db41cccfSHong Zhang 289*db41cccfSHong Zhang /* Now allocate buffers for a->j, and send them off */ 290*db41cccfSHong Zhang ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); 291*db41cccfSHong Zhang for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 292*db41cccfSHong Zhang ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr); 293*db41cccfSHong Zhang for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 294*db41cccfSHong Zhang 295*db41cccfSHong Zhang ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 296*db41cccfSHong Zhang { 297*db41cccfSHong Zhang for (i=0; i<nrqr; i++) { 298*db41cccfSHong Zhang rbuf1_i = rbuf1[i]; 299*db41cccfSHong Zhang sbuf_aj_i = sbuf_aj[i]; 300*db41cccfSHong Zhang ct1 = 2*rbuf1_i[0] + 1; 301*db41cccfSHong Zhang ct2 = 0; 302*db41cccfSHong Zhang for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 303*db41cccfSHong Zhang kmax = rbuf1[i][2*j]; 304*db41cccfSHong Zhang for (k=0; k<kmax; k++,ct1++) { 305*db41cccfSHong Zhang row = rbuf1_i[ct1] - rstart; 306*db41cccfSHong Zhang nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 307*db41cccfSHong Zhang ncols = nzA + nzB; 308*db41cccfSHong Zhang cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 309*db41cccfSHong Zhang 310*db41cccfSHong Zhang /* load the column indices for this row into cols*/ 311*db41cccfSHong Zhang cols = sbuf_aj_i + ct2; 312*db41cccfSHong Zhang for (l=0; l<nzB; l++) { 313*db41cccfSHong Zhang if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp; 314*db41cccfSHong Zhang else break; 315*db41cccfSHong Zhang } 316*db41cccfSHong Zhang imark = l; 317*db41cccfSHong Zhang for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l]; 318*db41cccfSHong Zhang for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]]; 319*db41cccfSHong Zhang ct2 += ncols; 320*db41cccfSHong Zhang } 321*db41cccfSHong Zhang } 322*db41cccfSHong Zhang ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 323*db41cccfSHong Zhang } 324*db41cccfSHong Zhang } 325*db41cccfSHong Zhang ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 326*db41cccfSHong Zhang ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 327*db41cccfSHong Zhang 328*db41cccfSHong Zhang #if defined(MV) 329*db41cccfSHong Zhang /* Allocate buffers for a->a, and send them off */ 330*db41cccfSHong Zhang ierr = PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);CHKERRQ(ierr); 331*db41cccfSHong Zhang for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 332*db41cccfSHong Zhang ierr = PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);CHKERRQ(ierr); 333*db41cccfSHong Zhang for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2; 334*db41cccfSHong Zhang 335*db41cccfSHong Zhang ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 336*db41cccfSHong Zhang { 337*db41cccfSHong Zhang for (i=0; i<nrqr; i++) { 338*db41cccfSHong Zhang rbuf1_i = rbuf1[i]; 339*db41cccfSHong Zhang sbuf_aa_i = sbuf_aa[i]; 340*db41cccfSHong Zhang ct1 = 2*rbuf1_i[0]+1; 341*db41cccfSHong Zhang ct2 = 0; 342*db41cccfSHong Zhang for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 343*db41cccfSHong Zhang kmax = rbuf1_i[2*j]; 344*db41cccfSHong Zhang for (k=0; k<kmax; k++,ct1++) { 345*db41cccfSHong Zhang row = rbuf1_i[ct1] - rstart; 346*db41cccfSHong Zhang nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 347*db41cccfSHong Zhang ncols = nzA + nzB; 348*db41cccfSHong Zhang cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 349*db41cccfSHong Zhang vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2; 350*db41cccfSHong Zhang 351*db41cccfSHong Zhang /* load the column values for this row into vals*/ 352*db41cccfSHong Zhang vals = sbuf_aa_i+ct2*bs2; 353*db41cccfSHong Zhang for (l=0; l<nzB; l++) { 354*db41cccfSHong Zhang if ((bmap[cworkB[l]]) < cstart) { 355*db41cccfSHong Zhang ierr = PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 356*db41cccfSHong Zhang } 357*db41cccfSHong Zhang else break; 358*db41cccfSHong Zhang } 359*db41cccfSHong Zhang imark = l; 360*db41cccfSHong Zhang for (l=0; l<nzA; l++) { 361*db41cccfSHong Zhang ierr = PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 362*db41cccfSHong Zhang } 363*db41cccfSHong Zhang for (l=imark; l<nzB; l++) { 364*db41cccfSHong Zhang ierr = PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 365*db41cccfSHong Zhang } 366*db41cccfSHong Zhang ct2 += ncols; 367*db41cccfSHong Zhang } 368*db41cccfSHong Zhang } 369*db41cccfSHong Zhang ierr = MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 370*db41cccfSHong Zhang } 371*db41cccfSHong Zhang } 372*db41cccfSHong Zhang ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 373*db41cccfSHong Zhang ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 374*db41cccfSHong Zhang #endif 375*db41cccfSHong Zhang ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 376*db41cccfSHong Zhang ierr = PetscFree(rbuf1);CHKERRQ(ierr); 377*db41cccfSHong Zhang 378*db41cccfSHong Zhang /* Form the matrix */ 379*db41cccfSHong Zhang /* create col map */ 380*db41cccfSHong Zhang { 381*db41cccfSHong Zhang const PetscInt *icol_i; 382*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 383*db41cccfSHong Zhang /* Create row map*/ 384*db41cccfSHong Zhang ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&colmaps);CHKERRQ(ierr); 385*db41cccfSHong Zhang for (i=0; i<ismax; i++) { 386*db41cccfSHong Zhang ierr = PetscTableCreate(ncol[i]+1,&colmaps[i]);CHKERRQ(ierr); 387*db41cccfSHong Zhang } 388*db41cccfSHong Zhang #else 389*db41cccfSHong Zhang ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr); 390*db41cccfSHong Zhang ierr = PetscMalloc(ismax*c->Nbs*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr); 391*db41cccfSHong Zhang for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; } 392*db41cccfSHong Zhang #endif 393*db41cccfSHong Zhang for (i=0; i<ismax; i++) { 394*db41cccfSHong Zhang jmax = ncol[i]; 395*db41cccfSHong Zhang icol_i = icol[i]; 396*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 397*db41cccfSHong Zhang lcol1_gcol1 = colmaps[i]; 398*db41cccfSHong Zhang for (j=0; j<jmax; j++) { 399*db41cccfSHong Zhang ierr = PetscTableAdd(lcol1_gcol1,icol_i[j]+1,j+1);CHKERRQ(ierr); 400*db41cccfSHong Zhang } 401*db41cccfSHong Zhang #else 402*db41cccfSHong Zhang cmap_i = cmap[i]; 403*db41cccfSHong Zhang for (j=0; j<jmax; j++) { 404*db41cccfSHong Zhang cmap_i[icol_i[j]] = j+1; 405*db41cccfSHong Zhang } 406*db41cccfSHong Zhang #endif 407*db41cccfSHong Zhang } 408*db41cccfSHong Zhang } 409*db41cccfSHong Zhang 410*db41cccfSHong Zhang /* Create lens which is required for MatCreate... */ 411*db41cccfSHong Zhang for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 412*db41cccfSHong Zhang ierr = PetscMalloc((1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt),&lens);CHKERRQ(ierr); 413*db41cccfSHong Zhang lens[0] = (PetscInt*)(lens + ismax); 414*db41cccfSHong Zhang ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 415*db41cccfSHong Zhang for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 416*db41cccfSHong Zhang 417*db41cccfSHong Zhang /* Update lens from local data */ 418*db41cccfSHong Zhang for (i=0; i<ismax; i++) { 419*db41cccfSHong Zhang jmax = nrow[i]; 420*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 421*db41cccfSHong Zhang lcol1_gcol1 = colmaps[i]; 422*db41cccfSHong Zhang #else 423*db41cccfSHong Zhang cmap_i = cmap[i]; 424*db41cccfSHong Zhang #endif 425*db41cccfSHong Zhang irow_i = irow[i]; 426*db41cccfSHong Zhang lens_i = lens[i]; 427*db41cccfSHong Zhang for (j=0; j<jmax; j++) { 428*db41cccfSHong Zhang row = irow_i[j]; 429*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 430*db41cccfSHong Zhang ierr = PetscGetProc_ijonly(row,size,c->rangebs,&proc);CHKERRQ(ierr); 431*db41cccfSHong Zhang #else 432*db41cccfSHong Zhang proc = rtable[row]; 433*db41cccfSHong Zhang #endif 434*db41cccfSHong Zhang if (proc == rank) { 435*db41cccfSHong Zhang /* Get indices from matA and then from matB */ 436*db41cccfSHong Zhang row = row - rstart; 437*db41cccfSHong Zhang nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 438*db41cccfSHong Zhang cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 439*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 440*db41cccfSHong Zhang for (k=0; k<nzA; k++) { 441*db41cccfSHong Zhang ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[k]+1,&tt);CHKERRQ(ierr); 442*db41cccfSHong Zhang if (tt) { lens_i[j]++; } 443*db41cccfSHong Zhang } 444*db41cccfSHong Zhang for (k=0; k<nzB; k++) { 445*db41cccfSHong Zhang ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[k]]+1,&tt);CHKERRQ(ierr); 446*db41cccfSHong Zhang if (tt) { lens_i[j]++; } 447*db41cccfSHong Zhang } 448*db41cccfSHong Zhang #else 449*db41cccfSHong Zhang for (k=0; k<nzA; k++) { 450*db41cccfSHong Zhang if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++; } 451*db41cccfSHong Zhang } 452*db41cccfSHong Zhang for (k=0; k<nzB; k++) { 453*db41cccfSHong Zhang if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++; } 454*db41cccfSHong Zhang } 455*db41cccfSHong Zhang #endif 456*db41cccfSHong Zhang } 457*db41cccfSHong Zhang } 458*db41cccfSHong Zhang } 459*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 460*db41cccfSHong Zhang /* Create row map*/ 461*db41cccfSHong Zhang ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rowmaps);CHKERRQ(ierr); 462*db41cccfSHong Zhang for (i=0; i<ismax; i++){ 463*db41cccfSHong Zhang ierr = PetscTableCreate(nrow[i]+1,&rowmaps[i]);CHKERRQ(ierr); 464*db41cccfSHong Zhang } 465*db41cccfSHong Zhang #else 466*db41cccfSHong Zhang /* Create row map*/ 467*db41cccfSHong Zhang ierr = PetscMalloc((1+ismax)*sizeof(PetscInt*)+ ismax*Mbs*sizeof(PetscInt),&rmap);CHKERRQ(ierr); 468*db41cccfSHong Zhang rmap[0] = (PetscInt*)(rmap + ismax); 469*db41cccfSHong Zhang ierr = PetscMemzero(rmap[0],ismax*Mbs*sizeof(PetscInt));CHKERRQ(ierr); 470*db41cccfSHong Zhang for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + Mbs;} 471*db41cccfSHong Zhang #endif 472*db41cccfSHong Zhang for (i=0; i<ismax; i++) { 473*db41cccfSHong Zhang irow_i = irow[i]; 474*db41cccfSHong Zhang jmax = nrow[i]; 475*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 476*db41cccfSHong Zhang lrow1_grow1 = rowmaps[i]; 477*db41cccfSHong Zhang for (j=0; j<jmax; j++) { 478*db41cccfSHong Zhang ierr = PetscTableAdd(lrow1_grow1,irow_i[j]+1,j+1);CHKERRQ(ierr); 479*db41cccfSHong Zhang } 480*db41cccfSHong Zhang #else 481*db41cccfSHong Zhang rmap_i = rmap[i]; 482*db41cccfSHong Zhang for (j=0; j<jmax; j++) { 483*db41cccfSHong Zhang rmap_i[irow_i[j]] = j; 484*db41cccfSHong Zhang } 485*db41cccfSHong Zhang #endif 486*db41cccfSHong Zhang } 487*db41cccfSHong Zhang 488*db41cccfSHong Zhang /* Update lens from offproc data */ 489*db41cccfSHong Zhang { 490*db41cccfSHong Zhang PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 491*db41cccfSHong Zhang PetscMPIInt ii; 492*db41cccfSHong Zhang 493*db41cccfSHong Zhang for (tmp2=0; tmp2<nrqs; tmp2++) { 494*db41cccfSHong Zhang ierr = MPI_Waitany(nrqs,r_waits3,&ii,r_status3+tmp2);CHKERRQ(ierr); 495*db41cccfSHong Zhang idex = pa[ii]; 496*db41cccfSHong Zhang sbuf1_i = sbuf1[idex]; 497*db41cccfSHong Zhang jmax = sbuf1_i[0]; 498*db41cccfSHong Zhang ct1 = 2*jmax+1; 499*db41cccfSHong Zhang ct2 = 0; 500*db41cccfSHong Zhang rbuf2_i = rbuf2[ii]; 501*db41cccfSHong Zhang rbuf3_i = rbuf3[ii]; 502*db41cccfSHong Zhang for (j=1; j<=jmax; j++) { 503*db41cccfSHong Zhang is_no = sbuf1_i[2*j-1]; 504*db41cccfSHong Zhang max1 = sbuf1_i[2*j]; 505*db41cccfSHong Zhang lens_i = lens[is_no]; 506*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 507*db41cccfSHong Zhang lcol1_gcol1 = colmaps[is_no]; 508*db41cccfSHong Zhang lrow1_grow1 = rowmaps[is_no]; 509*db41cccfSHong Zhang #else 510*db41cccfSHong Zhang cmap_i = cmap[is_no]; 511*db41cccfSHong Zhang rmap_i = rmap[is_no]; 512*db41cccfSHong Zhang #endif 513*db41cccfSHong Zhang for (k=0; k<max1; k++,ct1++) { 514*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 515*db41cccfSHong Zhang ierr = PetscTableFind(lrow1_grow1,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 516*db41cccfSHong Zhang row--; 517*db41cccfSHong Zhang if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); } 518*db41cccfSHong Zhang #else 519*db41cccfSHong Zhang row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 520*db41cccfSHong Zhang #endif 521*db41cccfSHong Zhang max2 = rbuf2_i[ct1]; 522*db41cccfSHong Zhang for (l=0; l<max2; l++,ct2++) { 523*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 524*db41cccfSHong Zhang ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tt);CHKERRQ(ierr); 525*db41cccfSHong Zhang if (tt) { 526*db41cccfSHong Zhang lens_i[row]++; 527*db41cccfSHong Zhang } 528*db41cccfSHong Zhang #else 529*db41cccfSHong Zhang if (cmap_i[rbuf3_i[ct2]]) { 530*db41cccfSHong Zhang lens_i[row]++; 531*db41cccfSHong Zhang } 532*db41cccfSHong Zhang #endif 533*db41cccfSHong Zhang } 534*db41cccfSHong Zhang } 535*db41cccfSHong Zhang } 536*db41cccfSHong Zhang } 537*db41cccfSHong Zhang } 538*db41cccfSHong Zhang ierr = PetscFree(r_status3);CHKERRQ(ierr); 539*db41cccfSHong Zhang ierr = PetscFree(r_waits3);CHKERRQ(ierr); 540*db41cccfSHong Zhang if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 541*db41cccfSHong Zhang ierr = PetscFree(s_status3);CHKERRQ(ierr); 542*db41cccfSHong Zhang ierr = PetscFree(s_waits3);CHKERRQ(ierr); 543*db41cccfSHong Zhang 544*db41cccfSHong Zhang /* Create the submatrices */ 545*db41cccfSHong Zhang if (scall == MAT_REUSE_MATRIX) { 546*db41cccfSHong Zhang /* 547*db41cccfSHong Zhang Assumes new rows are same length as the old rows, hence bug! 548*db41cccfSHong Zhang */ 549*db41cccfSHong Zhang for (i=0; i<ismax; i++) { 550*db41cccfSHong Zhang mat = (Mat_SeqBAIJ *)(submats[i]->data); 551*db41cccfSHong Zhang //if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || C->rmap->bs != bs)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 552*db41cccfSHong Zhang if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 553*db41cccfSHong Zhang ierr = PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 554*db41cccfSHong Zhang if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros"); 555*db41cccfSHong Zhang /* Initial matrix as if empty */ 556*db41cccfSHong Zhang ierr = PetscMemzero(mat->ilen,mat->mbs*sizeof(PetscInt));CHKERRQ(ierr); 557*db41cccfSHong Zhang submats[i]->factortype = C->factortype; 558*db41cccfSHong Zhang } 559*db41cccfSHong Zhang } else { 560*db41cccfSHong Zhang for (i=0; i<ismax; i++) { 561*db41cccfSHong Zhang ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 562*db41cccfSHong Zhang //ierr = MatSetSizes(submats[i],nrow[i]*bs,ncol[i]*bs,nrow[i]*bs,ncol[i]*bs);CHKERRQ(ierr); 563*db41cccfSHong Zhang ierr = MatSetSizes(submats[i],nrow[i],ncol[i],nrow[i],ncol[i]);CHKERRQ(ierr); 564*db41cccfSHong Zhang ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 565*db41cccfSHong Zhang ierr = MatSeqBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);CHKERRQ(ierr); 566*db41cccfSHong Zhang //ierr = MatSeqBAIJSetPreallocation(submats[i],1,0,lens[i]);CHKERRQ(ierr); 567*db41cccfSHong Zhang ierr = MatSeqSBAIJSetPreallocation(submats[i],1,0,lens[i]);CHKERRQ(ierr); 568*db41cccfSHong Zhang //ierr = MatSeqSBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);CHKERRQ(ierr); /* this subroutine is used by MatGetSubMatrices_MPISBAIJ()*/ 569*db41cccfSHong Zhang } 570*db41cccfSHong Zhang } 571*db41cccfSHong Zhang 572*db41cccfSHong Zhang /* Assemble the matrices */ 573*db41cccfSHong Zhang /* First assemble the local rows */ 574*db41cccfSHong Zhang { 575*db41cccfSHong Zhang PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i; 576*db41cccfSHong Zhang //MatScalar *imat_a; 577*db41cccfSHong Zhang 578*db41cccfSHong Zhang for (i=0; i<ismax; i++) { 579*db41cccfSHong Zhang mat = (Mat_SeqBAIJ*)submats[i]->data; 580*db41cccfSHong Zhang imat_ilen = mat->ilen; 581*db41cccfSHong Zhang imat_j = mat->j; 582*db41cccfSHong Zhang imat_i = mat->i; 583*db41cccfSHong Zhang //imat_a = mat->a; 584*db41cccfSHong Zhang 585*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 586*db41cccfSHong Zhang lcol1_gcol1 = colmaps[i]; 587*db41cccfSHong Zhang lrow1_grow1 = rowmaps[i]; 588*db41cccfSHong Zhang #else 589*db41cccfSHong Zhang cmap_i = cmap[i]; 590*db41cccfSHong Zhang rmap_i = rmap[i]; 591*db41cccfSHong Zhang #endif 592*db41cccfSHong Zhang irow_i = irow[i]; 593*db41cccfSHong Zhang jmax = nrow[i]; 594*db41cccfSHong Zhang for (j=0; j<jmax; j++) { 595*db41cccfSHong Zhang row = irow_i[j]; 596*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 597*db41cccfSHong Zhang ierr = PetscGetProc_ijonly(row,size,c->rangebs,&proc);CHKERRQ(ierr); 598*db41cccfSHong Zhang #else 599*db41cccfSHong Zhang proc = rtable[row]; 600*db41cccfSHong Zhang #endif 601*db41cccfSHong Zhang if (proc == rank) { 602*db41cccfSHong Zhang row = row - rstart; 603*db41cccfSHong Zhang nzA = a_i[row+1] - a_i[row]; 604*db41cccfSHong Zhang nzB = b_i[row+1] - b_i[row]; 605*db41cccfSHong Zhang cworkA = a_j + a_i[row]; 606*db41cccfSHong Zhang cworkB = b_j + b_i[row]; 607*db41cccfSHong Zhang //vworkA = a_a + a_i[row]*bs2; 608*db41cccfSHong Zhang //vworkB = b_a + b_i[row]*bs2; 609*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 610*db41cccfSHong Zhang ierr = PetscTableFind(lrow1_grow1,row+rstart+1,&row);CHKERRQ(ierr); 611*db41cccfSHong Zhang row--; 612*db41cccfSHong Zhang if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); } 613*db41cccfSHong Zhang #else 614*db41cccfSHong Zhang row = rmap_i[row + rstart]; 615*db41cccfSHong Zhang #endif 616*db41cccfSHong Zhang mat_i = imat_i[row]; 617*db41cccfSHong Zhang //mat_a = imat_a + mat_i*bs2; 618*db41cccfSHong Zhang mat_j = imat_j + mat_i; 619*db41cccfSHong Zhang ilen_row = imat_ilen[row]; 620*db41cccfSHong Zhang 621*db41cccfSHong Zhang /* load the column indices for this row into cols*/ 622*db41cccfSHong Zhang for (l=0; l<nzB; l++) { 623*db41cccfSHong Zhang if ((ctmp = bmap[cworkB[l]]) < cstart) { 624*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 625*db41cccfSHong Zhang ierr = PetscTableFind(lcol1_gcol1,ctmp+1,&tcol);CHKERRQ(ierr); 626*db41cccfSHong Zhang if (tcol) { 627*db41cccfSHong Zhang #else 628*db41cccfSHong Zhang if ((tcol = cmap_i[ctmp])) { 629*db41cccfSHong Zhang #endif 630*db41cccfSHong Zhang *mat_j++ = tcol - 1; 631*db41cccfSHong Zhang //ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 632*db41cccfSHong Zhang //mat_a += bs2; 633*db41cccfSHong Zhang ilen_row++; 634*db41cccfSHong Zhang } 635*db41cccfSHong Zhang } else break; 636*db41cccfSHong Zhang } 637*db41cccfSHong Zhang imark = l; 638*db41cccfSHong Zhang for (l=0; l<nzA; l++) { 639*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 640*db41cccfSHong Zhang ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[l]+1,&tcol);CHKERRQ(ierr); 641*db41cccfSHong Zhang if (tcol) { 642*db41cccfSHong Zhang #else 643*db41cccfSHong Zhang if ((tcol = cmap_i[cstart + cworkA[l]])) { 644*db41cccfSHong Zhang #endif 645*db41cccfSHong Zhang *mat_j++ = tcol - 1; 646*db41cccfSHong Zhang //ierr = PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 647*db41cccfSHong Zhang //mat_a += bs2; 648*db41cccfSHong Zhang ilen_row++; 649*db41cccfSHong Zhang } 650*db41cccfSHong Zhang } 651*db41cccfSHong Zhang for (l=imark; l<nzB; l++) { 652*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 653*db41cccfSHong Zhang ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[l]]+1,&tcol);CHKERRQ(ierr); 654*db41cccfSHong Zhang if (tcol) { 655*db41cccfSHong Zhang #else 656*db41cccfSHong Zhang if ((tcol = cmap_i[bmap[cworkB[l]]])) { 657*db41cccfSHong Zhang #endif 658*db41cccfSHong Zhang *mat_j++ = tcol - 1; 659*db41cccfSHong Zhang //ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 660*db41cccfSHong Zhang //mat_a += bs2; 661*db41cccfSHong Zhang ilen_row++; 662*db41cccfSHong Zhang } 663*db41cccfSHong Zhang } 664*db41cccfSHong Zhang imat_ilen[row] = ilen_row; 665*db41cccfSHong Zhang } 666*db41cccfSHong Zhang } 667*db41cccfSHong Zhang } 668*db41cccfSHong Zhang } 669*db41cccfSHong Zhang 670*db41cccfSHong Zhang /* Now assemble the off proc rows*/ 671*db41cccfSHong Zhang { 672*db41cccfSHong Zhang PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 673*db41cccfSHong Zhang PetscInt *imat_j,*imat_i; 674*db41cccfSHong Zhang //MatScalar *imat_a,*rbuf4_i; 675*db41cccfSHong Zhang PetscMPIInt ii; 676*db41cccfSHong Zhang 677*db41cccfSHong Zhang for (tmp2=0; tmp2<nrqs; tmp2++) { 678*db41cccfSHong Zhang //ierr = MPI_Waitany(nrqs,r_waits4,&ii,r_status4+tmp2);CHKERRQ(ierr); 679*db41cccfSHong Zhang ii = tmp2; //new 680*db41cccfSHong Zhang idex = pa[ii]; 681*db41cccfSHong Zhang sbuf1_i = sbuf1[idex]; 682*db41cccfSHong Zhang jmax = sbuf1_i[0]; 683*db41cccfSHong Zhang ct1 = 2*jmax + 1; 684*db41cccfSHong Zhang ct2 = 0; 685*db41cccfSHong Zhang rbuf2_i = rbuf2[ii]; 686*db41cccfSHong Zhang rbuf3_i = rbuf3[ii]; 687*db41cccfSHong Zhang //rbuf4_i = rbuf4[ii]; 688*db41cccfSHong Zhang for (j=1; j<=jmax; j++) { 689*db41cccfSHong Zhang is_no = sbuf1_i[2*j-1]; 690*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 691*db41cccfSHong Zhang lrow1_grow1 = rowmaps[is_no]; 692*db41cccfSHong Zhang lcol1_gcol1 = colmaps[is_no]; 693*db41cccfSHong Zhang #else 694*db41cccfSHong Zhang rmap_i = rmap[is_no]; 695*db41cccfSHong Zhang cmap_i = cmap[is_no]; 696*db41cccfSHong Zhang #endif 697*db41cccfSHong Zhang mat = (Mat_SeqBAIJ*)submats[is_no]->data; 698*db41cccfSHong Zhang imat_ilen = mat->ilen; 699*db41cccfSHong Zhang imat_j = mat->j; 700*db41cccfSHong Zhang imat_i = mat->i; 701*db41cccfSHong Zhang //imat_a = mat->a; 702*db41cccfSHong Zhang max1 = sbuf1_i[2*j]; 703*db41cccfSHong Zhang for (k=0; k<max1; k++,ct1++) { 704*db41cccfSHong Zhang row = sbuf1_i[ct1]; 705*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 706*db41cccfSHong Zhang ierr = PetscTableFind(lrow1_grow1,row+1,&row);CHKERRQ(ierr); 707*db41cccfSHong Zhang row--; 708*db41cccfSHong Zhang if(row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); } 709*db41cccfSHong Zhang #else 710*db41cccfSHong Zhang row = rmap_i[row]; 711*db41cccfSHong Zhang #endif 712*db41cccfSHong Zhang ilen = imat_ilen[row]; 713*db41cccfSHong Zhang mat_i = imat_i[row]; 714*db41cccfSHong Zhang //mat_a = imat_a + mat_i*bs2; 715*db41cccfSHong Zhang mat_j = imat_j + mat_i; 716*db41cccfSHong Zhang max2 = rbuf2_i[ct1]; 717*db41cccfSHong Zhang for (l=0; l<max2; l++,ct2++) { 718*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 719*db41cccfSHong Zhang ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 720*db41cccfSHong Zhang if (tcol) { 721*db41cccfSHong Zhang #else 722*db41cccfSHong Zhang if ((tcol = cmap_i[rbuf3_i[ct2]])) { 723*db41cccfSHong Zhang #endif 724*db41cccfSHong Zhang *mat_j++ = tcol - 1; 725*db41cccfSHong Zhang /* *mat_a++= rbuf4_i[ct2]; */ 726*db41cccfSHong Zhang //ierr = PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 727*db41cccfSHong Zhang //mat_a += bs2; 728*db41cccfSHong Zhang ilen++; 729*db41cccfSHong Zhang } 730*db41cccfSHong Zhang } 731*db41cccfSHong Zhang imat_ilen[row] = ilen; 732*db41cccfSHong Zhang } 733*db41cccfSHong Zhang } 734*db41cccfSHong Zhang } 735*db41cccfSHong Zhang } 736*db41cccfSHong Zhang //ierr = PetscFree(r_status4);CHKERRQ(ierr); 737*db41cccfSHong Zhang //ierr = PetscFree(r_waits4);CHKERRQ(ierr); 738*db41cccfSHong Zhang //if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 739*db41cccfSHong Zhang //ierr = PetscFree(s_waits4);CHKERRQ(ierr); 740*db41cccfSHong Zhang //ierr = PetscFree(s_status4);CHKERRQ(ierr); 741*db41cccfSHong Zhang 742*db41cccfSHong Zhang /* Restore the indices */ 743*db41cccfSHong Zhang for (i=0; i<ismax; i++) { 744*db41cccfSHong Zhang ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 745*db41cccfSHong Zhang ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 746*db41cccfSHong Zhang } 747*db41cccfSHong Zhang 748*db41cccfSHong Zhang /* Destroy allocated memory */ 749*db41cccfSHong Zhang #if defined(PETSC_USE_CTABLE) 750*db41cccfSHong Zhang ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 751*db41cccfSHong Zhang #else 752*db41cccfSHong Zhang ierr = PetscFree5(irow,icol,nrow,ncol,rtable);CHKERRQ(ierr); 753*db41cccfSHong Zhang #endif 754*db41cccfSHong Zhang ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 755*db41cccfSHong Zhang ierr = PetscFree(pa);CHKERRQ(ierr); 756*db41cccfSHong Zhang 757*db41cccfSHong Zhang ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 758*db41cccfSHong Zhang ierr = PetscFree(sbuf1);CHKERRQ(ierr); 759*db41cccfSHong Zhang ierr = PetscFree(rbuf2);CHKERRQ(ierr); 760*db41cccfSHong Zhang for (i=0; i<nrqr; ++i) { 761*db41cccfSHong Zhang ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 762*db41cccfSHong Zhang } 763*db41cccfSHong Zhang for (i=0; i<nrqs; ++i) { 764*db41cccfSHong Zhang ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 765*db41cccfSHong Zhang //ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 766*db41cccfSHong Zhang } 767*db41cccfSHong Zhang ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 768*db41cccfSHong Zhang ierr = PetscFree(rbuf3);CHKERRQ(ierr); 769*db41cccfSHong Zhang //ierr = PetscFree(rbuf4);CHKERRQ(ierr); 770*db41cccfSHong Zhang ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 771*db41cccfSHong Zhang ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 772*db41cccfSHong Zhang //ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 773*db41cccfSHong Zhang //ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 774*db41cccfSHong Zhang 775*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 776*db41cccfSHong Zhang for (i=0; i<ismax; i++){ 777*db41cccfSHong Zhang ierr = PetscTableDestroy(rowmaps[i]);CHKERRQ(ierr); 778*db41cccfSHong Zhang ierr = PetscTableDestroy(colmaps[i]);CHKERRQ(ierr); 779*db41cccfSHong Zhang } 780*db41cccfSHong Zhang ierr = PetscFree(colmaps);CHKERRQ(ierr); 781*db41cccfSHong Zhang ierr = PetscFree(rowmaps);CHKERRQ(ierr); 782*db41cccfSHong Zhang #else 783*db41cccfSHong Zhang ierr = PetscFree(rmap);CHKERRQ(ierr); 784*db41cccfSHong Zhang ierr = PetscFree(cmap[0]);CHKERRQ(ierr); 785*db41cccfSHong Zhang ierr = PetscFree(cmap);CHKERRQ(ierr); 786*db41cccfSHong Zhang #endif 787*db41cccfSHong Zhang ierr = PetscFree(lens);CHKERRQ(ierr); 788*db41cccfSHong Zhang 789*db41cccfSHong Zhang for (i=0; i<ismax; i++) { 790*db41cccfSHong Zhang ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 791*db41cccfSHong Zhang ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 792*db41cccfSHong Zhang } 793*db41cccfSHong Zhang PetscFunctionReturn(0); 794*db41cccfSHong Zhang } 795*db41cccfSHong Zhang 796*db41cccfSHong Zhang /* ------------------------------------------------------- */ 797*db41cccfSHong Zhang 798632d0f97SHong Zhang #undef __FUNCT__ 799632d0f97SHong Zhang #define __FUNCT__ "MatIncreaseOverlap_MPISBAIJ" 80013f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_MPISBAIJ(Mat C,PetscInt is_max,IS is[],PetscInt ov) 801632d0f97SHong Zhang { 8026849ba73SBarry Smith PetscErrorCode ierr; 803d0f46423SBarry Smith PetscInt i,N=C->cmap->N, bs=C->rmap->bs; 804632d0f97SHong Zhang IS *is_new; 805632d0f97SHong Zhang 806632d0f97SHong Zhang PetscFunctionBegin; 807c910923dSHong Zhang ierr = PetscMalloc(is_max*sizeof(IS),&is_new);CHKERRQ(ierr); 808632d0f97SHong Zhang /* Convert the indices into block format */ 80927f478b1SHong Zhang ierr = ISCompressIndicesGeneral(N,bs,is_max,is,is_new);CHKERRQ(ierr); 810e32f2f54SBarry Smith if (ov < 0){ SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");} 811*db41cccfSHong Zhang 812*db41cccfSHong Zhang //------- new ---------- 813*db41cccfSHong Zhang PetscBool flg=PETSC_FALSE; 814*db41cccfSHong Zhang ierr = PetscOptionsHasName(PETSC_NULL, "-IncreaseOverlap_new", &flg);CHKERRQ(ierr); 815*db41cccfSHong Zhang if (!flg){ /* previous non-scalable implementation */ 816632d0f97SHong Zhang for (i=0; i<ov; ++i) { 817c910923dSHong Zhang ierr = MatIncreaseOverlap_MPISBAIJ_Once(C,is_max,is_new);CHKERRQ(ierr); 818632d0f97SHong Zhang } 819*db41cccfSHong Zhang } else { /* scalable implementation using modified BAIJ routines */ 820*db41cccfSHong Zhang 821*db41cccfSHong Zhang Mat *submats; 822*db41cccfSHong Zhang IS *is_row; 823*db41cccfSHong Zhang PetscInt M=C->rmap->N,Mbs=M/bs,*nidx,isz,iov; 824*db41cccfSHong Zhang Mat_MPISBAIJ *c = (Mat_MPISBAIJ*)C->data; 825*db41cccfSHong Zhang PetscMPIInt rank=c->rank; 826*db41cccfSHong Zhang 827*db41cccfSHong Zhang 828*db41cccfSHong Zhang ierr = PetscMalloc((Mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 829*db41cccfSHong Zhang 830*db41cccfSHong Zhang /* Create is_row */ 831*db41cccfSHong Zhang ierr = PetscMalloc(is_max*sizeof(IS **),&is_row);CHKERRQ(ierr); 832*db41cccfSHong Zhang for (i=0; i<is_max; i++){ 833*db41cccfSHong Zhang ierr = ISCreateStride(PETSC_COMM_SELF,Mbs,0,1,is_row+i);CHKERRQ(ierr); 834*db41cccfSHong Zhang } 835*db41cccfSHong Zhang 836*db41cccfSHong Zhang for (iov=0; iov<ov; ++iov) { 837*db41cccfSHong Zhang /* Get submats for column search - replace with MatGetSubMatrices_MPIBAIJ_IJ() */ 838*db41cccfSHong Zhang // copied from MatGetSubMatrices_MPIBAIJ() ---------------- 839*db41cccfSHong Zhang /* Allocate memory to hold all the submatrices */ 840*db41cccfSHong Zhang //if (scall != MAT_REUSE_MATRIX) { 841*db41cccfSHong Zhang ierr = PetscMalloc((is_max+1)*sizeof(Mat),&submats);CHKERRQ(ierr); 842*db41cccfSHong Zhang //} 843*db41cccfSHong Zhang /* Determine the number of stages through which submatrices are done */ 844*db41cccfSHong Zhang PetscInt nmax,nstages_local,nstages,max_no,pos; 845*db41cccfSHong Zhang nmax = 20*1000000 / (c->Nbs * sizeof(PetscInt)); 846*db41cccfSHong Zhang if (!nmax) nmax = 1; 847*db41cccfSHong Zhang nstages_local = is_max/nmax + ((is_max % nmax)?1:0); 848*db41cccfSHong Zhang 849*db41cccfSHong Zhang /* Make sure every processor loops through the nstages */ 850*db41cccfSHong Zhang ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr); 851*db41cccfSHong Zhang for (i=0,pos=0; i<nstages; i++) { 852*db41cccfSHong Zhang if (pos+nmax <= is_max) max_no = nmax; 853*db41cccfSHong Zhang else if (pos == is_max) max_no = 0; 854*db41cccfSHong Zhang else max_no = is_max-pos; 855*db41cccfSHong Zhang 856*db41cccfSHong Zhang //ierr = MatGetSubMatrices_MPIBAIJ_local(C,max_no,is_row+pos,is_new+pos,MAT_INITIAL_MATRIX,submats+pos);CHKERRQ(ierr); 857*db41cccfSHong Zhang ierr = MatGetSubMatrices_MPIBAIJ_local_ijonly(C,max_no,is_row+pos,is_new+pos,MAT_INITIAL_MATRIX,submats+pos);CHKERRQ(ierr); 858*db41cccfSHong Zhang if (rank == 10 && !i){ 859*db41cccfSHong Zhang printf("submats[0]:\n"); 860*db41cccfSHong Zhang ierr = MatView(submats[0],PETSC_VIEWER_STDOUT_SELF);CHKERRQ(ierr); 861*db41cccfSHong Zhang } 862*db41cccfSHong Zhang 863*db41cccfSHong Zhang pos += max_no; 864*db41cccfSHong Zhang } 865*db41cccfSHong Zhang // end of copy ----------------------------------- 866*db41cccfSHong Zhang 867*db41cccfSHong Zhang /* Row search */ 868*db41cccfSHong Zhang ierr = MatIncreaseOverlap_MPIBAIJ_Once(C,is_max,is_new);CHKERRQ(ierr); 869*db41cccfSHong Zhang 870*db41cccfSHong Zhang /* Column search */ 871*db41cccfSHong Zhang PetscBT table; 872*db41cccfSHong Zhang Mat_SeqSBAIJ *asub_i; 873*db41cccfSHong Zhang PetscInt *ai,brow,nz,nis,l; 874*db41cccfSHong Zhang const PetscInt *idx; 875*db41cccfSHong Zhang 876*db41cccfSHong Zhang ierr = PetscBTCreate(Mbs,table);CHKERRQ(ierr); 877*db41cccfSHong Zhang for (i=0; i<is_max; i++){ 878*db41cccfSHong Zhang asub_i = (Mat_SeqSBAIJ*)submats[i]->data; 879*db41cccfSHong Zhang ai=asub_i->i;; 880*db41cccfSHong Zhang 881*db41cccfSHong Zhang /* put is_new obtained from MatIncreaseOverlap_MPIBAIJ() to table */ 882*db41cccfSHong Zhang ierr = PetscBTMemzero(Mbs,table);CHKERRQ(ierr); 883*db41cccfSHong Zhang 884*db41cccfSHong Zhang ierr = ISGetIndices(is_new[i],&idx);CHKERRQ(ierr); 885*db41cccfSHong Zhang ierr = ISGetLocalSize(is_new[i],&nis);CHKERRQ(ierr); 886*db41cccfSHong Zhang for (l=0; l<nis; l++) { 887*db41cccfSHong Zhang ierr = PetscBTSet(table,idx[l]);CHKERRQ(ierr); 888*db41cccfSHong Zhang nidx[l] = idx[l]; 889*db41cccfSHong Zhang } 890*db41cccfSHong Zhang isz = nis; 891*db41cccfSHong Zhang 892*db41cccfSHong Zhang for (brow=0; brow<Mbs; brow++){ 893*db41cccfSHong Zhang nz = ai[brow+1] - ai[brow]; 894*db41cccfSHong Zhang if (nz) { 895*db41cccfSHong Zhang if (!PetscBTLookupSet(table,brow)) nidx[isz++] = brow; 896*db41cccfSHong Zhang } 897*db41cccfSHong Zhang } 898*db41cccfSHong Zhang ierr = ISRestoreIndices(is_new[i],&idx);CHKERRQ(ierr); 899*db41cccfSHong Zhang ierr = ISDestroy(is_new[i]);CHKERRQ(ierr); 900*db41cccfSHong Zhang 901*db41cccfSHong Zhang /* create updated is_new */ 902*db41cccfSHong Zhang ierr = ISCreateGeneral(PETSC_COMM_SELF,isz,nidx,PETSC_COPY_VALUES,is_new+i);CHKERRQ(ierr); 903*db41cccfSHong Zhang } 904*db41cccfSHong Zhang 905*db41cccfSHong Zhang /* Free tmp spaces */ 906*db41cccfSHong Zhang ierr = PetscBTDestroy(table);CHKERRQ(ierr); 907*db41cccfSHong Zhang for (i=0; i<is_max; i++){ 908*db41cccfSHong Zhang if (rank == 10 && !i){ 909*db41cccfSHong Zhang printf("submats[0]:\n"); 910*db41cccfSHong Zhang ierr = MatView(submats[0],PETSC_VIEWER_STDOUT_SELF);CHKERRQ(ierr); 911*db41cccfSHong Zhang } 912*db41cccfSHong Zhang ierr = MatDestroy(submats[i]);CHKERRQ(ierr); 913*db41cccfSHong Zhang } 914*db41cccfSHong Zhang ierr = PetscFree(submats);CHKERRQ(ierr); 915*db41cccfSHong Zhang } 916*db41cccfSHong Zhang 917*db41cccfSHong Zhang for (i=0; i<is_max; i++){ 918*db41cccfSHong Zhang ierr = ISDestroy(is_row[i]);CHKERRQ(ierr); 919*db41cccfSHong Zhang } 920*db41cccfSHong Zhang ierr = PetscFree(is_row);CHKERRQ(ierr); 921*db41cccfSHong Zhang ierr = PetscFree(nidx);CHKERRQ(ierr); 922*db41cccfSHong Zhang } 923*db41cccfSHong Zhang //--------end of new---------- 924*db41cccfSHong Zhang 925c910923dSHong Zhang for (i=0; i<is_max; i++) {ierr = ISDestroy(is[i]);CHKERRQ(ierr);} 92627f478b1SHong Zhang ierr = ISExpandIndicesGeneral(N,bs,is_max,is_new,is);CHKERRQ(ierr); 927*db41cccfSHong Zhang 928c910923dSHong Zhang for (i=0; i<is_max; i++) {ierr = ISDestroy(is_new[i]);CHKERRQ(ierr);} 929632d0f97SHong Zhang ierr = PetscFree(is_new);CHKERRQ(ierr); 930632d0f97SHong Zhang PetscFunctionReturn(0); 931632d0f97SHong Zhang } 932632d0f97SHong Zhang 9334a69c536SBarry Smith typedef enum {MINE,OTHER} WhoseOwner; 9340472cc68SHong Zhang /* data1, odata1 and odata2 are packed in the format (for communication): 935a2a9f22aSHong Zhang data[0] = is_max, no of is 936a2a9f22aSHong Zhang data[1] = size of is[0] 937a2a9f22aSHong Zhang ... 938a2a9f22aSHong Zhang data[is_max] = size of is[is_max-1] 939a2a9f22aSHong Zhang data[is_max + 1] = data(is[0]) 940a2a9f22aSHong Zhang ... 941a2a9f22aSHong Zhang data[is_max+1+sum(size of is[k]), k=0,...,i-1] = data(is[i]) 942a2a9f22aSHong Zhang ... 9430472cc68SHong Zhang data2 is packed in the format (for creating output is[]): 9440472cc68SHong Zhang data[0] = is_max, no of is 9450472cc68SHong Zhang data[1] = size of is[0] 9460472cc68SHong Zhang ... 9470472cc68SHong Zhang data[is_max] = size of is[is_max-1] 9480472cc68SHong Zhang data[is_max + 1] = data(is[0]) 9490472cc68SHong Zhang ... 9500472cc68SHong Zhang data[is_max + 1 + Mbs*i) = data(is[i]) 9510472cc68SHong Zhang ... 952a2a9f22aSHong Zhang */ 953632d0f97SHong Zhang #undef __FUNCT__ 954632d0f97SHong Zhang #define __FUNCT__ "MatIncreaseOverlap_MPISBAIJ_Once" 95513f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat C,PetscInt is_max,IS is[]) 956632d0f97SHong Zhang { 957632d0f97SHong Zhang Mat_MPISBAIJ *c = (Mat_MPISBAIJ*)C->data; 9586849ba73SBarry Smith PetscErrorCode ierr; 95913f74950SBarry Smith PetscMPIInt size,rank,tag1,tag2,*len_s,nrqr,nrqs,*id_r1,*len_r1,flag,len; 9605d0c19d7SBarry Smith const PetscInt *idx_i; 9615d0c19d7SBarry Smith PetscInt idx,isz,col,*n,*data1,**data1_start,*data2,*data2_i,*data,*data_i, 96213f74950SBarry Smith Mbs,i,j,k,*odata1,*odata2, 96313f74950SBarry Smith proc_id,**odata2_ptr,*ctable=0,*btable,len_max,len_est; 96413f74950SBarry Smith PetscInt proc_end=0,*iwork,len_unused,nodata2; 96513f74950SBarry Smith PetscInt ois_max; /* max no of is[] in each of processor */ 966bfc6803cSHong Zhang char *t_p; 967632d0f97SHong Zhang MPI_Comm comm; 968e8527bf2SHong Zhang MPI_Request *s_waits1,*s_waits2,r_req; 969632d0f97SHong Zhang MPI_Status *s_status,r_status; 97045f43ab7SHong Zhang PetscBT *table; /* mark indices of this processor's is[] */ 971fc70d14eSHong Zhang PetscBT table_i; 972bfc6803cSHong Zhang PetscBT otable; /* mark indices of other processors' is[] */ 973d0f46423SBarry Smith PetscInt bs=C->rmap->bs,Bn = c->B->cmap->n,Bnbs = Bn/bs,*Bowners; 97410eea884SHong Zhang IS garray_local,garray_gl; 9755483b11dSHong Zhang 976632d0f97SHong Zhang PetscFunctionBegin; 9777adad957SLisandro Dalcin comm = ((PetscObject)C)->comm; 978632d0f97SHong Zhang size = c->size; 979632d0f97SHong Zhang rank = c->rank; 980632d0f97SHong Zhang Mbs = c->Mbs; 981632d0f97SHong Zhang 982c910923dSHong Zhang ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 983c910923dSHong Zhang ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 984c910923dSHong Zhang 985430a0127SHong Zhang /* create tables used in 986430a0127SHong Zhang step 1: table[i] - mark c->garray of proc [i] 98745f43ab7SHong Zhang step 3: table[i] - mark indices of is[i] when whose=MINE 988430a0127SHong Zhang table[0] - mark incideces of is[] when whose=OTHER */ 989430a0127SHong Zhang len = PetscMax(is_max, size);CHKERRQ(ierr); 99074ed9c26SBarry Smith ierr = PetscMalloc2(len,PetscBT,&table,(Mbs/PETSC_BITS_PER_BYTE+1)*len,char,&t_p);CHKERRQ(ierr); 991430a0127SHong Zhang for (i=0; i<len; i++) { 992430a0127SHong Zhang table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i; 993430a0127SHong Zhang } 994430a0127SHong Zhang 99513f74950SBarry Smith ierr = MPI_Allreduce(&is_max,&ois_max,1,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr); 99610eea884SHong Zhang 9975483b11dSHong Zhang /* 1. Send this processor's is[] to other processors */ 9985483b11dSHong Zhang /*---------------------------------------------------*/ 999e8527bf2SHong Zhang /* allocate spaces */ 100013f74950SBarry Smith ierr = PetscMalloc(is_max*sizeof(PetscInt),&n);CHKERRQ(ierr); 10015483b11dSHong Zhang len = 0; 1002c910923dSHong Zhang for (i=0; i<is_max; i++) { 1003632d0f97SHong Zhang ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 1004632d0f97SHong Zhang len += n[i]; 1005632d0f97SHong Zhang } 1006958c9bccSBarry Smith if (!len) { 10075483b11dSHong Zhang is_max = 0; 10085483b11dSHong Zhang } else { 10095483b11dSHong Zhang len += 1 + is_max; /* max length of data1 for one processor */ 10105483b11dSHong Zhang } 10115483b11dSHong Zhang 1012430a0127SHong Zhang 101313f74950SBarry Smith ierr = PetscMalloc((size*len+1)*sizeof(PetscInt),&data1);CHKERRQ(ierr); 101413f74950SBarry Smith ierr = PetscMalloc(size*sizeof(PetscInt*),&data1_start);CHKERRQ(ierr); 10155483b11dSHong Zhang for (i=0; i<size; i++) data1_start[i] = data1 + i*len; 10165483b11dSHong Zhang 101740cb64c9SJed Brown ierr = PetscMalloc4(size,PetscInt,&len_s,size,PetscInt,&btable,size,PetscInt,&iwork,size+1,PetscInt,&Bowners);CHKERRQ(ierr); 1018e8527bf2SHong Zhang 101976f244e2SHong Zhang /* gather c->garray from all processors */ 102070b3c8c7SBarry Smith ierr = ISCreateGeneral(comm,Bnbs,c->garray,PETSC_COPY_VALUES,&garray_local);CHKERRQ(ierr); 102176f244e2SHong Zhang ierr = ISAllGather(garray_local, &garray_gl);CHKERRQ(ierr); 102276f244e2SHong Zhang ierr = ISDestroy(garray_local);CHKERRQ(ierr); 1023a7cc72afSBarry Smith ierr = MPI_Allgather(&Bnbs,1,MPIU_INT,Bowners+1,1,MPIU_INT,comm);CHKERRQ(ierr); 102476f244e2SHong Zhang Bowners[0] = 0; 102576f244e2SHong Zhang for (i=0; i<size; i++) Bowners[i+1] += Bowners[i]; 102676f244e2SHong Zhang 10275483b11dSHong Zhang if (is_max){ 1028430a0127SHong Zhang /* hash table ctable which maps c->row to proc_id) */ 102913f74950SBarry Smith ierr = PetscMalloc(Mbs*sizeof(PetscInt),&ctable);CHKERRQ(ierr); 10305483b11dSHong Zhang for (proc_id=0,j=0; proc_id<size; proc_id++) { 1031288a2dc7SJed Brown for (; j<C->rmap->range[proc_id+1]/bs; j++) { 10325483b11dSHong Zhang ctable[j] = proc_id; 10335483b11dSHong Zhang } 10345483b11dSHong Zhang } 10355483b11dSHong Zhang 1036430a0127SHong Zhang /* hash tables marking c->garray */ 103710eea884SHong Zhang ierr = ISGetIndices(garray_gl,&idx_i); 10385483b11dSHong Zhang for (i=0; i<size; i++){ 1039430a0127SHong Zhang table_i = table[i]; 1040430a0127SHong Zhang ierr = PetscBTMemzero(Mbs,table_i);CHKERRQ(ierr); 1041430a0127SHong Zhang for (j = Bowners[i]; j<Bowners[i+1]; j++){ /* go through B cols of proc[i]*/ 1042430a0127SHong Zhang ierr = PetscBTSet(table_i,idx_i[j]);CHKERRQ(ierr); 10435483b11dSHong Zhang } 10445483b11dSHong Zhang } 104510eea884SHong Zhang ierr = ISRestoreIndices(garray_gl,&idx_i);CHKERRQ(ierr); 10465483b11dSHong Zhang } /* if (is_max) */ 104776f244e2SHong Zhang ierr = ISDestroy(garray_gl);CHKERRQ(ierr); 10485483b11dSHong Zhang 10495483b11dSHong Zhang /* evaluate communication - mesg to who, length, and buffer space */ 1050e8527bf2SHong Zhang for (i=0; i<size; i++) len_s[i] = 0; 10515483b11dSHong Zhang 10525483b11dSHong Zhang /* header of data1 */ 10535483b11dSHong Zhang for (proc_id=0; proc_id<size; proc_id++){ 10545483b11dSHong Zhang iwork[proc_id] = 0; 10555483b11dSHong Zhang *data1_start[proc_id] = is_max; 10565483b11dSHong Zhang data1_start[proc_id]++; 10575483b11dSHong Zhang for (j=0; j<is_max; j++) { 10585483b11dSHong Zhang if (proc_id == rank){ 10595483b11dSHong Zhang *data1_start[proc_id] = n[j]; 10605483b11dSHong Zhang } else { 10615483b11dSHong Zhang *data1_start[proc_id] = 0; 10625483b11dSHong Zhang } 10635483b11dSHong Zhang data1_start[proc_id]++; 10645483b11dSHong Zhang } 10655483b11dSHong Zhang } 10665483b11dSHong Zhang 10675483b11dSHong Zhang for (i=0; i<is_max; i++) { 10685483b11dSHong Zhang ierr = ISGetIndices(is[i],&idx_i);CHKERRQ(ierr); 10695483b11dSHong Zhang for (j=0; j<n[i]; j++){ 10705483b11dSHong Zhang idx = idx_i[j]; 10715483b11dSHong Zhang *data1_start[rank] = idx; data1_start[rank]++; /* for local proccessing */ 10725483b11dSHong Zhang proc_end = ctable[idx]; 10735483b11dSHong Zhang for (proc_id=0; proc_id<=proc_end; proc_id++){ /* for others to process */ 1074e8527bf2SHong Zhang if (proc_id == rank ) continue; /* done before this loop */ 1075430a0127SHong Zhang if (proc_id < proc_end && !PetscBTLookup(table[proc_id],idx)) 1076430a0127SHong Zhang continue; /* no need for sending idx to [proc_id] */ 10775483b11dSHong Zhang *data1_start[proc_id] = idx; data1_start[proc_id]++; 10785483b11dSHong Zhang len_s[proc_id]++; 10795483b11dSHong Zhang } 10805483b11dSHong Zhang } 10815483b11dSHong Zhang /* update header data */ 10822cfbe0a4SHong Zhang for (proc_id=0; proc_id<size; proc_id++){ 10835483b11dSHong Zhang if (proc_id== rank) continue; 10845483b11dSHong Zhang *(data1 + proc_id*len + 1 + i) = len_s[proc_id] - iwork[proc_id]; 10855483b11dSHong Zhang iwork[proc_id] = len_s[proc_id] ; 10865483b11dSHong Zhang } 10875483b11dSHong Zhang ierr = ISRestoreIndices(is[i],&idx_i);CHKERRQ(ierr); 1088e8527bf2SHong Zhang } 10895483b11dSHong Zhang 10905483b11dSHong Zhang nrqs = 0; nrqr = 0; 10915483b11dSHong Zhang for (i=0; i<size; i++){ 10925483b11dSHong Zhang data1_start[i] = data1 + i*len; 10935483b11dSHong Zhang if (len_s[i]){ 10945483b11dSHong Zhang nrqs++; 10955483b11dSHong Zhang len_s[i] += 1 + is_max; /* add no. of header msg */ 10965483b11dSHong Zhang } 10975483b11dSHong Zhang } 10985483b11dSHong Zhang 10995483b11dSHong Zhang for (i=0; i<is_max; i++) { 1100a2a9f22aSHong Zhang ierr = ISDestroy(is[i]);CHKERRQ(ierr); 1101632d0f97SHong Zhang } 1102bfc6803cSHong Zhang ierr = PetscFree(n);CHKERRQ(ierr); 110305b42c5fSBarry Smith ierr = PetscFree(ctable);CHKERRQ(ierr); 11045483b11dSHong Zhang 11055483b11dSHong Zhang /* Determine the number of messages to expect, their lengths, from from-ids */ 11065483b11dSHong Zhang ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&nrqr);CHKERRQ(ierr); 11075483b11dSHong Zhang ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,len_s,&id_r1,&len_r1);CHKERRQ(ierr); 1108632d0f97SHong Zhang 1109632d0f97SHong Zhang /* Now post the sends */ 111074ed9c26SBarry Smith ierr = PetscMalloc2(size,MPI_Request,&s_waits1,size,MPI_Request,&s_waits2);CHKERRQ(ierr); 1111632d0f97SHong Zhang k = 0; 11125483b11dSHong Zhang for (proc_id=0; proc_id<size; proc_id++){ /* send data1 to processor [proc_id] */ 11135483b11dSHong Zhang if (len_s[proc_id]){ 111413f74950SBarry Smith ierr = MPI_Isend(data1_start[proc_id],len_s[proc_id],MPIU_INT,proc_id,tag1,comm,s_waits1+k);CHKERRQ(ierr); 1115632d0f97SHong Zhang k++; 1116632d0f97SHong Zhang } 1117632d0f97SHong Zhang } 1118632d0f97SHong Zhang 111945f43ab7SHong Zhang /* 2. Receive other's is[] and process. Then send back */ 1120bfc6803cSHong Zhang /*-----------------------------------------------------*/ 11215483b11dSHong Zhang len = 0; 11225483b11dSHong Zhang for (i=0; i<nrqr; i++){ 11235483b11dSHong Zhang if (len_r1[i] > len)len = len_r1[i]; 11245483b11dSHong Zhang } 112545f43ab7SHong Zhang ierr = PetscFree(len_r1);CHKERRQ(ierr); 112645f43ab7SHong Zhang ierr = PetscFree(id_r1);CHKERRQ(ierr); 112745f43ab7SHong Zhang 112845f43ab7SHong Zhang for (proc_id=0; proc_id<size; proc_id++) 112945f43ab7SHong Zhang len_s[proc_id] = iwork[proc_id] = 0; 113045f43ab7SHong Zhang 113113f74950SBarry Smith ierr = PetscMalloc((len+1)*sizeof(PetscInt),&odata1);CHKERRQ(ierr); 113213f74950SBarry Smith ierr = PetscMalloc(size*sizeof(PetscInt**),&odata2_ptr);CHKERRQ(ierr); 113376f244e2SHong Zhang ierr = PetscBTCreate(Mbs,otable);CHKERRQ(ierr); 113410eea884SHong Zhang 113510eea884SHong Zhang len_max = ois_max*(Mbs+1); /* max space storing all is[] for each receive */ 1136240e5896SHong Zhang len_est = 2*len_max; /* estimated space of storing is[] for all receiving messages */ 113713f74950SBarry Smith ierr = PetscMalloc((len_est+1)*sizeof(PetscInt),&odata2);CHKERRQ(ierr); 113810eea884SHong Zhang nodata2 = 0; /* nodata2+1: num of PetscMalloc(,&odata2_ptr[]) called */ 1139240e5896SHong Zhang odata2_ptr[nodata2] = odata2; 114010eea884SHong Zhang len_unused = len_est; /* unused space in the array odata2_ptr[nodata2]-- needs to be >= len_max */ 114110eea884SHong Zhang 1142632d0f97SHong Zhang k = 0; 11435483b11dSHong Zhang while (k < nrqr){ 1144632d0f97SHong Zhang /* Receive messages */ 1145bfc6803cSHong Zhang ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag1,comm,&flag,&r_status);CHKERRQ(ierr); 1146632d0f97SHong Zhang if (flag){ 114713f74950SBarry Smith ierr = MPI_Get_count(&r_status,MPIU_INT,&len);CHKERRQ(ierr); 1148632d0f97SHong Zhang proc_id = r_status.MPI_SOURCE; 114913f74950SBarry Smith ierr = MPI_Irecv(odata1,len,MPIU_INT,proc_id,r_status.MPI_TAG,comm,&r_req);CHKERRQ(ierr); 11505483b11dSHong Zhang ierr = MPI_Wait(&r_req,&r_status);CHKERRQ(ierr); 1151632d0f97SHong Zhang 1152fc70d14eSHong Zhang /* Process messages */ 1153240e5896SHong Zhang /* make sure there is enough unused space in odata2 array */ 115410eea884SHong Zhang if (len_unused < len_max){ /* allocate more space for odata2 */ 115513f74950SBarry Smith ierr = PetscMalloc((len_est+1)*sizeof(PetscInt),&odata2);CHKERRQ(ierr); 1156240e5896SHong Zhang odata2_ptr[++nodata2] = odata2; 115710eea884SHong Zhang len_unused = len_est; 115810eea884SHong Zhang } 115910eea884SHong Zhang 116010eea884SHong Zhang ierr = MatIncreaseOverlap_MPISBAIJ_Local(C,odata1,OTHER,odata2,&otable);CHKERRQ(ierr); 1161a2a9f22aSHong Zhang len = 1 + odata2[0]; 1162a2a9f22aSHong Zhang for (i=0; i<odata2[0]; i++){ 1163a2a9f22aSHong Zhang len += odata2[1 + i]; 1164fc70d14eSHong Zhang } 1165632d0f97SHong Zhang 1166632d0f97SHong Zhang /* Send messages back */ 116713f74950SBarry Smith ierr = MPI_Isend(odata2,len,MPIU_INT,proc_id,tag2,comm,s_waits2+k);CHKERRQ(ierr); 1168fc70d14eSHong Zhang k++; 116910eea884SHong Zhang odata2 += len; 117010eea884SHong Zhang len_unused -= len; 117145f43ab7SHong Zhang len_s[proc_id] = len; /* num of messages sending back to [proc_id] by this proc */ 1172632d0f97SHong Zhang } 11735483b11dSHong Zhang } 11745483b11dSHong Zhang ierr = PetscFree(odata1);CHKERRQ(ierr); 117545f43ab7SHong Zhang ierr = PetscBTDestroy(otable);CHKERRQ(ierr); 1176632d0f97SHong Zhang 117745f43ab7SHong Zhang /* 3. Do local work on this processor's is[] */ 117845f43ab7SHong Zhang /*-------------------------------------------*/ 117945f43ab7SHong Zhang /* make sure there is enough unused space in odata2(=data) array */ 118045f43ab7SHong Zhang len_max = is_max*(Mbs+1); /* max space storing all is[] for this processor */ 118110eea884SHong Zhang if (len_unused < len_max){ /* allocate more space for odata2 */ 118213f74950SBarry Smith ierr = PetscMalloc((len_est+1)*sizeof(PetscInt),&odata2);CHKERRQ(ierr); 1183240e5896SHong Zhang odata2_ptr[++nodata2] = odata2; 118410eea884SHong Zhang len_unused = len_est; 118510eea884SHong Zhang } 1186bfc6803cSHong Zhang 1187240e5896SHong Zhang data = odata2; 118845f43ab7SHong Zhang ierr = MatIncreaseOverlap_MPISBAIJ_Local(C,data1_start[rank],MINE,data,table);CHKERRQ(ierr); 118945f43ab7SHong Zhang ierr = PetscFree(data1_start);CHKERRQ(ierr); 119045f43ab7SHong Zhang 119145f43ab7SHong Zhang /* 4. Receive work done on other processors, then merge */ 119245f43ab7SHong Zhang /*------------------------------------------------------*/ 119345f43ab7SHong Zhang /* get max number of messages that this processor expects to recv */ 119413f74950SBarry Smith ierr = MPI_Allreduce(len_s,iwork,size,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr); 119513f74950SBarry Smith ierr = PetscMalloc((iwork[rank]+1)*sizeof(PetscInt),&data2);CHKERRQ(ierr); 119674ed9c26SBarry Smith ierr = PetscFree4(len_s,btable,iwork,Bowners);CHKERRQ(ierr); 119745f43ab7SHong Zhang 1198632d0f97SHong Zhang k = 0; 11995483b11dSHong Zhang while (k < nrqs){ 1200632d0f97SHong Zhang /* Receive messages */ 1201c910923dSHong Zhang ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag2,comm,&flag,&r_status); 1202632d0f97SHong Zhang if (flag){ 120313f74950SBarry Smith ierr = MPI_Get_count(&r_status,MPIU_INT,&len);CHKERRQ(ierr); 1204632d0f97SHong Zhang proc_id = r_status.MPI_SOURCE; 120513f74950SBarry Smith ierr = MPI_Irecv(data2,len,MPIU_INT,proc_id,r_status.MPI_TAG,comm,&r_req);CHKERRQ(ierr); 12065483b11dSHong Zhang ierr = MPI_Wait(&r_req,&r_status);CHKERRQ(ierr); 12075483b11dSHong Zhang if (len > 1+is_max){ /* Add data2 into data */ 12080472cc68SHong Zhang data2_i = data2 + 1 + is_max; 1209fc70d14eSHong Zhang for (i=0; i<is_max; i++){ 1210fc70d14eSHong Zhang table_i = table[i]; 1211bfc6803cSHong Zhang data_i = data + 1 + is_max + Mbs*i; 1212bfc6803cSHong Zhang isz = data[1+i]; 12130472cc68SHong Zhang for (j=0; j<data2[1+i]; j++){ 12140472cc68SHong Zhang col = data2_i[j]; 1215bfc6803cSHong Zhang if (!PetscBTLookupSet(table_i,col)) {data_i[isz++] = col;} 1216fc70d14eSHong Zhang } 1217bfc6803cSHong Zhang data[1+i] = isz; 12180472cc68SHong Zhang if (i < is_max - 1) data2_i += data2[1+i]; 1219fc70d14eSHong Zhang } 12205483b11dSHong Zhang } 1221632d0f97SHong Zhang k++; 1222632d0f97SHong Zhang } 12235483b11dSHong Zhang } 122445f43ab7SHong Zhang ierr = PetscFree(data2);CHKERRQ(ierr); 122574ed9c26SBarry Smith ierr = PetscFree2(table,t_p);CHKERRQ(ierr); 12265483b11dSHong Zhang 12275483b11dSHong Zhang /* phase 1 sends are complete */ 12285483b11dSHong Zhang ierr = PetscMalloc(size*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 12290c468ba9SBarry Smith if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 12305483b11dSHong Zhang ierr = PetscFree(data1);CHKERRQ(ierr); 12315483b11dSHong Zhang 1232240e5896SHong Zhang /* phase 2 sends are complete */ 12330c468ba9SBarry Smith if (nrqr){ierr = MPI_Waitall(nrqr,s_waits2,s_status);CHKERRQ(ierr);} 123474ed9c26SBarry Smith ierr = PetscFree2(s_waits1,s_waits2);CHKERRQ(ierr); 123545f43ab7SHong Zhang ierr = PetscFree(s_status);CHKERRQ(ierr); 1236632d0f97SHong Zhang 1237c910923dSHong Zhang /* 5. Create new is[] */ 1238c910923dSHong Zhang /*--------------------*/ 1239c910923dSHong Zhang for (i=0; i<is_max; i++) { 1240bfc6803cSHong Zhang data_i = data + 1 + is_max + Mbs*i; 124170b3c8c7SBarry Smith ierr = ISCreateGeneral(PETSC_COMM_SELF,data[1+i],data_i,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 1242632d0f97SHong Zhang } 124345f43ab7SHong Zhang for (k=0; k<=nodata2; k++){ 124445f43ab7SHong Zhang ierr = PetscFree(odata2_ptr[k]);CHKERRQ(ierr); 124545f43ab7SHong Zhang } 124645f43ab7SHong Zhang ierr = PetscFree(odata2_ptr);CHKERRQ(ierr); 12475483b11dSHong Zhang 1248632d0f97SHong Zhang PetscFunctionReturn(0); 1249632d0f97SHong Zhang } 1250632d0f97SHong Zhang 1251632d0f97SHong Zhang #undef __FUNCT__ 1252632d0f97SHong Zhang #define __FUNCT__ "MatIncreaseOverlap_MPISBAIJ_Local" 1253632d0f97SHong Zhang /* 1254dc008846SHong Zhang MatIncreaseOverlap_MPISBAIJ_Local - Called by MatIncreaseOverlap, to do 1255632d0f97SHong Zhang the work on the local processor. 1256632d0f97SHong Zhang 1257632d0f97SHong Zhang Inputs: 1258632d0f97SHong Zhang C - MAT_MPISBAIJ; 1259bfc6803cSHong Zhang data - holds is[]. See MatIncreaseOverlap_MPISBAIJ_Once() for the format. 1260bfc6803cSHong Zhang whose - whose is[] to be processed, 1261bfc6803cSHong Zhang MINE: this processor's is[] 1262bfc6803cSHong Zhang OTHER: other processor's is[] 1263632d0f97SHong Zhang Output: 126410eea884SHong Zhang nidx - whose = MINE: 12650472cc68SHong Zhang holds input and newly found indices in the same format as data 12660472cc68SHong Zhang whose = OTHER: 12670472cc68SHong Zhang only holds the newly found indices 12680472cc68SHong Zhang table - table[i]: mark the indices of is[i], i=0,...,is_max. Used only in the case 'whose=MINE'. 1269632d0f97SHong Zhang */ 127076f244e2SHong Zhang /* Would computation be reduced by swapping the loop 'for each is' and 'for each row'? */ 127113f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat C,PetscInt *data,PetscInt whose,PetscInt *nidx,PetscBT *table) 1272632d0f97SHong Zhang { 1273632d0f97SHong Zhang Mat_MPISBAIJ *c = (Mat_MPISBAIJ*)C->data; 1274dc008846SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)(c->A)->data; 1275dc008846SHong Zhang Mat_SeqBAIJ *b = (Mat_SeqBAIJ*)(c->B)->data; 1276dfbe8321SBarry Smith PetscErrorCode ierr; 127713f74950SBarry Smith PetscInt row,mbs,Mbs,*nidx_i,col,col_max,isz,isz0,*ai,*aj,*bi,*bj,*garray,rstart,l; 127813f74950SBarry Smith PetscInt a_start,a_end,b_start,b_end,i,j,k,is_max,*idx_i,n; 1279bfc6803cSHong Zhang PetscBT table0; /* mark the indices of input is[] for look up */ 1280bfc6803cSHong Zhang PetscBT table_i; /* poits to i-th table. When whose=OTHER, a single table is used for all is[] */ 1281632d0f97SHong Zhang 1282632d0f97SHong Zhang PetscFunctionBegin; 128331f99336SHong Zhang Mbs = c->Mbs; mbs = a->mbs; 1284dc008846SHong Zhang ai = a->i; aj = a->j; 1285dc008846SHong Zhang bi = b->i; bj = b->j; 1286632d0f97SHong Zhang garray = c->garray; 1287899cda47SBarry Smith rstart = c->rstartbs; 1288dc008846SHong Zhang is_max = data[0]; 1289632d0f97SHong Zhang 1290fc70d14eSHong Zhang ierr = PetscBTCreate(Mbs,table0);CHKERRQ(ierr); 1291fc70d14eSHong Zhang 1292fc70d14eSHong Zhang nidx[0] = is_max; 1293fc70d14eSHong Zhang idx_i = data + is_max + 1; /* ptr to input is[0] array */ 1294bfc6803cSHong Zhang nidx_i = nidx + is_max + 1; /* ptr to output is[0] array */ 1295dc008846SHong Zhang for (i=0; i<is_max; i++) { /* for each is */ 1296dc008846SHong Zhang isz = 0; 1297fc70d14eSHong Zhang n = data[1+i]; /* size of input is[i] */ 1298dc008846SHong Zhang 129976f244e2SHong Zhang /* initialize and set table_i(mark idx and nidx) and table0(only mark idx) */ 1300bfc6803cSHong Zhang if (whose == MINE){ /* process this processor's is[] */ 1301bfc6803cSHong Zhang table_i = table[i]; 13020472cc68SHong Zhang nidx_i = nidx + 1+ is_max + Mbs*i; 1303bfc6803cSHong Zhang } else { /* process other processor's is[] - only use one temp table */ 1304430a0127SHong Zhang table_i = table[0]; 1305bfc6803cSHong Zhang } 1306bfc6803cSHong Zhang ierr = PetscBTMemzero(Mbs,table_i);CHKERRQ(ierr); 1307bfc6803cSHong Zhang ierr = PetscBTMemzero(Mbs,table0);CHKERRQ(ierr); 130876f244e2SHong Zhang if (n==0) { 130976f244e2SHong Zhang nidx[1+i] = 0; /* size of new is[i] */ 131076f244e2SHong Zhang continue; 131176f244e2SHong Zhang } 131276f244e2SHong Zhang 131376f244e2SHong Zhang isz0 = 0; col_max = 0; 1314dc008846SHong Zhang for (j=0; j<n; j++){ 1315dc008846SHong Zhang col = idx_i[j]; 1316e32f2f54SBarry Smith if (col >= Mbs) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index col %D >= Mbs %D",col,Mbs); 1317bfc6803cSHong Zhang if(!PetscBTLookupSet(table_i,col)) { 1318bfc6803cSHong Zhang ierr = PetscBTSet(table0,col);CHKERRQ(ierr); 13190472cc68SHong Zhang if (whose == MINE) {nidx_i[isz0] = col;} 1320dbe03f88SHong Zhang if (col_max < col) col_max = col; 1321bfc6803cSHong Zhang isz0++; 1322bfc6803cSHong Zhang } 1323632d0f97SHong Zhang } 1324dc008846SHong Zhang 13250472cc68SHong Zhang if (whose == MINE) {isz = isz0;} 1326fc70d14eSHong Zhang k = 0; /* no. of indices from input is[i] that have been examined */ 1327dc008846SHong Zhang for (row=0; row<mbs; row++){ 1328dc008846SHong Zhang a_start = ai[row]; a_end = ai[row+1]; 1329dc008846SHong Zhang b_start = bi[row]; b_end = bi[row+1]; 13300472cc68SHong Zhang if (PetscBTLookup(table0,row+rstart)){ /* row is on input is[i]: 13310472cc68SHong Zhang do row search: collect all col in this row */ 1332dc008846SHong Zhang for (l = a_start; l<a_end ; l++){ /* Amat */ 1333dc008846SHong Zhang col = aj[l] + rstart; 1334fc70d14eSHong Zhang if (!PetscBTLookupSet(table_i,col)) {nidx_i[isz++] = col;} 1335dc008846SHong Zhang } 1336dc008846SHong Zhang for (l = b_start; l<b_end ; l++){ /* Bmat */ 1337dc008846SHong Zhang col = garray[bj[l]]; 1338fc70d14eSHong Zhang if (!PetscBTLookupSet(table_i,col)) {nidx_i[isz++] = col;} 1339dc008846SHong Zhang } 1340dc008846SHong Zhang k++; 1341dc008846SHong Zhang if (k >= isz0) break; /* for (row=0; row<mbs; row++) */ 13420472cc68SHong Zhang } else { /* row is not on input is[i]: 13430472cc68SHong Zhang do col serach: add row onto nidx_i if there is a col in nidx_i */ 1344dc008846SHong Zhang for (l = a_start; l<a_end ; l++){ /* Amat */ 1345dc008846SHong Zhang col = aj[l] + rstart; 134676f244e2SHong Zhang if (col > col_max) break; 1347dc008846SHong Zhang if (PetscBTLookup(table0,col)){ 1348fc70d14eSHong Zhang if (!PetscBTLookupSet(table_i,row+rstart)) {nidx_i[isz++] = row+rstart;} 1349dc008846SHong Zhang break; /* for l = start; l<end ; l++) */ 1350632d0f97SHong Zhang } 1351632d0f97SHong Zhang } 1352dc008846SHong Zhang for (l = b_start; l<b_end ; l++){ /* Bmat */ 1353dc008846SHong Zhang col = garray[bj[l]]; 135476f244e2SHong Zhang if (col > col_max) break; 1355dc008846SHong Zhang if (PetscBTLookup(table0,col)){ 1356fc70d14eSHong Zhang if (!PetscBTLookupSet(table_i,row+rstart)) {nidx_i[isz++] = row+rstart;} 1357dc008846SHong Zhang break; /* for l = start; l<end ; l++) */ 1358632d0f97SHong Zhang } 1359dc008846SHong Zhang } 1360dc008846SHong Zhang } 1361bfc6803cSHong Zhang } 1362dc008846SHong Zhang 1363dc008846SHong Zhang if (i < is_max - 1){ 1364fc70d14eSHong Zhang idx_i += n; /* ptr to input is[i+1] array */ 1365bfc6803cSHong Zhang nidx_i += isz; /* ptr to output is[i+1] array */ 1366dc008846SHong Zhang } 1367fc70d14eSHong Zhang nidx[1+i] = isz; /* size of new is[i] */ 13681ab97a2aSSatish Balay } /* for each is */ 1369dc008846SHong Zhang ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 1370632d0f97SHong Zhang 1371632d0f97SHong Zhang PetscFunctionReturn(0); 1372632d0f97SHong Zhang } 1373632d0f97SHong Zhang 1374632d0f97SHong Zhang 1375