xref: /petsc/src/mat/impls/sbaij/mpi/sbaijov.c (revision 487daaac4c62e068766c412e6a3e1f31f5193750)
1632d0f97SHong Zhang 
2632d0f97SHong Zhang /*
3632d0f97SHong Zhang    Routines to compute overlapping regions of a parallel MPI matrix.
4632d0f97SHong Zhang    Used for finding submatrices that were shared across processors.
5632d0f97SHong Zhang */
6c6db04a5SJed Brown #include <../src/mat/impls/sbaij/mpi/mpisbaij.h>
7c6db04a5SJed Brown #include <petscbt.h>
8632d0f97SHong Zhang 
913f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat,PetscInt,IS*);
1013f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat,PetscInt*,PetscInt,PetscInt*,PetscBT*);
11632d0f97SHong Zhang 
12db41cccfSHong Zhang /* -------------------------------------------------------------------------*/
13db41cccfSHong Zhang /* This code is modified from MatGetSubMatrices_MPIBAIJ_local() */
14db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
15db41cccfSHong Zhang #undef __FUNCT__
16db41cccfSHong Zhang #define __FUNCT__ "PetscGetProc_ijonly"
17db41cccfSHong Zhang PetscErrorCode PetscGetProc_ijonly(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
18db41cccfSHong Zhang {
19db41cccfSHong Zhang   PetscInt    nGlobalNd = proc_gnode[size];
20db41cccfSHong Zhang   PetscMPIInt fproc = PetscMPIIntCast( (PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)));
21db41cccfSHong Zhang 
22db41cccfSHong Zhang   PetscFunctionBegin;
23db41cccfSHong Zhang   if (fproc > size) fproc = size;
24db41cccfSHong Zhang   while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
25db41cccfSHong Zhang     if (row < proc_gnode[fproc]) fproc--;
26db41cccfSHong Zhang     else                         fproc++;
27db41cccfSHong Zhang   }
28db41cccfSHong Zhang   *rank = fproc;
29db41cccfSHong Zhang   PetscFunctionReturn(0);
30db41cccfSHong Zhang }
31db41cccfSHong Zhang #endif
32db41cccfSHong Zhang 
33db41cccfSHong Zhang #undef __FUNCT__
34db41cccfSHong Zhang #define __FUNCT__ "MatGetSubMatrices_MPIBAIJ_local_ijonly"
35db41cccfSHong Zhang PetscErrorCode MatGetSubMatrices_MPIBAIJ_local_ijonly(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
36db41cccfSHong Zhang {
37db41cccfSHong Zhang   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
38db41cccfSHong Zhang   Mat            A = c->A;
39db41cccfSHong Zhang   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
40db41cccfSHong Zhang   const PetscInt **irow,**icol,*irow_i;
41db41cccfSHong Zhang   PetscInt       *nrow,*ncol,*w3,*w4,start;
42db41cccfSHong Zhang   PetscErrorCode ierr;
43db41cccfSHong Zhang   PetscMPIInt    size,tag0,tag1,tag2,tag3,*w1,*w2,nrqr,idex,end,proc;
44db41cccfSHong Zhang   PetscInt       **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,**rbuf1,row;
45db41cccfSHong Zhang   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
46db41cccfSHong Zhang   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
47db41cccfSHong Zhang   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
48db41cccfSHong Zhang   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
49db41cccfSHong Zhang   PetscInt       *a_j=a->j,*b_j=b->j,*cworkA,*cworkB; //bs=C->rmap->bs,bs2=c->bs2
50db41cccfSHong Zhang   PetscInt       cstart = c->cstartbs,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
51db41cccfSHong Zhang   PetscInt       *bmap = c->garray,ctmp,rstart=c->rstartbs;
52db41cccfSHong Zhang   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3,*s_waits3;
53db41cccfSHong Zhang   //MPI_Request    *r_waits4,*s_waits4;
54db41cccfSHong Zhang   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2,*r_status3;
55db41cccfSHong Zhang   //MPI_Status     *r_status4,*s_status4;
56db41cccfSHong Zhang   MPI_Comm       comm;
57db41cccfSHong Zhang   //MatScalar      **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB;
58db41cccfSHong Zhang   //MatScalar      *a_a=a->a,*b_a=b->a;
59db41cccfSHong Zhang   PetscBool      flag;
60db41cccfSHong Zhang   PetscMPIInt    *onodes1,*olengths1;
61db41cccfSHong Zhang 
62db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
63db41cccfSHong Zhang   PetscInt       tt;
64db41cccfSHong Zhang   PetscTable     *rowmaps,*colmaps,lrow1_grow1,lcol1_gcol1;
65db41cccfSHong Zhang #else
66db41cccfSHong Zhang   PetscInt       **cmap,*cmap_i,*rtable,*rmap_i,**rmap, Mbs = c->Mbs;
67db41cccfSHong Zhang #endif
68db41cccfSHong Zhang 
69db41cccfSHong Zhang   PetscFunctionBegin;
70db41cccfSHong Zhang   comm   = ((PetscObject)C)->comm;
71db41cccfSHong Zhang   tag0   = ((PetscObject)C)->tag;
72db41cccfSHong Zhang   size   = c->size;
73db41cccfSHong Zhang   rank   = c->rank;
74db41cccfSHong Zhang 
75db41cccfSHong Zhang   /* Get some new tags to keep the communication clean */
76db41cccfSHong Zhang   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
77db41cccfSHong Zhang   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
78db41cccfSHong Zhang   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
79db41cccfSHong Zhang 
80db41cccfSHong Zhang #if defined(PETSC_USE_CTABLE)
81db41cccfSHong Zhang   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
82db41cccfSHong Zhang #else
83db41cccfSHong Zhang   ierr = PetscMalloc5(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol,Mbs+1,PetscInt,&rtable);CHKERRQ(ierr);
84db41cccfSHong Zhang   /* Create hash table for the mapping :row -> proc*/
85db41cccfSHong Zhang   for (i=0,j=0; i<size; i++) {
86db41cccfSHong Zhang     jmax = c->rowners[i+1];
87db41cccfSHong Zhang     for (; j<jmax; j++) {
88db41cccfSHong Zhang       rtable[j] = i;
89db41cccfSHong Zhang     }
90db41cccfSHong Zhang   }
91db41cccfSHong Zhang #endif
92db41cccfSHong Zhang 
93db41cccfSHong Zhang   for (i=0; i<ismax; i++) {
94db41cccfSHong Zhang     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
95db41cccfSHong Zhang     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
96db41cccfSHong Zhang     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
97db41cccfSHong Zhang     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
98db41cccfSHong Zhang   }
99db41cccfSHong Zhang 
100db41cccfSHong Zhang   /* evaluate communication - mesg to who,length of mesg,and buffer space
101db41cccfSHong Zhang      required. Based on this, buffers are allocated, and data copied into them*/
102db41cccfSHong Zhang   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscInt,&w3,size,PetscInt,&w4);CHKERRQ(ierr);
103db41cccfSHong Zhang   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
104db41cccfSHong Zhang   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
105db41cccfSHong Zhang   ierr = PetscMemzero(w3,size*sizeof(PetscInt));CHKERRQ(ierr);
106db41cccfSHong Zhang   for (i=0; i<ismax; i++) {
107db41cccfSHong Zhang     ierr   = PetscMemzero(w4,size*sizeof(PetscInt));CHKERRQ(ierr); /* initialise work vector*/
108db41cccfSHong Zhang     jmax   = nrow[i];
109db41cccfSHong Zhang     irow_i = irow[i];
110db41cccfSHong Zhang     for (j=0; j<jmax; j++) {
111db41cccfSHong Zhang       row  = irow_i[j];
112db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
113db41cccfSHong Zhang       ierr = PetscGetProc_ijonly(row,size,c->rangebs,&proc);CHKERRQ(ierr);
114db41cccfSHong Zhang #else
115db41cccfSHong Zhang       proc = rtable[row];
116db41cccfSHong Zhang #endif
117db41cccfSHong Zhang       w4[proc]++;
118db41cccfSHong Zhang     }
119db41cccfSHong Zhang     for (j=0; j<size; j++) {
120db41cccfSHong Zhang       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
121db41cccfSHong Zhang     }
122db41cccfSHong Zhang   }
123db41cccfSHong Zhang 
124db41cccfSHong Zhang   nrqs     = 0;              /* no of outgoing messages */
125db41cccfSHong Zhang   msz      = 0;              /* total mesg length for all proc */
126db41cccfSHong Zhang   w1[rank] = 0;              /* no mesg sent to intself */
127db41cccfSHong Zhang   w3[rank] = 0;
128db41cccfSHong Zhang   for (i=0; i<size; i++) {
129db41cccfSHong Zhang     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
130db41cccfSHong Zhang   }
131db41cccfSHong Zhang   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
132db41cccfSHong Zhang   for (i=0,j=0; i<size; i++) {
133db41cccfSHong Zhang     if (w1[i]) { pa[j] = i; j++; }
134db41cccfSHong Zhang   }
135db41cccfSHong Zhang 
136db41cccfSHong Zhang   /* Each message would have a header = 1 + 2*(no of IS) + data */
137db41cccfSHong Zhang   for (i=0; i<nrqs; i++) {
138db41cccfSHong Zhang     j     = pa[i];
139db41cccfSHong Zhang     w1[j] += w2[j] + 2* w3[j];
140db41cccfSHong Zhang     msz   += w1[j];
141db41cccfSHong Zhang   }
142db41cccfSHong Zhang 
143db41cccfSHong Zhang   /* Determine the number of messages to expect, their lengths, from from-ids */
144db41cccfSHong Zhang   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
145db41cccfSHong Zhang   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
146db41cccfSHong Zhang 
147db41cccfSHong Zhang   /* Now post the Irecvs corresponding to these messages */
148db41cccfSHong Zhang   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
149db41cccfSHong Zhang 
150db41cccfSHong Zhang   ierr = PetscFree(onodes1);CHKERRQ(ierr);
151db41cccfSHong Zhang   ierr = PetscFree(olengths1);CHKERRQ(ierr);
152db41cccfSHong Zhang 
153db41cccfSHong Zhang   /* Allocate Memory for outgoing messages */
154db41cccfSHong Zhang   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
155db41cccfSHong Zhang   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
156db41cccfSHong Zhang   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
157db41cccfSHong Zhang   {
158db41cccfSHong Zhang     PetscInt *iptr = tmp,ict = 0;
159db41cccfSHong Zhang     for (i=0; i<nrqs; i++) {
160db41cccfSHong Zhang       j         = pa[i];
161db41cccfSHong Zhang       iptr     += ict;
162db41cccfSHong Zhang       sbuf1[j]  = iptr;
163db41cccfSHong Zhang       ict       = w1[j];
164db41cccfSHong Zhang     }
165db41cccfSHong Zhang   }
166db41cccfSHong Zhang 
167db41cccfSHong Zhang   /* Form the outgoing messages */
168db41cccfSHong Zhang   /* Initialise the header space */
169db41cccfSHong Zhang   for (i=0; i<nrqs; i++) {
170db41cccfSHong Zhang     j           = pa[i];
171db41cccfSHong Zhang     sbuf1[j][0] = 0;
172db41cccfSHong Zhang     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
173db41cccfSHong Zhang     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
174db41cccfSHong Zhang   }
175db41cccfSHong Zhang 
176db41cccfSHong Zhang   /* Parse the isrow and copy data into outbuf */
177db41cccfSHong Zhang   for (i=0; i<ismax; i++) {
178db41cccfSHong Zhang     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
179db41cccfSHong Zhang     irow_i = irow[i];
180db41cccfSHong Zhang     jmax   = nrow[i];
181db41cccfSHong Zhang     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
182db41cccfSHong Zhang       row  = irow_i[j];
183db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
184db41cccfSHong Zhang       ierr = PetscGetProc_ijonly(row,size,c->rangebs,&proc);CHKERRQ(ierr);
185db41cccfSHong Zhang #else
186db41cccfSHong Zhang       proc = rtable[row];
187db41cccfSHong Zhang #endif
188db41cccfSHong Zhang       if (proc != rank) { /* copy to the outgoing buf*/
189db41cccfSHong Zhang         ctr[proc]++;
190db41cccfSHong Zhang         *ptr[proc] = row;
191db41cccfSHong Zhang         ptr[proc]++;
192db41cccfSHong Zhang       }
193db41cccfSHong Zhang     }
194db41cccfSHong Zhang     /* Update the headers for the current IS */
195db41cccfSHong Zhang     for (j=0; j<size; j++) { /* Can Optimise this loop too */
196db41cccfSHong Zhang       if ((ctr_j = ctr[j])) {
197db41cccfSHong Zhang         sbuf1_j        = sbuf1[j];
198db41cccfSHong Zhang         k              = ++sbuf1_j[0];
199db41cccfSHong Zhang         sbuf1_j[2*k]   = ctr_j;
200db41cccfSHong Zhang         sbuf1_j[2*k-1] = i;
201db41cccfSHong Zhang       }
202db41cccfSHong Zhang     }
203db41cccfSHong Zhang   }
204db41cccfSHong Zhang 
205db41cccfSHong Zhang   /*  Now  post the sends */
206db41cccfSHong Zhang   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
207db41cccfSHong Zhang   for (i=0; i<nrqs; ++i) {
208db41cccfSHong Zhang     j = pa[i];
209db41cccfSHong Zhang     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
210db41cccfSHong Zhang   }
211db41cccfSHong Zhang 
212db41cccfSHong Zhang   /* Post Recieves to capture the buffer size */
213db41cccfSHong Zhang   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
214db41cccfSHong Zhang   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
215db41cccfSHong Zhang   rbuf2[0] = tmp + msz;
216db41cccfSHong Zhang   for (i=1; i<nrqs; ++i) {
217db41cccfSHong Zhang     j        = pa[i];
218db41cccfSHong Zhang     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
219db41cccfSHong Zhang   }
220db41cccfSHong Zhang   for (i=0; i<nrqs; ++i) {
221db41cccfSHong Zhang     j    = pa[i];
222db41cccfSHong Zhang     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
223db41cccfSHong Zhang   }
224db41cccfSHong Zhang 
225db41cccfSHong Zhang   /* Send to other procs the buf size they should allocate */
226db41cccfSHong Zhang 
227db41cccfSHong Zhang   /* Receive messages*/
228db41cccfSHong Zhang   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
229db41cccfSHong Zhang   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
230db41cccfSHong Zhang   ierr = PetscMalloc3(nrqr+1,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
231db41cccfSHong Zhang   {
232db41cccfSHong Zhang     Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
233db41cccfSHong Zhang     PetscInt    *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;
234db41cccfSHong Zhang 
235db41cccfSHong Zhang     for (i=0; i<nrqr; ++i) {
236db41cccfSHong Zhang       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
237db41cccfSHong Zhang       req_size[idex] = 0;
238db41cccfSHong Zhang       rbuf1_i         = rbuf1[idex];
239db41cccfSHong Zhang       start           = 2*rbuf1_i[0] + 1;
240db41cccfSHong Zhang       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
241db41cccfSHong Zhang       ierr            = PetscMalloc(end*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
242db41cccfSHong Zhang       sbuf2_i         = sbuf2[idex];
243db41cccfSHong Zhang       for (j=start; j<end; j++) {
244db41cccfSHong Zhang         id               = rbuf1_i[j] - rstart;
245db41cccfSHong Zhang         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
246db41cccfSHong Zhang         sbuf2_i[j]       = ncols;
247db41cccfSHong Zhang         req_size[idex] += ncols;
248db41cccfSHong Zhang       }
249db41cccfSHong Zhang       req_source[idex] = r_status1[i].MPI_SOURCE;
250db41cccfSHong Zhang       /* form the header */
251db41cccfSHong Zhang       sbuf2_i[0]   = req_size[idex];
252db41cccfSHong Zhang       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
253db41cccfSHong Zhang       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
254db41cccfSHong Zhang     }
255db41cccfSHong Zhang   }
256db41cccfSHong Zhang   ierr = PetscFree(r_status1);CHKERRQ(ierr);
257db41cccfSHong Zhang   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
258db41cccfSHong Zhang 
259db41cccfSHong Zhang   /*  recv buffer sizes */
260db41cccfSHong Zhang   /* Receive messages*/
261db41cccfSHong Zhang 
262db41cccfSHong Zhang   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
263db41cccfSHong Zhang   //ierr = PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);CHKERRQ(ierr);
264db41cccfSHong Zhang   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
265db41cccfSHong Zhang   //ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
266db41cccfSHong Zhang   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
267db41cccfSHong Zhang 
268db41cccfSHong Zhang   for (i=0; i<nrqs; ++i) {
269db41cccfSHong Zhang     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
270db41cccfSHong Zhang     ierr = PetscMalloc(rbuf2[idex][0]*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
271db41cccfSHong Zhang     //ierr = PetscMalloc(rbuf2[idex][0]*bs2*sizeof(MatScalar),&rbuf4[idex]);CHKERRQ(ierr);
272db41cccfSHong Zhang     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
273db41cccfSHong Zhang     //ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
274db41cccfSHong Zhang   }
275db41cccfSHong Zhang   ierr = PetscFree(r_status2);CHKERRQ(ierr);
276db41cccfSHong Zhang   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
277db41cccfSHong Zhang 
278db41cccfSHong Zhang   /* Wait on sends1 and sends2 */
279db41cccfSHong Zhang   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
280db41cccfSHong Zhang   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
281db41cccfSHong Zhang 
282db41cccfSHong Zhang   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
283db41cccfSHong Zhang   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
284db41cccfSHong Zhang   ierr = PetscFree(s_status1);CHKERRQ(ierr);
285db41cccfSHong Zhang   ierr = PetscFree(s_status2);CHKERRQ(ierr);
286db41cccfSHong Zhang   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
287db41cccfSHong Zhang   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
288db41cccfSHong Zhang 
289db41cccfSHong Zhang   /* Now allocate buffers for a->j, and send them off */
290db41cccfSHong Zhang   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
291db41cccfSHong Zhang   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
292db41cccfSHong Zhang   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
293db41cccfSHong Zhang   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
294db41cccfSHong Zhang 
295db41cccfSHong Zhang   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
296db41cccfSHong Zhang   {
297db41cccfSHong Zhang      for (i=0; i<nrqr; i++) {
298db41cccfSHong Zhang       rbuf1_i   = rbuf1[i];
299db41cccfSHong Zhang       sbuf_aj_i = sbuf_aj[i];
300db41cccfSHong Zhang       ct1       = 2*rbuf1_i[0] + 1;
301db41cccfSHong Zhang       ct2       = 0;
302db41cccfSHong Zhang       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
303db41cccfSHong Zhang         kmax = rbuf1[i][2*j];
304db41cccfSHong Zhang         for (k=0; k<kmax; k++,ct1++) {
305db41cccfSHong Zhang           row    = rbuf1_i[ct1] - rstart;
306db41cccfSHong Zhang           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
307db41cccfSHong Zhang           ncols  = nzA + nzB;
308db41cccfSHong Zhang           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
309db41cccfSHong Zhang 
310db41cccfSHong Zhang           /* load the column indices for this row into cols*/
311db41cccfSHong Zhang           cols  = sbuf_aj_i + ct2;
312db41cccfSHong Zhang           for (l=0; l<nzB; l++) {
313db41cccfSHong Zhang             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[l] = ctmp;
314db41cccfSHong Zhang             else break;
315db41cccfSHong Zhang           }
316db41cccfSHong Zhang           imark = l;
317db41cccfSHong Zhang           for (l=0; l<nzA; l++)   cols[imark+l] = cstart + cworkA[l];
318db41cccfSHong Zhang           for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
319db41cccfSHong Zhang           ct2 += ncols;
320db41cccfSHong Zhang         }
321db41cccfSHong Zhang       }
322db41cccfSHong Zhang       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
323db41cccfSHong Zhang     }
324db41cccfSHong Zhang   }
325db41cccfSHong Zhang   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
326db41cccfSHong Zhang   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
327db41cccfSHong Zhang 
328db41cccfSHong Zhang #if defined(MV)
329db41cccfSHong Zhang   /* Allocate buffers for a->a, and send them off */
330db41cccfSHong Zhang   ierr = PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);CHKERRQ(ierr);
331db41cccfSHong Zhang   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
332db41cccfSHong Zhang   ierr = PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);CHKERRQ(ierr);
333db41cccfSHong Zhang   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
334db41cccfSHong Zhang 
335db41cccfSHong Zhang   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
336db41cccfSHong Zhang   {
337db41cccfSHong Zhang     for (i=0; i<nrqr; i++) {
338db41cccfSHong Zhang       rbuf1_i   = rbuf1[i];
339db41cccfSHong Zhang       sbuf_aa_i = sbuf_aa[i];
340db41cccfSHong Zhang       ct1       = 2*rbuf1_i[0]+1;
341db41cccfSHong Zhang       ct2       = 0;
342db41cccfSHong Zhang       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
343db41cccfSHong Zhang         kmax = rbuf1_i[2*j];
344db41cccfSHong Zhang         for (k=0; k<kmax; k++,ct1++) {
345db41cccfSHong Zhang           row    = rbuf1_i[ct1] - rstart;
346db41cccfSHong Zhang           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
347db41cccfSHong Zhang           ncols  = nzA + nzB;
348db41cccfSHong Zhang           cworkA = a_j + a_i[row];     cworkB = b_j + b_i[row];
349db41cccfSHong Zhang           vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2;
350db41cccfSHong Zhang 
351db41cccfSHong Zhang           /* load the column values for this row into vals*/
352db41cccfSHong Zhang           vals  = sbuf_aa_i+ct2*bs2;
353db41cccfSHong Zhang           for (l=0; l<nzB; l++) {
354db41cccfSHong Zhang             if ((bmap[cworkB[l]]) < cstart) {
355db41cccfSHong Zhang               ierr = PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
356db41cccfSHong Zhang             }
357db41cccfSHong Zhang             else break;
358db41cccfSHong Zhang           }
359db41cccfSHong Zhang           imark = l;
360db41cccfSHong Zhang           for (l=0; l<nzA; l++) {
361db41cccfSHong Zhang             ierr = PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
362db41cccfSHong Zhang           }
363db41cccfSHong Zhang           for (l=imark; l<nzB; l++) {
364db41cccfSHong Zhang             ierr = PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
365db41cccfSHong Zhang           }
366db41cccfSHong Zhang           ct2 += ncols;
367db41cccfSHong Zhang         }
368db41cccfSHong Zhang       }
369db41cccfSHong Zhang       ierr = MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
370db41cccfSHong Zhang     }
371db41cccfSHong Zhang   }
372db41cccfSHong Zhang   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
373db41cccfSHong Zhang   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
374db41cccfSHong Zhang #endif
375db41cccfSHong Zhang   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
376db41cccfSHong Zhang   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
377db41cccfSHong Zhang 
378db41cccfSHong Zhang   /* Form the matrix */
379db41cccfSHong Zhang   /* create col map */
380db41cccfSHong Zhang   {
381db41cccfSHong Zhang     const PetscInt *icol_i;
382db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
383db41cccfSHong Zhang     /* Create row map*/
384db41cccfSHong Zhang     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&colmaps);CHKERRQ(ierr);
385db41cccfSHong Zhang     for (i=0; i<ismax; i++) {
386db41cccfSHong Zhang       ierr = PetscTableCreate(ncol[i]+1,&colmaps[i]);CHKERRQ(ierr);
387db41cccfSHong Zhang     }
388db41cccfSHong Zhang #else
389db41cccfSHong Zhang     ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
390db41cccfSHong Zhang     ierr    = PetscMalloc(ismax*c->Nbs*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr);
391db41cccfSHong Zhang     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; }
392db41cccfSHong Zhang #endif
393db41cccfSHong Zhang     for (i=0; i<ismax; i++) {
394db41cccfSHong Zhang       jmax   = ncol[i];
395db41cccfSHong Zhang       icol_i = icol[i];
396db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
397db41cccfSHong Zhang       lcol1_gcol1 = colmaps[i];
398db41cccfSHong Zhang       for (j=0; j<jmax; j++) {
399db41cccfSHong Zhang 	ierr = PetscTableAdd(lcol1_gcol1,icol_i[j]+1,j+1);CHKERRQ(ierr);
400db41cccfSHong Zhang       }
401db41cccfSHong Zhang #else
402db41cccfSHong Zhang       cmap_i = cmap[i];
403db41cccfSHong Zhang       for (j=0; j<jmax; j++) {
404db41cccfSHong Zhang         cmap_i[icol_i[j]] = j+1;
405db41cccfSHong Zhang       }
406db41cccfSHong Zhang #endif
407db41cccfSHong Zhang     }
408db41cccfSHong Zhang   }
409db41cccfSHong Zhang 
410db41cccfSHong Zhang   /* Create lens which is required for MatCreate... */
411db41cccfSHong Zhang   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
412db41cccfSHong Zhang   ierr    = PetscMalloc((1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt),&lens);CHKERRQ(ierr);
413db41cccfSHong Zhang   lens[0] = (PetscInt*)(lens + ismax);
414db41cccfSHong Zhang   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
415db41cccfSHong Zhang   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
416db41cccfSHong Zhang 
417db41cccfSHong Zhang   /* Update lens from local data */
418db41cccfSHong Zhang   for (i=0; i<ismax; i++) {
419db41cccfSHong Zhang     jmax   = nrow[i];
420db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
421db41cccfSHong Zhang     lcol1_gcol1 = colmaps[i];
422db41cccfSHong Zhang #else
423db41cccfSHong Zhang     cmap_i = cmap[i];
424db41cccfSHong Zhang #endif
425db41cccfSHong Zhang     irow_i = irow[i];
426db41cccfSHong Zhang     lens_i = lens[i];
427db41cccfSHong Zhang     for (j=0; j<jmax; j++) {
428db41cccfSHong Zhang       row  = irow_i[j];
429db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
430db41cccfSHong Zhang       ierr = PetscGetProc_ijonly(row,size,c->rangebs,&proc);CHKERRQ(ierr);
431db41cccfSHong Zhang #else
432db41cccfSHong Zhang       proc = rtable[row];
433db41cccfSHong Zhang #endif
434db41cccfSHong Zhang       if (proc == rank) {
435db41cccfSHong Zhang         /* Get indices from matA and then from matB */
436db41cccfSHong Zhang         row    = row - rstart;
437db41cccfSHong Zhang         nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
438db41cccfSHong Zhang         cworkA =  a_j + a_i[row]; cworkB = b_j + b_i[row];
439db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
440db41cccfSHong Zhang        for (k=0; k<nzA; k++) {
441db41cccfSHong Zhang 	 ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[k]+1,&tt);CHKERRQ(ierr);
442db41cccfSHong Zhang           if (tt) { lens_i[j]++; }
443db41cccfSHong Zhang         }
444db41cccfSHong Zhang         for (k=0; k<nzB; k++) {
445db41cccfSHong Zhang 	  ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[k]]+1,&tt);CHKERRQ(ierr);
446db41cccfSHong Zhang           if (tt) { lens_i[j]++; }
447db41cccfSHong Zhang         }
448db41cccfSHong Zhang #else
449db41cccfSHong Zhang         for (k=0; k<nzA; k++) {
450db41cccfSHong Zhang           if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++; }
451db41cccfSHong Zhang         }
452db41cccfSHong Zhang         for (k=0; k<nzB; k++) {
453db41cccfSHong Zhang           if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++; }
454db41cccfSHong Zhang         }
455db41cccfSHong Zhang #endif
456db41cccfSHong Zhang       }
457db41cccfSHong Zhang     }
458db41cccfSHong Zhang   }
459db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
460db41cccfSHong Zhang   /* Create row map*/
461db41cccfSHong Zhang   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rowmaps);CHKERRQ(ierr);
462db41cccfSHong Zhang   for (i=0; i<ismax; i++){
463db41cccfSHong Zhang     ierr = PetscTableCreate(nrow[i]+1,&rowmaps[i]);CHKERRQ(ierr);
464db41cccfSHong Zhang   }
465db41cccfSHong Zhang #else
466db41cccfSHong Zhang   /* Create row map*/
467db41cccfSHong Zhang   ierr    = PetscMalloc((1+ismax)*sizeof(PetscInt*)+ ismax*Mbs*sizeof(PetscInt),&rmap);CHKERRQ(ierr);
468db41cccfSHong Zhang   rmap[0] = (PetscInt*)(rmap + ismax);
469db41cccfSHong Zhang   ierr    = PetscMemzero(rmap[0],ismax*Mbs*sizeof(PetscInt));CHKERRQ(ierr);
470db41cccfSHong Zhang   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + Mbs;}
471db41cccfSHong Zhang #endif
472db41cccfSHong Zhang   for (i=0; i<ismax; i++) {
473db41cccfSHong Zhang     irow_i = irow[i];
474db41cccfSHong Zhang     jmax   = nrow[i];
475db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
476db41cccfSHong Zhang     lrow1_grow1 = rowmaps[i];
477db41cccfSHong Zhang     for (j=0; j<jmax; j++) {
478db41cccfSHong Zhang       ierr = PetscTableAdd(lrow1_grow1,irow_i[j]+1,j+1);CHKERRQ(ierr);
479db41cccfSHong Zhang     }
480db41cccfSHong Zhang #else
481db41cccfSHong Zhang     rmap_i = rmap[i];
482db41cccfSHong Zhang     for (j=0; j<jmax; j++) {
483db41cccfSHong Zhang       rmap_i[irow_i[j]] = j;
484db41cccfSHong Zhang     }
485db41cccfSHong Zhang #endif
486db41cccfSHong Zhang   }
487db41cccfSHong Zhang 
488db41cccfSHong Zhang   /* Update lens from offproc data */
489db41cccfSHong Zhang   {
490db41cccfSHong Zhang     PetscInt    *rbuf2_i,*rbuf3_i,*sbuf1_i;
491db41cccfSHong Zhang     PetscMPIInt ii;
492db41cccfSHong Zhang 
493db41cccfSHong Zhang     for (tmp2=0; tmp2<nrqs; tmp2++) {
494db41cccfSHong Zhang       ierr    = MPI_Waitany(nrqs,r_waits3,&ii,r_status3+tmp2);CHKERRQ(ierr);
495db41cccfSHong Zhang       idex   = pa[ii];
496db41cccfSHong Zhang       sbuf1_i = sbuf1[idex];
497db41cccfSHong Zhang       jmax    = sbuf1_i[0];
498db41cccfSHong Zhang       ct1     = 2*jmax+1;
499db41cccfSHong Zhang       ct2     = 0;
500db41cccfSHong Zhang       rbuf2_i = rbuf2[ii];
501db41cccfSHong Zhang       rbuf3_i = rbuf3[ii];
502db41cccfSHong Zhang       for (j=1; j<=jmax; j++) {
503db41cccfSHong Zhang         is_no   = sbuf1_i[2*j-1];
504db41cccfSHong Zhang         max1    = sbuf1_i[2*j];
505db41cccfSHong Zhang         lens_i  = lens[is_no];
506db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
507db41cccfSHong Zhang 	lcol1_gcol1 = colmaps[is_no];
508db41cccfSHong Zhang 	lrow1_grow1 = rowmaps[is_no];
509db41cccfSHong Zhang #else
510db41cccfSHong Zhang         cmap_i  = cmap[is_no];
511db41cccfSHong Zhang 	rmap_i  = rmap[is_no];
512db41cccfSHong Zhang #endif
513db41cccfSHong Zhang         for (k=0; k<max1; k++,ct1++) {
514db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
515db41cccfSHong Zhang 	  ierr = PetscTableFind(lrow1_grow1,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
516db41cccfSHong Zhang           row--;
517db41cccfSHong Zhang           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
518db41cccfSHong Zhang #else
519db41cccfSHong Zhang           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
520db41cccfSHong Zhang #endif
521db41cccfSHong Zhang           max2 = rbuf2_i[ct1];
522db41cccfSHong Zhang           for (l=0; l<max2; l++,ct2++) {
523db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
524db41cccfSHong Zhang 	    ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tt);CHKERRQ(ierr);
525db41cccfSHong Zhang 	    if (tt) {
526db41cccfSHong Zhang               lens_i[row]++;
527db41cccfSHong Zhang             }
528db41cccfSHong Zhang #else
529db41cccfSHong Zhang             if (cmap_i[rbuf3_i[ct2]]) {
530db41cccfSHong Zhang               lens_i[row]++;
531db41cccfSHong Zhang             }
532db41cccfSHong Zhang #endif
533db41cccfSHong Zhang           }
534db41cccfSHong Zhang         }
535db41cccfSHong Zhang       }
536db41cccfSHong Zhang     }
537db41cccfSHong Zhang   }
538db41cccfSHong Zhang   ierr = PetscFree(r_status3);CHKERRQ(ierr);
539db41cccfSHong Zhang   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
540db41cccfSHong Zhang   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
541db41cccfSHong Zhang   ierr = PetscFree(s_status3);CHKERRQ(ierr);
542db41cccfSHong Zhang   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
543db41cccfSHong Zhang 
544db41cccfSHong Zhang   /* Create the submatrices */
545db41cccfSHong Zhang   if (scall == MAT_REUSE_MATRIX) {
546db41cccfSHong Zhang     /*
547db41cccfSHong Zhang         Assumes new rows are same length as the old rows, hence bug!
548db41cccfSHong Zhang     */
549db41cccfSHong Zhang     for (i=0; i<ismax; i++) {
550db41cccfSHong Zhang       mat = (Mat_SeqBAIJ *)(submats[i]->data);
551db41cccfSHong Zhang       //if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || C->rmap->bs != bs)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
552db41cccfSHong Zhang       if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
553db41cccfSHong Zhang       ierr = PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
554db41cccfSHong Zhang       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros");
555db41cccfSHong Zhang       /* Initial matrix as if empty */
556db41cccfSHong Zhang       ierr = PetscMemzero(mat->ilen,mat->mbs*sizeof(PetscInt));CHKERRQ(ierr);
557db41cccfSHong Zhang       submats[i]->factortype = C->factortype;
558db41cccfSHong Zhang     }
559db41cccfSHong Zhang   } else {
560db41cccfSHong Zhang     for (i=0; i<ismax; i++) {
561db41cccfSHong Zhang       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
562db41cccfSHong Zhang       //ierr = MatSetSizes(submats[i],nrow[i]*bs,ncol[i]*bs,nrow[i]*bs,ncol[i]*bs);CHKERRQ(ierr);
563db41cccfSHong Zhang       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],nrow[i],ncol[i]);CHKERRQ(ierr);
564db41cccfSHong Zhang       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
565db41cccfSHong Zhang       ierr = MatSeqBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);CHKERRQ(ierr);
566db41cccfSHong Zhang       //ierr = MatSeqBAIJSetPreallocation(submats[i],1,0,lens[i]);CHKERRQ(ierr);
567db41cccfSHong Zhang       ierr = MatSeqSBAIJSetPreallocation(submats[i],1,0,lens[i]);CHKERRQ(ierr);
568db41cccfSHong Zhang       //ierr = MatSeqSBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);CHKERRQ(ierr); /* this subroutine is used by MatGetSubMatrices_MPISBAIJ()*/
569db41cccfSHong Zhang     }
570db41cccfSHong Zhang   }
571db41cccfSHong Zhang 
572db41cccfSHong Zhang   /* Assemble the matrices */
573db41cccfSHong Zhang   /* First assemble the local rows */
574db41cccfSHong Zhang   {
575db41cccfSHong Zhang     PetscInt       ilen_row,*imat_ilen,*imat_j,*imat_i;
576db41cccfSHong Zhang     //MatScalar *imat_a;
577db41cccfSHong Zhang 
578db41cccfSHong Zhang     for (i=0; i<ismax; i++) {
579db41cccfSHong Zhang       mat       = (Mat_SeqBAIJ*)submats[i]->data;
580db41cccfSHong Zhang       imat_ilen = mat->ilen;
581db41cccfSHong Zhang       imat_j    = mat->j;
582db41cccfSHong Zhang       imat_i    = mat->i;
583db41cccfSHong Zhang       //imat_a    = mat->a;
584db41cccfSHong Zhang 
585db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
586db41cccfSHong Zhang       lcol1_gcol1 = colmaps[i];
587db41cccfSHong Zhang       lrow1_grow1 = rowmaps[i];
588db41cccfSHong Zhang #else
589db41cccfSHong Zhang       cmap_i    = cmap[i];
590db41cccfSHong Zhang       rmap_i    = rmap[i];
591db41cccfSHong Zhang #endif
592db41cccfSHong Zhang       irow_i    = irow[i];
593db41cccfSHong Zhang       jmax      = nrow[i];
594db41cccfSHong Zhang       for (j=0; j<jmax; j++) {
595db41cccfSHong Zhang         row      = irow_i[j];
596db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
597db41cccfSHong Zhang 	ierr = PetscGetProc_ijonly(row,size,c->rangebs,&proc);CHKERRQ(ierr);
598db41cccfSHong Zhang #else
599db41cccfSHong Zhang 	proc = rtable[row];
600db41cccfSHong Zhang #endif
601db41cccfSHong Zhang         if (proc == rank) {
602db41cccfSHong Zhang           row      = row - rstart;
603db41cccfSHong Zhang           nzA      = a_i[row+1] - a_i[row];
604db41cccfSHong Zhang           nzB      = b_i[row+1] - b_i[row];
605db41cccfSHong Zhang           cworkA   = a_j + a_i[row];
606db41cccfSHong Zhang           cworkB   = b_j + b_i[row];
607db41cccfSHong Zhang           //vworkA   = a_a + a_i[row]*bs2;
608db41cccfSHong Zhang           //vworkB   = b_a + b_i[row]*bs2;
609db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
610db41cccfSHong Zhang 	  ierr = PetscTableFind(lrow1_grow1,row+rstart+1,&row);CHKERRQ(ierr);
611db41cccfSHong Zhang           row--;
612db41cccfSHong Zhang           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
613db41cccfSHong Zhang #else
614db41cccfSHong Zhang           row      = rmap_i[row + rstart];
615db41cccfSHong Zhang #endif
616db41cccfSHong Zhang           mat_i    = imat_i[row];
617db41cccfSHong Zhang           //mat_a    = imat_a + mat_i*bs2;
618db41cccfSHong Zhang           mat_j    = imat_j + mat_i;
619db41cccfSHong Zhang           ilen_row = imat_ilen[row];
620db41cccfSHong Zhang 
621db41cccfSHong Zhang           /* load the column indices for this row into cols*/
622db41cccfSHong Zhang           for (l=0; l<nzB; l++) {
623db41cccfSHong Zhang 	    if ((ctmp = bmap[cworkB[l]]) < cstart) {
624db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
625db41cccfSHong Zhang 	      ierr = PetscTableFind(lcol1_gcol1,ctmp+1,&tcol);CHKERRQ(ierr);
626db41cccfSHong Zhang 	      if (tcol) {
627db41cccfSHong Zhang #else
628db41cccfSHong Zhang               if ((tcol = cmap_i[ctmp])) {
629db41cccfSHong Zhang #endif
630db41cccfSHong Zhang                 *mat_j++ = tcol - 1;
631db41cccfSHong Zhang                 //ierr     = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
632db41cccfSHong Zhang                 //mat_a   += bs2;
633db41cccfSHong Zhang                 ilen_row++;
634db41cccfSHong Zhang               }
635db41cccfSHong Zhang             } else break;
636db41cccfSHong Zhang           }
637db41cccfSHong Zhang           imark = l;
638db41cccfSHong Zhang           for (l=0; l<nzA; l++) {
639db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
640db41cccfSHong Zhang 	    ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[l]+1,&tcol);CHKERRQ(ierr);
641db41cccfSHong Zhang 	    if (tcol) {
642db41cccfSHong Zhang #else
643db41cccfSHong Zhang 	    if ((tcol = cmap_i[cstart + cworkA[l]])) {
644db41cccfSHong Zhang #endif
645db41cccfSHong Zhang               *mat_j++ = tcol - 1;
646db41cccfSHong Zhang               //ierr     = PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
647db41cccfSHong Zhang               //mat_a   += bs2;
648db41cccfSHong Zhang               ilen_row++;
649db41cccfSHong Zhang             }
650db41cccfSHong Zhang           }
651db41cccfSHong Zhang           for (l=imark; l<nzB; l++) {
652db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
653db41cccfSHong Zhang 	    ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[l]]+1,&tcol);CHKERRQ(ierr);
654db41cccfSHong Zhang 	    if (tcol) {
655db41cccfSHong Zhang #else
656db41cccfSHong Zhang             if ((tcol = cmap_i[bmap[cworkB[l]]])) {
657db41cccfSHong Zhang #endif
658db41cccfSHong Zhang               *mat_j++ = tcol - 1;
659db41cccfSHong Zhang               //ierr     = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
660db41cccfSHong Zhang               //mat_a   += bs2;
661db41cccfSHong Zhang               ilen_row++;
662db41cccfSHong Zhang             }
663db41cccfSHong Zhang           }
664db41cccfSHong Zhang           imat_ilen[row] = ilen_row;
665db41cccfSHong Zhang         }
666db41cccfSHong Zhang       }
667db41cccfSHong Zhang     }
668db41cccfSHong Zhang   }
669db41cccfSHong Zhang 
670db41cccfSHong Zhang   /*   Now assemble the off proc rows*/
671db41cccfSHong Zhang   {
672db41cccfSHong Zhang     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
673db41cccfSHong Zhang     PetscInt    *imat_j,*imat_i;
674db41cccfSHong Zhang     //MatScalar   *imat_a,*rbuf4_i;
675db41cccfSHong Zhang     PetscMPIInt ii;
676db41cccfSHong Zhang 
677db41cccfSHong Zhang     for (tmp2=0; tmp2<nrqs; tmp2++) {
678db41cccfSHong Zhang       //ierr    = MPI_Waitany(nrqs,r_waits4,&ii,r_status4+tmp2);CHKERRQ(ierr);
679db41cccfSHong Zhang       ii = tmp2;  //new
680db41cccfSHong Zhang       idex   = pa[ii];
681db41cccfSHong Zhang       sbuf1_i = sbuf1[idex];
682db41cccfSHong Zhang       jmax    = sbuf1_i[0];
683db41cccfSHong Zhang       ct1     = 2*jmax + 1;
684db41cccfSHong Zhang       ct2     = 0;
685db41cccfSHong Zhang       rbuf2_i = rbuf2[ii];
686db41cccfSHong Zhang       rbuf3_i = rbuf3[ii];
687db41cccfSHong Zhang       //rbuf4_i = rbuf4[ii];
688db41cccfSHong Zhang       for (j=1; j<=jmax; j++) {
689db41cccfSHong Zhang         is_no     = sbuf1_i[2*j-1];
690db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
691db41cccfSHong Zhang 	lrow1_grow1 = rowmaps[is_no];
692db41cccfSHong Zhang 	lcol1_gcol1 = colmaps[is_no];
693db41cccfSHong Zhang #else
694db41cccfSHong Zhang         rmap_i    = rmap[is_no];
695db41cccfSHong Zhang         cmap_i    = cmap[is_no];
696db41cccfSHong Zhang #endif
697db41cccfSHong Zhang         mat       = (Mat_SeqBAIJ*)submats[is_no]->data;
698db41cccfSHong Zhang         imat_ilen = mat->ilen;
699db41cccfSHong Zhang         imat_j    = mat->j;
700db41cccfSHong Zhang         imat_i    = mat->i;
701db41cccfSHong Zhang         //imat_a    = mat->a;
702db41cccfSHong Zhang         max1      = sbuf1_i[2*j];
703db41cccfSHong Zhang         for (k=0; k<max1; k++,ct1++) {
704db41cccfSHong Zhang           row   = sbuf1_i[ct1];
705db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
706db41cccfSHong Zhang 	  ierr = PetscTableFind(lrow1_grow1,row+1,&row);CHKERRQ(ierr);
707db41cccfSHong Zhang           row--;
708db41cccfSHong Zhang           if(row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
709db41cccfSHong Zhang #else
710db41cccfSHong Zhang           row   = rmap_i[row];
711db41cccfSHong Zhang #endif
712db41cccfSHong Zhang           ilen  = imat_ilen[row];
713db41cccfSHong Zhang           mat_i = imat_i[row];
714db41cccfSHong Zhang           //mat_a = imat_a + mat_i*bs2;
715db41cccfSHong Zhang           mat_j = imat_j + mat_i;
716db41cccfSHong Zhang           max2 = rbuf2_i[ct1];
717db41cccfSHong Zhang           for (l=0; l<max2; l++,ct2++) {
718db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
719db41cccfSHong Zhang 	    ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
720db41cccfSHong Zhang 	    if (tcol) {
721db41cccfSHong Zhang #else
722db41cccfSHong Zhang 	    if ((tcol = cmap_i[rbuf3_i[ct2]])) {
723db41cccfSHong Zhang #endif
724db41cccfSHong Zhang               *mat_j++    = tcol - 1;
725db41cccfSHong Zhang               /* *mat_a++= rbuf4_i[ct2]; */
726db41cccfSHong Zhang               //ierr        = PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
727db41cccfSHong Zhang               //mat_a      += bs2;
728db41cccfSHong Zhang               ilen++;
729db41cccfSHong Zhang             }
730db41cccfSHong Zhang           }
731db41cccfSHong Zhang           imat_ilen[row] = ilen;
732db41cccfSHong Zhang         }
733db41cccfSHong Zhang       }
734db41cccfSHong Zhang     }
735db41cccfSHong Zhang   }
736db41cccfSHong Zhang     //ierr = PetscFree(r_status4);CHKERRQ(ierr);
737db41cccfSHong Zhang     //ierr = PetscFree(r_waits4);CHKERRQ(ierr);
738db41cccfSHong Zhang     //if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
739db41cccfSHong Zhang     //ierr = PetscFree(s_waits4);CHKERRQ(ierr);
740db41cccfSHong Zhang     //ierr = PetscFree(s_status4);CHKERRQ(ierr);
741db41cccfSHong Zhang 
742db41cccfSHong Zhang   /* Restore the indices */
743db41cccfSHong Zhang   for (i=0; i<ismax; i++) {
744db41cccfSHong Zhang     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
745db41cccfSHong Zhang     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
746db41cccfSHong Zhang   }
747db41cccfSHong Zhang 
748db41cccfSHong Zhang   /* Destroy allocated memory */
749db41cccfSHong Zhang #if defined(PETSC_USE_CTABLE)
750db41cccfSHong Zhang   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
751db41cccfSHong Zhang #else
752db41cccfSHong Zhang   ierr = PetscFree5(irow,icol,nrow,ncol,rtable);CHKERRQ(ierr);
753db41cccfSHong Zhang #endif
754db41cccfSHong Zhang   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
755db41cccfSHong Zhang   ierr = PetscFree(pa);CHKERRQ(ierr);
756db41cccfSHong Zhang 
757db41cccfSHong Zhang   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
758db41cccfSHong Zhang   ierr = PetscFree(sbuf1);CHKERRQ(ierr);
759db41cccfSHong Zhang   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
760db41cccfSHong Zhang   for (i=0; i<nrqr; ++i) {
761db41cccfSHong Zhang     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
762db41cccfSHong Zhang   }
763db41cccfSHong Zhang   for (i=0; i<nrqs; ++i) {
764db41cccfSHong Zhang     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
765db41cccfSHong Zhang     //ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
766db41cccfSHong Zhang   }
767db41cccfSHong Zhang   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
768db41cccfSHong Zhang   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
769db41cccfSHong Zhang   //ierr = PetscFree(rbuf4);CHKERRQ(ierr);
770db41cccfSHong Zhang   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
771db41cccfSHong Zhang   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
772db41cccfSHong Zhang   //ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
773db41cccfSHong Zhang   //ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
774db41cccfSHong Zhang 
775db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
776db41cccfSHong Zhang   for (i=0; i<ismax; i++){
777db41cccfSHong Zhang     ierr = PetscTableDestroy(rowmaps[i]);CHKERRQ(ierr);
778db41cccfSHong Zhang     ierr = PetscTableDestroy(colmaps[i]);CHKERRQ(ierr);
779db41cccfSHong Zhang   }
780db41cccfSHong Zhang   ierr = PetscFree(colmaps);CHKERRQ(ierr);
781db41cccfSHong Zhang   ierr = PetscFree(rowmaps);CHKERRQ(ierr);
782db41cccfSHong Zhang #else
783db41cccfSHong Zhang   ierr = PetscFree(rmap);CHKERRQ(ierr);
784db41cccfSHong Zhang   ierr = PetscFree(cmap[0]);CHKERRQ(ierr);
785db41cccfSHong Zhang   ierr = PetscFree(cmap);CHKERRQ(ierr);
786db41cccfSHong Zhang #endif
787db41cccfSHong Zhang   ierr = PetscFree(lens);CHKERRQ(ierr);
788db41cccfSHong Zhang 
789db41cccfSHong Zhang   for (i=0; i<ismax; i++) {
790db41cccfSHong Zhang     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
791db41cccfSHong Zhang     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
792db41cccfSHong Zhang   }
793db41cccfSHong Zhang   PetscFunctionReturn(0);
794db41cccfSHong Zhang }
795db41cccfSHong Zhang 
796db41cccfSHong Zhang /* ------------------------------------------------------- */
797db41cccfSHong Zhang 
798632d0f97SHong Zhang #undef __FUNCT__
799632d0f97SHong Zhang #define __FUNCT__ "MatIncreaseOverlap_MPISBAIJ"
80013f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_MPISBAIJ(Mat C,PetscInt is_max,IS is[],PetscInt ov)
801632d0f97SHong Zhang {
8026849ba73SBarry Smith   PetscErrorCode ierr;
803d0f46423SBarry Smith   PetscInt       i,N=C->cmap->N, bs=C->rmap->bs;
804632d0f97SHong Zhang   IS             *is_new;
805632d0f97SHong Zhang 
806632d0f97SHong Zhang   PetscFunctionBegin;
807c910923dSHong Zhang   ierr = PetscMalloc(is_max*sizeof(IS),&is_new);CHKERRQ(ierr);
808632d0f97SHong Zhang   /* Convert the indices into block format */
80927f478b1SHong Zhang   ierr = ISCompressIndicesGeneral(N,bs,is_max,is,is_new);CHKERRQ(ierr);
810e32f2f54SBarry Smith   if (ov < 0){ SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");}
811db41cccfSHong Zhang 
812db41cccfSHong Zhang   //------- new ----------
813db41cccfSHong Zhang   PetscBool flg=PETSC_FALSE;
814db41cccfSHong Zhang   ierr = PetscOptionsHasName(PETSC_NULL, "-IncreaseOverlap_new", &flg);CHKERRQ(ierr);
815db41cccfSHong Zhang   if (!flg){ /* previous non-scalable implementation */
816632d0f97SHong Zhang     for (i=0; i<ov; ++i) {
817c910923dSHong Zhang       ierr = MatIncreaseOverlap_MPISBAIJ_Once(C,is_max,is_new);CHKERRQ(ierr);
818632d0f97SHong Zhang     }
819db41cccfSHong Zhang   } else { /* scalable implementation using modified BAIJ routines */
820db41cccfSHong Zhang 
821db41cccfSHong Zhang   Mat           *submats;
822db41cccfSHong Zhang   IS            *is_row;
823db41cccfSHong Zhang   PetscInt      M=C->rmap->N,Mbs=M/bs,*nidx,isz,iov;
824db41cccfSHong Zhang   Mat_MPISBAIJ  *c = (Mat_MPISBAIJ*)C->data;
825db41cccfSHong Zhang   PetscMPIInt   rank=c->rank;
826db41cccfSHong Zhang 
827db41cccfSHong Zhang   ierr = PetscMalloc((Mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
828db41cccfSHong Zhang 
829db41cccfSHong Zhang   /* Create is_row */
830db41cccfSHong Zhang   ierr = PetscMalloc(is_max*sizeof(IS **),&is_row);CHKERRQ(ierr);
831*487daaacSHong Zhang   ierr = ISCreateStride(PETSC_COMM_SELF,Mbs,0,1,&is_row[0]);CHKERRQ(ierr);
832*487daaacSHong Zhang   for (i=1; i<is_max; i++) is_row[i] = is_row[0]; /* reuse is_row[0] */
833db41cccfSHong Zhang 
834db41cccfSHong Zhang   for (iov=0; iov<ov; ++iov) {
835*487daaacSHong Zhang     /* Get submats for column search - Modified from MatGetSubMatrices_MPIBAIJ() */
836*487daaacSHong Zhang 
837db41cccfSHong Zhang     /* Allocate memory to hold all the submatrices */
838db41cccfSHong Zhang     ierr = PetscMalloc((is_max+1)*sizeof(Mat),&submats);CHKERRQ(ierr);
839db41cccfSHong Zhang     /* Determine the number of stages through which submatrices are done */
840db41cccfSHong Zhang     PetscInt nmax,nstages_local,nstages,max_no,pos;
841db41cccfSHong Zhang     nmax          = 20*1000000 / (c->Nbs * sizeof(PetscInt));
842db41cccfSHong Zhang     if (!nmax) nmax = 1;
843db41cccfSHong Zhang     nstages_local = is_max/nmax + ((is_max % nmax)?1:0);
844db41cccfSHong Zhang 
845db41cccfSHong Zhang     /* Make sure every processor loops through the nstages */
846db41cccfSHong Zhang     ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
847db41cccfSHong Zhang     for (i=0,pos=0; i<nstages; i++) {
848db41cccfSHong Zhang       if (pos+nmax <= is_max) max_no = nmax;
849db41cccfSHong Zhang       else if (pos == is_max) max_no = 0;
850db41cccfSHong Zhang       else                   max_no = is_max-pos;
851db41cccfSHong Zhang 
852db41cccfSHong Zhang       ierr = MatGetSubMatrices_MPIBAIJ_local_ijonly(C,max_no,is_row+pos,is_new+pos,MAT_INITIAL_MATRIX,submats+pos);CHKERRQ(ierr);
853db41cccfSHong Zhang       if (rank == 10 && !i){
854db41cccfSHong Zhang         printf("submats[0]:\n");
855db41cccfSHong Zhang         ierr = MatView(submats[0],PETSC_VIEWER_STDOUT_SELF);CHKERRQ(ierr);
856db41cccfSHong Zhang       }
857db41cccfSHong Zhang       pos += max_no;
858db41cccfSHong Zhang     }
859db41cccfSHong Zhang 
860db41cccfSHong Zhang     /* Row search */
861db41cccfSHong Zhang     ierr = MatIncreaseOverlap_MPIBAIJ_Once(C,is_max,is_new);CHKERRQ(ierr);
862db41cccfSHong Zhang 
863db41cccfSHong Zhang     /* Column search */
864db41cccfSHong Zhang     PetscBT        table;
865db41cccfSHong Zhang     Mat_SeqSBAIJ   *asub_i;
866db41cccfSHong Zhang     PetscInt       *ai,brow,nz,nis,l;
867db41cccfSHong Zhang     const PetscInt *idx;
868db41cccfSHong Zhang 
869db41cccfSHong Zhang     ierr = PetscBTCreate(Mbs,table);CHKERRQ(ierr);
870db41cccfSHong Zhang     for (i=0; i<is_max; i++){
871db41cccfSHong Zhang       asub_i = (Mat_SeqSBAIJ*)submats[i]->data;
872db41cccfSHong Zhang       ai=asub_i->i;;
873db41cccfSHong Zhang 
874db41cccfSHong Zhang       /* put is_new obtained from MatIncreaseOverlap_MPIBAIJ() to table */
875db41cccfSHong Zhang       ierr = PetscBTMemzero(Mbs,table);CHKERRQ(ierr);
876db41cccfSHong Zhang 
877db41cccfSHong Zhang       ierr = ISGetIndices(is_new[i],&idx);CHKERRQ(ierr);
878db41cccfSHong Zhang       ierr = ISGetLocalSize(is_new[i],&nis);CHKERRQ(ierr);
879db41cccfSHong Zhang       for (l=0; l<nis; l++) {
880db41cccfSHong Zhang         ierr = PetscBTSet(table,idx[l]);CHKERRQ(ierr);
881db41cccfSHong Zhang         nidx[l] = idx[l];
882db41cccfSHong Zhang       }
883db41cccfSHong Zhang       isz = nis;
884db41cccfSHong Zhang 
885db41cccfSHong Zhang       for (brow=0; brow<Mbs; brow++){
886db41cccfSHong Zhang         nz = ai[brow+1] - ai[brow];
887db41cccfSHong Zhang         if (nz) {
888db41cccfSHong Zhang           if (!PetscBTLookupSet(table,brow)) nidx[isz++] = brow;
889db41cccfSHong Zhang         }
890db41cccfSHong Zhang       }
891db41cccfSHong Zhang       ierr = ISRestoreIndices(is_new[i],&idx);CHKERRQ(ierr);
892db41cccfSHong Zhang       ierr = ISDestroy(is_new[i]);CHKERRQ(ierr);
893db41cccfSHong Zhang 
894db41cccfSHong Zhang       /* create updated is_new */
895db41cccfSHong Zhang       ierr = ISCreateGeneral(PETSC_COMM_SELF,isz,nidx,PETSC_COPY_VALUES,is_new+i);CHKERRQ(ierr);
896db41cccfSHong Zhang     }
897db41cccfSHong Zhang 
898db41cccfSHong Zhang     /* Free tmp spaces */
899db41cccfSHong Zhang     ierr = PetscBTDestroy(table);CHKERRQ(ierr);
900db41cccfSHong Zhang     for (i=0; i<is_max; i++){
901db41cccfSHong Zhang       ierr = MatDestroy(submats[i]);CHKERRQ(ierr);
902db41cccfSHong Zhang     }
903db41cccfSHong Zhang     ierr = PetscFree(submats);CHKERRQ(ierr);
904db41cccfSHong Zhang   }
905*487daaacSHong Zhang   ierr = ISDestroy(is_row[0]);CHKERRQ(ierr);
906db41cccfSHong Zhang   ierr = PetscFree(is_row);CHKERRQ(ierr);
907db41cccfSHong Zhang   ierr = PetscFree(nidx);CHKERRQ(ierr);
908db41cccfSHong Zhang   }
909db41cccfSHong Zhang   //--------end of new----------
910db41cccfSHong Zhang 
911c910923dSHong Zhang   for (i=0; i<is_max; i++) {ierr = ISDestroy(is[i]);CHKERRQ(ierr);}
91227f478b1SHong Zhang   ierr = ISExpandIndicesGeneral(N,bs,is_max,is_new,is);CHKERRQ(ierr);
913db41cccfSHong Zhang 
914c910923dSHong Zhang   for (i=0; i<is_max; i++) {ierr = ISDestroy(is_new[i]);CHKERRQ(ierr);}
915632d0f97SHong Zhang   ierr = PetscFree(is_new);CHKERRQ(ierr);
916632d0f97SHong Zhang   PetscFunctionReturn(0);
917632d0f97SHong Zhang }
918632d0f97SHong Zhang 
9194a69c536SBarry Smith typedef enum {MINE,OTHER} WhoseOwner;
9200472cc68SHong Zhang /*  data1, odata1 and odata2 are packed in the format (for communication):
921a2a9f22aSHong Zhang        data[0]          = is_max, no of is
922a2a9f22aSHong Zhang        data[1]          = size of is[0]
923a2a9f22aSHong Zhang         ...
924a2a9f22aSHong Zhang        data[is_max]     = size of is[is_max-1]
925a2a9f22aSHong Zhang        data[is_max + 1] = data(is[0])
926a2a9f22aSHong Zhang         ...
927a2a9f22aSHong Zhang        data[is_max+1+sum(size of is[k]), k=0,...,i-1] = data(is[i])
928a2a9f22aSHong Zhang         ...
9290472cc68SHong Zhang    data2 is packed in the format (for creating output is[]):
9300472cc68SHong Zhang        data[0]          = is_max, no of is
9310472cc68SHong Zhang        data[1]          = size of is[0]
9320472cc68SHong Zhang         ...
9330472cc68SHong Zhang        data[is_max]     = size of is[is_max-1]
9340472cc68SHong Zhang        data[is_max + 1] = data(is[0])
9350472cc68SHong Zhang         ...
9360472cc68SHong Zhang        data[is_max + 1 + Mbs*i) = data(is[i])
9370472cc68SHong Zhang         ...
938a2a9f22aSHong Zhang */
939632d0f97SHong Zhang #undef __FUNCT__
940632d0f97SHong Zhang #define __FUNCT__ "MatIncreaseOverlap_MPISBAIJ_Once"
94113f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat C,PetscInt is_max,IS is[])
942632d0f97SHong Zhang {
943632d0f97SHong Zhang   Mat_MPISBAIJ  *c = (Mat_MPISBAIJ*)C->data;
9446849ba73SBarry Smith   PetscErrorCode ierr;
94513f74950SBarry Smith   PetscMPIInt    size,rank,tag1,tag2,*len_s,nrqr,nrqs,*id_r1,*len_r1,flag,len;
9465d0c19d7SBarry Smith   const PetscInt *idx_i;
9475d0c19d7SBarry Smith   PetscInt       idx,isz,col,*n,*data1,**data1_start,*data2,*data2_i,*data,*data_i,
94813f74950SBarry Smith                  Mbs,i,j,k,*odata1,*odata2,
94913f74950SBarry Smith                  proc_id,**odata2_ptr,*ctable=0,*btable,len_max,len_est;
95013f74950SBarry Smith   PetscInt       proc_end=0,*iwork,len_unused,nodata2;
95113f74950SBarry Smith   PetscInt       ois_max; /* max no of is[] in each of processor */
952bfc6803cSHong Zhang   char           *t_p;
953632d0f97SHong Zhang   MPI_Comm       comm;
954e8527bf2SHong Zhang   MPI_Request    *s_waits1,*s_waits2,r_req;
955632d0f97SHong Zhang   MPI_Status     *s_status,r_status;
95645f43ab7SHong Zhang   PetscBT        *table;  /* mark indices of this processor's is[] */
957fc70d14eSHong Zhang   PetscBT        table_i;
958bfc6803cSHong Zhang   PetscBT        otable; /* mark indices of other processors' is[] */
959d0f46423SBarry Smith   PetscInt       bs=C->rmap->bs,Bn = c->B->cmap->n,Bnbs = Bn/bs,*Bowners;
96010eea884SHong Zhang   IS             garray_local,garray_gl;
9615483b11dSHong Zhang 
962632d0f97SHong Zhang   PetscFunctionBegin;
9637adad957SLisandro Dalcin   comm = ((PetscObject)C)->comm;
964632d0f97SHong Zhang   size = c->size;
965632d0f97SHong Zhang   rank = c->rank;
966632d0f97SHong Zhang   Mbs  = c->Mbs;
967632d0f97SHong Zhang 
968c910923dSHong Zhang   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
969c910923dSHong Zhang   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
970c910923dSHong Zhang 
971430a0127SHong Zhang   /* create tables used in
972430a0127SHong Zhang      step 1: table[i] - mark c->garray of proc [i]
97345f43ab7SHong Zhang      step 3: table[i] - mark indices of is[i] when whose=MINE
974430a0127SHong Zhang              table[0] - mark incideces of is[] when whose=OTHER */
975430a0127SHong Zhang   len = PetscMax(is_max, size);CHKERRQ(ierr);
97674ed9c26SBarry Smith   ierr = PetscMalloc2(len,PetscBT,&table,(Mbs/PETSC_BITS_PER_BYTE+1)*len,char,&t_p);CHKERRQ(ierr);
977430a0127SHong Zhang   for (i=0; i<len; i++) {
978430a0127SHong Zhang     table[i]  = t_p  + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
979430a0127SHong Zhang   }
980430a0127SHong Zhang 
98113f74950SBarry Smith   ierr = MPI_Allreduce(&is_max,&ois_max,1,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
98210eea884SHong Zhang 
9835483b11dSHong Zhang   /* 1. Send this processor's is[] to other processors */
9845483b11dSHong Zhang   /*---------------------------------------------------*/
985e8527bf2SHong Zhang   /* allocate spaces */
98613f74950SBarry Smith   ierr = PetscMalloc(is_max*sizeof(PetscInt),&n);CHKERRQ(ierr);
9875483b11dSHong Zhang   len = 0;
988c910923dSHong Zhang   for (i=0; i<is_max; i++) {
989632d0f97SHong Zhang     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
990632d0f97SHong Zhang     len += n[i];
991632d0f97SHong Zhang   }
992958c9bccSBarry Smith   if (!len) {
9935483b11dSHong Zhang     is_max = 0;
9945483b11dSHong Zhang   } else {
9955483b11dSHong Zhang     len += 1 + is_max; /* max length of data1 for one processor */
9965483b11dSHong Zhang   }
9975483b11dSHong Zhang 
998430a0127SHong Zhang 
99913f74950SBarry Smith   ierr = PetscMalloc((size*len+1)*sizeof(PetscInt),&data1);CHKERRQ(ierr);
100013f74950SBarry Smith   ierr = PetscMalloc(size*sizeof(PetscInt*),&data1_start);CHKERRQ(ierr);
10015483b11dSHong Zhang   for (i=0; i<size; i++) data1_start[i] = data1 + i*len;
10025483b11dSHong Zhang 
100340cb64c9SJed Brown   ierr = PetscMalloc4(size,PetscInt,&len_s,size,PetscInt,&btable,size,PetscInt,&iwork,size+1,PetscInt,&Bowners);CHKERRQ(ierr);
1004e8527bf2SHong Zhang 
100576f244e2SHong Zhang   /* gather c->garray from all processors */
100670b3c8c7SBarry Smith   ierr = ISCreateGeneral(comm,Bnbs,c->garray,PETSC_COPY_VALUES,&garray_local);CHKERRQ(ierr);
100776f244e2SHong Zhang   ierr = ISAllGather(garray_local, &garray_gl);CHKERRQ(ierr);
100876f244e2SHong Zhang   ierr = ISDestroy(garray_local);CHKERRQ(ierr);
1009a7cc72afSBarry Smith   ierr = MPI_Allgather(&Bnbs,1,MPIU_INT,Bowners+1,1,MPIU_INT,comm);CHKERRQ(ierr);
101076f244e2SHong Zhang   Bowners[0] = 0;
101176f244e2SHong Zhang   for (i=0; i<size; i++) Bowners[i+1] += Bowners[i];
101276f244e2SHong Zhang 
10135483b11dSHong Zhang   if (is_max){
1014430a0127SHong Zhang     /* hash table ctable which maps c->row to proc_id) */
101513f74950SBarry Smith     ierr = PetscMalloc(Mbs*sizeof(PetscInt),&ctable);CHKERRQ(ierr);
10165483b11dSHong Zhang     for (proc_id=0,j=0; proc_id<size; proc_id++) {
1017288a2dc7SJed Brown       for (; j<C->rmap->range[proc_id+1]/bs; j++) {
10185483b11dSHong Zhang         ctable[j] = proc_id;
10195483b11dSHong Zhang       }
10205483b11dSHong Zhang     }
10215483b11dSHong Zhang 
1022430a0127SHong Zhang     /* hash tables marking c->garray */
102310eea884SHong Zhang     ierr = ISGetIndices(garray_gl,&idx_i);
10245483b11dSHong Zhang     for (i=0; i<size; i++){
1025430a0127SHong Zhang       table_i = table[i];
1026430a0127SHong Zhang       ierr    = PetscBTMemzero(Mbs,table_i);CHKERRQ(ierr);
1027430a0127SHong Zhang       for (j = Bowners[i]; j<Bowners[i+1]; j++){ /* go through B cols of proc[i]*/
1028430a0127SHong Zhang         ierr = PetscBTSet(table_i,idx_i[j]);CHKERRQ(ierr);
10295483b11dSHong Zhang       }
10305483b11dSHong Zhang     }
103110eea884SHong Zhang     ierr = ISRestoreIndices(garray_gl,&idx_i);CHKERRQ(ierr);
10325483b11dSHong Zhang   }  /* if (is_max) */
103376f244e2SHong Zhang   ierr = ISDestroy(garray_gl);CHKERRQ(ierr);
10345483b11dSHong Zhang 
10355483b11dSHong Zhang   /* evaluate communication - mesg to who, length, and buffer space */
1036e8527bf2SHong Zhang   for (i=0; i<size; i++) len_s[i] = 0;
10375483b11dSHong Zhang 
10385483b11dSHong Zhang   /* header of data1 */
10395483b11dSHong Zhang   for (proc_id=0; proc_id<size; proc_id++){
10405483b11dSHong Zhang     iwork[proc_id] = 0;
10415483b11dSHong Zhang     *data1_start[proc_id] = is_max;
10425483b11dSHong Zhang     data1_start[proc_id]++;
10435483b11dSHong Zhang     for (j=0; j<is_max; j++) {
10445483b11dSHong Zhang       if (proc_id == rank){
10455483b11dSHong Zhang         *data1_start[proc_id] = n[j];
10465483b11dSHong Zhang       } else {
10475483b11dSHong Zhang         *data1_start[proc_id] = 0;
10485483b11dSHong Zhang       }
10495483b11dSHong Zhang       data1_start[proc_id]++;
10505483b11dSHong Zhang     }
10515483b11dSHong Zhang   }
10525483b11dSHong Zhang 
10535483b11dSHong Zhang   for (i=0; i<is_max; i++) {
10545483b11dSHong Zhang     ierr = ISGetIndices(is[i],&idx_i);CHKERRQ(ierr);
10555483b11dSHong Zhang     for (j=0; j<n[i]; j++){
10565483b11dSHong Zhang       idx = idx_i[j];
10575483b11dSHong Zhang       *data1_start[rank] = idx; data1_start[rank]++; /* for local proccessing */
10585483b11dSHong Zhang       proc_end = ctable[idx];
10595483b11dSHong Zhang       for (proc_id=0;  proc_id<=proc_end; proc_id++){ /* for others to process */
1060e8527bf2SHong Zhang         if (proc_id == rank ) continue; /* done before this loop */
1061430a0127SHong Zhang         if (proc_id < proc_end && !PetscBTLookup(table[proc_id],idx))
1062430a0127SHong Zhang           continue;   /* no need for sending idx to [proc_id] */
10635483b11dSHong Zhang         *data1_start[proc_id] = idx; data1_start[proc_id]++;
10645483b11dSHong Zhang         len_s[proc_id]++;
10655483b11dSHong Zhang       }
10665483b11dSHong Zhang     }
10675483b11dSHong Zhang     /* update header data */
10682cfbe0a4SHong Zhang     for (proc_id=0; proc_id<size; proc_id++){
10695483b11dSHong Zhang       if (proc_id== rank) continue;
10705483b11dSHong Zhang       *(data1 + proc_id*len + 1 + i) = len_s[proc_id] - iwork[proc_id];
10715483b11dSHong Zhang       iwork[proc_id] = len_s[proc_id] ;
10725483b11dSHong Zhang     }
10735483b11dSHong Zhang     ierr = ISRestoreIndices(is[i],&idx_i);CHKERRQ(ierr);
1074e8527bf2SHong Zhang   }
10755483b11dSHong Zhang 
10765483b11dSHong Zhang   nrqs = 0; nrqr = 0;
10775483b11dSHong Zhang   for (i=0; i<size; i++){
10785483b11dSHong Zhang     data1_start[i] = data1 + i*len;
10795483b11dSHong Zhang     if (len_s[i]){
10805483b11dSHong Zhang       nrqs++;
10815483b11dSHong Zhang       len_s[i] += 1 + is_max; /* add no. of header msg */
10825483b11dSHong Zhang     }
10835483b11dSHong Zhang   }
10845483b11dSHong Zhang 
10855483b11dSHong Zhang   for (i=0; i<is_max; i++) {
1086a2a9f22aSHong Zhang     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
1087632d0f97SHong Zhang   }
1088bfc6803cSHong Zhang   ierr = PetscFree(n);CHKERRQ(ierr);
108905b42c5fSBarry Smith   ierr = PetscFree(ctable);CHKERRQ(ierr);
10905483b11dSHong Zhang 
10915483b11dSHong Zhang   /* Determine the number of messages to expect, their lengths, from from-ids */
10925483b11dSHong Zhang   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&nrqr);CHKERRQ(ierr);
10935483b11dSHong Zhang   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,len_s,&id_r1,&len_r1);CHKERRQ(ierr);
1094632d0f97SHong Zhang 
1095632d0f97SHong Zhang   /*  Now  post the sends */
109674ed9c26SBarry Smith   ierr = PetscMalloc2(size,MPI_Request,&s_waits1,size,MPI_Request,&s_waits2);CHKERRQ(ierr);
1097632d0f97SHong Zhang   k = 0;
10985483b11dSHong Zhang   for (proc_id=0; proc_id<size; proc_id++){  /* send data1 to processor [proc_id] */
10995483b11dSHong Zhang     if (len_s[proc_id]){
110013f74950SBarry Smith       ierr = MPI_Isend(data1_start[proc_id],len_s[proc_id],MPIU_INT,proc_id,tag1,comm,s_waits1+k);CHKERRQ(ierr);
1101632d0f97SHong Zhang       k++;
1102632d0f97SHong Zhang     }
1103632d0f97SHong Zhang   }
1104632d0f97SHong Zhang 
110545f43ab7SHong Zhang   /* 2. Receive other's is[] and process. Then send back */
1106bfc6803cSHong Zhang   /*-----------------------------------------------------*/
11075483b11dSHong Zhang   len = 0;
11085483b11dSHong Zhang   for (i=0; i<nrqr; i++){
11095483b11dSHong Zhang     if (len_r1[i] > len)len = len_r1[i];
11105483b11dSHong Zhang   }
111145f43ab7SHong Zhang   ierr = PetscFree(len_r1);CHKERRQ(ierr);
111245f43ab7SHong Zhang   ierr = PetscFree(id_r1);CHKERRQ(ierr);
111345f43ab7SHong Zhang 
111445f43ab7SHong Zhang   for (proc_id=0; proc_id<size; proc_id++)
111545f43ab7SHong Zhang     len_s[proc_id] = iwork[proc_id] = 0;
111645f43ab7SHong Zhang 
111713f74950SBarry Smith   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&odata1);CHKERRQ(ierr);
111813f74950SBarry Smith   ierr = PetscMalloc(size*sizeof(PetscInt**),&odata2_ptr);CHKERRQ(ierr);
111976f244e2SHong Zhang   ierr = PetscBTCreate(Mbs,otable);CHKERRQ(ierr);
112010eea884SHong Zhang 
112110eea884SHong Zhang   len_max = ois_max*(Mbs+1);  /* max space storing all is[] for each receive */
1122240e5896SHong Zhang   len_est = 2*len_max; /* estimated space of storing is[] for all receiving messages */
112313f74950SBarry Smith   ierr = PetscMalloc((len_est+1)*sizeof(PetscInt),&odata2);CHKERRQ(ierr);
112410eea884SHong Zhang   nodata2 = 0;       /* nodata2+1: num of PetscMalloc(,&odata2_ptr[]) called */
1125240e5896SHong Zhang   odata2_ptr[nodata2] = odata2;
112610eea884SHong Zhang   len_unused = len_est; /* unused space in the array odata2_ptr[nodata2]-- needs to be >= len_max  */
112710eea884SHong Zhang 
1128632d0f97SHong Zhang   k = 0;
11295483b11dSHong Zhang   while (k < nrqr){
1130632d0f97SHong Zhang     /* Receive messages */
1131bfc6803cSHong Zhang     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag1,comm,&flag,&r_status);CHKERRQ(ierr);
1132632d0f97SHong Zhang     if (flag){
113313f74950SBarry Smith       ierr = MPI_Get_count(&r_status,MPIU_INT,&len);CHKERRQ(ierr);
1134632d0f97SHong Zhang       proc_id = r_status.MPI_SOURCE;
113513f74950SBarry Smith       ierr = MPI_Irecv(odata1,len,MPIU_INT,proc_id,r_status.MPI_TAG,comm,&r_req);CHKERRQ(ierr);
11365483b11dSHong Zhang       ierr = MPI_Wait(&r_req,&r_status);CHKERRQ(ierr);
1137632d0f97SHong Zhang 
1138fc70d14eSHong Zhang       /*  Process messages */
1139240e5896SHong Zhang       /*  make sure there is enough unused space in odata2 array */
114010eea884SHong Zhang       if (len_unused < len_max){ /* allocate more space for odata2 */
114113f74950SBarry Smith         ierr = PetscMalloc((len_est+1)*sizeof(PetscInt),&odata2);CHKERRQ(ierr);
1142240e5896SHong Zhang         odata2_ptr[++nodata2] = odata2;
114310eea884SHong Zhang         len_unused = len_est;
114410eea884SHong Zhang       }
114510eea884SHong Zhang 
114610eea884SHong Zhang       ierr = MatIncreaseOverlap_MPISBAIJ_Local(C,odata1,OTHER,odata2,&otable);CHKERRQ(ierr);
1147a2a9f22aSHong Zhang       len = 1 + odata2[0];
1148a2a9f22aSHong Zhang       for (i=0; i<odata2[0]; i++){
1149a2a9f22aSHong Zhang         len += odata2[1 + i];
1150fc70d14eSHong Zhang       }
1151632d0f97SHong Zhang 
1152632d0f97SHong Zhang       /* Send messages back */
115313f74950SBarry Smith       ierr = MPI_Isend(odata2,len,MPIU_INT,proc_id,tag2,comm,s_waits2+k);CHKERRQ(ierr);
1154fc70d14eSHong Zhang       k++;
115510eea884SHong Zhang       odata2     += len;
115610eea884SHong Zhang       len_unused -= len;
115745f43ab7SHong Zhang       len_s[proc_id] = len; /* num of messages sending back to [proc_id] by this proc */
1158632d0f97SHong Zhang     }
11595483b11dSHong Zhang   }
11605483b11dSHong Zhang   ierr = PetscFree(odata1);CHKERRQ(ierr);
116145f43ab7SHong Zhang   ierr = PetscBTDestroy(otable);CHKERRQ(ierr);
1162632d0f97SHong Zhang 
116345f43ab7SHong Zhang   /* 3. Do local work on this processor's is[] */
116445f43ab7SHong Zhang   /*-------------------------------------------*/
116545f43ab7SHong Zhang   /* make sure there is enough unused space in odata2(=data) array */
116645f43ab7SHong Zhang   len_max = is_max*(Mbs+1); /* max space storing all is[] for this processor */
116710eea884SHong Zhang   if (len_unused < len_max){ /* allocate more space for odata2 */
116813f74950SBarry Smith     ierr = PetscMalloc((len_est+1)*sizeof(PetscInt),&odata2);CHKERRQ(ierr);
1169240e5896SHong Zhang     odata2_ptr[++nodata2] = odata2;
117010eea884SHong Zhang     len_unused = len_est;
117110eea884SHong Zhang   }
1172bfc6803cSHong Zhang 
1173240e5896SHong Zhang   data = odata2;
117445f43ab7SHong Zhang   ierr = MatIncreaseOverlap_MPISBAIJ_Local(C,data1_start[rank],MINE,data,table);CHKERRQ(ierr);
117545f43ab7SHong Zhang   ierr = PetscFree(data1_start);CHKERRQ(ierr);
117645f43ab7SHong Zhang 
117745f43ab7SHong Zhang   /* 4. Receive work done on other processors, then merge */
117845f43ab7SHong Zhang   /*------------------------------------------------------*/
117945f43ab7SHong Zhang   /* get max number of messages that this processor expects to recv */
118013f74950SBarry Smith   ierr = MPI_Allreduce(len_s,iwork,size,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
118113f74950SBarry Smith   ierr = PetscMalloc((iwork[rank]+1)*sizeof(PetscInt),&data2);CHKERRQ(ierr);
118274ed9c26SBarry Smith   ierr = PetscFree4(len_s,btable,iwork,Bowners);CHKERRQ(ierr);
118345f43ab7SHong Zhang 
1184632d0f97SHong Zhang   k = 0;
11855483b11dSHong Zhang   while (k < nrqs){
1186632d0f97SHong Zhang     /* Receive messages */
1187c910923dSHong Zhang     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag2,comm,&flag,&r_status);
1188632d0f97SHong Zhang     if (flag){
118913f74950SBarry Smith       ierr = MPI_Get_count(&r_status,MPIU_INT,&len);CHKERRQ(ierr);
1190632d0f97SHong Zhang       proc_id = r_status.MPI_SOURCE;
119113f74950SBarry Smith       ierr = MPI_Irecv(data2,len,MPIU_INT,proc_id,r_status.MPI_TAG,comm,&r_req);CHKERRQ(ierr);
11925483b11dSHong Zhang       ierr = MPI_Wait(&r_req,&r_status);CHKERRQ(ierr);
11935483b11dSHong Zhang       if (len > 1+is_max){ /* Add data2 into data */
11940472cc68SHong Zhang         data2_i = data2 + 1 + is_max;
1195fc70d14eSHong Zhang         for (i=0; i<is_max; i++){
1196fc70d14eSHong Zhang           table_i = table[i];
1197bfc6803cSHong Zhang           data_i  = data + 1 + is_max + Mbs*i;
1198bfc6803cSHong Zhang           isz     = data[1+i];
11990472cc68SHong Zhang           for (j=0; j<data2[1+i]; j++){
12000472cc68SHong Zhang             col = data2_i[j];
1201bfc6803cSHong Zhang             if (!PetscBTLookupSet(table_i,col)) {data_i[isz++] = col;}
1202fc70d14eSHong Zhang           }
1203bfc6803cSHong Zhang           data[1+i] = isz;
12040472cc68SHong Zhang           if (i < is_max - 1) data2_i += data2[1+i];
1205fc70d14eSHong Zhang         }
12065483b11dSHong Zhang       }
1207632d0f97SHong Zhang       k++;
1208632d0f97SHong Zhang     }
12095483b11dSHong Zhang   }
121045f43ab7SHong Zhang   ierr = PetscFree(data2);CHKERRQ(ierr);
121174ed9c26SBarry Smith   ierr = PetscFree2(table,t_p);CHKERRQ(ierr);
12125483b11dSHong Zhang 
12135483b11dSHong Zhang   /* phase 1 sends are complete */
12145483b11dSHong Zhang   ierr = PetscMalloc(size*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
12150c468ba9SBarry Smith   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
12165483b11dSHong Zhang   ierr = PetscFree(data1);CHKERRQ(ierr);
12175483b11dSHong Zhang 
1218240e5896SHong Zhang   /* phase 2 sends are complete */
12190c468ba9SBarry Smith   if (nrqr){ierr = MPI_Waitall(nrqr,s_waits2,s_status);CHKERRQ(ierr);}
122074ed9c26SBarry Smith   ierr = PetscFree2(s_waits1,s_waits2);CHKERRQ(ierr);
122145f43ab7SHong Zhang   ierr = PetscFree(s_status);CHKERRQ(ierr);
1222632d0f97SHong Zhang 
1223c910923dSHong Zhang   /* 5. Create new is[] */
1224c910923dSHong Zhang   /*--------------------*/
1225c910923dSHong Zhang   for (i=0; i<is_max; i++) {
1226bfc6803cSHong Zhang     data_i = data + 1 + is_max + Mbs*i;
122770b3c8c7SBarry Smith     ierr = ISCreateGeneral(PETSC_COMM_SELF,data[1+i],data_i,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
1228632d0f97SHong Zhang   }
122945f43ab7SHong Zhang   for (k=0; k<=nodata2; k++){
123045f43ab7SHong Zhang     ierr = PetscFree(odata2_ptr[k]);CHKERRQ(ierr);
123145f43ab7SHong Zhang   }
123245f43ab7SHong Zhang   ierr = PetscFree(odata2_ptr);CHKERRQ(ierr);
12335483b11dSHong Zhang 
1234632d0f97SHong Zhang   PetscFunctionReturn(0);
1235632d0f97SHong Zhang }
1236632d0f97SHong Zhang 
1237632d0f97SHong Zhang #undef __FUNCT__
1238632d0f97SHong Zhang #define __FUNCT__ "MatIncreaseOverlap_MPISBAIJ_Local"
1239632d0f97SHong Zhang /*
1240dc008846SHong Zhang    MatIncreaseOverlap_MPISBAIJ_Local - Called by MatIncreaseOverlap, to do
1241632d0f97SHong Zhang        the work on the local processor.
1242632d0f97SHong Zhang 
1243632d0f97SHong Zhang      Inputs:
1244632d0f97SHong Zhang       C      - MAT_MPISBAIJ;
1245bfc6803cSHong Zhang       data   - holds is[]. See MatIncreaseOverlap_MPISBAIJ_Once() for the format.
1246bfc6803cSHong Zhang       whose  - whose is[] to be processed,
1247bfc6803cSHong Zhang                MINE:  this processor's is[]
1248bfc6803cSHong Zhang                OTHER: other processor's is[]
1249632d0f97SHong Zhang      Output:
125010eea884SHong Zhang        nidx  - whose = MINE:
12510472cc68SHong Zhang                      holds input and newly found indices in the same format as data
12520472cc68SHong Zhang                whose = OTHER:
12530472cc68SHong Zhang                      only holds the newly found indices
12540472cc68SHong Zhang        table - table[i]: mark the indices of is[i], i=0,...,is_max. Used only in the case 'whose=MINE'.
1255632d0f97SHong Zhang */
125676f244e2SHong Zhang /* Would computation be reduced by swapping the loop 'for each is' and 'for each row'? */
125713f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat C,PetscInt *data,PetscInt whose,PetscInt *nidx,PetscBT *table)
1258632d0f97SHong Zhang {
1259632d0f97SHong Zhang   Mat_MPISBAIJ   *c = (Mat_MPISBAIJ*)C->data;
1260dc008846SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)(c->A)->data;
1261dc008846SHong Zhang   Mat_SeqBAIJ    *b = (Mat_SeqBAIJ*)(c->B)->data;
1262dfbe8321SBarry Smith   PetscErrorCode ierr;
126313f74950SBarry Smith   PetscInt       row,mbs,Mbs,*nidx_i,col,col_max,isz,isz0,*ai,*aj,*bi,*bj,*garray,rstart,l;
126413f74950SBarry Smith   PetscInt       a_start,a_end,b_start,b_end,i,j,k,is_max,*idx_i,n;
1265bfc6803cSHong Zhang   PetscBT        table0;  /* mark the indices of input is[] for look up */
1266bfc6803cSHong Zhang   PetscBT        table_i; /* poits to i-th table. When whose=OTHER, a single table is used for all is[] */
1267632d0f97SHong Zhang 
1268632d0f97SHong Zhang   PetscFunctionBegin;
126931f99336SHong Zhang   Mbs = c->Mbs; mbs = a->mbs;
1270dc008846SHong Zhang   ai = a->i; aj = a->j;
1271dc008846SHong Zhang   bi = b->i; bj = b->j;
1272632d0f97SHong Zhang   garray = c->garray;
1273899cda47SBarry Smith   rstart = c->rstartbs;
1274dc008846SHong Zhang   is_max = data[0];
1275632d0f97SHong Zhang 
1276fc70d14eSHong Zhang   ierr = PetscBTCreate(Mbs,table0);CHKERRQ(ierr);
1277fc70d14eSHong Zhang 
1278fc70d14eSHong Zhang   nidx[0] = is_max;
1279fc70d14eSHong Zhang   idx_i   = data + is_max + 1; /* ptr to input is[0] array */
1280bfc6803cSHong Zhang   nidx_i  = nidx + is_max + 1; /* ptr to output is[0] array */
1281dc008846SHong Zhang   for (i=0; i<is_max; i++) { /* for each is */
1282dc008846SHong Zhang     isz  = 0;
1283fc70d14eSHong Zhang     n = data[1+i]; /* size of input is[i] */
1284dc008846SHong Zhang 
128576f244e2SHong Zhang     /* initialize and set table_i(mark idx and nidx) and table0(only mark idx) */
1286bfc6803cSHong Zhang     if (whose == MINE){ /* process this processor's is[] */
1287bfc6803cSHong Zhang       table_i = table[i];
12880472cc68SHong Zhang       nidx_i  = nidx + 1+ is_max + Mbs*i;
1289bfc6803cSHong Zhang     } else {            /* process other processor's is[] - only use one temp table */
1290430a0127SHong Zhang       table_i = table[0];
1291bfc6803cSHong Zhang     }
1292bfc6803cSHong Zhang     ierr = PetscBTMemzero(Mbs,table_i);CHKERRQ(ierr);
1293bfc6803cSHong Zhang     ierr = PetscBTMemzero(Mbs,table0);CHKERRQ(ierr);
129476f244e2SHong Zhang     if (n==0) {
129576f244e2SHong Zhang        nidx[1+i] = 0; /* size of new is[i] */
129676f244e2SHong Zhang        continue;
129776f244e2SHong Zhang     }
129876f244e2SHong Zhang 
129976f244e2SHong Zhang     isz0 = 0; col_max = 0;
1300dc008846SHong Zhang     for (j=0; j<n; j++){
1301dc008846SHong Zhang       col = idx_i[j];
1302e32f2f54SBarry Smith       if (col >= Mbs) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index col %D >= Mbs %D",col,Mbs);
1303bfc6803cSHong Zhang       if(!PetscBTLookupSet(table_i,col)) {
1304bfc6803cSHong Zhang         ierr = PetscBTSet(table0,col);CHKERRQ(ierr);
13050472cc68SHong Zhang         if (whose == MINE) {nidx_i[isz0] = col;}
1306dbe03f88SHong Zhang         if (col_max < col) col_max = col;
1307bfc6803cSHong Zhang         isz0++;
1308bfc6803cSHong Zhang       }
1309632d0f97SHong Zhang     }
1310dc008846SHong Zhang 
13110472cc68SHong Zhang     if (whose == MINE) {isz = isz0;}
1312fc70d14eSHong Zhang     k = 0;  /* no. of indices from input is[i] that have been examined */
1313dc008846SHong Zhang     for (row=0; row<mbs; row++){
1314dc008846SHong Zhang       a_start = ai[row]; a_end = ai[row+1];
1315dc008846SHong Zhang       b_start = bi[row]; b_end = bi[row+1];
13160472cc68SHong Zhang       if (PetscBTLookup(table0,row+rstart)){ /* row is on input is[i]:
13170472cc68SHong Zhang                                                 do row search: collect all col in this row */
1318dc008846SHong Zhang         for (l = a_start; l<a_end ; l++){ /* Amat */
1319dc008846SHong Zhang           col = aj[l] + rstart;
1320fc70d14eSHong Zhang           if (!PetscBTLookupSet(table_i,col)) {nidx_i[isz++] = col;}
1321dc008846SHong Zhang         }
1322dc008846SHong Zhang         for (l = b_start; l<b_end ; l++){ /* Bmat */
1323dc008846SHong Zhang           col = garray[bj[l]];
1324fc70d14eSHong Zhang           if (!PetscBTLookupSet(table_i,col)) {nidx_i[isz++] = col;}
1325dc008846SHong Zhang         }
1326dc008846SHong Zhang         k++;
1327dc008846SHong Zhang         if (k >= isz0) break; /* for (row=0; row<mbs; row++) */
13280472cc68SHong Zhang       } else { /* row is not on input is[i]:
13290472cc68SHong Zhang                   do col serach: add row onto nidx_i if there is a col in nidx_i */
1330dc008846SHong Zhang         for (l = a_start; l<a_end ; l++){ /* Amat */
1331dc008846SHong Zhang           col = aj[l] + rstart;
133276f244e2SHong Zhang           if (col > col_max) break;
1333dc008846SHong Zhang           if (PetscBTLookup(table0,col)){
1334fc70d14eSHong Zhang             if (!PetscBTLookupSet(table_i,row+rstart)) {nidx_i[isz++] = row+rstart;}
1335dc008846SHong Zhang             break; /* for l = start; l<end ; l++) */
1336632d0f97SHong Zhang           }
1337632d0f97SHong Zhang         }
1338dc008846SHong Zhang         for (l = b_start; l<b_end ; l++){ /* Bmat */
1339dc008846SHong Zhang           col = garray[bj[l]];
134076f244e2SHong Zhang           if (col > col_max) break;
1341dc008846SHong Zhang           if (PetscBTLookup(table0,col)){
1342fc70d14eSHong Zhang             if (!PetscBTLookupSet(table_i,row+rstart)) {nidx_i[isz++] = row+rstart;}
1343dc008846SHong Zhang             break; /* for l = start; l<end ; l++) */
1344632d0f97SHong Zhang           }
1345dc008846SHong Zhang         }
1346dc008846SHong Zhang       }
1347bfc6803cSHong Zhang     }
1348dc008846SHong Zhang 
1349dc008846SHong Zhang     if (i < is_max - 1){
1350fc70d14eSHong Zhang       idx_i  += n;   /* ptr to input is[i+1] array */
1351bfc6803cSHong Zhang       nidx_i += isz; /* ptr to output is[i+1] array */
1352dc008846SHong Zhang     }
1353fc70d14eSHong Zhang     nidx[1+i] = isz; /* size of new is[i] */
13541ab97a2aSSatish Balay   } /* for each is */
1355dc008846SHong Zhang   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
1356632d0f97SHong Zhang 
1357632d0f97SHong Zhang   PetscFunctionReturn(0);
1358632d0f97SHong Zhang }
1359632d0f97SHong Zhang 
1360632d0f97SHong Zhang 
1361