xref: /petsc/src/mat/impls/sbaij/mpi/sbaijov.c (revision db41cccff32efbb513e6f5a641add0fbc314552d)
1632d0f97SHong Zhang 
2632d0f97SHong Zhang /*
3632d0f97SHong Zhang    Routines to compute overlapping regions of a parallel MPI matrix.
4632d0f97SHong Zhang    Used for finding submatrices that were shared across processors.
5632d0f97SHong Zhang */
6c6db04a5SJed Brown #include <../src/mat/impls/sbaij/mpi/mpisbaij.h>
7c6db04a5SJed Brown #include <petscbt.h>
8632d0f97SHong Zhang 
913f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat,PetscInt,IS*);
1013f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat,PetscInt*,PetscInt,PetscInt*,PetscBT*);
11632d0f97SHong Zhang 
12*db41cccfSHong Zhang /* -------------------------------------------------------------------------*/
13*db41cccfSHong Zhang /* This code is modified from MatGetSubMatrices_MPIBAIJ_local() */
14*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
15*db41cccfSHong Zhang #undef __FUNCT__
16*db41cccfSHong Zhang #define __FUNCT__ "PetscGetProc_ijonly"
17*db41cccfSHong Zhang PetscErrorCode PetscGetProc_ijonly(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
18*db41cccfSHong Zhang {
19*db41cccfSHong Zhang   PetscInt    nGlobalNd = proc_gnode[size];
20*db41cccfSHong Zhang   PetscMPIInt fproc = PetscMPIIntCast( (PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)));
21*db41cccfSHong Zhang 
22*db41cccfSHong Zhang   PetscFunctionBegin;
23*db41cccfSHong Zhang   if (fproc > size) fproc = size;
24*db41cccfSHong Zhang   while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
25*db41cccfSHong Zhang     if (row < proc_gnode[fproc]) fproc--;
26*db41cccfSHong Zhang     else                         fproc++;
27*db41cccfSHong Zhang   }
28*db41cccfSHong Zhang   *rank = fproc;
29*db41cccfSHong Zhang   PetscFunctionReturn(0);
30*db41cccfSHong Zhang }
31*db41cccfSHong Zhang #endif
32*db41cccfSHong Zhang 
33*db41cccfSHong Zhang #undef __FUNCT__
34*db41cccfSHong Zhang #define __FUNCT__ "MatGetSubMatrices_MPIBAIJ_local_ijonly"
35*db41cccfSHong Zhang PetscErrorCode MatGetSubMatrices_MPIBAIJ_local_ijonly(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
36*db41cccfSHong Zhang {
37*db41cccfSHong Zhang   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
38*db41cccfSHong Zhang   Mat            A = c->A;
39*db41cccfSHong Zhang   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
40*db41cccfSHong Zhang   const PetscInt **irow,**icol,*irow_i;
41*db41cccfSHong Zhang   PetscInt       *nrow,*ncol,*w3,*w4,start;
42*db41cccfSHong Zhang   PetscErrorCode ierr;
43*db41cccfSHong Zhang   PetscMPIInt    size,tag0,tag1,tag2,tag3,*w1,*w2,nrqr,idex,end,proc;
44*db41cccfSHong Zhang   PetscInt       **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,**rbuf1,row;
45*db41cccfSHong Zhang   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
46*db41cccfSHong Zhang   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
47*db41cccfSHong Zhang   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
48*db41cccfSHong Zhang   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
49*db41cccfSHong Zhang   PetscInt       *a_j=a->j,*b_j=b->j,*cworkA,*cworkB; //bs=C->rmap->bs,bs2=c->bs2
50*db41cccfSHong Zhang   PetscInt       cstart = c->cstartbs,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
51*db41cccfSHong Zhang   PetscInt       *bmap = c->garray,ctmp,rstart=c->rstartbs;
52*db41cccfSHong Zhang   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3,*s_waits3;
53*db41cccfSHong Zhang   //MPI_Request    *r_waits4,*s_waits4;
54*db41cccfSHong Zhang   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2,*r_status3;
55*db41cccfSHong Zhang   //MPI_Status     *r_status4,*s_status4;
56*db41cccfSHong Zhang   MPI_Comm       comm;
57*db41cccfSHong Zhang   //MatScalar      **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB;
58*db41cccfSHong Zhang   //MatScalar      *a_a=a->a,*b_a=b->a;
59*db41cccfSHong Zhang   PetscBool      flag;
60*db41cccfSHong Zhang   PetscMPIInt    *onodes1,*olengths1;
61*db41cccfSHong Zhang 
62*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
63*db41cccfSHong Zhang   PetscInt       tt;
64*db41cccfSHong Zhang   PetscTable     *rowmaps,*colmaps,lrow1_grow1,lcol1_gcol1;
65*db41cccfSHong Zhang #else
66*db41cccfSHong Zhang   PetscInt       **cmap,*cmap_i,*rtable,*rmap_i,**rmap, Mbs = c->Mbs;
67*db41cccfSHong Zhang #endif
68*db41cccfSHong Zhang 
69*db41cccfSHong Zhang   PetscFunctionBegin;
70*db41cccfSHong Zhang   comm   = ((PetscObject)C)->comm;
71*db41cccfSHong Zhang   tag0   = ((PetscObject)C)->tag;
72*db41cccfSHong Zhang   size   = c->size;
73*db41cccfSHong Zhang   rank   = c->rank;
74*db41cccfSHong Zhang 
75*db41cccfSHong Zhang   /* Get some new tags to keep the communication clean */
76*db41cccfSHong Zhang   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
77*db41cccfSHong Zhang   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
78*db41cccfSHong Zhang   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
79*db41cccfSHong Zhang 
80*db41cccfSHong Zhang #if defined(PETSC_USE_CTABLE)
81*db41cccfSHong Zhang   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
82*db41cccfSHong Zhang #else
83*db41cccfSHong Zhang   ierr = PetscMalloc5(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol,Mbs+1,PetscInt,&rtable);CHKERRQ(ierr);
84*db41cccfSHong Zhang   /* Create hash table for the mapping :row -> proc*/
85*db41cccfSHong Zhang   for (i=0,j=0; i<size; i++) {
86*db41cccfSHong Zhang     jmax = c->rowners[i+1];
87*db41cccfSHong Zhang     for (; j<jmax; j++) {
88*db41cccfSHong Zhang       rtable[j] = i;
89*db41cccfSHong Zhang     }
90*db41cccfSHong Zhang   }
91*db41cccfSHong Zhang #endif
92*db41cccfSHong Zhang 
93*db41cccfSHong Zhang   for (i=0; i<ismax; i++) {
94*db41cccfSHong Zhang     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
95*db41cccfSHong Zhang     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
96*db41cccfSHong Zhang     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
97*db41cccfSHong Zhang     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
98*db41cccfSHong Zhang   }
99*db41cccfSHong Zhang 
100*db41cccfSHong Zhang   /* evaluate communication - mesg to who,length of mesg,and buffer space
101*db41cccfSHong Zhang      required. Based on this, buffers are allocated, and data copied into them*/
102*db41cccfSHong Zhang   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscInt,&w3,size,PetscInt,&w4);CHKERRQ(ierr);
103*db41cccfSHong Zhang   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
104*db41cccfSHong Zhang   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
105*db41cccfSHong Zhang   ierr = PetscMemzero(w3,size*sizeof(PetscInt));CHKERRQ(ierr);
106*db41cccfSHong Zhang   for (i=0; i<ismax; i++) {
107*db41cccfSHong Zhang     ierr   = PetscMemzero(w4,size*sizeof(PetscInt));CHKERRQ(ierr); /* initialise work vector*/
108*db41cccfSHong Zhang     jmax   = nrow[i];
109*db41cccfSHong Zhang     irow_i = irow[i];
110*db41cccfSHong Zhang     for (j=0; j<jmax; j++) {
111*db41cccfSHong Zhang       row  = irow_i[j];
112*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
113*db41cccfSHong Zhang       ierr = PetscGetProc_ijonly(row,size,c->rangebs,&proc);CHKERRQ(ierr);
114*db41cccfSHong Zhang #else
115*db41cccfSHong Zhang       proc = rtable[row];
116*db41cccfSHong Zhang #endif
117*db41cccfSHong Zhang       w4[proc]++;
118*db41cccfSHong Zhang     }
119*db41cccfSHong Zhang     for (j=0; j<size; j++) {
120*db41cccfSHong Zhang       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
121*db41cccfSHong Zhang     }
122*db41cccfSHong Zhang   }
123*db41cccfSHong Zhang 
124*db41cccfSHong Zhang   nrqs     = 0;              /* no of outgoing messages */
125*db41cccfSHong Zhang   msz      = 0;              /* total mesg length for all proc */
126*db41cccfSHong Zhang   w1[rank] = 0;              /* no mesg sent to intself */
127*db41cccfSHong Zhang   w3[rank] = 0;
128*db41cccfSHong Zhang   for (i=0; i<size; i++) {
129*db41cccfSHong Zhang     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
130*db41cccfSHong Zhang   }
131*db41cccfSHong Zhang   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
132*db41cccfSHong Zhang   for (i=0,j=0; i<size; i++) {
133*db41cccfSHong Zhang     if (w1[i]) { pa[j] = i; j++; }
134*db41cccfSHong Zhang   }
135*db41cccfSHong Zhang 
136*db41cccfSHong Zhang   /* Each message would have a header = 1 + 2*(no of IS) + data */
137*db41cccfSHong Zhang   for (i=0; i<nrqs; i++) {
138*db41cccfSHong Zhang     j     = pa[i];
139*db41cccfSHong Zhang     w1[j] += w2[j] + 2* w3[j];
140*db41cccfSHong Zhang     msz   += w1[j];
141*db41cccfSHong Zhang   }
142*db41cccfSHong Zhang 
143*db41cccfSHong Zhang   /* Determine the number of messages to expect, their lengths, from from-ids */
144*db41cccfSHong Zhang   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
145*db41cccfSHong Zhang   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
146*db41cccfSHong Zhang 
147*db41cccfSHong Zhang   /* Now post the Irecvs corresponding to these messages */
148*db41cccfSHong Zhang   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
149*db41cccfSHong Zhang 
150*db41cccfSHong Zhang   ierr = PetscFree(onodes1);CHKERRQ(ierr);
151*db41cccfSHong Zhang   ierr = PetscFree(olengths1);CHKERRQ(ierr);
152*db41cccfSHong Zhang 
153*db41cccfSHong Zhang   /* Allocate Memory for outgoing messages */
154*db41cccfSHong Zhang   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
155*db41cccfSHong Zhang   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
156*db41cccfSHong Zhang   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
157*db41cccfSHong Zhang   {
158*db41cccfSHong Zhang     PetscInt *iptr = tmp,ict = 0;
159*db41cccfSHong Zhang     for (i=0; i<nrqs; i++) {
160*db41cccfSHong Zhang       j         = pa[i];
161*db41cccfSHong Zhang       iptr     += ict;
162*db41cccfSHong Zhang       sbuf1[j]  = iptr;
163*db41cccfSHong Zhang       ict       = w1[j];
164*db41cccfSHong Zhang     }
165*db41cccfSHong Zhang   }
166*db41cccfSHong Zhang 
167*db41cccfSHong Zhang   /* Form the outgoing messages */
168*db41cccfSHong Zhang   /* Initialise the header space */
169*db41cccfSHong Zhang   for (i=0; i<nrqs; i++) {
170*db41cccfSHong Zhang     j           = pa[i];
171*db41cccfSHong Zhang     sbuf1[j][0] = 0;
172*db41cccfSHong Zhang     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
173*db41cccfSHong Zhang     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
174*db41cccfSHong Zhang   }
175*db41cccfSHong Zhang 
176*db41cccfSHong Zhang   /* Parse the isrow and copy data into outbuf */
177*db41cccfSHong Zhang   for (i=0; i<ismax; i++) {
178*db41cccfSHong Zhang     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
179*db41cccfSHong Zhang     irow_i = irow[i];
180*db41cccfSHong Zhang     jmax   = nrow[i];
181*db41cccfSHong Zhang     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
182*db41cccfSHong Zhang       row  = irow_i[j];
183*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
184*db41cccfSHong Zhang       ierr = PetscGetProc_ijonly(row,size,c->rangebs,&proc);CHKERRQ(ierr);
185*db41cccfSHong Zhang #else
186*db41cccfSHong Zhang       proc = rtable[row];
187*db41cccfSHong Zhang #endif
188*db41cccfSHong Zhang       if (proc != rank) { /* copy to the outgoing buf*/
189*db41cccfSHong Zhang         ctr[proc]++;
190*db41cccfSHong Zhang         *ptr[proc] = row;
191*db41cccfSHong Zhang         ptr[proc]++;
192*db41cccfSHong Zhang       }
193*db41cccfSHong Zhang     }
194*db41cccfSHong Zhang     /* Update the headers for the current IS */
195*db41cccfSHong Zhang     for (j=0; j<size; j++) { /* Can Optimise this loop too */
196*db41cccfSHong Zhang       if ((ctr_j = ctr[j])) {
197*db41cccfSHong Zhang         sbuf1_j        = sbuf1[j];
198*db41cccfSHong Zhang         k              = ++sbuf1_j[0];
199*db41cccfSHong Zhang         sbuf1_j[2*k]   = ctr_j;
200*db41cccfSHong Zhang         sbuf1_j[2*k-1] = i;
201*db41cccfSHong Zhang       }
202*db41cccfSHong Zhang     }
203*db41cccfSHong Zhang   }
204*db41cccfSHong Zhang 
205*db41cccfSHong Zhang   /*  Now  post the sends */
206*db41cccfSHong Zhang   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
207*db41cccfSHong Zhang   for (i=0; i<nrqs; ++i) {
208*db41cccfSHong Zhang     j = pa[i];
209*db41cccfSHong Zhang     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
210*db41cccfSHong Zhang   }
211*db41cccfSHong Zhang 
212*db41cccfSHong Zhang   /* Post Recieves to capture the buffer size */
213*db41cccfSHong Zhang   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
214*db41cccfSHong Zhang   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
215*db41cccfSHong Zhang   rbuf2[0] = tmp + msz;
216*db41cccfSHong Zhang   for (i=1; i<nrqs; ++i) {
217*db41cccfSHong Zhang     j        = pa[i];
218*db41cccfSHong Zhang     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
219*db41cccfSHong Zhang   }
220*db41cccfSHong Zhang   for (i=0; i<nrqs; ++i) {
221*db41cccfSHong Zhang     j    = pa[i];
222*db41cccfSHong Zhang     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
223*db41cccfSHong Zhang   }
224*db41cccfSHong Zhang 
225*db41cccfSHong Zhang   /* Send to other procs the buf size they should allocate */
226*db41cccfSHong Zhang 
227*db41cccfSHong Zhang   /* Receive messages*/
228*db41cccfSHong Zhang   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
229*db41cccfSHong Zhang   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
230*db41cccfSHong Zhang   ierr = PetscMalloc3(nrqr+1,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
231*db41cccfSHong Zhang   {
232*db41cccfSHong Zhang     Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
233*db41cccfSHong Zhang     PetscInt    *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;
234*db41cccfSHong Zhang 
235*db41cccfSHong Zhang     for (i=0; i<nrqr; ++i) {
236*db41cccfSHong Zhang       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
237*db41cccfSHong Zhang       req_size[idex] = 0;
238*db41cccfSHong Zhang       rbuf1_i         = rbuf1[idex];
239*db41cccfSHong Zhang       start           = 2*rbuf1_i[0] + 1;
240*db41cccfSHong Zhang       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
241*db41cccfSHong Zhang       ierr            = PetscMalloc(end*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
242*db41cccfSHong Zhang       sbuf2_i         = sbuf2[idex];
243*db41cccfSHong Zhang       for (j=start; j<end; j++) {
244*db41cccfSHong Zhang         id               = rbuf1_i[j] - rstart;
245*db41cccfSHong Zhang         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
246*db41cccfSHong Zhang         sbuf2_i[j]       = ncols;
247*db41cccfSHong Zhang         req_size[idex] += ncols;
248*db41cccfSHong Zhang       }
249*db41cccfSHong Zhang       req_source[idex] = r_status1[i].MPI_SOURCE;
250*db41cccfSHong Zhang       /* form the header */
251*db41cccfSHong Zhang       sbuf2_i[0]   = req_size[idex];
252*db41cccfSHong Zhang       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
253*db41cccfSHong Zhang       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
254*db41cccfSHong Zhang     }
255*db41cccfSHong Zhang   }
256*db41cccfSHong Zhang   ierr = PetscFree(r_status1);CHKERRQ(ierr);
257*db41cccfSHong Zhang   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
258*db41cccfSHong Zhang 
259*db41cccfSHong Zhang   /*  recv buffer sizes */
260*db41cccfSHong Zhang   /* Receive messages*/
261*db41cccfSHong Zhang 
262*db41cccfSHong Zhang   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
263*db41cccfSHong Zhang   //ierr = PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);CHKERRQ(ierr);
264*db41cccfSHong Zhang   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
265*db41cccfSHong Zhang   //ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
266*db41cccfSHong Zhang   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
267*db41cccfSHong Zhang 
268*db41cccfSHong Zhang   for (i=0; i<nrqs; ++i) {
269*db41cccfSHong Zhang     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
270*db41cccfSHong Zhang     ierr = PetscMalloc(rbuf2[idex][0]*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
271*db41cccfSHong Zhang     //ierr = PetscMalloc(rbuf2[idex][0]*bs2*sizeof(MatScalar),&rbuf4[idex]);CHKERRQ(ierr);
272*db41cccfSHong Zhang     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
273*db41cccfSHong Zhang     //ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
274*db41cccfSHong Zhang   }
275*db41cccfSHong Zhang   ierr = PetscFree(r_status2);CHKERRQ(ierr);
276*db41cccfSHong Zhang   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
277*db41cccfSHong Zhang 
278*db41cccfSHong Zhang   /* Wait on sends1 and sends2 */
279*db41cccfSHong Zhang   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
280*db41cccfSHong Zhang   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
281*db41cccfSHong Zhang 
282*db41cccfSHong Zhang   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
283*db41cccfSHong Zhang   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
284*db41cccfSHong Zhang   ierr = PetscFree(s_status1);CHKERRQ(ierr);
285*db41cccfSHong Zhang   ierr = PetscFree(s_status2);CHKERRQ(ierr);
286*db41cccfSHong Zhang   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
287*db41cccfSHong Zhang   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
288*db41cccfSHong Zhang 
289*db41cccfSHong Zhang   /* Now allocate buffers for a->j, and send them off */
290*db41cccfSHong Zhang   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
291*db41cccfSHong Zhang   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
292*db41cccfSHong Zhang   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
293*db41cccfSHong Zhang   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
294*db41cccfSHong Zhang 
295*db41cccfSHong Zhang   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
296*db41cccfSHong Zhang   {
297*db41cccfSHong Zhang      for (i=0; i<nrqr; i++) {
298*db41cccfSHong Zhang       rbuf1_i   = rbuf1[i];
299*db41cccfSHong Zhang       sbuf_aj_i = sbuf_aj[i];
300*db41cccfSHong Zhang       ct1       = 2*rbuf1_i[0] + 1;
301*db41cccfSHong Zhang       ct2       = 0;
302*db41cccfSHong Zhang       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
303*db41cccfSHong Zhang         kmax = rbuf1[i][2*j];
304*db41cccfSHong Zhang         for (k=0; k<kmax; k++,ct1++) {
305*db41cccfSHong Zhang           row    = rbuf1_i[ct1] - rstart;
306*db41cccfSHong Zhang           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
307*db41cccfSHong Zhang           ncols  = nzA + nzB;
308*db41cccfSHong Zhang           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
309*db41cccfSHong Zhang 
310*db41cccfSHong Zhang           /* load the column indices for this row into cols*/
311*db41cccfSHong Zhang           cols  = sbuf_aj_i + ct2;
312*db41cccfSHong Zhang           for (l=0; l<nzB; l++) {
313*db41cccfSHong Zhang             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[l] = ctmp;
314*db41cccfSHong Zhang             else break;
315*db41cccfSHong Zhang           }
316*db41cccfSHong Zhang           imark = l;
317*db41cccfSHong Zhang           for (l=0; l<nzA; l++)   cols[imark+l] = cstart + cworkA[l];
318*db41cccfSHong Zhang           for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
319*db41cccfSHong Zhang           ct2 += ncols;
320*db41cccfSHong Zhang         }
321*db41cccfSHong Zhang       }
322*db41cccfSHong Zhang       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
323*db41cccfSHong Zhang     }
324*db41cccfSHong Zhang   }
325*db41cccfSHong Zhang   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
326*db41cccfSHong Zhang   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
327*db41cccfSHong Zhang 
328*db41cccfSHong Zhang #if defined(MV)
329*db41cccfSHong Zhang   /* Allocate buffers for a->a, and send them off */
330*db41cccfSHong Zhang   ierr = PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);CHKERRQ(ierr);
331*db41cccfSHong Zhang   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
332*db41cccfSHong Zhang   ierr = PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);CHKERRQ(ierr);
333*db41cccfSHong Zhang   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
334*db41cccfSHong Zhang 
335*db41cccfSHong Zhang   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
336*db41cccfSHong Zhang   {
337*db41cccfSHong Zhang     for (i=0; i<nrqr; i++) {
338*db41cccfSHong Zhang       rbuf1_i   = rbuf1[i];
339*db41cccfSHong Zhang       sbuf_aa_i = sbuf_aa[i];
340*db41cccfSHong Zhang       ct1       = 2*rbuf1_i[0]+1;
341*db41cccfSHong Zhang       ct2       = 0;
342*db41cccfSHong Zhang       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
343*db41cccfSHong Zhang         kmax = rbuf1_i[2*j];
344*db41cccfSHong Zhang         for (k=0; k<kmax; k++,ct1++) {
345*db41cccfSHong Zhang           row    = rbuf1_i[ct1] - rstart;
346*db41cccfSHong Zhang           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
347*db41cccfSHong Zhang           ncols  = nzA + nzB;
348*db41cccfSHong Zhang           cworkA = a_j + a_i[row];     cworkB = b_j + b_i[row];
349*db41cccfSHong Zhang           vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2;
350*db41cccfSHong Zhang 
351*db41cccfSHong Zhang           /* load the column values for this row into vals*/
352*db41cccfSHong Zhang           vals  = sbuf_aa_i+ct2*bs2;
353*db41cccfSHong Zhang           for (l=0; l<nzB; l++) {
354*db41cccfSHong Zhang             if ((bmap[cworkB[l]]) < cstart) {
355*db41cccfSHong Zhang               ierr = PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
356*db41cccfSHong Zhang             }
357*db41cccfSHong Zhang             else break;
358*db41cccfSHong Zhang           }
359*db41cccfSHong Zhang           imark = l;
360*db41cccfSHong Zhang           for (l=0; l<nzA; l++) {
361*db41cccfSHong Zhang             ierr = PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
362*db41cccfSHong Zhang           }
363*db41cccfSHong Zhang           for (l=imark; l<nzB; l++) {
364*db41cccfSHong Zhang             ierr = PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
365*db41cccfSHong Zhang           }
366*db41cccfSHong Zhang           ct2 += ncols;
367*db41cccfSHong Zhang         }
368*db41cccfSHong Zhang       }
369*db41cccfSHong Zhang       ierr = MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
370*db41cccfSHong Zhang     }
371*db41cccfSHong Zhang   }
372*db41cccfSHong Zhang   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
373*db41cccfSHong Zhang   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
374*db41cccfSHong Zhang #endif
375*db41cccfSHong Zhang   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
376*db41cccfSHong Zhang   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
377*db41cccfSHong Zhang 
378*db41cccfSHong Zhang   /* Form the matrix */
379*db41cccfSHong Zhang   /* create col map */
380*db41cccfSHong Zhang   {
381*db41cccfSHong Zhang     const PetscInt *icol_i;
382*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
383*db41cccfSHong Zhang     /* Create row map*/
384*db41cccfSHong Zhang     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&colmaps);CHKERRQ(ierr);
385*db41cccfSHong Zhang     for (i=0; i<ismax; i++) {
386*db41cccfSHong Zhang       ierr = PetscTableCreate(ncol[i]+1,&colmaps[i]);CHKERRQ(ierr);
387*db41cccfSHong Zhang     }
388*db41cccfSHong Zhang #else
389*db41cccfSHong Zhang     ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
390*db41cccfSHong Zhang     ierr    = PetscMalloc(ismax*c->Nbs*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr);
391*db41cccfSHong Zhang     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; }
392*db41cccfSHong Zhang #endif
393*db41cccfSHong Zhang     for (i=0; i<ismax; i++) {
394*db41cccfSHong Zhang       jmax   = ncol[i];
395*db41cccfSHong Zhang       icol_i = icol[i];
396*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
397*db41cccfSHong Zhang       lcol1_gcol1 = colmaps[i];
398*db41cccfSHong Zhang       for (j=0; j<jmax; j++) {
399*db41cccfSHong Zhang 	ierr = PetscTableAdd(lcol1_gcol1,icol_i[j]+1,j+1);CHKERRQ(ierr);
400*db41cccfSHong Zhang       }
401*db41cccfSHong Zhang #else
402*db41cccfSHong Zhang       cmap_i = cmap[i];
403*db41cccfSHong Zhang       for (j=0; j<jmax; j++) {
404*db41cccfSHong Zhang         cmap_i[icol_i[j]] = j+1;
405*db41cccfSHong Zhang       }
406*db41cccfSHong Zhang #endif
407*db41cccfSHong Zhang     }
408*db41cccfSHong Zhang   }
409*db41cccfSHong Zhang 
410*db41cccfSHong Zhang   /* Create lens which is required for MatCreate... */
411*db41cccfSHong Zhang   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
412*db41cccfSHong Zhang   ierr    = PetscMalloc((1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt),&lens);CHKERRQ(ierr);
413*db41cccfSHong Zhang   lens[0] = (PetscInt*)(lens + ismax);
414*db41cccfSHong Zhang   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
415*db41cccfSHong Zhang   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
416*db41cccfSHong Zhang 
417*db41cccfSHong Zhang   /* Update lens from local data */
418*db41cccfSHong Zhang   for (i=0; i<ismax; i++) {
419*db41cccfSHong Zhang     jmax   = nrow[i];
420*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
421*db41cccfSHong Zhang     lcol1_gcol1 = colmaps[i];
422*db41cccfSHong Zhang #else
423*db41cccfSHong Zhang     cmap_i = cmap[i];
424*db41cccfSHong Zhang #endif
425*db41cccfSHong Zhang     irow_i = irow[i];
426*db41cccfSHong Zhang     lens_i = lens[i];
427*db41cccfSHong Zhang     for (j=0; j<jmax; j++) {
428*db41cccfSHong Zhang       row  = irow_i[j];
429*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
430*db41cccfSHong Zhang       ierr = PetscGetProc_ijonly(row,size,c->rangebs,&proc);CHKERRQ(ierr);
431*db41cccfSHong Zhang #else
432*db41cccfSHong Zhang       proc = rtable[row];
433*db41cccfSHong Zhang #endif
434*db41cccfSHong Zhang       if (proc == rank) {
435*db41cccfSHong Zhang         /* Get indices from matA and then from matB */
436*db41cccfSHong Zhang         row    = row - rstart;
437*db41cccfSHong Zhang         nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
438*db41cccfSHong Zhang         cworkA =  a_j + a_i[row]; cworkB = b_j + b_i[row];
439*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
440*db41cccfSHong Zhang        for (k=0; k<nzA; k++) {
441*db41cccfSHong Zhang 	 ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[k]+1,&tt);CHKERRQ(ierr);
442*db41cccfSHong Zhang           if (tt) { lens_i[j]++; }
443*db41cccfSHong Zhang         }
444*db41cccfSHong Zhang         for (k=0; k<nzB; k++) {
445*db41cccfSHong Zhang 	  ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[k]]+1,&tt);CHKERRQ(ierr);
446*db41cccfSHong Zhang           if (tt) { lens_i[j]++; }
447*db41cccfSHong Zhang         }
448*db41cccfSHong Zhang #else
449*db41cccfSHong Zhang         for (k=0; k<nzA; k++) {
450*db41cccfSHong Zhang           if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++; }
451*db41cccfSHong Zhang         }
452*db41cccfSHong Zhang         for (k=0; k<nzB; k++) {
453*db41cccfSHong Zhang           if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++; }
454*db41cccfSHong Zhang         }
455*db41cccfSHong Zhang #endif
456*db41cccfSHong Zhang       }
457*db41cccfSHong Zhang     }
458*db41cccfSHong Zhang   }
459*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
460*db41cccfSHong Zhang   /* Create row map*/
461*db41cccfSHong Zhang   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rowmaps);CHKERRQ(ierr);
462*db41cccfSHong Zhang   for (i=0; i<ismax; i++){
463*db41cccfSHong Zhang     ierr = PetscTableCreate(nrow[i]+1,&rowmaps[i]);CHKERRQ(ierr);
464*db41cccfSHong Zhang   }
465*db41cccfSHong Zhang #else
466*db41cccfSHong Zhang   /* Create row map*/
467*db41cccfSHong Zhang   ierr    = PetscMalloc((1+ismax)*sizeof(PetscInt*)+ ismax*Mbs*sizeof(PetscInt),&rmap);CHKERRQ(ierr);
468*db41cccfSHong Zhang   rmap[0] = (PetscInt*)(rmap + ismax);
469*db41cccfSHong Zhang   ierr    = PetscMemzero(rmap[0],ismax*Mbs*sizeof(PetscInt));CHKERRQ(ierr);
470*db41cccfSHong Zhang   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + Mbs;}
471*db41cccfSHong Zhang #endif
472*db41cccfSHong Zhang   for (i=0; i<ismax; i++) {
473*db41cccfSHong Zhang     irow_i = irow[i];
474*db41cccfSHong Zhang     jmax   = nrow[i];
475*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
476*db41cccfSHong Zhang     lrow1_grow1 = rowmaps[i];
477*db41cccfSHong Zhang     for (j=0; j<jmax; j++) {
478*db41cccfSHong Zhang       ierr = PetscTableAdd(lrow1_grow1,irow_i[j]+1,j+1);CHKERRQ(ierr);
479*db41cccfSHong Zhang     }
480*db41cccfSHong Zhang #else
481*db41cccfSHong Zhang     rmap_i = rmap[i];
482*db41cccfSHong Zhang     for (j=0; j<jmax; j++) {
483*db41cccfSHong Zhang       rmap_i[irow_i[j]] = j;
484*db41cccfSHong Zhang     }
485*db41cccfSHong Zhang #endif
486*db41cccfSHong Zhang   }
487*db41cccfSHong Zhang 
488*db41cccfSHong Zhang   /* Update lens from offproc data */
489*db41cccfSHong Zhang   {
490*db41cccfSHong Zhang     PetscInt    *rbuf2_i,*rbuf3_i,*sbuf1_i;
491*db41cccfSHong Zhang     PetscMPIInt ii;
492*db41cccfSHong Zhang 
493*db41cccfSHong Zhang     for (tmp2=0; tmp2<nrqs; tmp2++) {
494*db41cccfSHong Zhang       ierr    = MPI_Waitany(nrqs,r_waits3,&ii,r_status3+tmp2);CHKERRQ(ierr);
495*db41cccfSHong Zhang       idex   = pa[ii];
496*db41cccfSHong Zhang       sbuf1_i = sbuf1[idex];
497*db41cccfSHong Zhang       jmax    = sbuf1_i[0];
498*db41cccfSHong Zhang       ct1     = 2*jmax+1;
499*db41cccfSHong Zhang       ct2     = 0;
500*db41cccfSHong Zhang       rbuf2_i = rbuf2[ii];
501*db41cccfSHong Zhang       rbuf3_i = rbuf3[ii];
502*db41cccfSHong Zhang       for (j=1; j<=jmax; j++) {
503*db41cccfSHong Zhang         is_no   = sbuf1_i[2*j-1];
504*db41cccfSHong Zhang         max1    = sbuf1_i[2*j];
505*db41cccfSHong Zhang         lens_i  = lens[is_no];
506*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
507*db41cccfSHong Zhang 	lcol1_gcol1 = colmaps[is_no];
508*db41cccfSHong Zhang 	lrow1_grow1 = rowmaps[is_no];
509*db41cccfSHong Zhang #else
510*db41cccfSHong Zhang         cmap_i  = cmap[is_no];
511*db41cccfSHong Zhang 	rmap_i  = rmap[is_no];
512*db41cccfSHong Zhang #endif
513*db41cccfSHong Zhang         for (k=0; k<max1; k++,ct1++) {
514*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
515*db41cccfSHong Zhang 	  ierr = PetscTableFind(lrow1_grow1,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
516*db41cccfSHong Zhang           row--;
517*db41cccfSHong Zhang           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
518*db41cccfSHong Zhang #else
519*db41cccfSHong Zhang           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
520*db41cccfSHong Zhang #endif
521*db41cccfSHong Zhang           max2 = rbuf2_i[ct1];
522*db41cccfSHong Zhang           for (l=0; l<max2; l++,ct2++) {
523*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
524*db41cccfSHong Zhang 	    ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tt);CHKERRQ(ierr);
525*db41cccfSHong Zhang 	    if (tt) {
526*db41cccfSHong Zhang               lens_i[row]++;
527*db41cccfSHong Zhang             }
528*db41cccfSHong Zhang #else
529*db41cccfSHong Zhang             if (cmap_i[rbuf3_i[ct2]]) {
530*db41cccfSHong Zhang               lens_i[row]++;
531*db41cccfSHong Zhang             }
532*db41cccfSHong Zhang #endif
533*db41cccfSHong Zhang           }
534*db41cccfSHong Zhang         }
535*db41cccfSHong Zhang       }
536*db41cccfSHong Zhang     }
537*db41cccfSHong Zhang   }
538*db41cccfSHong Zhang   ierr = PetscFree(r_status3);CHKERRQ(ierr);
539*db41cccfSHong Zhang   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
540*db41cccfSHong Zhang   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
541*db41cccfSHong Zhang   ierr = PetscFree(s_status3);CHKERRQ(ierr);
542*db41cccfSHong Zhang   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
543*db41cccfSHong Zhang 
544*db41cccfSHong Zhang   /* Create the submatrices */
545*db41cccfSHong Zhang   if (scall == MAT_REUSE_MATRIX) {
546*db41cccfSHong Zhang     /*
547*db41cccfSHong Zhang         Assumes new rows are same length as the old rows, hence bug!
548*db41cccfSHong Zhang     */
549*db41cccfSHong Zhang     for (i=0; i<ismax; i++) {
550*db41cccfSHong Zhang       mat = (Mat_SeqBAIJ *)(submats[i]->data);
551*db41cccfSHong Zhang       //if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || C->rmap->bs != bs)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
552*db41cccfSHong Zhang       if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
553*db41cccfSHong Zhang       ierr = PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
554*db41cccfSHong Zhang       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros");
555*db41cccfSHong Zhang       /* Initial matrix as if empty */
556*db41cccfSHong Zhang       ierr = PetscMemzero(mat->ilen,mat->mbs*sizeof(PetscInt));CHKERRQ(ierr);
557*db41cccfSHong Zhang       submats[i]->factortype = C->factortype;
558*db41cccfSHong Zhang     }
559*db41cccfSHong Zhang   } else {
560*db41cccfSHong Zhang     for (i=0; i<ismax; i++) {
561*db41cccfSHong Zhang       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
562*db41cccfSHong Zhang       //ierr = MatSetSizes(submats[i],nrow[i]*bs,ncol[i]*bs,nrow[i]*bs,ncol[i]*bs);CHKERRQ(ierr);
563*db41cccfSHong Zhang       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],nrow[i],ncol[i]);CHKERRQ(ierr);
564*db41cccfSHong Zhang       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
565*db41cccfSHong Zhang       ierr = MatSeqBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);CHKERRQ(ierr);
566*db41cccfSHong Zhang       //ierr = MatSeqBAIJSetPreallocation(submats[i],1,0,lens[i]);CHKERRQ(ierr);
567*db41cccfSHong Zhang       ierr = MatSeqSBAIJSetPreallocation(submats[i],1,0,lens[i]);CHKERRQ(ierr);
568*db41cccfSHong Zhang       //ierr = MatSeqSBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);CHKERRQ(ierr); /* this subroutine is used by MatGetSubMatrices_MPISBAIJ()*/
569*db41cccfSHong Zhang     }
570*db41cccfSHong Zhang   }
571*db41cccfSHong Zhang 
572*db41cccfSHong Zhang   /* Assemble the matrices */
573*db41cccfSHong Zhang   /* First assemble the local rows */
574*db41cccfSHong Zhang   {
575*db41cccfSHong Zhang     PetscInt       ilen_row,*imat_ilen,*imat_j,*imat_i;
576*db41cccfSHong Zhang     //MatScalar *imat_a;
577*db41cccfSHong Zhang 
578*db41cccfSHong Zhang     for (i=0; i<ismax; i++) {
579*db41cccfSHong Zhang       mat       = (Mat_SeqBAIJ*)submats[i]->data;
580*db41cccfSHong Zhang       imat_ilen = mat->ilen;
581*db41cccfSHong Zhang       imat_j    = mat->j;
582*db41cccfSHong Zhang       imat_i    = mat->i;
583*db41cccfSHong Zhang       //imat_a    = mat->a;
584*db41cccfSHong Zhang 
585*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
586*db41cccfSHong Zhang       lcol1_gcol1 = colmaps[i];
587*db41cccfSHong Zhang       lrow1_grow1 = rowmaps[i];
588*db41cccfSHong Zhang #else
589*db41cccfSHong Zhang       cmap_i    = cmap[i];
590*db41cccfSHong Zhang       rmap_i    = rmap[i];
591*db41cccfSHong Zhang #endif
592*db41cccfSHong Zhang       irow_i    = irow[i];
593*db41cccfSHong Zhang       jmax      = nrow[i];
594*db41cccfSHong Zhang       for (j=0; j<jmax; j++) {
595*db41cccfSHong Zhang         row      = irow_i[j];
596*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
597*db41cccfSHong Zhang 	ierr = PetscGetProc_ijonly(row,size,c->rangebs,&proc);CHKERRQ(ierr);
598*db41cccfSHong Zhang #else
599*db41cccfSHong Zhang 	proc = rtable[row];
600*db41cccfSHong Zhang #endif
601*db41cccfSHong Zhang         if (proc == rank) {
602*db41cccfSHong Zhang           row      = row - rstart;
603*db41cccfSHong Zhang           nzA      = a_i[row+1] - a_i[row];
604*db41cccfSHong Zhang           nzB      = b_i[row+1] - b_i[row];
605*db41cccfSHong Zhang           cworkA   = a_j + a_i[row];
606*db41cccfSHong Zhang           cworkB   = b_j + b_i[row];
607*db41cccfSHong Zhang           //vworkA   = a_a + a_i[row]*bs2;
608*db41cccfSHong Zhang           //vworkB   = b_a + b_i[row]*bs2;
609*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
610*db41cccfSHong Zhang 	  ierr = PetscTableFind(lrow1_grow1,row+rstart+1,&row);CHKERRQ(ierr);
611*db41cccfSHong Zhang           row--;
612*db41cccfSHong Zhang           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
613*db41cccfSHong Zhang #else
614*db41cccfSHong Zhang           row      = rmap_i[row + rstart];
615*db41cccfSHong Zhang #endif
616*db41cccfSHong Zhang           mat_i    = imat_i[row];
617*db41cccfSHong Zhang           //mat_a    = imat_a + mat_i*bs2;
618*db41cccfSHong Zhang           mat_j    = imat_j + mat_i;
619*db41cccfSHong Zhang           ilen_row = imat_ilen[row];
620*db41cccfSHong Zhang 
621*db41cccfSHong Zhang           /* load the column indices for this row into cols*/
622*db41cccfSHong Zhang           for (l=0; l<nzB; l++) {
623*db41cccfSHong Zhang 	    if ((ctmp = bmap[cworkB[l]]) < cstart) {
624*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
625*db41cccfSHong Zhang 	      ierr = PetscTableFind(lcol1_gcol1,ctmp+1,&tcol);CHKERRQ(ierr);
626*db41cccfSHong Zhang 	      if (tcol) {
627*db41cccfSHong Zhang #else
628*db41cccfSHong Zhang               if ((tcol = cmap_i[ctmp])) {
629*db41cccfSHong Zhang #endif
630*db41cccfSHong Zhang                 *mat_j++ = tcol - 1;
631*db41cccfSHong Zhang                 //ierr     = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
632*db41cccfSHong Zhang                 //mat_a   += bs2;
633*db41cccfSHong Zhang                 ilen_row++;
634*db41cccfSHong Zhang               }
635*db41cccfSHong Zhang             } else break;
636*db41cccfSHong Zhang           }
637*db41cccfSHong Zhang           imark = l;
638*db41cccfSHong Zhang           for (l=0; l<nzA; l++) {
639*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
640*db41cccfSHong Zhang 	    ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[l]+1,&tcol);CHKERRQ(ierr);
641*db41cccfSHong Zhang 	    if (tcol) {
642*db41cccfSHong Zhang #else
643*db41cccfSHong Zhang 	    if ((tcol = cmap_i[cstart + cworkA[l]])) {
644*db41cccfSHong Zhang #endif
645*db41cccfSHong Zhang               *mat_j++ = tcol - 1;
646*db41cccfSHong Zhang               //ierr     = PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
647*db41cccfSHong Zhang               //mat_a   += bs2;
648*db41cccfSHong Zhang               ilen_row++;
649*db41cccfSHong Zhang             }
650*db41cccfSHong Zhang           }
651*db41cccfSHong Zhang           for (l=imark; l<nzB; l++) {
652*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
653*db41cccfSHong Zhang 	    ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[l]]+1,&tcol);CHKERRQ(ierr);
654*db41cccfSHong Zhang 	    if (tcol) {
655*db41cccfSHong Zhang #else
656*db41cccfSHong Zhang             if ((tcol = cmap_i[bmap[cworkB[l]]])) {
657*db41cccfSHong Zhang #endif
658*db41cccfSHong Zhang               *mat_j++ = tcol - 1;
659*db41cccfSHong Zhang               //ierr     = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
660*db41cccfSHong Zhang               //mat_a   += bs2;
661*db41cccfSHong Zhang               ilen_row++;
662*db41cccfSHong Zhang             }
663*db41cccfSHong Zhang           }
664*db41cccfSHong Zhang           imat_ilen[row] = ilen_row;
665*db41cccfSHong Zhang         }
666*db41cccfSHong Zhang       }
667*db41cccfSHong Zhang     }
668*db41cccfSHong Zhang   }
669*db41cccfSHong Zhang 
670*db41cccfSHong Zhang   /*   Now assemble the off proc rows*/
671*db41cccfSHong Zhang   {
672*db41cccfSHong Zhang     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
673*db41cccfSHong Zhang     PetscInt    *imat_j,*imat_i;
674*db41cccfSHong Zhang     //MatScalar   *imat_a,*rbuf4_i;
675*db41cccfSHong Zhang     PetscMPIInt ii;
676*db41cccfSHong Zhang 
677*db41cccfSHong Zhang     for (tmp2=0; tmp2<nrqs; tmp2++) {
678*db41cccfSHong Zhang       //ierr    = MPI_Waitany(nrqs,r_waits4,&ii,r_status4+tmp2);CHKERRQ(ierr);
679*db41cccfSHong Zhang       ii = tmp2;  //new
680*db41cccfSHong Zhang       idex   = pa[ii];
681*db41cccfSHong Zhang       sbuf1_i = sbuf1[idex];
682*db41cccfSHong Zhang       jmax    = sbuf1_i[0];
683*db41cccfSHong Zhang       ct1     = 2*jmax + 1;
684*db41cccfSHong Zhang       ct2     = 0;
685*db41cccfSHong Zhang       rbuf2_i = rbuf2[ii];
686*db41cccfSHong Zhang       rbuf3_i = rbuf3[ii];
687*db41cccfSHong Zhang       //rbuf4_i = rbuf4[ii];
688*db41cccfSHong Zhang       for (j=1; j<=jmax; j++) {
689*db41cccfSHong Zhang         is_no     = sbuf1_i[2*j-1];
690*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
691*db41cccfSHong Zhang 	lrow1_grow1 = rowmaps[is_no];
692*db41cccfSHong Zhang 	lcol1_gcol1 = colmaps[is_no];
693*db41cccfSHong Zhang #else
694*db41cccfSHong Zhang         rmap_i    = rmap[is_no];
695*db41cccfSHong Zhang         cmap_i    = cmap[is_no];
696*db41cccfSHong Zhang #endif
697*db41cccfSHong Zhang         mat       = (Mat_SeqBAIJ*)submats[is_no]->data;
698*db41cccfSHong Zhang         imat_ilen = mat->ilen;
699*db41cccfSHong Zhang         imat_j    = mat->j;
700*db41cccfSHong Zhang         imat_i    = mat->i;
701*db41cccfSHong Zhang         //imat_a    = mat->a;
702*db41cccfSHong Zhang         max1      = sbuf1_i[2*j];
703*db41cccfSHong Zhang         for (k=0; k<max1; k++,ct1++) {
704*db41cccfSHong Zhang           row   = sbuf1_i[ct1];
705*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
706*db41cccfSHong Zhang 	  ierr = PetscTableFind(lrow1_grow1,row+1,&row);CHKERRQ(ierr);
707*db41cccfSHong Zhang           row--;
708*db41cccfSHong Zhang           if(row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
709*db41cccfSHong Zhang #else
710*db41cccfSHong Zhang           row   = rmap_i[row];
711*db41cccfSHong Zhang #endif
712*db41cccfSHong Zhang           ilen  = imat_ilen[row];
713*db41cccfSHong Zhang           mat_i = imat_i[row];
714*db41cccfSHong Zhang           //mat_a = imat_a + mat_i*bs2;
715*db41cccfSHong Zhang           mat_j = imat_j + mat_i;
716*db41cccfSHong Zhang           max2 = rbuf2_i[ct1];
717*db41cccfSHong Zhang           for (l=0; l<max2; l++,ct2++) {
718*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
719*db41cccfSHong Zhang 	    ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
720*db41cccfSHong Zhang 	    if (tcol) {
721*db41cccfSHong Zhang #else
722*db41cccfSHong Zhang 	    if ((tcol = cmap_i[rbuf3_i[ct2]])) {
723*db41cccfSHong Zhang #endif
724*db41cccfSHong Zhang               *mat_j++    = tcol - 1;
725*db41cccfSHong Zhang               /* *mat_a++= rbuf4_i[ct2]; */
726*db41cccfSHong Zhang               //ierr        = PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
727*db41cccfSHong Zhang               //mat_a      += bs2;
728*db41cccfSHong Zhang               ilen++;
729*db41cccfSHong Zhang             }
730*db41cccfSHong Zhang           }
731*db41cccfSHong Zhang           imat_ilen[row] = ilen;
732*db41cccfSHong Zhang         }
733*db41cccfSHong Zhang       }
734*db41cccfSHong Zhang     }
735*db41cccfSHong Zhang   }
736*db41cccfSHong Zhang     //ierr = PetscFree(r_status4);CHKERRQ(ierr);
737*db41cccfSHong Zhang     //ierr = PetscFree(r_waits4);CHKERRQ(ierr);
738*db41cccfSHong Zhang     //if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
739*db41cccfSHong Zhang     //ierr = PetscFree(s_waits4);CHKERRQ(ierr);
740*db41cccfSHong Zhang     //ierr = PetscFree(s_status4);CHKERRQ(ierr);
741*db41cccfSHong Zhang 
742*db41cccfSHong Zhang   /* Restore the indices */
743*db41cccfSHong Zhang   for (i=0; i<ismax; i++) {
744*db41cccfSHong Zhang     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
745*db41cccfSHong Zhang     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
746*db41cccfSHong Zhang   }
747*db41cccfSHong Zhang 
748*db41cccfSHong Zhang   /* Destroy allocated memory */
749*db41cccfSHong Zhang #if defined(PETSC_USE_CTABLE)
750*db41cccfSHong Zhang   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
751*db41cccfSHong Zhang #else
752*db41cccfSHong Zhang   ierr = PetscFree5(irow,icol,nrow,ncol,rtable);CHKERRQ(ierr);
753*db41cccfSHong Zhang #endif
754*db41cccfSHong Zhang   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
755*db41cccfSHong Zhang   ierr = PetscFree(pa);CHKERRQ(ierr);
756*db41cccfSHong Zhang 
757*db41cccfSHong Zhang   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
758*db41cccfSHong Zhang   ierr = PetscFree(sbuf1);CHKERRQ(ierr);
759*db41cccfSHong Zhang   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
760*db41cccfSHong Zhang   for (i=0; i<nrqr; ++i) {
761*db41cccfSHong Zhang     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
762*db41cccfSHong Zhang   }
763*db41cccfSHong Zhang   for (i=0; i<nrqs; ++i) {
764*db41cccfSHong Zhang     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
765*db41cccfSHong Zhang     //ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
766*db41cccfSHong Zhang   }
767*db41cccfSHong Zhang   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
768*db41cccfSHong Zhang   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
769*db41cccfSHong Zhang   //ierr = PetscFree(rbuf4);CHKERRQ(ierr);
770*db41cccfSHong Zhang   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
771*db41cccfSHong Zhang   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
772*db41cccfSHong Zhang   //ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
773*db41cccfSHong Zhang   //ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
774*db41cccfSHong Zhang 
775*db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE)
776*db41cccfSHong Zhang   for (i=0; i<ismax; i++){
777*db41cccfSHong Zhang     ierr = PetscTableDestroy(rowmaps[i]);CHKERRQ(ierr);
778*db41cccfSHong Zhang     ierr = PetscTableDestroy(colmaps[i]);CHKERRQ(ierr);
779*db41cccfSHong Zhang   }
780*db41cccfSHong Zhang   ierr = PetscFree(colmaps);CHKERRQ(ierr);
781*db41cccfSHong Zhang   ierr = PetscFree(rowmaps);CHKERRQ(ierr);
782*db41cccfSHong Zhang #else
783*db41cccfSHong Zhang   ierr = PetscFree(rmap);CHKERRQ(ierr);
784*db41cccfSHong Zhang   ierr = PetscFree(cmap[0]);CHKERRQ(ierr);
785*db41cccfSHong Zhang   ierr = PetscFree(cmap);CHKERRQ(ierr);
786*db41cccfSHong Zhang #endif
787*db41cccfSHong Zhang   ierr = PetscFree(lens);CHKERRQ(ierr);
788*db41cccfSHong Zhang 
789*db41cccfSHong Zhang   for (i=0; i<ismax; i++) {
790*db41cccfSHong Zhang     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
791*db41cccfSHong Zhang     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
792*db41cccfSHong Zhang   }
793*db41cccfSHong Zhang   PetscFunctionReturn(0);
794*db41cccfSHong Zhang }
795*db41cccfSHong Zhang 
796*db41cccfSHong Zhang /* ------------------------------------------------------- */
797*db41cccfSHong Zhang 
798632d0f97SHong Zhang #undef __FUNCT__
799632d0f97SHong Zhang #define __FUNCT__ "MatIncreaseOverlap_MPISBAIJ"
80013f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_MPISBAIJ(Mat C,PetscInt is_max,IS is[],PetscInt ov)
801632d0f97SHong Zhang {
8026849ba73SBarry Smith   PetscErrorCode ierr;
803d0f46423SBarry Smith   PetscInt       i,N=C->cmap->N, bs=C->rmap->bs;
804632d0f97SHong Zhang   IS             *is_new;
805632d0f97SHong Zhang 
806632d0f97SHong Zhang   PetscFunctionBegin;
807c910923dSHong Zhang   ierr = PetscMalloc(is_max*sizeof(IS),&is_new);CHKERRQ(ierr);
808632d0f97SHong Zhang   /* Convert the indices into block format */
80927f478b1SHong Zhang   ierr = ISCompressIndicesGeneral(N,bs,is_max,is,is_new);CHKERRQ(ierr);
810e32f2f54SBarry Smith   if (ov < 0){ SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");}
811*db41cccfSHong Zhang 
812*db41cccfSHong Zhang   //------- new ----------
813*db41cccfSHong Zhang   PetscBool flg=PETSC_FALSE;
814*db41cccfSHong Zhang   ierr = PetscOptionsHasName(PETSC_NULL, "-IncreaseOverlap_new", &flg);CHKERRQ(ierr);
815*db41cccfSHong Zhang   if (!flg){ /* previous non-scalable implementation */
816632d0f97SHong Zhang     for (i=0; i<ov; ++i) {
817c910923dSHong Zhang       ierr = MatIncreaseOverlap_MPISBAIJ_Once(C,is_max,is_new);CHKERRQ(ierr);
818632d0f97SHong Zhang     }
819*db41cccfSHong Zhang   } else { /* scalable implementation using modified BAIJ routines */
820*db41cccfSHong Zhang 
821*db41cccfSHong Zhang   Mat           *submats;
822*db41cccfSHong Zhang   IS            *is_row;
823*db41cccfSHong Zhang   PetscInt      M=C->rmap->N,Mbs=M/bs,*nidx,isz,iov;
824*db41cccfSHong Zhang   Mat_MPISBAIJ  *c = (Mat_MPISBAIJ*)C->data;
825*db41cccfSHong Zhang   PetscMPIInt   rank=c->rank;
826*db41cccfSHong Zhang 
827*db41cccfSHong Zhang 
828*db41cccfSHong Zhang   ierr = PetscMalloc((Mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
829*db41cccfSHong Zhang 
830*db41cccfSHong Zhang   /* Create is_row */
831*db41cccfSHong Zhang   ierr = PetscMalloc(is_max*sizeof(IS **),&is_row);CHKERRQ(ierr);
832*db41cccfSHong Zhang   for (i=0; i<is_max; i++){
833*db41cccfSHong Zhang     ierr = ISCreateStride(PETSC_COMM_SELF,Mbs,0,1,is_row+i);CHKERRQ(ierr);
834*db41cccfSHong Zhang   }
835*db41cccfSHong Zhang 
836*db41cccfSHong Zhang   for (iov=0; iov<ov; ++iov) {
837*db41cccfSHong Zhang     /* Get submats for column search - replace with MatGetSubMatrices_MPIBAIJ_IJ() */
838*db41cccfSHong Zhang     // copied from MatGetSubMatrices_MPIBAIJ() ----------------
839*db41cccfSHong Zhang     /* Allocate memory to hold all the submatrices */
840*db41cccfSHong Zhang     //if (scall != MAT_REUSE_MATRIX) {
841*db41cccfSHong Zhang     ierr = PetscMalloc((is_max+1)*sizeof(Mat),&submats);CHKERRQ(ierr);
842*db41cccfSHong Zhang     //}
843*db41cccfSHong Zhang     /* Determine the number of stages through which submatrices are done */
844*db41cccfSHong Zhang     PetscInt nmax,nstages_local,nstages,max_no,pos;
845*db41cccfSHong Zhang     nmax          = 20*1000000 / (c->Nbs * sizeof(PetscInt));
846*db41cccfSHong Zhang     if (!nmax) nmax = 1;
847*db41cccfSHong Zhang     nstages_local = is_max/nmax + ((is_max % nmax)?1:0);
848*db41cccfSHong Zhang 
849*db41cccfSHong Zhang     /* Make sure every processor loops through the nstages */
850*db41cccfSHong Zhang     ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
851*db41cccfSHong Zhang     for (i=0,pos=0; i<nstages; i++) {
852*db41cccfSHong Zhang       if (pos+nmax <= is_max) max_no = nmax;
853*db41cccfSHong Zhang       else if (pos == is_max) max_no = 0;
854*db41cccfSHong Zhang       else                   max_no = is_max-pos;
855*db41cccfSHong Zhang 
856*db41cccfSHong Zhang       //ierr = MatGetSubMatrices_MPIBAIJ_local(C,max_no,is_row+pos,is_new+pos,MAT_INITIAL_MATRIX,submats+pos);CHKERRQ(ierr);
857*db41cccfSHong Zhang       ierr = MatGetSubMatrices_MPIBAIJ_local_ijonly(C,max_no,is_row+pos,is_new+pos,MAT_INITIAL_MATRIX,submats+pos);CHKERRQ(ierr);
858*db41cccfSHong Zhang       if (rank == 10 && !i){
859*db41cccfSHong Zhang         printf("submats[0]:\n");
860*db41cccfSHong Zhang         ierr = MatView(submats[0],PETSC_VIEWER_STDOUT_SELF);CHKERRQ(ierr);
861*db41cccfSHong Zhang       }
862*db41cccfSHong Zhang 
863*db41cccfSHong Zhang       pos += max_no;
864*db41cccfSHong Zhang     }
865*db41cccfSHong Zhang     // end of copy -----------------------------------
866*db41cccfSHong Zhang 
867*db41cccfSHong Zhang     /* Row search */
868*db41cccfSHong Zhang     ierr = MatIncreaseOverlap_MPIBAIJ_Once(C,is_max,is_new);CHKERRQ(ierr);
869*db41cccfSHong Zhang 
870*db41cccfSHong Zhang     /* Column search */
871*db41cccfSHong Zhang     PetscBT        table;
872*db41cccfSHong Zhang     Mat_SeqSBAIJ   *asub_i;
873*db41cccfSHong Zhang     PetscInt       *ai,brow,nz,nis,l;
874*db41cccfSHong Zhang     const PetscInt *idx;
875*db41cccfSHong Zhang 
876*db41cccfSHong Zhang     ierr = PetscBTCreate(Mbs,table);CHKERRQ(ierr);
877*db41cccfSHong Zhang     for (i=0; i<is_max; i++){
878*db41cccfSHong Zhang       asub_i = (Mat_SeqSBAIJ*)submats[i]->data;
879*db41cccfSHong Zhang       ai=asub_i->i;;
880*db41cccfSHong Zhang 
881*db41cccfSHong Zhang       /* put is_new obtained from MatIncreaseOverlap_MPIBAIJ() to table */
882*db41cccfSHong Zhang       ierr = PetscBTMemzero(Mbs,table);CHKERRQ(ierr);
883*db41cccfSHong Zhang 
884*db41cccfSHong Zhang       ierr = ISGetIndices(is_new[i],&idx);CHKERRQ(ierr);
885*db41cccfSHong Zhang       ierr = ISGetLocalSize(is_new[i],&nis);CHKERRQ(ierr);
886*db41cccfSHong Zhang       for (l=0; l<nis; l++) {
887*db41cccfSHong Zhang         ierr = PetscBTSet(table,idx[l]);CHKERRQ(ierr);
888*db41cccfSHong Zhang         nidx[l] = idx[l];
889*db41cccfSHong Zhang       }
890*db41cccfSHong Zhang       isz = nis;
891*db41cccfSHong Zhang 
892*db41cccfSHong Zhang       for (brow=0; brow<Mbs; brow++){
893*db41cccfSHong Zhang         nz = ai[brow+1] - ai[brow];
894*db41cccfSHong Zhang         if (nz) {
895*db41cccfSHong Zhang           if (!PetscBTLookupSet(table,brow)) nidx[isz++] = brow;
896*db41cccfSHong Zhang         }
897*db41cccfSHong Zhang       }
898*db41cccfSHong Zhang       ierr = ISRestoreIndices(is_new[i],&idx);CHKERRQ(ierr);
899*db41cccfSHong Zhang       ierr = ISDestroy(is_new[i]);CHKERRQ(ierr);
900*db41cccfSHong Zhang 
901*db41cccfSHong Zhang       /* create updated is_new */
902*db41cccfSHong Zhang       ierr = ISCreateGeneral(PETSC_COMM_SELF,isz,nidx,PETSC_COPY_VALUES,is_new+i);CHKERRQ(ierr);
903*db41cccfSHong Zhang     }
904*db41cccfSHong Zhang 
905*db41cccfSHong Zhang     /* Free tmp spaces */
906*db41cccfSHong Zhang     ierr = PetscBTDestroy(table);CHKERRQ(ierr);
907*db41cccfSHong Zhang     for (i=0; i<is_max; i++){
908*db41cccfSHong Zhang       if (rank == 10 && !i){
909*db41cccfSHong Zhang         printf("submats[0]:\n");
910*db41cccfSHong Zhang         ierr = MatView(submats[0],PETSC_VIEWER_STDOUT_SELF);CHKERRQ(ierr);
911*db41cccfSHong Zhang       }
912*db41cccfSHong Zhang       ierr = MatDestroy(submats[i]);CHKERRQ(ierr);
913*db41cccfSHong Zhang     }
914*db41cccfSHong Zhang     ierr = PetscFree(submats);CHKERRQ(ierr);
915*db41cccfSHong Zhang   }
916*db41cccfSHong Zhang 
917*db41cccfSHong Zhang   for (i=0; i<is_max; i++){
918*db41cccfSHong Zhang     ierr = ISDestroy(is_row[i]);CHKERRQ(ierr);
919*db41cccfSHong Zhang   }
920*db41cccfSHong Zhang   ierr = PetscFree(is_row);CHKERRQ(ierr);
921*db41cccfSHong Zhang   ierr = PetscFree(nidx);CHKERRQ(ierr);
922*db41cccfSHong Zhang   }
923*db41cccfSHong Zhang   //--------end of new----------
924*db41cccfSHong Zhang 
925c910923dSHong Zhang   for (i=0; i<is_max; i++) {ierr = ISDestroy(is[i]);CHKERRQ(ierr);}
92627f478b1SHong Zhang   ierr = ISExpandIndicesGeneral(N,bs,is_max,is_new,is);CHKERRQ(ierr);
927*db41cccfSHong Zhang 
928c910923dSHong Zhang   for (i=0; i<is_max; i++) {ierr = ISDestroy(is_new[i]);CHKERRQ(ierr);}
929632d0f97SHong Zhang   ierr = PetscFree(is_new);CHKERRQ(ierr);
930632d0f97SHong Zhang   PetscFunctionReturn(0);
931632d0f97SHong Zhang }
932632d0f97SHong Zhang 
9334a69c536SBarry Smith typedef enum {MINE,OTHER} WhoseOwner;
9340472cc68SHong Zhang /*  data1, odata1 and odata2 are packed in the format (for communication):
935a2a9f22aSHong Zhang        data[0]          = is_max, no of is
936a2a9f22aSHong Zhang        data[1]          = size of is[0]
937a2a9f22aSHong Zhang         ...
938a2a9f22aSHong Zhang        data[is_max]     = size of is[is_max-1]
939a2a9f22aSHong Zhang        data[is_max + 1] = data(is[0])
940a2a9f22aSHong Zhang         ...
941a2a9f22aSHong Zhang        data[is_max+1+sum(size of is[k]), k=0,...,i-1] = data(is[i])
942a2a9f22aSHong Zhang         ...
9430472cc68SHong Zhang    data2 is packed in the format (for creating output is[]):
9440472cc68SHong Zhang        data[0]          = is_max, no of is
9450472cc68SHong Zhang        data[1]          = size of is[0]
9460472cc68SHong Zhang         ...
9470472cc68SHong Zhang        data[is_max]     = size of is[is_max-1]
9480472cc68SHong Zhang        data[is_max + 1] = data(is[0])
9490472cc68SHong Zhang         ...
9500472cc68SHong Zhang        data[is_max + 1 + Mbs*i) = data(is[i])
9510472cc68SHong Zhang         ...
952a2a9f22aSHong Zhang */
953632d0f97SHong Zhang #undef __FUNCT__
954632d0f97SHong Zhang #define __FUNCT__ "MatIncreaseOverlap_MPISBAIJ_Once"
95513f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat C,PetscInt is_max,IS is[])
956632d0f97SHong Zhang {
957632d0f97SHong Zhang   Mat_MPISBAIJ  *c = (Mat_MPISBAIJ*)C->data;
9586849ba73SBarry Smith   PetscErrorCode ierr;
95913f74950SBarry Smith   PetscMPIInt    size,rank,tag1,tag2,*len_s,nrqr,nrqs,*id_r1,*len_r1,flag,len;
9605d0c19d7SBarry Smith   const PetscInt *idx_i;
9615d0c19d7SBarry Smith   PetscInt       idx,isz,col,*n,*data1,**data1_start,*data2,*data2_i,*data,*data_i,
96213f74950SBarry Smith                  Mbs,i,j,k,*odata1,*odata2,
96313f74950SBarry Smith                  proc_id,**odata2_ptr,*ctable=0,*btable,len_max,len_est;
96413f74950SBarry Smith   PetscInt       proc_end=0,*iwork,len_unused,nodata2;
96513f74950SBarry Smith   PetscInt       ois_max; /* max no of is[] in each of processor */
966bfc6803cSHong Zhang   char           *t_p;
967632d0f97SHong Zhang   MPI_Comm       comm;
968e8527bf2SHong Zhang   MPI_Request    *s_waits1,*s_waits2,r_req;
969632d0f97SHong Zhang   MPI_Status     *s_status,r_status;
97045f43ab7SHong Zhang   PetscBT        *table;  /* mark indices of this processor's is[] */
971fc70d14eSHong Zhang   PetscBT        table_i;
972bfc6803cSHong Zhang   PetscBT        otable; /* mark indices of other processors' is[] */
973d0f46423SBarry Smith   PetscInt       bs=C->rmap->bs,Bn = c->B->cmap->n,Bnbs = Bn/bs,*Bowners;
97410eea884SHong Zhang   IS             garray_local,garray_gl;
9755483b11dSHong Zhang 
976632d0f97SHong Zhang   PetscFunctionBegin;
9777adad957SLisandro Dalcin   comm = ((PetscObject)C)->comm;
978632d0f97SHong Zhang   size = c->size;
979632d0f97SHong Zhang   rank = c->rank;
980632d0f97SHong Zhang   Mbs  = c->Mbs;
981632d0f97SHong Zhang 
982c910923dSHong Zhang   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
983c910923dSHong Zhang   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
984c910923dSHong Zhang 
985430a0127SHong Zhang   /* create tables used in
986430a0127SHong Zhang      step 1: table[i] - mark c->garray of proc [i]
98745f43ab7SHong Zhang      step 3: table[i] - mark indices of is[i] when whose=MINE
988430a0127SHong Zhang              table[0] - mark incideces of is[] when whose=OTHER */
989430a0127SHong Zhang   len = PetscMax(is_max, size);CHKERRQ(ierr);
99074ed9c26SBarry Smith   ierr = PetscMalloc2(len,PetscBT,&table,(Mbs/PETSC_BITS_PER_BYTE+1)*len,char,&t_p);CHKERRQ(ierr);
991430a0127SHong Zhang   for (i=0; i<len; i++) {
992430a0127SHong Zhang     table[i]  = t_p  + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
993430a0127SHong Zhang   }
994430a0127SHong Zhang 
99513f74950SBarry Smith   ierr = MPI_Allreduce(&is_max,&ois_max,1,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
99610eea884SHong Zhang 
9975483b11dSHong Zhang   /* 1. Send this processor's is[] to other processors */
9985483b11dSHong Zhang   /*---------------------------------------------------*/
999e8527bf2SHong Zhang   /* allocate spaces */
100013f74950SBarry Smith   ierr = PetscMalloc(is_max*sizeof(PetscInt),&n);CHKERRQ(ierr);
10015483b11dSHong Zhang   len = 0;
1002c910923dSHong Zhang   for (i=0; i<is_max; i++) {
1003632d0f97SHong Zhang     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
1004632d0f97SHong Zhang     len += n[i];
1005632d0f97SHong Zhang   }
1006958c9bccSBarry Smith   if (!len) {
10075483b11dSHong Zhang     is_max = 0;
10085483b11dSHong Zhang   } else {
10095483b11dSHong Zhang     len += 1 + is_max; /* max length of data1 for one processor */
10105483b11dSHong Zhang   }
10115483b11dSHong Zhang 
1012430a0127SHong Zhang 
101313f74950SBarry Smith   ierr = PetscMalloc((size*len+1)*sizeof(PetscInt),&data1);CHKERRQ(ierr);
101413f74950SBarry Smith   ierr = PetscMalloc(size*sizeof(PetscInt*),&data1_start);CHKERRQ(ierr);
10155483b11dSHong Zhang   for (i=0; i<size; i++) data1_start[i] = data1 + i*len;
10165483b11dSHong Zhang 
101740cb64c9SJed Brown   ierr = PetscMalloc4(size,PetscInt,&len_s,size,PetscInt,&btable,size,PetscInt,&iwork,size+1,PetscInt,&Bowners);CHKERRQ(ierr);
1018e8527bf2SHong Zhang 
101976f244e2SHong Zhang   /* gather c->garray from all processors */
102070b3c8c7SBarry Smith   ierr = ISCreateGeneral(comm,Bnbs,c->garray,PETSC_COPY_VALUES,&garray_local);CHKERRQ(ierr);
102176f244e2SHong Zhang   ierr = ISAllGather(garray_local, &garray_gl);CHKERRQ(ierr);
102276f244e2SHong Zhang   ierr = ISDestroy(garray_local);CHKERRQ(ierr);
1023a7cc72afSBarry Smith   ierr = MPI_Allgather(&Bnbs,1,MPIU_INT,Bowners+1,1,MPIU_INT,comm);CHKERRQ(ierr);
102476f244e2SHong Zhang   Bowners[0] = 0;
102576f244e2SHong Zhang   for (i=0; i<size; i++) Bowners[i+1] += Bowners[i];
102676f244e2SHong Zhang 
10275483b11dSHong Zhang   if (is_max){
1028430a0127SHong Zhang     /* hash table ctable which maps c->row to proc_id) */
102913f74950SBarry Smith     ierr = PetscMalloc(Mbs*sizeof(PetscInt),&ctable);CHKERRQ(ierr);
10305483b11dSHong Zhang     for (proc_id=0,j=0; proc_id<size; proc_id++) {
1031288a2dc7SJed Brown       for (; j<C->rmap->range[proc_id+1]/bs; j++) {
10325483b11dSHong Zhang         ctable[j] = proc_id;
10335483b11dSHong Zhang       }
10345483b11dSHong Zhang     }
10355483b11dSHong Zhang 
1036430a0127SHong Zhang     /* hash tables marking c->garray */
103710eea884SHong Zhang     ierr = ISGetIndices(garray_gl,&idx_i);
10385483b11dSHong Zhang     for (i=0; i<size; i++){
1039430a0127SHong Zhang       table_i = table[i];
1040430a0127SHong Zhang       ierr    = PetscBTMemzero(Mbs,table_i);CHKERRQ(ierr);
1041430a0127SHong Zhang       for (j = Bowners[i]; j<Bowners[i+1]; j++){ /* go through B cols of proc[i]*/
1042430a0127SHong Zhang         ierr = PetscBTSet(table_i,idx_i[j]);CHKERRQ(ierr);
10435483b11dSHong Zhang       }
10445483b11dSHong Zhang     }
104510eea884SHong Zhang     ierr = ISRestoreIndices(garray_gl,&idx_i);CHKERRQ(ierr);
10465483b11dSHong Zhang   }  /* if (is_max) */
104776f244e2SHong Zhang   ierr = ISDestroy(garray_gl);CHKERRQ(ierr);
10485483b11dSHong Zhang 
10495483b11dSHong Zhang   /* evaluate communication - mesg to who, length, and buffer space */
1050e8527bf2SHong Zhang   for (i=0; i<size; i++) len_s[i] = 0;
10515483b11dSHong Zhang 
10525483b11dSHong Zhang   /* header of data1 */
10535483b11dSHong Zhang   for (proc_id=0; proc_id<size; proc_id++){
10545483b11dSHong Zhang     iwork[proc_id] = 0;
10555483b11dSHong Zhang     *data1_start[proc_id] = is_max;
10565483b11dSHong Zhang     data1_start[proc_id]++;
10575483b11dSHong Zhang     for (j=0; j<is_max; j++) {
10585483b11dSHong Zhang       if (proc_id == rank){
10595483b11dSHong Zhang         *data1_start[proc_id] = n[j];
10605483b11dSHong Zhang       } else {
10615483b11dSHong Zhang         *data1_start[proc_id] = 0;
10625483b11dSHong Zhang       }
10635483b11dSHong Zhang       data1_start[proc_id]++;
10645483b11dSHong Zhang     }
10655483b11dSHong Zhang   }
10665483b11dSHong Zhang 
10675483b11dSHong Zhang   for (i=0; i<is_max; i++) {
10685483b11dSHong Zhang     ierr = ISGetIndices(is[i],&idx_i);CHKERRQ(ierr);
10695483b11dSHong Zhang     for (j=0; j<n[i]; j++){
10705483b11dSHong Zhang       idx = idx_i[j];
10715483b11dSHong Zhang       *data1_start[rank] = idx; data1_start[rank]++; /* for local proccessing */
10725483b11dSHong Zhang       proc_end = ctable[idx];
10735483b11dSHong Zhang       for (proc_id=0;  proc_id<=proc_end; proc_id++){ /* for others to process */
1074e8527bf2SHong Zhang         if (proc_id == rank ) continue; /* done before this loop */
1075430a0127SHong Zhang         if (proc_id < proc_end && !PetscBTLookup(table[proc_id],idx))
1076430a0127SHong Zhang           continue;   /* no need for sending idx to [proc_id] */
10775483b11dSHong Zhang         *data1_start[proc_id] = idx; data1_start[proc_id]++;
10785483b11dSHong Zhang         len_s[proc_id]++;
10795483b11dSHong Zhang       }
10805483b11dSHong Zhang     }
10815483b11dSHong Zhang     /* update header data */
10822cfbe0a4SHong Zhang     for (proc_id=0; proc_id<size; proc_id++){
10835483b11dSHong Zhang       if (proc_id== rank) continue;
10845483b11dSHong Zhang       *(data1 + proc_id*len + 1 + i) = len_s[proc_id] - iwork[proc_id];
10855483b11dSHong Zhang       iwork[proc_id] = len_s[proc_id] ;
10865483b11dSHong Zhang     }
10875483b11dSHong Zhang     ierr = ISRestoreIndices(is[i],&idx_i);CHKERRQ(ierr);
1088e8527bf2SHong Zhang   }
10895483b11dSHong Zhang 
10905483b11dSHong Zhang   nrqs = 0; nrqr = 0;
10915483b11dSHong Zhang   for (i=0; i<size; i++){
10925483b11dSHong Zhang     data1_start[i] = data1 + i*len;
10935483b11dSHong Zhang     if (len_s[i]){
10945483b11dSHong Zhang       nrqs++;
10955483b11dSHong Zhang       len_s[i] += 1 + is_max; /* add no. of header msg */
10965483b11dSHong Zhang     }
10975483b11dSHong Zhang   }
10985483b11dSHong Zhang 
10995483b11dSHong Zhang   for (i=0; i<is_max; i++) {
1100a2a9f22aSHong Zhang     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
1101632d0f97SHong Zhang   }
1102bfc6803cSHong Zhang   ierr = PetscFree(n);CHKERRQ(ierr);
110305b42c5fSBarry Smith   ierr = PetscFree(ctable);CHKERRQ(ierr);
11045483b11dSHong Zhang 
11055483b11dSHong Zhang   /* Determine the number of messages to expect, their lengths, from from-ids */
11065483b11dSHong Zhang   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&nrqr);CHKERRQ(ierr);
11075483b11dSHong Zhang   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,len_s,&id_r1,&len_r1);CHKERRQ(ierr);
1108632d0f97SHong Zhang 
1109632d0f97SHong Zhang   /*  Now  post the sends */
111074ed9c26SBarry Smith   ierr = PetscMalloc2(size,MPI_Request,&s_waits1,size,MPI_Request,&s_waits2);CHKERRQ(ierr);
1111632d0f97SHong Zhang   k = 0;
11125483b11dSHong Zhang   for (proc_id=0; proc_id<size; proc_id++){  /* send data1 to processor [proc_id] */
11135483b11dSHong Zhang     if (len_s[proc_id]){
111413f74950SBarry Smith       ierr = MPI_Isend(data1_start[proc_id],len_s[proc_id],MPIU_INT,proc_id,tag1,comm,s_waits1+k);CHKERRQ(ierr);
1115632d0f97SHong Zhang       k++;
1116632d0f97SHong Zhang     }
1117632d0f97SHong Zhang   }
1118632d0f97SHong Zhang 
111945f43ab7SHong Zhang   /* 2. Receive other's is[] and process. Then send back */
1120bfc6803cSHong Zhang   /*-----------------------------------------------------*/
11215483b11dSHong Zhang   len = 0;
11225483b11dSHong Zhang   for (i=0; i<nrqr; i++){
11235483b11dSHong Zhang     if (len_r1[i] > len)len = len_r1[i];
11245483b11dSHong Zhang   }
112545f43ab7SHong Zhang   ierr = PetscFree(len_r1);CHKERRQ(ierr);
112645f43ab7SHong Zhang   ierr = PetscFree(id_r1);CHKERRQ(ierr);
112745f43ab7SHong Zhang 
112845f43ab7SHong Zhang   for (proc_id=0; proc_id<size; proc_id++)
112945f43ab7SHong Zhang     len_s[proc_id] = iwork[proc_id] = 0;
113045f43ab7SHong Zhang 
113113f74950SBarry Smith   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&odata1);CHKERRQ(ierr);
113213f74950SBarry Smith   ierr = PetscMalloc(size*sizeof(PetscInt**),&odata2_ptr);CHKERRQ(ierr);
113376f244e2SHong Zhang   ierr = PetscBTCreate(Mbs,otable);CHKERRQ(ierr);
113410eea884SHong Zhang 
113510eea884SHong Zhang   len_max = ois_max*(Mbs+1);  /* max space storing all is[] for each receive */
1136240e5896SHong Zhang   len_est = 2*len_max; /* estimated space of storing is[] for all receiving messages */
113713f74950SBarry Smith   ierr = PetscMalloc((len_est+1)*sizeof(PetscInt),&odata2);CHKERRQ(ierr);
113810eea884SHong Zhang   nodata2 = 0;       /* nodata2+1: num of PetscMalloc(,&odata2_ptr[]) called */
1139240e5896SHong Zhang   odata2_ptr[nodata2] = odata2;
114010eea884SHong Zhang   len_unused = len_est; /* unused space in the array odata2_ptr[nodata2]-- needs to be >= len_max  */
114110eea884SHong Zhang 
1142632d0f97SHong Zhang   k = 0;
11435483b11dSHong Zhang   while (k < nrqr){
1144632d0f97SHong Zhang     /* Receive messages */
1145bfc6803cSHong Zhang     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag1,comm,&flag,&r_status);CHKERRQ(ierr);
1146632d0f97SHong Zhang     if (flag){
114713f74950SBarry Smith       ierr = MPI_Get_count(&r_status,MPIU_INT,&len);CHKERRQ(ierr);
1148632d0f97SHong Zhang       proc_id = r_status.MPI_SOURCE;
114913f74950SBarry Smith       ierr = MPI_Irecv(odata1,len,MPIU_INT,proc_id,r_status.MPI_TAG,comm,&r_req);CHKERRQ(ierr);
11505483b11dSHong Zhang       ierr = MPI_Wait(&r_req,&r_status);CHKERRQ(ierr);
1151632d0f97SHong Zhang 
1152fc70d14eSHong Zhang       /*  Process messages */
1153240e5896SHong Zhang       /*  make sure there is enough unused space in odata2 array */
115410eea884SHong Zhang       if (len_unused < len_max){ /* allocate more space for odata2 */
115513f74950SBarry Smith         ierr = PetscMalloc((len_est+1)*sizeof(PetscInt),&odata2);CHKERRQ(ierr);
1156240e5896SHong Zhang         odata2_ptr[++nodata2] = odata2;
115710eea884SHong Zhang         len_unused = len_est;
115810eea884SHong Zhang       }
115910eea884SHong Zhang 
116010eea884SHong Zhang       ierr = MatIncreaseOverlap_MPISBAIJ_Local(C,odata1,OTHER,odata2,&otable);CHKERRQ(ierr);
1161a2a9f22aSHong Zhang       len = 1 + odata2[0];
1162a2a9f22aSHong Zhang       for (i=0; i<odata2[0]; i++){
1163a2a9f22aSHong Zhang         len += odata2[1 + i];
1164fc70d14eSHong Zhang       }
1165632d0f97SHong Zhang 
1166632d0f97SHong Zhang       /* Send messages back */
116713f74950SBarry Smith       ierr = MPI_Isend(odata2,len,MPIU_INT,proc_id,tag2,comm,s_waits2+k);CHKERRQ(ierr);
1168fc70d14eSHong Zhang       k++;
116910eea884SHong Zhang       odata2     += len;
117010eea884SHong Zhang       len_unused -= len;
117145f43ab7SHong Zhang       len_s[proc_id] = len; /* num of messages sending back to [proc_id] by this proc */
1172632d0f97SHong Zhang     }
11735483b11dSHong Zhang   }
11745483b11dSHong Zhang   ierr = PetscFree(odata1);CHKERRQ(ierr);
117545f43ab7SHong Zhang   ierr = PetscBTDestroy(otable);CHKERRQ(ierr);
1176632d0f97SHong Zhang 
117745f43ab7SHong Zhang   /* 3. Do local work on this processor's is[] */
117845f43ab7SHong Zhang   /*-------------------------------------------*/
117945f43ab7SHong Zhang   /* make sure there is enough unused space in odata2(=data) array */
118045f43ab7SHong Zhang   len_max = is_max*(Mbs+1); /* max space storing all is[] for this processor */
118110eea884SHong Zhang   if (len_unused < len_max){ /* allocate more space for odata2 */
118213f74950SBarry Smith     ierr = PetscMalloc((len_est+1)*sizeof(PetscInt),&odata2);CHKERRQ(ierr);
1183240e5896SHong Zhang     odata2_ptr[++nodata2] = odata2;
118410eea884SHong Zhang     len_unused = len_est;
118510eea884SHong Zhang   }
1186bfc6803cSHong Zhang 
1187240e5896SHong Zhang   data = odata2;
118845f43ab7SHong Zhang   ierr = MatIncreaseOverlap_MPISBAIJ_Local(C,data1_start[rank],MINE,data,table);CHKERRQ(ierr);
118945f43ab7SHong Zhang   ierr = PetscFree(data1_start);CHKERRQ(ierr);
119045f43ab7SHong Zhang 
119145f43ab7SHong Zhang   /* 4. Receive work done on other processors, then merge */
119245f43ab7SHong Zhang   /*------------------------------------------------------*/
119345f43ab7SHong Zhang   /* get max number of messages that this processor expects to recv */
119413f74950SBarry Smith   ierr = MPI_Allreduce(len_s,iwork,size,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
119513f74950SBarry Smith   ierr = PetscMalloc((iwork[rank]+1)*sizeof(PetscInt),&data2);CHKERRQ(ierr);
119674ed9c26SBarry Smith   ierr = PetscFree4(len_s,btable,iwork,Bowners);CHKERRQ(ierr);
119745f43ab7SHong Zhang 
1198632d0f97SHong Zhang   k = 0;
11995483b11dSHong Zhang   while (k < nrqs){
1200632d0f97SHong Zhang     /* Receive messages */
1201c910923dSHong Zhang     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag2,comm,&flag,&r_status);
1202632d0f97SHong Zhang     if (flag){
120313f74950SBarry Smith       ierr = MPI_Get_count(&r_status,MPIU_INT,&len);CHKERRQ(ierr);
1204632d0f97SHong Zhang       proc_id = r_status.MPI_SOURCE;
120513f74950SBarry Smith       ierr = MPI_Irecv(data2,len,MPIU_INT,proc_id,r_status.MPI_TAG,comm,&r_req);CHKERRQ(ierr);
12065483b11dSHong Zhang       ierr = MPI_Wait(&r_req,&r_status);CHKERRQ(ierr);
12075483b11dSHong Zhang       if (len > 1+is_max){ /* Add data2 into data */
12080472cc68SHong Zhang         data2_i = data2 + 1 + is_max;
1209fc70d14eSHong Zhang         for (i=0; i<is_max; i++){
1210fc70d14eSHong Zhang           table_i = table[i];
1211bfc6803cSHong Zhang           data_i  = data + 1 + is_max + Mbs*i;
1212bfc6803cSHong Zhang           isz     = data[1+i];
12130472cc68SHong Zhang           for (j=0; j<data2[1+i]; j++){
12140472cc68SHong Zhang             col = data2_i[j];
1215bfc6803cSHong Zhang             if (!PetscBTLookupSet(table_i,col)) {data_i[isz++] = col;}
1216fc70d14eSHong Zhang           }
1217bfc6803cSHong Zhang           data[1+i] = isz;
12180472cc68SHong Zhang           if (i < is_max - 1) data2_i += data2[1+i];
1219fc70d14eSHong Zhang         }
12205483b11dSHong Zhang       }
1221632d0f97SHong Zhang       k++;
1222632d0f97SHong Zhang     }
12235483b11dSHong Zhang   }
122445f43ab7SHong Zhang   ierr = PetscFree(data2);CHKERRQ(ierr);
122574ed9c26SBarry Smith   ierr = PetscFree2(table,t_p);CHKERRQ(ierr);
12265483b11dSHong Zhang 
12275483b11dSHong Zhang   /* phase 1 sends are complete */
12285483b11dSHong Zhang   ierr = PetscMalloc(size*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
12290c468ba9SBarry Smith   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
12305483b11dSHong Zhang   ierr = PetscFree(data1);CHKERRQ(ierr);
12315483b11dSHong Zhang 
1232240e5896SHong Zhang   /* phase 2 sends are complete */
12330c468ba9SBarry Smith   if (nrqr){ierr = MPI_Waitall(nrqr,s_waits2,s_status);CHKERRQ(ierr);}
123474ed9c26SBarry Smith   ierr = PetscFree2(s_waits1,s_waits2);CHKERRQ(ierr);
123545f43ab7SHong Zhang   ierr = PetscFree(s_status);CHKERRQ(ierr);
1236632d0f97SHong Zhang 
1237c910923dSHong Zhang   /* 5. Create new is[] */
1238c910923dSHong Zhang   /*--------------------*/
1239c910923dSHong Zhang   for (i=0; i<is_max; i++) {
1240bfc6803cSHong Zhang     data_i = data + 1 + is_max + Mbs*i;
124170b3c8c7SBarry Smith     ierr = ISCreateGeneral(PETSC_COMM_SELF,data[1+i],data_i,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
1242632d0f97SHong Zhang   }
124345f43ab7SHong Zhang   for (k=0; k<=nodata2; k++){
124445f43ab7SHong Zhang     ierr = PetscFree(odata2_ptr[k]);CHKERRQ(ierr);
124545f43ab7SHong Zhang   }
124645f43ab7SHong Zhang   ierr = PetscFree(odata2_ptr);CHKERRQ(ierr);
12475483b11dSHong Zhang 
1248632d0f97SHong Zhang   PetscFunctionReturn(0);
1249632d0f97SHong Zhang }
1250632d0f97SHong Zhang 
1251632d0f97SHong Zhang #undef __FUNCT__
1252632d0f97SHong Zhang #define __FUNCT__ "MatIncreaseOverlap_MPISBAIJ_Local"
1253632d0f97SHong Zhang /*
1254dc008846SHong Zhang    MatIncreaseOverlap_MPISBAIJ_Local - Called by MatIncreaseOverlap, to do
1255632d0f97SHong Zhang        the work on the local processor.
1256632d0f97SHong Zhang 
1257632d0f97SHong Zhang      Inputs:
1258632d0f97SHong Zhang       C      - MAT_MPISBAIJ;
1259bfc6803cSHong Zhang       data   - holds is[]. See MatIncreaseOverlap_MPISBAIJ_Once() for the format.
1260bfc6803cSHong Zhang       whose  - whose is[] to be processed,
1261bfc6803cSHong Zhang                MINE:  this processor's is[]
1262bfc6803cSHong Zhang                OTHER: other processor's is[]
1263632d0f97SHong Zhang      Output:
126410eea884SHong Zhang        nidx  - whose = MINE:
12650472cc68SHong Zhang                      holds input and newly found indices in the same format as data
12660472cc68SHong Zhang                whose = OTHER:
12670472cc68SHong Zhang                      only holds the newly found indices
12680472cc68SHong Zhang        table - table[i]: mark the indices of is[i], i=0,...,is_max. Used only in the case 'whose=MINE'.
1269632d0f97SHong Zhang */
127076f244e2SHong Zhang /* Would computation be reduced by swapping the loop 'for each is' and 'for each row'? */
127113f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat C,PetscInt *data,PetscInt whose,PetscInt *nidx,PetscBT *table)
1272632d0f97SHong Zhang {
1273632d0f97SHong Zhang   Mat_MPISBAIJ   *c = (Mat_MPISBAIJ*)C->data;
1274dc008846SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)(c->A)->data;
1275dc008846SHong Zhang   Mat_SeqBAIJ    *b = (Mat_SeqBAIJ*)(c->B)->data;
1276dfbe8321SBarry Smith   PetscErrorCode ierr;
127713f74950SBarry Smith   PetscInt       row,mbs,Mbs,*nidx_i,col,col_max,isz,isz0,*ai,*aj,*bi,*bj,*garray,rstart,l;
127813f74950SBarry Smith   PetscInt       a_start,a_end,b_start,b_end,i,j,k,is_max,*idx_i,n;
1279bfc6803cSHong Zhang   PetscBT        table0;  /* mark the indices of input is[] for look up */
1280bfc6803cSHong Zhang   PetscBT        table_i; /* poits to i-th table. When whose=OTHER, a single table is used for all is[] */
1281632d0f97SHong Zhang 
1282632d0f97SHong Zhang   PetscFunctionBegin;
128331f99336SHong Zhang   Mbs = c->Mbs; mbs = a->mbs;
1284dc008846SHong Zhang   ai = a->i; aj = a->j;
1285dc008846SHong Zhang   bi = b->i; bj = b->j;
1286632d0f97SHong Zhang   garray = c->garray;
1287899cda47SBarry Smith   rstart = c->rstartbs;
1288dc008846SHong Zhang   is_max = data[0];
1289632d0f97SHong Zhang 
1290fc70d14eSHong Zhang   ierr = PetscBTCreate(Mbs,table0);CHKERRQ(ierr);
1291fc70d14eSHong Zhang 
1292fc70d14eSHong Zhang   nidx[0] = is_max;
1293fc70d14eSHong Zhang   idx_i   = data + is_max + 1; /* ptr to input is[0] array */
1294bfc6803cSHong Zhang   nidx_i  = nidx + is_max + 1; /* ptr to output is[0] array */
1295dc008846SHong Zhang   for (i=0; i<is_max; i++) { /* for each is */
1296dc008846SHong Zhang     isz  = 0;
1297fc70d14eSHong Zhang     n = data[1+i]; /* size of input is[i] */
1298dc008846SHong Zhang 
129976f244e2SHong Zhang     /* initialize and set table_i(mark idx and nidx) and table0(only mark idx) */
1300bfc6803cSHong Zhang     if (whose == MINE){ /* process this processor's is[] */
1301bfc6803cSHong Zhang       table_i = table[i];
13020472cc68SHong Zhang       nidx_i  = nidx + 1+ is_max + Mbs*i;
1303bfc6803cSHong Zhang     } else {            /* process other processor's is[] - only use one temp table */
1304430a0127SHong Zhang       table_i = table[0];
1305bfc6803cSHong Zhang     }
1306bfc6803cSHong Zhang     ierr = PetscBTMemzero(Mbs,table_i);CHKERRQ(ierr);
1307bfc6803cSHong Zhang     ierr = PetscBTMemzero(Mbs,table0);CHKERRQ(ierr);
130876f244e2SHong Zhang     if (n==0) {
130976f244e2SHong Zhang        nidx[1+i] = 0; /* size of new is[i] */
131076f244e2SHong Zhang        continue;
131176f244e2SHong Zhang     }
131276f244e2SHong Zhang 
131376f244e2SHong Zhang     isz0 = 0; col_max = 0;
1314dc008846SHong Zhang     for (j=0; j<n; j++){
1315dc008846SHong Zhang       col = idx_i[j];
1316e32f2f54SBarry Smith       if (col >= Mbs) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index col %D >= Mbs %D",col,Mbs);
1317bfc6803cSHong Zhang       if(!PetscBTLookupSet(table_i,col)) {
1318bfc6803cSHong Zhang         ierr = PetscBTSet(table0,col);CHKERRQ(ierr);
13190472cc68SHong Zhang         if (whose == MINE) {nidx_i[isz0] = col;}
1320dbe03f88SHong Zhang         if (col_max < col) col_max = col;
1321bfc6803cSHong Zhang         isz0++;
1322bfc6803cSHong Zhang       }
1323632d0f97SHong Zhang     }
1324dc008846SHong Zhang 
13250472cc68SHong Zhang     if (whose == MINE) {isz = isz0;}
1326fc70d14eSHong Zhang     k = 0;  /* no. of indices from input is[i] that have been examined */
1327dc008846SHong Zhang     for (row=0; row<mbs; row++){
1328dc008846SHong Zhang       a_start = ai[row]; a_end = ai[row+1];
1329dc008846SHong Zhang       b_start = bi[row]; b_end = bi[row+1];
13300472cc68SHong Zhang       if (PetscBTLookup(table0,row+rstart)){ /* row is on input is[i]:
13310472cc68SHong Zhang                                                 do row search: collect all col in this row */
1332dc008846SHong Zhang         for (l = a_start; l<a_end ; l++){ /* Amat */
1333dc008846SHong Zhang           col = aj[l] + rstart;
1334fc70d14eSHong Zhang           if (!PetscBTLookupSet(table_i,col)) {nidx_i[isz++] = col;}
1335dc008846SHong Zhang         }
1336dc008846SHong Zhang         for (l = b_start; l<b_end ; l++){ /* Bmat */
1337dc008846SHong Zhang           col = garray[bj[l]];
1338fc70d14eSHong Zhang           if (!PetscBTLookupSet(table_i,col)) {nidx_i[isz++] = col;}
1339dc008846SHong Zhang         }
1340dc008846SHong Zhang         k++;
1341dc008846SHong Zhang         if (k >= isz0) break; /* for (row=0; row<mbs; row++) */
13420472cc68SHong Zhang       } else { /* row is not on input is[i]:
13430472cc68SHong Zhang                   do col serach: add row onto nidx_i if there is a col in nidx_i */
1344dc008846SHong Zhang         for (l = a_start; l<a_end ; l++){ /* Amat */
1345dc008846SHong Zhang           col = aj[l] + rstart;
134676f244e2SHong Zhang           if (col > col_max) break;
1347dc008846SHong Zhang           if (PetscBTLookup(table0,col)){
1348fc70d14eSHong Zhang             if (!PetscBTLookupSet(table_i,row+rstart)) {nidx_i[isz++] = row+rstart;}
1349dc008846SHong Zhang             break; /* for l = start; l<end ; l++) */
1350632d0f97SHong Zhang           }
1351632d0f97SHong Zhang         }
1352dc008846SHong Zhang         for (l = b_start; l<b_end ; l++){ /* Bmat */
1353dc008846SHong Zhang           col = garray[bj[l]];
135476f244e2SHong Zhang           if (col > col_max) break;
1355dc008846SHong Zhang           if (PetscBTLookup(table0,col)){
1356fc70d14eSHong Zhang             if (!PetscBTLookupSet(table_i,row+rstart)) {nidx_i[isz++] = row+rstart;}
1357dc008846SHong Zhang             break; /* for l = start; l<end ; l++) */
1358632d0f97SHong Zhang           }
1359dc008846SHong Zhang         }
1360dc008846SHong Zhang       }
1361bfc6803cSHong Zhang     }
1362dc008846SHong Zhang 
1363dc008846SHong Zhang     if (i < is_max - 1){
1364fc70d14eSHong Zhang       idx_i  += n;   /* ptr to input is[i+1] array */
1365bfc6803cSHong Zhang       nidx_i += isz; /* ptr to output is[i+1] array */
1366dc008846SHong Zhang     }
1367fc70d14eSHong Zhang     nidx[1+i] = isz; /* size of new is[i] */
13681ab97a2aSSatish Balay   } /* for each is */
1369dc008846SHong Zhang   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
1370632d0f97SHong Zhang 
1371632d0f97SHong Zhang   PetscFunctionReturn(0);
1372632d0f97SHong Zhang }
1373632d0f97SHong Zhang 
1374632d0f97SHong Zhang 
1375