xref: /petsc/src/mat/impls/sbaij/mpi/sbaijov.c (revision db41cccff32efbb513e6f5a641add0fbc314552d)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix.
4    Used for finding submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/sbaij/mpi/mpisbaij.h>
7 #include <petscbt.h>
8 
9 static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat,PetscInt,IS*);
10 static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat,PetscInt*,PetscInt,PetscInt*,PetscBT*);
11 
12 /* -------------------------------------------------------------------------*/
13 /* This code is modified from MatGetSubMatrices_MPIBAIJ_local() */
14 #if defined (PETSC_USE_CTABLE)
15 #undef __FUNCT__
16 #define __FUNCT__ "PetscGetProc_ijonly"
17 PetscErrorCode PetscGetProc_ijonly(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
18 {
19   PetscInt    nGlobalNd = proc_gnode[size];
20   PetscMPIInt fproc = PetscMPIIntCast( (PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)));
21 
22   PetscFunctionBegin;
23   if (fproc > size) fproc = size;
24   while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
25     if (row < proc_gnode[fproc]) fproc--;
26     else                         fproc++;
27   }
28   *rank = fproc;
29   PetscFunctionReturn(0);
30 }
31 #endif
32 
33 #undef __FUNCT__
34 #define __FUNCT__ "MatGetSubMatrices_MPIBAIJ_local_ijonly"
35 PetscErrorCode MatGetSubMatrices_MPIBAIJ_local_ijonly(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
36 {
37   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
38   Mat            A = c->A;
39   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
40   const PetscInt **irow,**icol,*irow_i;
41   PetscInt       *nrow,*ncol,*w3,*w4,start;
42   PetscErrorCode ierr;
43   PetscMPIInt    size,tag0,tag1,tag2,tag3,*w1,*w2,nrqr,idex,end,proc;
44   PetscInt       **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,**rbuf1,row;
45   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
46   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
47   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
48   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
49   PetscInt       *a_j=a->j,*b_j=b->j,*cworkA,*cworkB; //bs=C->rmap->bs,bs2=c->bs2
50   PetscInt       cstart = c->cstartbs,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
51   PetscInt       *bmap = c->garray,ctmp,rstart=c->rstartbs;
52   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3,*s_waits3;
53   //MPI_Request    *r_waits4,*s_waits4;
54   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2,*r_status3;
55   //MPI_Status     *r_status4,*s_status4;
56   MPI_Comm       comm;
57   //MatScalar      **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB;
58   //MatScalar      *a_a=a->a,*b_a=b->a;
59   PetscBool      flag;
60   PetscMPIInt    *onodes1,*olengths1;
61 
62 #if defined (PETSC_USE_CTABLE)
63   PetscInt       tt;
64   PetscTable     *rowmaps,*colmaps,lrow1_grow1,lcol1_gcol1;
65 #else
66   PetscInt       **cmap,*cmap_i,*rtable,*rmap_i,**rmap, Mbs = c->Mbs;
67 #endif
68 
69   PetscFunctionBegin;
70   comm   = ((PetscObject)C)->comm;
71   tag0   = ((PetscObject)C)->tag;
72   size   = c->size;
73   rank   = c->rank;
74 
75   /* Get some new tags to keep the communication clean */
76   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
77   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
79 
80 #if defined(PETSC_USE_CTABLE)
81   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
82 #else
83   ierr = PetscMalloc5(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol,Mbs+1,PetscInt,&rtable);CHKERRQ(ierr);
84   /* Create hash table for the mapping :row -> proc*/
85   for (i=0,j=0; i<size; i++) {
86     jmax = c->rowners[i+1];
87     for (; j<jmax; j++) {
88       rtable[j] = i;
89     }
90   }
91 #endif
92 
93   for (i=0; i<ismax; i++) {
94     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
95     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
96     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
97     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
98   }
99 
100   /* evaluate communication - mesg to who,length of mesg,and buffer space
101      required. Based on this, buffers are allocated, and data copied into them*/
102   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscInt,&w3,size,PetscInt,&w4);CHKERRQ(ierr);
103   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
104   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
105   ierr = PetscMemzero(w3,size*sizeof(PetscInt));CHKERRQ(ierr);
106   for (i=0; i<ismax; i++) {
107     ierr   = PetscMemzero(w4,size*sizeof(PetscInt));CHKERRQ(ierr); /* initialise work vector*/
108     jmax   = nrow[i];
109     irow_i = irow[i];
110     for (j=0; j<jmax; j++) {
111       row  = irow_i[j];
112 #if defined (PETSC_USE_CTABLE)
113       ierr = PetscGetProc_ijonly(row,size,c->rangebs,&proc);CHKERRQ(ierr);
114 #else
115       proc = rtable[row];
116 #endif
117       w4[proc]++;
118     }
119     for (j=0; j<size; j++) {
120       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
121     }
122   }
123 
124   nrqs     = 0;              /* no of outgoing messages */
125   msz      = 0;              /* total mesg length for all proc */
126   w1[rank] = 0;              /* no mesg sent to intself */
127   w3[rank] = 0;
128   for (i=0; i<size; i++) {
129     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
130   }
131   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
132   for (i=0,j=0; i<size; i++) {
133     if (w1[i]) { pa[j] = i; j++; }
134   }
135 
136   /* Each message would have a header = 1 + 2*(no of IS) + data */
137   for (i=0; i<nrqs; i++) {
138     j     = pa[i];
139     w1[j] += w2[j] + 2* w3[j];
140     msz   += w1[j];
141   }
142 
143   /* Determine the number of messages to expect, their lengths, from from-ids */
144   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
145   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
146 
147   /* Now post the Irecvs corresponding to these messages */
148   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
149 
150   ierr = PetscFree(onodes1);CHKERRQ(ierr);
151   ierr = PetscFree(olengths1);CHKERRQ(ierr);
152 
153   /* Allocate Memory for outgoing messages */
154   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
155   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
156   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
157   {
158     PetscInt *iptr = tmp,ict = 0;
159     for (i=0; i<nrqs; i++) {
160       j         = pa[i];
161       iptr     += ict;
162       sbuf1[j]  = iptr;
163       ict       = w1[j];
164     }
165   }
166 
167   /* Form the outgoing messages */
168   /* Initialise the header space */
169   for (i=0; i<nrqs; i++) {
170     j           = pa[i];
171     sbuf1[j][0] = 0;
172     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
173     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
174   }
175 
176   /* Parse the isrow and copy data into outbuf */
177   for (i=0; i<ismax; i++) {
178     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
179     irow_i = irow[i];
180     jmax   = nrow[i];
181     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
182       row  = irow_i[j];
183 #if defined (PETSC_USE_CTABLE)
184       ierr = PetscGetProc_ijonly(row,size,c->rangebs,&proc);CHKERRQ(ierr);
185 #else
186       proc = rtable[row];
187 #endif
188       if (proc != rank) { /* copy to the outgoing buf*/
189         ctr[proc]++;
190         *ptr[proc] = row;
191         ptr[proc]++;
192       }
193     }
194     /* Update the headers for the current IS */
195     for (j=0; j<size; j++) { /* Can Optimise this loop too */
196       if ((ctr_j = ctr[j])) {
197         sbuf1_j        = sbuf1[j];
198         k              = ++sbuf1_j[0];
199         sbuf1_j[2*k]   = ctr_j;
200         sbuf1_j[2*k-1] = i;
201       }
202     }
203   }
204 
205   /*  Now  post the sends */
206   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
207   for (i=0; i<nrqs; ++i) {
208     j = pa[i];
209     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
210   }
211 
212   /* Post Recieves to capture the buffer size */
213   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
214   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
215   rbuf2[0] = tmp + msz;
216   for (i=1; i<nrqs; ++i) {
217     j        = pa[i];
218     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
219   }
220   for (i=0; i<nrqs; ++i) {
221     j    = pa[i];
222     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
223   }
224 
225   /* Send to other procs the buf size they should allocate */
226 
227   /* Receive messages*/
228   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
229   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
230   ierr = PetscMalloc3(nrqr+1,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
231   {
232     Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
233     PetscInt    *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;
234 
235     for (i=0; i<nrqr; ++i) {
236       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
237       req_size[idex] = 0;
238       rbuf1_i         = rbuf1[idex];
239       start           = 2*rbuf1_i[0] + 1;
240       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
241       ierr            = PetscMalloc(end*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
242       sbuf2_i         = sbuf2[idex];
243       for (j=start; j<end; j++) {
244         id               = rbuf1_i[j] - rstart;
245         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
246         sbuf2_i[j]       = ncols;
247         req_size[idex] += ncols;
248       }
249       req_source[idex] = r_status1[i].MPI_SOURCE;
250       /* form the header */
251       sbuf2_i[0]   = req_size[idex];
252       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
253       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
254     }
255   }
256   ierr = PetscFree(r_status1);CHKERRQ(ierr);
257   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
258 
259   /*  recv buffer sizes */
260   /* Receive messages*/
261 
262   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
263   //ierr = PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);CHKERRQ(ierr);
264   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
265   //ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
266   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
267 
268   for (i=0; i<nrqs; ++i) {
269     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
270     ierr = PetscMalloc(rbuf2[idex][0]*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
271     //ierr = PetscMalloc(rbuf2[idex][0]*bs2*sizeof(MatScalar),&rbuf4[idex]);CHKERRQ(ierr);
272     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
273     //ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
274   }
275   ierr = PetscFree(r_status2);CHKERRQ(ierr);
276   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
277 
278   /* Wait on sends1 and sends2 */
279   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
280   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
281 
282   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
283   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
284   ierr = PetscFree(s_status1);CHKERRQ(ierr);
285   ierr = PetscFree(s_status2);CHKERRQ(ierr);
286   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
287   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
288 
289   /* Now allocate buffers for a->j, and send them off */
290   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
291   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
292   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
293   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
294 
295   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
296   {
297      for (i=0; i<nrqr; i++) {
298       rbuf1_i   = rbuf1[i];
299       sbuf_aj_i = sbuf_aj[i];
300       ct1       = 2*rbuf1_i[0] + 1;
301       ct2       = 0;
302       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
303         kmax = rbuf1[i][2*j];
304         for (k=0; k<kmax; k++,ct1++) {
305           row    = rbuf1_i[ct1] - rstart;
306           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
307           ncols  = nzA + nzB;
308           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
309 
310           /* load the column indices for this row into cols*/
311           cols  = sbuf_aj_i + ct2;
312           for (l=0; l<nzB; l++) {
313             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[l] = ctmp;
314             else break;
315           }
316           imark = l;
317           for (l=0; l<nzA; l++)   cols[imark+l] = cstart + cworkA[l];
318           for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
319           ct2 += ncols;
320         }
321       }
322       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
323     }
324   }
325   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
326   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
327 
328 #if defined(MV)
329   /* Allocate buffers for a->a, and send them off */
330   ierr = PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);CHKERRQ(ierr);
331   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
332   ierr = PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);CHKERRQ(ierr);
333   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
334 
335   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
336   {
337     for (i=0; i<nrqr; i++) {
338       rbuf1_i   = rbuf1[i];
339       sbuf_aa_i = sbuf_aa[i];
340       ct1       = 2*rbuf1_i[0]+1;
341       ct2       = 0;
342       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
343         kmax = rbuf1_i[2*j];
344         for (k=0; k<kmax; k++,ct1++) {
345           row    = rbuf1_i[ct1] - rstart;
346           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
347           ncols  = nzA + nzB;
348           cworkA = a_j + a_i[row];     cworkB = b_j + b_i[row];
349           vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2;
350 
351           /* load the column values for this row into vals*/
352           vals  = sbuf_aa_i+ct2*bs2;
353           for (l=0; l<nzB; l++) {
354             if ((bmap[cworkB[l]]) < cstart) {
355               ierr = PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
356             }
357             else break;
358           }
359           imark = l;
360           for (l=0; l<nzA; l++) {
361             ierr = PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
362           }
363           for (l=imark; l<nzB; l++) {
364             ierr = PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
365           }
366           ct2 += ncols;
367         }
368       }
369       ierr = MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
370     }
371   }
372   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
373   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
374 #endif
375   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
376   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
377 
378   /* Form the matrix */
379   /* create col map */
380   {
381     const PetscInt *icol_i;
382 #if defined (PETSC_USE_CTABLE)
383     /* Create row map*/
384     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&colmaps);CHKERRQ(ierr);
385     for (i=0; i<ismax; i++) {
386       ierr = PetscTableCreate(ncol[i]+1,&colmaps[i]);CHKERRQ(ierr);
387     }
388 #else
389     ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
390     ierr    = PetscMalloc(ismax*c->Nbs*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr);
391     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; }
392 #endif
393     for (i=0; i<ismax; i++) {
394       jmax   = ncol[i];
395       icol_i = icol[i];
396 #if defined (PETSC_USE_CTABLE)
397       lcol1_gcol1 = colmaps[i];
398       for (j=0; j<jmax; j++) {
399 	ierr = PetscTableAdd(lcol1_gcol1,icol_i[j]+1,j+1);CHKERRQ(ierr);
400       }
401 #else
402       cmap_i = cmap[i];
403       for (j=0; j<jmax; j++) {
404         cmap_i[icol_i[j]] = j+1;
405       }
406 #endif
407     }
408   }
409 
410   /* Create lens which is required for MatCreate... */
411   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
412   ierr    = PetscMalloc((1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt),&lens);CHKERRQ(ierr);
413   lens[0] = (PetscInt*)(lens + ismax);
414   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
415   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
416 
417   /* Update lens from local data */
418   for (i=0; i<ismax; i++) {
419     jmax   = nrow[i];
420 #if defined (PETSC_USE_CTABLE)
421     lcol1_gcol1 = colmaps[i];
422 #else
423     cmap_i = cmap[i];
424 #endif
425     irow_i = irow[i];
426     lens_i = lens[i];
427     for (j=0; j<jmax; j++) {
428       row  = irow_i[j];
429 #if defined (PETSC_USE_CTABLE)
430       ierr = PetscGetProc_ijonly(row,size,c->rangebs,&proc);CHKERRQ(ierr);
431 #else
432       proc = rtable[row];
433 #endif
434       if (proc == rank) {
435         /* Get indices from matA and then from matB */
436         row    = row - rstart;
437         nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
438         cworkA =  a_j + a_i[row]; cworkB = b_j + b_i[row];
439 #if defined (PETSC_USE_CTABLE)
440        for (k=0; k<nzA; k++) {
441 	 ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[k]+1,&tt);CHKERRQ(ierr);
442           if (tt) { lens_i[j]++; }
443         }
444         for (k=0; k<nzB; k++) {
445 	  ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[k]]+1,&tt);CHKERRQ(ierr);
446           if (tt) { lens_i[j]++; }
447         }
448 #else
449         for (k=0; k<nzA; k++) {
450           if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++; }
451         }
452         for (k=0; k<nzB; k++) {
453           if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++; }
454         }
455 #endif
456       }
457     }
458   }
459 #if defined (PETSC_USE_CTABLE)
460   /* Create row map*/
461   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rowmaps);CHKERRQ(ierr);
462   for (i=0; i<ismax; i++){
463     ierr = PetscTableCreate(nrow[i]+1,&rowmaps[i]);CHKERRQ(ierr);
464   }
465 #else
466   /* Create row map*/
467   ierr    = PetscMalloc((1+ismax)*sizeof(PetscInt*)+ ismax*Mbs*sizeof(PetscInt),&rmap);CHKERRQ(ierr);
468   rmap[0] = (PetscInt*)(rmap + ismax);
469   ierr    = PetscMemzero(rmap[0],ismax*Mbs*sizeof(PetscInt));CHKERRQ(ierr);
470   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + Mbs;}
471 #endif
472   for (i=0; i<ismax; i++) {
473     irow_i = irow[i];
474     jmax   = nrow[i];
475 #if defined (PETSC_USE_CTABLE)
476     lrow1_grow1 = rowmaps[i];
477     for (j=0; j<jmax; j++) {
478       ierr = PetscTableAdd(lrow1_grow1,irow_i[j]+1,j+1);CHKERRQ(ierr);
479     }
480 #else
481     rmap_i = rmap[i];
482     for (j=0; j<jmax; j++) {
483       rmap_i[irow_i[j]] = j;
484     }
485 #endif
486   }
487 
488   /* Update lens from offproc data */
489   {
490     PetscInt    *rbuf2_i,*rbuf3_i,*sbuf1_i;
491     PetscMPIInt ii;
492 
493     for (tmp2=0; tmp2<nrqs; tmp2++) {
494       ierr    = MPI_Waitany(nrqs,r_waits3,&ii,r_status3+tmp2);CHKERRQ(ierr);
495       idex   = pa[ii];
496       sbuf1_i = sbuf1[idex];
497       jmax    = sbuf1_i[0];
498       ct1     = 2*jmax+1;
499       ct2     = 0;
500       rbuf2_i = rbuf2[ii];
501       rbuf3_i = rbuf3[ii];
502       for (j=1; j<=jmax; j++) {
503         is_no   = sbuf1_i[2*j-1];
504         max1    = sbuf1_i[2*j];
505         lens_i  = lens[is_no];
506 #if defined (PETSC_USE_CTABLE)
507 	lcol1_gcol1 = colmaps[is_no];
508 	lrow1_grow1 = rowmaps[is_no];
509 #else
510         cmap_i  = cmap[is_no];
511 	rmap_i  = rmap[is_no];
512 #endif
513         for (k=0; k<max1; k++,ct1++) {
514 #if defined (PETSC_USE_CTABLE)
515 	  ierr = PetscTableFind(lrow1_grow1,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
516           row--;
517           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
518 #else
519           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
520 #endif
521           max2 = rbuf2_i[ct1];
522           for (l=0; l<max2; l++,ct2++) {
523 #if defined (PETSC_USE_CTABLE)
524 	    ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tt);CHKERRQ(ierr);
525 	    if (tt) {
526               lens_i[row]++;
527             }
528 #else
529             if (cmap_i[rbuf3_i[ct2]]) {
530               lens_i[row]++;
531             }
532 #endif
533           }
534         }
535       }
536     }
537   }
538   ierr = PetscFree(r_status3);CHKERRQ(ierr);
539   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
540   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
541   ierr = PetscFree(s_status3);CHKERRQ(ierr);
542   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
543 
544   /* Create the submatrices */
545   if (scall == MAT_REUSE_MATRIX) {
546     /*
547         Assumes new rows are same length as the old rows, hence bug!
548     */
549     for (i=0; i<ismax; i++) {
550       mat = (Mat_SeqBAIJ *)(submats[i]->data);
551       //if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || C->rmap->bs != bs)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
552       if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
553       ierr = PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
554       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros");
555       /* Initial matrix as if empty */
556       ierr = PetscMemzero(mat->ilen,mat->mbs*sizeof(PetscInt));CHKERRQ(ierr);
557       submats[i]->factortype = C->factortype;
558     }
559   } else {
560     for (i=0; i<ismax; i++) {
561       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
562       //ierr = MatSetSizes(submats[i],nrow[i]*bs,ncol[i]*bs,nrow[i]*bs,ncol[i]*bs);CHKERRQ(ierr);
563       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],nrow[i],ncol[i]);CHKERRQ(ierr);
564       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
565       ierr = MatSeqBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);CHKERRQ(ierr);
566       //ierr = MatSeqBAIJSetPreallocation(submats[i],1,0,lens[i]);CHKERRQ(ierr);
567       ierr = MatSeqSBAIJSetPreallocation(submats[i],1,0,lens[i]);CHKERRQ(ierr);
568       //ierr = MatSeqSBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);CHKERRQ(ierr); /* this subroutine is used by MatGetSubMatrices_MPISBAIJ()*/
569     }
570   }
571 
572   /* Assemble the matrices */
573   /* First assemble the local rows */
574   {
575     PetscInt       ilen_row,*imat_ilen,*imat_j,*imat_i;
576     //MatScalar *imat_a;
577 
578     for (i=0; i<ismax; i++) {
579       mat       = (Mat_SeqBAIJ*)submats[i]->data;
580       imat_ilen = mat->ilen;
581       imat_j    = mat->j;
582       imat_i    = mat->i;
583       //imat_a    = mat->a;
584 
585 #if defined (PETSC_USE_CTABLE)
586       lcol1_gcol1 = colmaps[i];
587       lrow1_grow1 = rowmaps[i];
588 #else
589       cmap_i    = cmap[i];
590       rmap_i    = rmap[i];
591 #endif
592       irow_i    = irow[i];
593       jmax      = nrow[i];
594       for (j=0; j<jmax; j++) {
595         row      = irow_i[j];
596 #if defined (PETSC_USE_CTABLE)
597 	ierr = PetscGetProc_ijonly(row,size,c->rangebs,&proc);CHKERRQ(ierr);
598 #else
599 	proc = rtable[row];
600 #endif
601         if (proc == rank) {
602           row      = row - rstart;
603           nzA      = a_i[row+1] - a_i[row];
604           nzB      = b_i[row+1] - b_i[row];
605           cworkA   = a_j + a_i[row];
606           cworkB   = b_j + b_i[row];
607           //vworkA   = a_a + a_i[row]*bs2;
608           //vworkB   = b_a + b_i[row]*bs2;
609 #if defined (PETSC_USE_CTABLE)
610 	  ierr = PetscTableFind(lrow1_grow1,row+rstart+1,&row);CHKERRQ(ierr);
611           row--;
612           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
613 #else
614           row      = rmap_i[row + rstart];
615 #endif
616           mat_i    = imat_i[row];
617           //mat_a    = imat_a + mat_i*bs2;
618           mat_j    = imat_j + mat_i;
619           ilen_row = imat_ilen[row];
620 
621           /* load the column indices for this row into cols*/
622           for (l=0; l<nzB; l++) {
623 	    if ((ctmp = bmap[cworkB[l]]) < cstart) {
624 #if defined (PETSC_USE_CTABLE)
625 	      ierr = PetscTableFind(lcol1_gcol1,ctmp+1,&tcol);CHKERRQ(ierr);
626 	      if (tcol) {
627 #else
628               if ((tcol = cmap_i[ctmp])) {
629 #endif
630                 *mat_j++ = tcol - 1;
631                 //ierr     = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
632                 //mat_a   += bs2;
633                 ilen_row++;
634               }
635             } else break;
636           }
637           imark = l;
638           for (l=0; l<nzA; l++) {
639 #if defined (PETSC_USE_CTABLE)
640 	    ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[l]+1,&tcol);CHKERRQ(ierr);
641 	    if (tcol) {
642 #else
643 	    if ((tcol = cmap_i[cstart + cworkA[l]])) {
644 #endif
645               *mat_j++ = tcol - 1;
646               //ierr     = PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
647               //mat_a   += bs2;
648               ilen_row++;
649             }
650           }
651           for (l=imark; l<nzB; l++) {
652 #if defined (PETSC_USE_CTABLE)
653 	    ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[l]]+1,&tcol);CHKERRQ(ierr);
654 	    if (tcol) {
655 #else
656             if ((tcol = cmap_i[bmap[cworkB[l]]])) {
657 #endif
658               *mat_j++ = tcol - 1;
659               //ierr     = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
660               //mat_a   += bs2;
661               ilen_row++;
662             }
663           }
664           imat_ilen[row] = ilen_row;
665         }
666       }
667     }
668   }
669 
670   /*   Now assemble the off proc rows*/
671   {
672     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
673     PetscInt    *imat_j,*imat_i;
674     //MatScalar   *imat_a,*rbuf4_i;
675     PetscMPIInt ii;
676 
677     for (tmp2=0; tmp2<nrqs; tmp2++) {
678       //ierr    = MPI_Waitany(nrqs,r_waits4,&ii,r_status4+tmp2);CHKERRQ(ierr);
679       ii = tmp2;  //new
680       idex   = pa[ii];
681       sbuf1_i = sbuf1[idex];
682       jmax    = sbuf1_i[0];
683       ct1     = 2*jmax + 1;
684       ct2     = 0;
685       rbuf2_i = rbuf2[ii];
686       rbuf3_i = rbuf3[ii];
687       //rbuf4_i = rbuf4[ii];
688       for (j=1; j<=jmax; j++) {
689         is_no     = sbuf1_i[2*j-1];
690 #if defined (PETSC_USE_CTABLE)
691 	lrow1_grow1 = rowmaps[is_no];
692 	lcol1_gcol1 = colmaps[is_no];
693 #else
694         rmap_i    = rmap[is_no];
695         cmap_i    = cmap[is_no];
696 #endif
697         mat       = (Mat_SeqBAIJ*)submats[is_no]->data;
698         imat_ilen = mat->ilen;
699         imat_j    = mat->j;
700         imat_i    = mat->i;
701         //imat_a    = mat->a;
702         max1      = sbuf1_i[2*j];
703         for (k=0; k<max1; k++,ct1++) {
704           row   = sbuf1_i[ct1];
705 #if defined (PETSC_USE_CTABLE)
706 	  ierr = PetscTableFind(lrow1_grow1,row+1,&row);CHKERRQ(ierr);
707           row--;
708           if(row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
709 #else
710           row   = rmap_i[row];
711 #endif
712           ilen  = imat_ilen[row];
713           mat_i = imat_i[row];
714           //mat_a = imat_a + mat_i*bs2;
715           mat_j = imat_j + mat_i;
716           max2 = rbuf2_i[ct1];
717           for (l=0; l<max2; l++,ct2++) {
718 #if defined (PETSC_USE_CTABLE)
719 	    ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
720 	    if (tcol) {
721 #else
722 	    if ((tcol = cmap_i[rbuf3_i[ct2]])) {
723 #endif
724               *mat_j++    = tcol - 1;
725               /* *mat_a++= rbuf4_i[ct2]; */
726               //ierr        = PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
727               //mat_a      += bs2;
728               ilen++;
729             }
730           }
731           imat_ilen[row] = ilen;
732         }
733       }
734     }
735   }
736     //ierr = PetscFree(r_status4);CHKERRQ(ierr);
737     //ierr = PetscFree(r_waits4);CHKERRQ(ierr);
738     //if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
739     //ierr = PetscFree(s_waits4);CHKERRQ(ierr);
740     //ierr = PetscFree(s_status4);CHKERRQ(ierr);
741 
742   /* Restore the indices */
743   for (i=0; i<ismax; i++) {
744     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
745     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
746   }
747 
748   /* Destroy allocated memory */
749 #if defined(PETSC_USE_CTABLE)
750   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
751 #else
752   ierr = PetscFree5(irow,icol,nrow,ncol,rtable);CHKERRQ(ierr);
753 #endif
754   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
755   ierr = PetscFree(pa);CHKERRQ(ierr);
756 
757   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
758   ierr = PetscFree(sbuf1);CHKERRQ(ierr);
759   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
760   for (i=0; i<nrqr; ++i) {
761     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
762   }
763   for (i=0; i<nrqs; ++i) {
764     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
765     //ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
766   }
767   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
768   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
769   //ierr = PetscFree(rbuf4);CHKERRQ(ierr);
770   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
771   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
772   //ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
773   //ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
774 
775 #if defined (PETSC_USE_CTABLE)
776   for (i=0; i<ismax; i++){
777     ierr = PetscTableDestroy(rowmaps[i]);CHKERRQ(ierr);
778     ierr = PetscTableDestroy(colmaps[i]);CHKERRQ(ierr);
779   }
780   ierr = PetscFree(colmaps);CHKERRQ(ierr);
781   ierr = PetscFree(rowmaps);CHKERRQ(ierr);
782 #else
783   ierr = PetscFree(rmap);CHKERRQ(ierr);
784   ierr = PetscFree(cmap[0]);CHKERRQ(ierr);
785   ierr = PetscFree(cmap);CHKERRQ(ierr);
786 #endif
787   ierr = PetscFree(lens);CHKERRQ(ierr);
788 
789   for (i=0; i<ismax; i++) {
790     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
791     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
792   }
793   PetscFunctionReturn(0);
794 }
795 
796 /* ------------------------------------------------------- */
797 
798 #undef __FUNCT__
799 #define __FUNCT__ "MatIncreaseOverlap_MPISBAIJ"
800 PetscErrorCode MatIncreaseOverlap_MPISBAIJ(Mat C,PetscInt is_max,IS is[],PetscInt ov)
801 {
802   PetscErrorCode ierr;
803   PetscInt       i,N=C->cmap->N, bs=C->rmap->bs;
804   IS             *is_new;
805 
806   PetscFunctionBegin;
807   ierr = PetscMalloc(is_max*sizeof(IS),&is_new);CHKERRQ(ierr);
808   /* Convert the indices into block format */
809   ierr = ISCompressIndicesGeneral(N,bs,is_max,is,is_new);CHKERRQ(ierr);
810   if (ov < 0){ SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");}
811 
812   //------- new ----------
813   PetscBool flg=PETSC_FALSE;
814   ierr = PetscOptionsHasName(PETSC_NULL, "-IncreaseOverlap_new", &flg);CHKERRQ(ierr);
815   if (!flg){ /* previous non-scalable implementation */
816     for (i=0; i<ov; ++i) {
817       ierr = MatIncreaseOverlap_MPISBAIJ_Once(C,is_max,is_new);CHKERRQ(ierr);
818     }
819   } else { /* scalable implementation using modified BAIJ routines */
820 
821   Mat           *submats;
822   IS            *is_row;
823   PetscInt      M=C->rmap->N,Mbs=M/bs,*nidx,isz,iov;
824   Mat_MPISBAIJ  *c = (Mat_MPISBAIJ*)C->data;
825   PetscMPIInt   rank=c->rank;
826 
827 
828   ierr = PetscMalloc((Mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
829 
830   /* Create is_row */
831   ierr = PetscMalloc(is_max*sizeof(IS **),&is_row);CHKERRQ(ierr);
832   for (i=0; i<is_max; i++){
833     ierr = ISCreateStride(PETSC_COMM_SELF,Mbs,0,1,is_row+i);CHKERRQ(ierr);
834   }
835 
836   for (iov=0; iov<ov; ++iov) {
837     /* Get submats for column search - replace with MatGetSubMatrices_MPIBAIJ_IJ() */
838     // copied from MatGetSubMatrices_MPIBAIJ() ----------------
839     /* Allocate memory to hold all the submatrices */
840     //if (scall != MAT_REUSE_MATRIX) {
841     ierr = PetscMalloc((is_max+1)*sizeof(Mat),&submats);CHKERRQ(ierr);
842     //}
843     /* Determine the number of stages through which submatrices are done */
844     PetscInt nmax,nstages_local,nstages,max_no,pos;
845     nmax          = 20*1000000 / (c->Nbs * sizeof(PetscInt));
846     if (!nmax) nmax = 1;
847     nstages_local = is_max/nmax + ((is_max % nmax)?1:0);
848 
849     /* Make sure every processor loops through the nstages */
850     ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
851     for (i=0,pos=0; i<nstages; i++) {
852       if (pos+nmax <= is_max) max_no = nmax;
853       else if (pos == is_max) max_no = 0;
854       else                   max_no = is_max-pos;
855 
856       //ierr = MatGetSubMatrices_MPIBAIJ_local(C,max_no,is_row+pos,is_new+pos,MAT_INITIAL_MATRIX,submats+pos);CHKERRQ(ierr);
857       ierr = MatGetSubMatrices_MPIBAIJ_local_ijonly(C,max_no,is_row+pos,is_new+pos,MAT_INITIAL_MATRIX,submats+pos);CHKERRQ(ierr);
858       if (rank == 10 && !i){
859         printf("submats[0]:\n");
860         ierr = MatView(submats[0],PETSC_VIEWER_STDOUT_SELF);CHKERRQ(ierr);
861       }
862 
863       pos += max_no;
864     }
865     // end of copy -----------------------------------
866 
867     /* Row search */
868     ierr = MatIncreaseOverlap_MPIBAIJ_Once(C,is_max,is_new);CHKERRQ(ierr);
869 
870     /* Column search */
871     PetscBT        table;
872     Mat_SeqSBAIJ   *asub_i;
873     PetscInt       *ai,brow,nz,nis,l;
874     const PetscInt *idx;
875 
876     ierr = PetscBTCreate(Mbs,table);CHKERRQ(ierr);
877     for (i=0; i<is_max; i++){
878       asub_i = (Mat_SeqSBAIJ*)submats[i]->data;
879       ai=asub_i->i;;
880 
881       /* put is_new obtained from MatIncreaseOverlap_MPIBAIJ() to table */
882       ierr = PetscBTMemzero(Mbs,table);CHKERRQ(ierr);
883 
884       ierr = ISGetIndices(is_new[i],&idx);CHKERRQ(ierr);
885       ierr = ISGetLocalSize(is_new[i],&nis);CHKERRQ(ierr);
886       for (l=0; l<nis; l++) {
887         ierr = PetscBTSet(table,idx[l]);CHKERRQ(ierr);
888         nidx[l] = idx[l];
889       }
890       isz = nis;
891 
892       for (brow=0; brow<Mbs; brow++){
893         nz = ai[brow+1] - ai[brow];
894         if (nz) {
895           if (!PetscBTLookupSet(table,brow)) nidx[isz++] = brow;
896         }
897       }
898       ierr = ISRestoreIndices(is_new[i],&idx);CHKERRQ(ierr);
899       ierr = ISDestroy(is_new[i]);CHKERRQ(ierr);
900 
901       /* create updated is_new */
902       ierr = ISCreateGeneral(PETSC_COMM_SELF,isz,nidx,PETSC_COPY_VALUES,is_new+i);CHKERRQ(ierr);
903     }
904 
905     /* Free tmp spaces */
906     ierr = PetscBTDestroy(table);CHKERRQ(ierr);
907     for (i=0; i<is_max; i++){
908       if (rank == 10 && !i){
909         printf("submats[0]:\n");
910         ierr = MatView(submats[0],PETSC_VIEWER_STDOUT_SELF);CHKERRQ(ierr);
911       }
912       ierr = MatDestroy(submats[i]);CHKERRQ(ierr);
913     }
914     ierr = PetscFree(submats);CHKERRQ(ierr);
915   }
916 
917   for (i=0; i<is_max; i++){
918     ierr = ISDestroy(is_row[i]);CHKERRQ(ierr);
919   }
920   ierr = PetscFree(is_row);CHKERRQ(ierr);
921   ierr = PetscFree(nidx);CHKERRQ(ierr);
922   }
923   //--------end of new----------
924 
925   for (i=0; i<is_max; i++) {ierr = ISDestroy(is[i]);CHKERRQ(ierr);}
926   ierr = ISExpandIndicesGeneral(N,bs,is_max,is_new,is);CHKERRQ(ierr);
927 
928   for (i=0; i<is_max; i++) {ierr = ISDestroy(is_new[i]);CHKERRQ(ierr);}
929   ierr = PetscFree(is_new);CHKERRQ(ierr);
930   PetscFunctionReturn(0);
931 }
932 
933 typedef enum {MINE,OTHER} WhoseOwner;
934 /*  data1, odata1 and odata2 are packed in the format (for communication):
935        data[0]          = is_max, no of is
936        data[1]          = size of is[0]
937         ...
938        data[is_max]     = size of is[is_max-1]
939        data[is_max + 1] = data(is[0])
940         ...
941        data[is_max+1+sum(size of is[k]), k=0,...,i-1] = data(is[i])
942         ...
943    data2 is packed in the format (for creating output is[]):
944        data[0]          = is_max, no of is
945        data[1]          = size of is[0]
946         ...
947        data[is_max]     = size of is[is_max-1]
948        data[is_max + 1] = data(is[0])
949         ...
950        data[is_max + 1 + Mbs*i) = data(is[i])
951         ...
952 */
953 #undef __FUNCT__
954 #define __FUNCT__ "MatIncreaseOverlap_MPISBAIJ_Once"
955 static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat C,PetscInt is_max,IS is[])
956 {
957   Mat_MPISBAIJ  *c = (Mat_MPISBAIJ*)C->data;
958   PetscErrorCode ierr;
959   PetscMPIInt    size,rank,tag1,tag2,*len_s,nrqr,nrqs,*id_r1,*len_r1,flag,len;
960   const PetscInt *idx_i;
961   PetscInt       idx,isz,col,*n,*data1,**data1_start,*data2,*data2_i,*data,*data_i,
962                  Mbs,i,j,k,*odata1,*odata2,
963                  proc_id,**odata2_ptr,*ctable=0,*btable,len_max,len_est;
964   PetscInt       proc_end=0,*iwork,len_unused,nodata2;
965   PetscInt       ois_max; /* max no of is[] in each of processor */
966   char           *t_p;
967   MPI_Comm       comm;
968   MPI_Request    *s_waits1,*s_waits2,r_req;
969   MPI_Status     *s_status,r_status;
970   PetscBT        *table;  /* mark indices of this processor's is[] */
971   PetscBT        table_i;
972   PetscBT        otable; /* mark indices of other processors' is[] */
973   PetscInt       bs=C->rmap->bs,Bn = c->B->cmap->n,Bnbs = Bn/bs,*Bowners;
974   IS             garray_local,garray_gl;
975 
976   PetscFunctionBegin;
977   comm = ((PetscObject)C)->comm;
978   size = c->size;
979   rank = c->rank;
980   Mbs  = c->Mbs;
981 
982   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
983   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
984 
985   /* create tables used in
986      step 1: table[i] - mark c->garray of proc [i]
987      step 3: table[i] - mark indices of is[i] when whose=MINE
988              table[0] - mark incideces of is[] when whose=OTHER */
989   len = PetscMax(is_max, size);CHKERRQ(ierr);
990   ierr = PetscMalloc2(len,PetscBT,&table,(Mbs/PETSC_BITS_PER_BYTE+1)*len,char,&t_p);CHKERRQ(ierr);
991   for (i=0; i<len; i++) {
992     table[i]  = t_p  + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
993   }
994 
995   ierr = MPI_Allreduce(&is_max,&ois_max,1,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
996 
997   /* 1. Send this processor's is[] to other processors */
998   /*---------------------------------------------------*/
999   /* allocate spaces */
1000   ierr = PetscMalloc(is_max*sizeof(PetscInt),&n);CHKERRQ(ierr);
1001   len = 0;
1002   for (i=0; i<is_max; i++) {
1003     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
1004     len += n[i];
1005   }
1006   if (!len) {
1007     is_max = 0;
1008   } else {
1009     len += 1 + is_max; /* max length of data1 for one processor */
1010   }
1011 
1012 
1013   ierr = PetscMalloc((size*len+1)*sizeof(PetscInt),&data1);CHKERRQ(ierr);
1014   ierr = PetscMalloc(size*sizeof(PetscInt*),&data1_start);CHKERRQ(ierr);
1015   for (i=0; i<size; i++) data1_start[i] = data1 + i*len;
1016 
1017   ierr = PetscMalloc4(size,PetscInt,&len_s,size,PetscInt,&btable,size,PetscInt,&iwork,size+1,PetscInt,&Bowners);CHKERRQ(ierr);
1018 
1019   /* gather c->garray from all processors */
1020   ierr = ISCreateGeneral(comm,Bnbs,c->garray,PETSC_COPY_VALUES,&garray_local);CHKERRQ(ierr);
1021   ierr = ISAllGather(garray_local, &garray_gl);CHKERRQ(ierr);
1022   ierr = ISDestroy(garray_local);CHKERRQ(ierr);
1023   ierr = MPI_Allgather(&Bnbs,1,MPIU_INT,Bowners+1,1,MPIU_INT,comm);CHKERRQ(ierr);
1024   Bowners[0] = 0;
1025   for (i=0; i<size; i++) Bowners[i+1] += Bowners[i];
1026 
1027   if (is_max){
1028     /* hash table ctable which maps c->row to proc_id) */
1029     ierr = PetscMalloc(Mbs*sizeof(PetscInt),&ctable);CHKERRQ(ierr);
1030     for (proc_id=0,j=0; proc_id<size; proc_id++) {
1031       for (; j<C->rmap->range[proc_id+1]/bs; j++) {
1032         ctable[j] = proc_id;
1033       }
1034     }
1035 
1036     /* hash tables marking c->garray */
1037     ierr = ISGetIndices(garray_gl,&idx_i);
1038     for (i=0; i<size; i++){
1039       table_i = table[i];
1040       ierr    = PetscBTMemzero(Mbs,table_i);CHKERRQ(ierr);
1041       for (j = Bowners[i]; j<Bowners[i+1]; j++){ /* go through B cols of proc[i]*/
1042         ierr = PetscBTSet(table_i,idx_i[j]);CHKERRQ(ierr);
1043       }
1044     }
1045     ierr = ISRestoreIndices(garray_gl,&idx_i);CHKERRQ(ierr);
1046   }  /* if (is_max) */
1047   ierr = ISDestroy(garray_gl);CHKERRQ(ierr);
1048 
1049   /* evaluate communication - mesg to who, length, and buffer space */
1050   for (i=0; i<size; i++) len_s[i] = 0;
1051 
1052   /* header of data1 */
1053   for (proc_id=0; proc_id<size; proc_id++){
1054     iwork[proc_id] = 0;
1055     *data1_start[proc_id] = is_max;
1056     data1_start[proc_id]++;
1057     for (j=0; j<is_max; j++) {
1058       if (proc_id == rank){
1059         *data1_start[proc_id] = n[j];
1060       } else {
1061         *data1_start[proc_id] = 0;
1062       }
1063       data1_start[proc_id]++;
1064     }
1065   }
1066 
1067   for (i=0; i<is_max; i++) {
1068     ierr = ISGetIndices(is[i],&idx_i);CHKERRQ(ierr);
1069     for (j=0; j<n[i]; j++){
1070       idx = idx_i[j];
1071       *data1_start[rank] = idx; data1_start[rank]++; /* for local proccessing */
1072       proc_end = ctable[idx];
1073       for (proc_id=0;  proc_id<=proc_end; proc_id++){ /* for others to process */
1074         if (proc_id == rank ) continue; /* done before this loop */
1075         if (proc_id < proc_end && !PetscBTLookup(table[proc_id],idx))
1076           continue;   /* no need for sending idx to [proc_id] */
1077         *data1_start[proc_id] = idx; data1_start[proc_id]++;
1078         len_s[proc_id]++;
1079       }
1080     }
1081     /* update header data */
1082     for (proc_id=0; proc_id<size; proc_id++){
1083       if (proc_id== rank) continue;
1084       *(data1 + proc_id*len + 1 + i) = len_s[proc_id] - iwork[proc_id];
1085       iwork[proc_id] = len_s[proc_id] ;
1086     }
1087     ierr = ISRestoreIndices(is[i],&idx_i);CHKERRQ(ierr);
1088   }
1089 
1090   nrqs = 0; nrqr = 0;
1091   for (i=0; i<size; i++){
1092     data1_start[i] = data1 + i*len;
1093     if (len_s[i]){
1094       nrqs++;
1095       len_s[i] += 1 + is_max; /* add no. of header msg */
1096     }
1097   }
1098 
1099   for (i=0; i<is_max; i++) {
1100     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
1101   }
1102   ierr = PetscFree(n);CHKERRQ(ierr);
1103   ierr = PetscFree(ctable);CHKERRQ(ierr);
1104 
1105   /* Determine the number of messages to expect, their lengths, from from-ids */
1106   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&nrqr);CHKERRQ(ierr);
1107   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,len_s,&id_r1,&len_r1);CHKERRQ(ierr);
1108 
1109   /*  Now  post the sends */
1110   ierr = PetscMalloc2(size,MPI_Request,&s_waits1,size,MPI_Request,&s_waits2);CHKERRQ(ierr);
1111   k = 0;
1112   for (proc_id=0; proc_id<size; proc_id++){  /* send data1 to processor [proc_id] */
1113     if (len_s[proc_id]){
1114       ierr = MPI_Isend(data1_start[proc_id],len_s[proc_id],MPIU_INT,proc_id,tag1,comm,s_waits1+k);CHKERRQ(ierr);
1115       k++;
1116     }
1117   }
1118 
1119   /* 2. Receive other's is[] and process. Then send back */
1120   /*-----------------------------------------------------*/
1121   len = 0;
1122   for (i=0; i<nrqr; i++){
1123     if (len_r1[i] > len)len = len_r1[i];
1124   }
1125   ierr = PetscFree(len_r1);CHKERRQ(ierr);
1126   ierr = PetscFree(id_r1);CHKERRQ(ierr);
1127 
1128   for (proc_id=0; proc_id<size; proc_id++)
1129     len_s[proc_id] = iwork[proc_id] = 0;
1130 
1131   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&odata1);CHKERRQ(ierr);
1132   ierr = PetscMalloc(size*sizeof(PetscInt**),&odata2_ptr);CHKERRQ(ierr);
1133   ierr = PetscBTCreate(Mbs,otable);CHKERRQ(ierr);
1134 
1135   len_max = ois_max*(Mbs+1);  /* max space storing all is[] for each receive */
1136   len_est = 2*len_max; /* estimated space of storing is[] for all receiving messages */
1137   ierr = PetscMalloc((len_est+1)*sizeof(PetscInt),&odata2);CHKERRQ(ierr);
1138   nodata2 = 0;       /* nodata2+1: num of PetscMalloc(,&odata2_ptr[]) called */
1139   odata2_ptr[nodata2] = odata2;
1140   len_unused = len_est; /* unused space in the array odata2_ptr[nodata2]-- needs to be >= len_max  */
1141 
1142   k = 0;
1143   while (k < nrqr){
1144     /* Receive messages */
1145     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag1,comm,&flag,&r_status);CHKERRQ(ierr);
1146     if (flag){
1147       ierr = MPI_Get_count(&r_status,MPIU_INT,&len);CHKERRQ(ierr);
1148       proc_id = r_status.MPI_SOURCE;
1149       ierr = MPI_Irecv(odata1,len,MPIU_INT,proc_id,r_status.MPI_TAG,comm,&r_req);CHKERRQ(ierr);
1150       ierr = MPI_Wait(&r_req,&r_status);CHKERRQ(ierr);
1151 
1152       /*  Process messages */
1153       /*  make sure there is enough unused space in odata2 array */
1154       if (len_unused < len_max){ /* allocate more space for odata2 */
1155         ierr = PetscMalloc((len_est+1)*sizeof(PetscInt),&odata2);CHKERRQ(ierr);
1156         odata2_ptr[++nodata2] = odata2;
1157         len_unused = len_est;
1158       }
1159 
1160       ierr = MatIncreaseOverlap_MPISBAIJ_Local(C,odata1,OTHER,odata2,&otable);CHKERRQ(ierr);
1161       len = 1 + odata2[0];
1162       for (i=0; i<odata2[0]; i++){
1163         len += odata2[1 + i];
1164       }
1165 
1166       /* Send messages back */
1167       ierr = MPI_Isend(odata2,len,MPIU_INT,proc_id,tag2,comm,s_waits2+k);CHKERRQ(ierr);
1168       k++;
1169       odata2     += len;
1170       len_unused -= len;
1171       len_s[proc_id] = len; /* num of messages sending back to [proc_id] by this proc */
1172     }
1173   }
1174   ierr = PetscFree(odata1);CHKERRQ(ierr);
1175   ierr = PetscBTDestroy(otable);CHKERRQ(ierr);
1176 
1177   /* 3. Do local work on this processor's is[] */
1178   /*-------------------------------------------*/
1179   /* make sure there is enough unused space in odata2(=data) array */
1180   len_max = is_max*(Mbs+1); /* max space storing all is[] for this processor */
1181   if (len_unused < len_max){ /* allocate more space for odata2 */
1182     ierr = PetscMalloc((len_est+1)*sizeof(PetscInt),&odata2);CHKERRQ(ierr);
1183     odata2_ptr[++nodata2] = odata2;
1184     len_unused = len_est;
1185   }
1186 
1187   data = odata2;
1188   ierr = MatIncreaseOverlap_MPISBAIJ_Local(C,data1_start[rank],MINE,data,table);CHKERRQ(ierr);
1189   ierr = PetscFree(data1_start);CHKERRQ(ierr);
1190 
1191   /* 4. Receive work done on other processors, then merge */
1192   /*------------------------------------------------------*/
1193   /* get max number of messages that this processor expects to recv */
1194   ierr = MPI_Allreduce(len_s,iwork,size,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
1195   ierr = PetscMalloc((iwork[rank]+1)*sizeof(PetscInt),&data2);CHKERRQ(ierr);
1196   ierr = PetscFree4(len_s,btable,iwork,Bowners);CHKERRQ(ierr);
1197 
1198   k = 0;
1199   while (k < nrqs){
1200     /* Receive messages */
1201     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag2,comm,&flag,&r_status);
1202     if (flag){
1203       ierr = MPI_Get_count(&r_status,MPIU_INT,&len);CHKERRQ(ierr);
1204       proc_id = r_status.MPI_SOURCE;
1205       ierr = MPI_Irecv(data2,len,MPIU_INT,proc_id,r_status.MPI_TAG,comm,&r_req);CHKERRQ(ierr);
1206       ierr = MPI_Wait(&r_req,&r_status);CHKERRQ(ierr);
1207       if (len > 1+is_max){ /* Add data2 into data */
1208         data2_i = data2 + 1 + is_max;
1209         for (i=0; i<is_max; i++){
1210           table_i = table[i];
1211           data_i  = data + 1 + is_max + Mbs*i;
1212           isz     = data[1+i];
1213           for (j=0; j<data2[1+i]; j++){
1214             col = data2_i[j];
1215             if (!PetscBTLookupSet(table_i,col)) {data_i[isz++] = col;}
1216           }
1217           data[1+i] = isz;
1218           if (i < is_max - 1) data2_i += data2[1+i];
1219         }
1220       }
1221       k++;
1222     }
1223   }
1224   ierr = PetscFree(data2);CHKERRQ(ierr);
1225   ierr = PetscFree2(table,t_p);CHKERRQ(ierr);
1226 
1227   /* phase 1 sends are complete */
1228   ierr = PetscMalloc(size*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
1229   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
1230   ierr = PetscFree(data1);CHKERRQ(ierr);
1231 
1232   /* phase 2 sends are complete */
1233   if (nrqr){ierr = MPI_Waitall(nrqr,s_waits2,s_status);CHKERRQ(ierr);}
1234   ierr = PetscFree2(s_waits1,s_waits2);CHKERRQ(ierr);
1235   ierr = PetscFree(s_status);CHKERRQ(ierr);
1236 
1237   /* 5. Create new is[] */
1238   /*--------------------*/
1239   for (i=0; i<is_max; i++) {
1240     data_i = data + 1 + is_max + Mbs*i;
1241     ierr = ISCreateGeneral(PETSC_COMM_SELF,data[1+i],data_i,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
1242   }
1243   for (k=0; k<=nodata2; k++){
1244     ierr = PetscFree(odata2_ptr[k]);CHKERRQ(ierr);
1245   }
1246   ierr = PetscFree(odata2_ptr);CHKERRQ(ierr);
1247 
1248   PetscFunctionReturn(0);
1249 }
1250 
1251 #undef __FUNCT__
1252 #define __FUNCT__ "MatIncreaseOverlap_MPISBAIJ_Local"
1253 /*
1254    MatIncreaseOverlap_MPISBAIJ_Local - Called by MatIncreaseOverlap, to do
1255        the work on the local processor.
1256 
1257      Inputs:
1258       C      - MAT_MPISBAIJ;
1259       data   - holds is[]. See MatIncreaseOverlap_MPISBAIJ_Once() for the format.
1260       whose  - whose is[] to be processed,
1261                MINE:  this processor's is[]
1262                OTHER: other processor's is[]
1263      Output:
1264        nidx  - whose = MINE:
1265                      holds input and newly found indices in the same format as data
1266                whose = OTHER:
1267                      only holds the newly found indices
1268        table - table[i]: mark the indices of is[i], i=0,...,is_max. Used only in the case 'whose=MINE'.
1269 */
1270 /* Would computation be reduced by swapping the loop 'for each is' and 'for each row'? */
1271 static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat C,PetscInt *data,PetscInt whose,PetscInt *nidx,PetscBT *table)
1272 {
1273   Mat_MPISBAIJ   *c = (Mat_MPISBAIJ*)C->data;
1274   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)(c->A)->data;
1275   Mat_SeqBAIJ    *b = (Mat_SeqBAIJ*)(c->B)->data;
1276   PetscErrorCode ierr;
1277   PetscInt       row,mbs,Mbs,*nidx_i,col,col_max,isz,isz0,*ai,*aj,*bi,*bj,*garray,rstart,l;
1278   PetscInt       a_start,a_end,b_start,b_end,i,j,k,is_max,*idx_i,n;
1279   PetscBT        table0;  /* mark the indices of input is[] for look up */
1280   PetscBT        table_i; /* poits to i-th table. When whose=OTHER, a single table is used for all is[] */
1281 
1282   PetscFunctionBegin;
1283   Mbs = c->Mbs; mbs = a->mbs;
1284   ai = a->i; aj = a->j;
1285   bi = b->i; bj = b->j;
1286   garray = c->garray;
1287   rstart = c->rstartbs;
1288   is_max = data[0];
1289 
1290   ierr = PetscBTCreate(Mbs,table0);CHKERRQ(ierr);
1291 
1292   nidx[0] = is_max;
1293   idx_i   = data + is_max + 1; /* ptr to input is[0] array */
1294   nidx_i  = nidx + is_max + 1; /* ptr to output is[0] array */
1295   for (i=0; i<is_max; i++) { /* for each is */
1296     isz  = 0;
1297     n = data[1+i]; /* size of input is[i] */
1298 
1299     /* initialize and set table_i(mark idx and nidx) and table0(only mark idx) */
1300     if (whose == MINE){ /* process this processor's is[] */
1301       table_i = table[i];
1302       nidx_i  = nidx + 1+ is_max + Mbs*i;
1303     } else {            /* process other processor's is[] - only use one temp table */
1304       table_i = table[0];
1305     }
1306     ierr = PetscBTMemzero(Mbs,table_i);CHKERRQ(ierr);
1307     ierr = PetscBTMemzero(Mbs,table0);CHKERRQ(ierr);
1308     if (n==0) {
1309        nidx[1+i] = 0; /* size of new is[i] */
1310        continue;
1311     }
1312 
1313     isz0 = 0; col_max = 0;
1314     for (j=0; j<n; j++){
1315       col = idx_i[j];
1316       if (col >= Mbs) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index col %D >= Mbs %D",col,Mbs);
1317       if(!PetscBTLookupSet(table_i,col)) {
1318         ierr = PetscBTSet(table0,col);CHKERRQ(ierr);
1319         if (whose == MINE) {nidx_i[isz0] = col;}
1320         if (col_max < col) col_max = col;
1321         isz0++;
1322       }
1323     }
1324 
1325     if (whose == MINE) {isz = isz0;}
1326     k = 0;  /* no. of indices from input is[i] that have been examined */
1327     for (row=0; row<mbs; row++){
1328       a_start = ai[row]; a_end = ai[row+1];
1329       b_start = bi[row]; b_end = bi[row+1];
1330       if (PetscBTLookup(table0,row+rstart)){ /* row is on input is[i]:
1331                                                 do row search: collect all col in this row */
1332         for (l = a_start; l<a_end ; l++){ /* Amat */
1333           col = aj[l] + rstart;
1334           if (!PetscBTLookupSet(table_i,col)) {nidx_i[isz++] = col;}
1335         }
1336         for (l = b_start; l<b_end ; l++){ /* Bmat */
1337           col = garray[bj[l]];
1338           if (!PetscBTLookupSet(table_i,col)) {nidx_i[isz++] = col;}
1339         }
1340         k++;
1341         if (k >= isz0) break; /* for (row=0; row<mbs; row++) */
1342       } else { /* row is not on input is[i]:
1343                   do col serach: add row onto nidx_i if there is a col in nidx_i */
1344         for (l = a_start; l<a_end ; l++){ /* Amat */
1345           col = aj[l] + rstart;
1346           if (col > col_max) break;
1347           if (PetscBTLookup(table0,col)){
1348             if (!PetscBTLookupSet(table_i,row+rstart)) {nidx_i[isz++] = row+rstart;}
1349             break; /* for l = start; l<end ; l++) */
1350           }
1351         }
1352         for (l = b_start; l<b_end ; l++){ /* Bmat */
1353           col = garray[bj[l]];
1354           if (col > col_max) break;
1355           if (PetscBTLookup(table0,col)){
1356             if (!PetscBTLookupSet(table_i,row+rstart)) {nidx_i[isz++] = row+rstart;}
1357             break; /* for l = start; l<end ; l++) */
1358           }
1359         }
1360       }
1361     }
1362 
1363     if (i < is_max - 1){
1364       idx_i  += n;   /* ptr to input is[i+1] array */
1365       nidx_i += isz; /* ptr to output is[i+1] array */
1366     }
1367     nidx[1+i] = isz; /* size of new is[i] */
1368   } /* for each is */
1369   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
1370 
1371   PetscFunctionReturn(0);
1372 }
1373 
1374 
1375