xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 22ff0165bac93da38892b9f033347e61537570af)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 #undef __FUNCT__
23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
25 {
26   PetscErrorCode ierr;
27   PetscInt       i;
28 
29   PetscFunctionBegin;
30   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
31   for (i=0; i<ov; ++i) {
32     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
33   }
34   PetscFunctionReturn(0);
35 }
36 
37 #undef __FUNCT__
38 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable"
39 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
40 {
41   PetscErrorCode ierr;
42   PetscInt       i;
43 
44   PetscFunctionBegin;
45   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
46   for (i=0; i<ov; ++i) {
47     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
48   }
49   PetscFunctionReturn(0);
50 }
51 
52 
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
56 {
57   PetscErrorCode   ierr;
58   MPI_Comm         comm;
59   PetscInt        *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
60   PetscInt         *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
61   PetscInt         nrecvrows,*sbsizes = 0,*sbdata = 0;
62   const PetscInt  *indices_i,**indices;
63   PetscLayout      rmap;
64   PetscMPIInt      rank,size,*toranks,*fromranks,nto,nfrom;
65   PetscSF          sf;
66   PetscSFNode     *remote;
67 
68   PetscFunctionBegin;
69   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
70   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
71   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
72   /* get row map to determine where rows should be going */
73   ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr);
74   /* retrieve IS data and put all together so that we
75    * can optimize communication
76    *  */
77   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
78   for (i=0,tlength=0; i<nidx; i++){
79     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
80     tlength += length[i];
81     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
82   }
83   /* find these rows on remote processors */
84   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
85   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
86   nrrows = 0;
87   for (i=0; i<nidx; i++){
88     length_i     = length[i];
89     indices_i    = indices[i];
90     for (j=0; j<length_i; j++){
91       owner = -1;
92       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
93       /* remote processors */
94       if (owner != rank){
95         tosizes_temp[owner]++; /* number of rows to owner */
96         rrow_ranks[nrrows]  = owner; /* processor */
97         rrow_isids[nrrows]   = i; /* is id */
98         remoterows[nrrows++] = indices_i[j]; /* row */
99       }
100     }
101     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
102   }
103   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
104   /* test if we need to exchange messages
105    * generally speaking, we do not need to exchange
106    * data when overlap is 1
107    * */
108   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
109   /* we do not have any messages
110    * It usually corresponds to overlap 1
111    * */
112   if (!reducednrrows){
113     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
114     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
115     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
116     PetscFunctionReturn(0);
117   }
118   nto = 0;
119   /* send sizes and ranks for building a two-sided communcation */
120   for (i=0; i<size; i++){
121    if (tosizes_temp[i]){
122      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
123      tosizes_temp[i] = nto; /* a map from processor to index */
124      toranks[nto++]  = i; /* processor */
125    }
126   }
127   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
128   for (i=0; i<nto; i++){
129     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
130     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
131   }
132   /* send information to other processors */
133   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
134   nrecvrows = 0;
135   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
136   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
137   nrecvrows = 0;
138   for (i=0; i<nfrom; i++){
139     for (j=0; j<fromsizes[2*i]; j++){
140       remote[nrecvrows].rank    = fromranks[i];
141       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
142     }
143   }
144   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
145   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
146   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
147   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
148   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
149   /* message pair <no of is, row>  */
150   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
151   for (i=0; i<nrrows; i++){
152     owner = rrow_ranks[i]; /* processor */
153     j     = tosizes_temp[owner]; /* index */
154     todata[toffsets[j]++] = rrow_isids[i];
155     todata[toffsets[j]++] = remoterows[i];
156   }
157   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
158   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
159   ierr = PetscFree(toffsets);CHKERRQ(ierr);
160   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
161   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
162   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
163   /* send rows belonging to the remote so that then we could get the overlapping data back */
164   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
165   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
166   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
167   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
168   ierr = PetscFree(fromranks);CHKERRQ(ierr);
169   nrecvrows = 0;
170   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
171   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
172   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
173   nrecvrows = 0;
174   for (i=0; i<nto; i++){
175     for (j=0; j<tosizes[2*i]; j++){
176       remote[nrecvrows].rank    = toranks[i];
177       remote[nrecvrows++].index = tosizes[2*i+1]+j;
178     }
179   }
180   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
181   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
182   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
183   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
184   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
185   /* overlap communication and computation */
186   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
187   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
188   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
189   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
190   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
191   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
192   ierr = PetscFree(toranks);CHKERRQ(ierr);
193   ierr = PetscFree(tosizes);CHKERRQ(ierr);
194   ierr = PetscFree(todata);CHKERRQ(ierr);
195   PetscFunctionReturn(0);
196 }
197 
198 #undef __FUNCT__
199 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable"
200 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
201 {
202   PetscInt         *isz,isz_i,i,j,is_id, data_size;
203   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
204   const PetscInt   *indices_i_temp;
205   PetscErrorCode    ierr;
206 
207   PetscFunctionBegin;
208   max_lsize = 0;
209   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
210   for (i=0; i<nidx; i++){
211     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
212     max_lsize = lsize>max_lsize ? lsize:max_lsize;
213     isz[i]    = lsize;
214   }
215   ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr);
216   for (i=0; i<nidx; i++){
217     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
218     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
219     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
220     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
221   }
222   /* retrieve information to get row id and its overlap */
223   for (i=0; i<nrecvs; ){
224     is_id      = recvdata[i++];
225     data_size  = recvdata[i++];
226     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
227     isz_i      = isz[is_id];
228     for (j=0; j< data_size; j++){
229       col = recvdata[i++];
230       indices_i[isz_i++] = col;
231     }
232     isz[is_id] = isz_i;
233   }
234   /* remove duplicate entities */
235   for (i=0; i<nidx; i++){
236     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
237     isz_i      = isz[i];
238     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
239     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
240   }
241   ierr = PetscFree(isz);CHKERRQ(ierr);
242   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
243   PetscFunctionReturn(0);
244 }
245 
246 #undef __FUNCT__
247 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable"
248 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
249 {
250   PetscLayout       rmap,cmap;
251   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
252   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
253   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
254   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
255   Mat               amat,bmat;
256   PetscMPIInt       rank;
257   PetscBool         done;
258   MPI_Comm          comm;
259   PetscErrorCode    ierr;
260 
261   PetscFunctionBegin;
262   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
263   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
264   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
265   /* Even if the mat is symmetric, we still assume it is not symmetric */
266   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
267   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
268   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
269   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
270   /* total number of nonzero values is used to estimate the memory usage in the next step */
271   tnz  = ai[an]+bi[bn];
272   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
273   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
274   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
275   /* to find the longest message */
276   max_fszs = 0;
277   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
278   /* better way to estimate number of nonzero in the mat??? */
279   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
280   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
281   rows_pos  = 0;
282   totalrows = 0;
283   for (i=0; i<nfrom; i++){
284     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
285     /* group data together */
286     for (j=0; j<fromsizes[2*i]; j+=2){
287       is_id                       = fromrows[rows_pos++];/* no of is */
288       rows_i                      = rows_data[is_id];
289       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
290     }
291     /* estimate a space to avoid multiple allocations  */
292     for (j=0; j<nidx; j++){
293       indvc_ij = 0;
294       rows_i   = rows_data[j];
295       for (l=0; l<rows_pos_i[j]; l++){
296         row    = rows_i[l]-rstart;
297         start  = ai[row];
298         end    = ai[row+1];
299         for (k=start; k<end; k++){ /* Amat */
300           col = aj[k] + cstart;
301           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
302         }
303         start = bi[row];
304         end   = bi[row+1];
305         for (k=start; k<end; k++) { /* Bmat */
306           col = gcols[bj[k]];
307           indices_tmp[indvc_ij++] = col;
308         }
309       }
310       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
311       indv_counts[i*nidx+j] = indvc_ij;
312       totalrows            += indvc_ij;
313     }
314   }
315   /* message triple <no of is, number of rows, rows> */
316   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
317   totalrows = 0;
318   rows_pos  = 0;
319   /* use this code again */
320   for (i=0;i<nfrom;i++){
321     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
322     for (j=0; j<fromsizes[2*i]; j+=2){
323       is_id                       = fromrows[rows_pos++];
324       rows_i                      = rows_data[is_id];
325       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
326     }
327     /* add data  */
328     for (j=0; j<nidx; j++){
329       if (!indv_counts[i*nidx+j]) continue;
330       indvc_ij = 0;
331       sbdata[totalrows++] = j;
332       sbdata[totalrows++] = indv_counts[i*nidx+j];
333       sbsizes[2*i]       += 2;
334       rows_i              = rows_data[j];
335       for (l=0; l<rows_pos_i[j]; l++){
336         row   = rows_i[l]-rstart;
337         start = ai[row];
338         end   = ai[row+1];
339         for (k=start; k<end; k++){ /* Amat */
340           col = aj[k] + cstart;
341           indices_tmp[indvc_ij++] = col;
342         }
343         start = bi[row];
344         end   = bi[row+1];
345         for (k=start; k<end; k++) { /* Bmat */
346           col = gcols[bj[k]];
347           indices_tmp[indvc_ij++] = col;
348         }
349       }
350       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
351       sbsizes[2*i]  += indvc_ij;
352       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
353       totalrows += indvc_ij;
354     }
355   }
356   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
357   for (i=0; i<nfrom; i++){
358     offsets[i+1]   = offsets[i] + sbsizes[2*i];
359     sbsizes[2*i+1] = offsets[i];
360   }
361   ierr = PetscFree(offsets);CHKERRQ(ierr);
362   if (sbrowsizes) *sbrowsizes = sbsizes;
363   if (sbrows) *sbrows = sbdata;
364   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
365   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
366   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
367   PetscFunctionReturn(0);
368 }
369 
370 #undef __FUNCT__
371 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable"
372 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
373 {
374   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
375   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
376   PetscInt          lsize,lsize_tmp,owner;
377   PetscMPIInt       rank;
378   Mat               amat,bmat;
379   PetscBool         done;
380   PetscLayout       cmap,rmap;
381   MPI_Comm          comm;
382   PetscErrorCode    ierr;
383 
384   PetscFunctionBegin;
385   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
386   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
387   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
388   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
389   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
390   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
391   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
392   /* is it a safe way to compute number of nonzero values ? */
393   tnz  = ai[an]+bi[bn];
394   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
395   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
396   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
397   /* it is a better way to estimate memory than the old implementation
398    * where global size of matrix is used
399    * */
400   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
401   for (i=0; i<nidx; i++) {
402     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
403     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
404     lsize_tmp = 0;
405     for (j=0; j<lsize; j++) {
406       owner = -1;
407       row   = indices[j];
408       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
409       if (owner != rank) continue;
410       /* local number */
411       row  -= rstart;
412       start = ai[row];
413       end   = ai[row+1];
414       for (k=start; k<end; k++) { /* Amat */
415         col = aj[k] + cstart;
416         indices_temp[lsize_tmp++] = col;
417       }
418       start = bi[row];
419       end   = bi[row+1];
420       for (k=start; k<end; k++) { /* Bmat */
421         col = gcols[bj[k]];
422         indices_temp[lsize_tmp++] = col;
423       }
424     }
425    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
426    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
427    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
428    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
429   }
430   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
431   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
432   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
433   PetscFunctionReturn(0);
434 }
435 
436 
437 /*
438   Sample message format:
439   If a processor A wants processor B to process some elements corresponding
440   to index sets is[1],is[5]
441   mesg [0] = 2   (no of index sets in the mesg)
442   -----------
443   mesg [1] = 1 => is[1]
444   mesg [2] = sizeof(is[1]);
445   -----------
446   mesg [3] = 5  => is[5]
447   mesg [4] = sizeof(is[5]);
448   -----------
449   mesg [5]
450   mesg [n]  datas[1]
451   -----------
452   mesg[n+1]
453   mesg[m]  data(is[5])
454   -----------
455 
456   Notes:
457   nrqs - no of requests sent (or to be sent out)
458   nrqr - no of requests recieved (which have to be or which have been processed
459 */
460 #undef __FUNCT__
461 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
462 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
463 {
464   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
465   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
466   const PetscInt **idx,*idx_i;
467   PetscInt       *n,**data,len;
468 #if defined(PETSC_USE_CTABLE)
469   PetscTable     *table_data,table_data_i;
470   PetscInt       *tdata,tcount,tcount_max;
471 #else
472   PetscInt       *data_i,*d_p;
473 #endif
474   PetscErrorCode ierr;
475   PetscMPIInt    size,rank,tag1,tag2;
476   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
477   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
478   PetscBT        *table;
479   MPI_Comm       comm;
480   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
481   MPI_Status     *s_status,*recv_status;
482   char           *t_p;
483 
484   PetscFunctionBegin;
485   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
486   size = c->size;
487   rank = c->rank;
488   M    = C->rmap->N;
489 
490   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
491   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
492 
493   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
494 
495   for (i=0; i<imax; i++) {
496     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
497     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
498   }
499 
500   /* evaluate communication - mesg to who,length of mesg, and buffer space
501      required. Based on this, buffers are allocated, and data copied into them  */
502   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
503   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
504   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
505   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
506   for (i=0; i<imax; i++) {
507     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
508     idx_i = idx[i];
509     len   = n[i];
510     for (j=0; j<len; j++) {
511       row = idx_i[j];
512       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
513       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
514       w4[proc]++;
515     }
516     for (j=0; j<size; j++) {
517       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
518     }
519   }
520 
521   nrqs     = 0;              /* no of outgoing messages */
522   msz      = 0;              /* total mesg length (for all proc */
523   w1[rank] = 0;              /* no mesg sent to intself */
524   w3[rank] = 0;
525   for (i=0; i<size; i++) {
526     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
527   }
528   /* pa - is list of processors to communicate with */
529   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
530   for (i=0,j=0; i<size; i++) {
531     if (w1[i]) {pa[j] = i; j++;}
532   }
533 
534   /* Each message would have a header = 1 + 2*(no of IS) + data */
535   for (i=0; i<nrqs; i++) {
536     j      = pa[i];
537     w1[j] += w2[j] + 2*w3[j];
538     msz   += w1[j];
539   }
540 
541   /* Determine the number of messages to expect, their lengths, from from-ids */
542   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
543   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
544 
545   /* Now post the Irecvs corresponding to these messages */
546   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
547 
548   /* Allocate Memory for outgoing messages */
549   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
550   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
551   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
552 
553   {
554     PetscInt *iptr = tmp,ict  = 0;
555     for (i=0; i<nrqs; i++) {
556       j         = pa[i];
557       iptr     +=  ict;
558       outdat[j] = iptr;
559       ict       = w1[j];
560     }
561   }
562 
563   /* Form the outgoing messages */
564   /* plug in the headers */
565   for (i=0; i<nrqs; i++) {
566     j            = pa[i];
567     outdat[j][0] = 0;
568     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
569     ptr[j]       = outdat[j] + 2*w3[j] + 1;
570   }
571 
572   /* Memory for doing local proc's work */
573   {
574 #if defined(PETSC_USE_CTABLE)
575     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
576     for (i=0; i<imax; i++) {
577       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
578     }
579     ierr = PetscCalloc4(imax,&table,imax,&data,imax,&isz,(M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr);
580     for (i=0; i<imax; i++) {
581       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
582     }
583 #else
584     PetscInt Mimax = 0,M_BPB_imax = 0;
585     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
586     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
587     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
588     ierr = PetscCalloc5(imax,&table,imax,&data,imax,&isz,M*imax,&d_p,(M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr);
589     for (i=0; i<imax; i++) {
590       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
591       data[i]  = d_p + M*i;
592     }
593 #endif
594   }
595 
596   /* Parse the IS and update local tables and the outgoing buf with the data */
597   {
598     PetscInt n_i,isz_i,*outdat_j,ctr_j;
599     PetscBT  table_i;
600 
601     for (i=0; i<imax; i++) {
602       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
603       n_i     = n[i];
604       table_i = table[i];
605       idx_i   = idx[i];
606 #if defined(PETSC_USE_CTABLE)
607       table_data_i = table_data[i];
608 #else
609       data_i  = data[i];
610 #endif
611       isz_i   = isz[i];
612       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
613         row  = idx_i[j];
614         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
615         if (proc != rank) { /* copy to the outgoing buffer */
616           ctr[proc]++;
617           *ptr[proc] = row;
618           ptr[proc]++;
619         } else if (!PetscBTLookupSet(table_i,row)) {
620 #if defined(PETSC_USE_CTABLE)
621           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
622 #else
623           data_i[isz_i] = row; /* Update the local table */
624 #endif
625           isz_i++;
626         }
627       }
628       /* Update the headers for the current IS */
629       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
630         if ((ctr_j = ctr[j])) {
631           outdat_j        = outdat[j];
632           k               = ++outdat_j[0];
633           outdat_j[2*k]   = ctr_j;
634           outdat_j[2*k-1] = i;
635         }
636       }
637       isz[i] = isz_i;
638     }
639   }
640 
641   /*  Now  post the sends */
642   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
643   for (i=0; i<nrqs; ++i) {
644     j    = pa[i];
645     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
646   }
647 
648   /* No longer need the original indices */
649   for (i=0; i<imax; ++i) {
650     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
651   }
652   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
653 
654   for (i=0; i<imax; ++i) {
655     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
656   }
657 
658   /* Do Local work */
659 #if defined(PETSC_USE_CTABLE)
660   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
661 #else
662   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
663 #endif
664 
665   /* Receive messages */
666   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
667   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
668 
669   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
670   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
671 
672   /* Phase 1 sends are complete - deallocate buffers */
673   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
674   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
675 
676   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
677   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
678   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
679   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
680   ierr = PetscFree(rbuf);CHKERRQ(ierr);
681 
682 
683   /* Send the data back */
684   /* Do a global reduction to know the buffer space req for incoming messages */
685   {
686     PetscMPIInt *rw1;
687 
688     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
689 
690     for (i=0; i<nrqr; ++i) {
691       proc = recv_status[i].MPI_SOURCE;
692 
693       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
694       rw1[proc] = isz1[i];
695     }
696     ierr = PetscFree(onodes1);CHKERRQ(ierr);
697     ierr = PetscFree(olengths1);CHKERRQ(ierr);
698 
699     /* Determine the number of messages to expect, their lengths, from from-ids */
700     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
701     ierr = PetscFree(rw1);CHKERRQ(ierr);
702   }
703   /* Now post the Irecvs corresponding to these messages */
704   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
705 
706   /* Now  post the sends */
707   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
708   for (i=0; i<nrqr; ++i) {
709     j    = recv_status[i].MPI_SOURCE;
710     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
711   }
712 
713   /* receive work done on other processors */
714   {
715     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
716     PetscMPIInt idex;
717     PetscBT     table_i;
718     MPI_Status  *status2;
719 
720     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
721     for (i=0; i<nrqs; ++i) {
722       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
723       /* Process the message */
724       rbuf2_i = rbuf2[idex];
725       ct1     = 2*rbuf2_i[0]+1;
726       jmax    = rbuf2[idex][0];
727       for (j=1; j<=jmax; j++) {
728         max     = rbuf2_i[2*j];
729         is_no   = rbuf2_i[2*j-1];
730         isz_i   = isz[is_no];
731         table_i = table[is_no];
732 #if defined(PETSC_USE_CTABLE)
733         table_data_i = table_data[is_no];
734 #else
735         data_i  = data[is_no];
736 #endif
737         for (k=0; k<max; k++,ct1++) {
738           row = rbuf2_i[ct1];
739           if (!PetscBTLookupSet(table_i,row)) {
740 #if defined(PETSC_USE_CTABLE)
741             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
742 #else
743             data_i[isz_i] = row;
744 #endif
745             isz_i++;
746           }
747         }
748         isz[is_no] = isz_i;
749       }
750     }
751 
752     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
753     ierr = PetscFree(status2);CHKERRQ(ierr);
754   }
755 
756 #if defined(PETSC_USE_CTABLE)
757   tcount_max = 0;
758   for (i=0; i<imax; ++i) {
759     table_data_i = table_data[i];
760     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
761     if (tcount_max < tcount) tcount_max = tcount;
762   }
763   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
764 #endif
765 
766   for (i=0; i<imax; ++i) {
767 #if defined(PETSC_USE_CTABLE)
768     PetscTablePosition tpos;
769     table_data_i = table_data[i];
770 
771     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
772     while (tpos) {
773       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
774       tdata[--j] = --k;
775     }
776     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
777 #else
778     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
779 #endif
780   }
781 
782   ierr = PetscFree(onodes2);CHKERRQ(ierr);
783   ierr = PetscFree(olengths2);CHKERRQ(ierr);
784 
785   ierr = PetscFree(pa);CHKERRQ(ierr);
786   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
787   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
788   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
789   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
790   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
791   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
792   ierr = PetscFree(s_status);CHKERRQ(ierr);
793   ierr = PetscFree(recv_status);CHKERRQ(ierr);
794   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
795   ierr = PetscFree(xdata);CHKERRQ(ierr);
796   ierr = PetscFree(isz1);CHKERRQ(ierr);
797 #if defined(PETSC_USE_CTABLE)
798   for (i=0; i<imax; i++) {
799     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
800   }
801   ierr = PetscFree(table_data);CHKERRQ(ierr);
802   ierr = PetscFree(tdata);CHKERRQ(ierr);
803   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
804 #else
805   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
806 #endif
807   PetscFunctionReturn(0);
808 }
809 
810 #undef __FUNCT__
811 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
812 /*
813    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
814        the work on the local processor.
815 
816      Inputs:
817       C      - MAT_MPIAIJ;
818       imax - total no of index sets processed at a time;
819       table  - an array of char - size = m bits.
820 
821      Output:
822       isz    - array containing the count of the solution elements corresponding
823                to each index set;
824       data or table_data  - pointer to the solutions
825 */
826 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
827 {
828   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
829   Mat        A  = c->A,B = c->B;
830   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
831   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
832   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
833   PetscBT    table_i;
834 #if defined(PETSC_USE_CTABLE)
835   PetscTable         table_data_i;
836   PetscErrorCode     ierr;
837   PetscTablePosition tpos;
838   PetscInt           tcount,*tdata;
839 #else
840   PetscInt           *data_i;
841 #endif
842 
843   PetscFunctionBegin;
844   rstart = C->rmap->rstart;
845   cstart = C->cmap->rstart;
846   ai     = a->i;
847   aj     = a->j;
848   bi     = b->i;
849   bj     = b->j;
850   garray = c->garray;
851 
852   for (i=0; i<imax; i++) {
853 #if defined(PETSC_USE_CTABLE)
854     /* copy existing entries of table_data_i into tdata[] */
855     table_data_i = table_data[i];
856     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
857     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
858 
859     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
860     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
861     while (tpos) {
862       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
863       tdata[--j] = --row;
864       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
865     }
866 #else
867     data_i  = data[i];
868 #endif
869     table_i = table[i];
870     isz_i   = isz[i];
871     max     = isz[i];
872 
873     for (j=0; j<max; j++) {
874 #if defined(PETSC_USE_CTABLE)
875       row   = tdata[j] - rstart;
876 #else
877       row   = data_i[j] - rstart;
878 #endif
879       start = ai[row];
880       end   = ai[row+1];
881       for (k=start; k<end; k++) { /* Amat */
882         val = aj[k] + cstart;
883         if (!PetscBTLookupSet(table_i,val)) {
884 #if defined(PETSC_USE_CTABLE)
885           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
886 #else
887           data_i[isz_i] = val;
888 #endif
889           isz_i++;
890         }
891       }
892       start = bi[row];
893       end   = bi[row+1];
894       for (k=start; k<end; k++) { /* Bmat */
895         val = garray[bj[k]];
896         if (!PetscBTLookupSet(table_i,val)) {
897 #if defined(PETSC_USE_CTABLE)
898           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
899 #else
900           data_i[isz_i] = val;
901 #endif
902           isz_i++;
903         }
904       }
905     }
906     isz[i] = isz_i;
907 
908 #if defined(PETSC_USE_CTABLE)
909     ierr = PetscFree(tdata);CHKERRQ(ierr);
910 #endif
911   }
912   PetscFunctionReturn(0);
913 }
914 
915 #undef __FUNCT__
916 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
917 /*
918       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
919          and return the output
920 
921          Input:
922            C    - the matrix
923            nrqr - no of messages being processed.
924            rbuf - an array of pointers to the recieved requests
925 
926          Output:
927            xdata - array of messages to be sent back
928            isz1  - size of each message
929 
930   For better efficiency perhaps we should malloc separately each xdata[i],
931 then if a remalloc is required we need only copy the data for that one row
932 rather then all previous rows as it is now where a single large chunck of
933 memory is used.
934 
935 */
936 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
937 {
938   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
939   Mat            A  = c->A,B = c->B;
940   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
941   PetscErrorCode ierr;
942   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
943   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
944   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
945   PetscInt       *rbuf_i,kmax,rbuf_0;
946   PetscBT        xtable;
947 
948   PetscFunctionBegin;
949   m      = C->rmap->N;
950   rstart = C->rmap->rstart;
951   cstart = C->cmap->rstart;
952   ai     = a->i;
953   aj     = a->j;
954   bi     = b->i;
955   bj     = b->j;
956   garray = c->garray;
957 
958 
959   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
960     rbuf_i =  rbuf[i];
961     rbuf_0 =  rbuf_i[0];
962     ct    += rbuf_0;
963     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
964   }
965 
966   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
967   else max1 = 1;
968   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
969   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
970   ++no_malloc;
971   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
972   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
973 
974   ct3 = 0;
975   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
976     rbuf_i =  rbuf[i];
977     rbuf_0 =  rbuf_i[0];
978     ct1    =  2*rbuf_0+1;
979     ct2    =  ct1;
980     ct3   += ct1;
981     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
982       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
983       oct2 = ct2;
984       kmax = rbuf_i[2*j];
985       for (k=0; k<kmax; k++,ct1++) {
986         row = rbuf_i[ct1];
987         if (!PetscBTLookupSet(xtable,row)) {
988           if (!(ct3 < mem_estimate)) {
989             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
990             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
991             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
992             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
993             xdata[0]     = tmp;
994             mem_estimate = new_estimate; ++no_malloc;
995             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
996           }
997           xdata[i][ct2++] = row;
998           ct3++;
999         }
1000       }
1001       for (k=oct2,max2=ct2; k<max2; k++) {
1002         row   = xdata[i][k] - rstart;
1003         start = ai[row];
1004         end   = ai[row+1];
1005         for (l=start; l<end; l++) {
1006           val = aj[l] + cstart;
1007           if (!PetscBTLookupSet(xtable,val)) {
1008             if (!(ct3 < mem_estimate)) {
1009               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1010               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1011               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1012               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1013               xdata[0]     = tmp;
1014               mem_estimate = new_estimate; ++no_malloc;
1015               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1016             }
1017             xdata[i][ct2++] = val;
1018             ct3++;
1019           }
1020         }
1021         start = bi[row];
1022         end   = bi[row+1];
1023         for (l=start; l<end; l++) {
1024           val = garray[bj[l]];
1025           if (!PetscBTLookupSet(xtable,val)) {
1026             if (!(ct3 < mem_estimate)) {
1027               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1028               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1029               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1030               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1031               xdata[0]     = tmp;
1032               mem_estimate = new_estimate; ++no_malloc;
1033               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1034             }
1035             xdata[i][ct2++] = val;
1036             ct3++;
1037           }
1038         }
1039       }
1040       /* Update the header*/
1041       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1042       xdata[i][2*j-1] = rbuf_i[2*j-1];
1043     }
1044     xdata[i][0] = rbuf_0;
1045     xdata[i+1]  = xdata[i] + ct2;
1046     isz1[i]     = ct2; /* size of each message */
1047   }
1048   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1049   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1050   PetscFunctionReturn(0);
1051 }
1052 /* -------------------------------------------------------------------------*/
1053 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
1054 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
1055 /*
1056     Every processor gets the entire matrix
1057 */
1058 #undef __FUNCT__
1059 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
1060 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1061 {
1062   Mat            B;
1063   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1064   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1065   PetscErrorCode ierr;
1066   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
1067   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1068   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1069   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1070 
1071   PetscFunctionBegin;
1072   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1073   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
1074 
1075   if (scall == MAT_INITIAL_MATRIX) {
1076     /* ----------------------------------------------------------------
1077          Tell every processor the number of nonzeros per row
1078     */
1079     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1080     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1081       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1082     }
1083     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1084     for (i=0; i<size; i++) {
1085       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1086       displs[i]     = A->rmap->range[i];
1087     }
1088 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1089     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1090 #else
1091     sendcount = A->rmap->rend - A->rmap->rstart;
1092     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1093 #endif
1094     /* ---------------------------------------------------------------
1095          Create the sequential matrix of the same type as the local block diagonal
1096     */
1097     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1098     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1099     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1100     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1101     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1102     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
1103     **Bin = B;
1104     b     = (Mat_SeqAIJ*)B->data;
1105 
1106     /*--------------------------------------------------------------------
1107        Copy my part of matrix column indices over
1108     */
1109     sendcount  = ad->nz + bd->nz;
1110     jsendbuf   = b->j + b->i[rstarts[rank]];
1111     a_jsendbuf = ad->j;
1112     b_jsendbuf = bd->j;
1113     n          = A->rmap->rend - A->rmap->rstart;
1114     cnt        = 0;
1115     for (i=0; i<n; i++) {
1116 
1117       /* put in lower diagonal portion */
1118       m = bd->i[i+1] - bd->i[i];
1119       while (m > 0) {
1120         /* is it above diagonal (in bd (compressed) numbering) */
1121         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1122         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1123         m--;
1124       }
1125 
1126       /* put in diagonal portion */
1127       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1128         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1129       }
1130 
1131       /* put in upper diagonal portion */
1132       while (m-- > 0) {
1133         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1134       }
1135     }
1136     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1137 
1138     /*--------------------------------------------------------------------
1139        Gather all column indices to all processors
1140     */
1141     for (i=0; i<size; i++) {
1142       recvcounts[i] = 0;
1143       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1144         recvcounts[i] += lens[j];
1145       }
1146     }
1147     displs[0] = 0;
1148     for (i=1; i<size; i++) {
1149       displs[i] = displs[i-1] + recvcounts[i-1];
1150     }
1151 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1152     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1153 #else
1154     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1155 #endif
1156     /*--------------------------------------------------------------------
1157         Assemble the matrix into useable form (note numerical values not yet set)
1158     */
1159     /* set the b->ilen (length of each row) values */
1160     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1161     /* set the b->i indices */
1162     b->i[0] = 0;
1163     for (i=1; i<=A->rmap->N; i++) {
1164       b->i[i] = b->i[i-1] + lens[i-1];
1165     }
1166     ierr = PetscFree(lens);CHKERRQ(ierr);
1167     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1168     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1169 
1170   } else {
1171     B = **Bin;
1172     b = (Mat_SeqAIJ*)B->data;
1173   }
1174 
1175   /*--------------------------------------------------------------------
1176        Copy my part of matrix numerical values into the values location
1177   */
1178   if (flag == MAT_GET_VALUES) {
1179     sendcount = ad->nz + bd->nz;
1180     sendbuf   = b->a + b->i[rstarts[rank]];
1181     a_sendbuf = ad->a;
1182     b_sendbuf = bd->a;
1183     b_sendj   = bd->j;
1184     n         = A->rmap->rend - A->rmap->rstart;
1185     cnt       = 0;
1186     for (i=0; i<n; i++) {
1187 
1188       /* put in lower diagonal portion */
1189       m = bd->i[i+1] - bd->i[i];
1190       while (m > 0) {
1191         /* is it above diagonal (in bd (compressed) numbering) */
1192         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1193         sendbuf[cnt++] = *b_sendbuf++;
1194         m--;
1195         b_sendj++;
1196       }
1197 
1198       /* put in diagonal portion */
1199       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1200         sendbuf[cnt++] = *a_sendbuf++;
1201       }
1202 
1203       /* put in upper diagonal portion */
1204       while (m-- > 0) {
1205         sendbuf[cnt++] = *b_sendbuf++;
1206         b_sendj++;
1207       }
1208     }
1209     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1210 
1211     /* -----------------------------------------------------------------
1212        Gather all numerical values to all processors
1213     */
1214     if (!recvcounts) {
1215       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1216     }
1217     for (i=0; i<size; i++) {
1218       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1219     }
1220     displs[0] = 0;
1221     for (i=1; i<size; i++) {
1222       displs[i] = displs[i-1] + recvcounts[i-1];
1223     }
1224     recvbuf = b->a;
1225 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1226     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1227 #else
1228     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1229 #endif
1230   }  /* endof (flag == MAT_GET_VALUES) */
1231   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1232 
1233   if (A->symmetric) {
1234     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1235   } else if (A->hermitian) {
1236     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1237   } else if (A->structurally_symmetric) {
1238     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1239   }
1240   PetscFunctionReturn(0);
1241 }
1242 #undef __FUNCT__
1243 #define __FUNCT__ "MatDestroy_MPIAIJ_MatGetSubmatrices"
1244 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C)
1245 {
1246   PetscErrorCode ierr;
1247   Mat_SeqAIJ     *c = (Mat_SeqAIJ*)C->data;
1248   Mat_SubMat     *submatj = c->submatis1;
1249   PetscInt       i;
1250 
1251   PetscFunctionBegin;
1252   ierr = submatj->destroy(C);CHKERRQ(ierr);
1253 
1254   ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr);
1255 
1256   for (i=0; i<submatj->nrqr; ++i) {
1257     ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr);
1258   }
1259   ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr);
1260 
1261   ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr);
1262   ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr);
1263 
1264   for (i=0; i<submatj->nrqs; ++i) {
1265     ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr);
1266   }
1267   ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr);
1268   ierr = PetscFree(submatj->pa);CHKERRQ(ierr);
1269 
1270   ierr = PetscFree(submatj->r2pmap);CHKERRQ(ierr);
1271 
1272 #if defined(PETSC_USE_CTABLE)
1273   ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr);
1274   if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);}
1275   ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr);
1276 #else
1277   ierr = PetscFree(submatj->rmap);CHKERRQ(ierr);
1278 #endif
1279 
1280   if (!submatj->allcolumns) {
1281 #if defined(PETSC_USE_CTABLE)
1282     ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr);
1283 #else
1284     ierr = PetscFree(submatj->cmap);CHKERRQ(ierr);
1285 #endif
1286   }
1287 
1288   ierr = PetscFree(submatj);CHKERRQ(ierr);
1289   PetscFunctionReturn(0);
1290 }
1291 
1292 #undef __FUNCT__
1293 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS_Local"
1294 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1295 {
1296   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1297   Mat            submat,A = c->A,B = c->B;
1298   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1299   PetscInt       *ai = a->i,*bi = b->i,nzA,nzB;
1300   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1301   const PetscInt *icol,*irow;
1302   PetscInt       nrow,ncol,start;
1303   PetscErrorCode ierr;
1304   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
1305   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1306   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1307   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1308   PetscInt       *lens,ncols,*cols,Crow;
1309 #if defined(PETSC_USE_CTABLE)
1310   PetscTable     cmap,rmap;
1311   PetscInt       *cmap_loc,*rmap_loc;
1312 #else
1313   PetscInt       *cmap,*rmap;
1314 #endif
1315   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1316   PetscInt       *cworkB,lwrite,*bj = b->j,*subcols,*r2pmap;
1317   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1318   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1319   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1320   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1321   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1322   MPI_Comm       comm;
1323   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1324   PetscMPIInt    *onodes1,*olengths1,idex,end;
1325   Mat_SubMat     *smatis1;
1326 #if defined(PETSC_USE_LOG)
1327   PetscLogStage stage[5];
1328 #endif
1329 
1330   PetscFunctionBegin;
1331   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1332 
1333   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1334   size = c->size;
1335   rank = c->rank;
1336 
1337   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1338   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1339   if (allcolumns) {
1340     icol = NULL;
1341     ncol = C->cmap->N;
1342   } else {
1343     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1344     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1345   }
1346 
1347   if (scall == MAT_INITIAL_MATRIX) {
1348     PetscInt *sbuf2_i,*cworkA,lwrite,*aj = a->j,ctmp;
1349 
1350     ierr = PetscLogStageRegister("Setup",&stage[0]);CHKERRQ(ierr);
1351     ierr = PetscLogStageRegister("Aj comm",&stage[1]);CHKERRQ(ierr);
1352     ierr = PetscLogStageRegister("Aval comm",&stage[2]);CHKERRQ(ierr);
1353     ierr = PetscLogStageRegister("MAssem_loc",&stage[3]);CHKERRQ(ierr);
1354     ierr = PetscLogStageRegister("MAssem_off",&stage[4]);CHKERRQ(ierr);
1355 
1356     ierr = PetscLogStagePush(stage[0]);CHKERRQ(ierr);
1357 
1358     /* Get some new tags to keep the communication clean */
1359     tag1 = ((PetscObject)C)->tag;
1360     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1361     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1362 
1363     /* evaluate communication - mesg to who, length of mesg, and buffer space
1364      required. Based on this, buffers are allocated, and data copied into them */
1365     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
1366     ierr = PetscMalloc1(nrow,&r2pmap);CHKERRQ(ierr);
1367 
1368     /* w4[proc] = num of rows owned by this proc */
1369     for (j=0; j<nrow; j++) {
1370       proc = 0;
1371       row  = irow[j];
1372       while (row >= C->rmap->range[proc+1]) proc++;
1373       w4[proc]++;
1374       r2pmap[j] = proc; /* map row to proc */
1375     }
1376     for (proc=0; proc<size; proc++) {
1377       if (w4[proc]) { w1[proc] += w4[proc];  w3[proc]++;}
1378     }
1379 
1380     nrqs     = 0;              /* no of outgoing messages */
1381     msz      = 0;              /* total mesg length (for all procs) */
1382     w1[rank] = 0;              /* no mesg sent to self */
1383     w3[rank] = 0;
1384     for (proc=0; proc<size; proc++) {
1385       if (w1[proc])  { w2[proc] = 1; nrqs++;} /* there exists a message to this proc */
1386     }
1387 
1388     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1389     for (i=0,j=0; i<size; i++) {
1390       if (w1[i]) { pa[j] = i; j++; }
1391     }
1392 
1393     /* Each message would have a header = 1 + 2*(no of IS) + data */
1394     for (i=0; i<nrqs; i++) {
1395       proc      = pa[i];
1396       w1[proc] += w2[proc] + 2* w3[proc];
1397       msz      += w1[proc];
1398     }
1399     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1400 
1401     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1402     /* if w2[i]=1, a message of length w1[i] will be sent to proc[i]; */
1403     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1404 
1405     /* nrqs: nsend; nrqr: nrecv; w1: send message length; onodes1: recv id; olengths1: recv message length */
1406     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1407 
1408     /* Now post the Irecvs corresponding to these messages */
1409     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1410 
1411     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1412     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1413 
1414     /* Allocate Memory for outgoing messages */
1415     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1416     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1417     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1418 
1419     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1420     iptr = tmp;
1421     k    = 0;
1422     for (i=0; i<nrqs; i++) {
1423       proc     = pa[i];
1424       iptr    += k;
1425       sbuf1[proc] = iptr;
1426       k           = w1[proc];
1427     }
1428 
1429     /* Form the outgoing messages */
1430     /* Initialize the header space */
1431     for (i=0; i<nrqs; i++) {
1432       proc           = pa[i];
1433       sbuf1[proc][0] = 0;
1434       ierr           = PetscMemzero(sbuf1[proc]+1,2*w3[proc]*sizeof(PetscInt));CHKERRQ(ierr);
1435       ptr[proc]      = sbuf1[proc] + 2*w3[proc] + 1;
1436     }
1437 
1438     /* Parse the isrow and copy data into outbuf */
1439     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1440     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1441       proc  = 0;
1442       row = irow[j];
1443       while (row >= C->rmap->range[proc+1]) proc++;
1444       if (proc != rank) { /* copy to the outgoing buf*/
1445         ctr[proc]++;
1446         *ptr[proc] = row;
1447         ptr[proc]++;
1448       }
1449     }
1450 
1451     /* Update the headers for the current IS */
1452     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1453       if ((ctr_j = ctr[j])) {
1454         sbuf1_j        = sbuf1[j];
1455         k              = ++sbuf1_j[0];
1456         sbuf1_j[2*k]   = ctr_j;
1457         sbuf1_j[2*k-1] = 0;
1458       }
1459     }
1460 
1461     /* Now post the sends */
1462     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1463     for (i=0; i<nrqs; ++i) {
1464       proc = pa[i];
1465       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr);
1466     }
1467 
1468     /* Post Receives to capture the buffer size */
1469     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1470     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1471 
1472     rbuf2[0] = tmp + msz;
1473     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1474 
1475     for (i=0; i<nrqs; ++i) {
1476       proc = pa[i];
1477       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr);
1478     }
1479 
1480     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1481 
1482     /* Send to other procs the buf size they should allocate */
1483     /* Receive messages*/
1484     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1485     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1486 
1487     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
1488     for (i=0; i<nrqr; ++i) {
1489       req_size[i] = 0;
1490       rbuf1_i        = rbuf1[i];
1491       start          = 2*rbuf1_i[0] + 1;
1492       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1493       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1494       sbuf2_i        = sbuf2[i];
1495       for (j=start; j<end; j++) {
1496         k            = rbuf1_i[j] - rstart;
1497         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1498         sbuf2_i[j]   = ncols;
1499         req_size[i] += ncols;
1500       }
1501       req_source1[i] = r_status1[i].MPI_SOURCE;
1502 
1503       /* form the header */
1504       sbuf2_i[0] = req_size[i];
1505       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1506 
1507       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
1508     }
1509 
1510     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1511     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1512 
1513     /* rbuf2 is received, Post recv column indices a->j */
1514     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
1515 
1516     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1517     for (i=0; i<nrqs; ++i) {
1518       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1519       req_source2[i] = r_status2[i].MPI_SOURCE;
1520       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
1521     }
1522 
1523     /* Wait on sends1 and sends2 */
1524     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1525     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1526     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1527     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1528 
1529     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1530     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1531 
1532     ierr = PetscLogStagePop();CHKERRQ(ierr);
1533     ierr = PetscLogStagePush(stage[1]);CHKERRQ(ierr);
1534 
1535     /* Now allocate sending buffers for a->j, and send them off */
1536     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1537     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1538     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1539     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1540 
1541     for (i=0; i<nrqr; i++) { /* for each requested message */
1542       rbuf1_i   = rbuf1[i];
1543       sbuf_aj_i = sbuf_aj[i];
1544       ct1       = 2*rbuf1_i[0] + 1;
1545       ct2       = 0;
1546       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1547 
1548       kmax = rbuf1[i][2];
1549       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1550         row    = rbuf1_i[ct1] - rstart;
1551         nzA    = ai[row+1] - ai[row];     nzB = bi[row+1] - bi[row];
1552         ncols  = nzA + nzB;
1553         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1554 
1555         /* load the column indices for this row into cols*/
1556         cols = sbuf_aj_i + ct2;
1557 
1558         lwrite = 0;
1559         for (l=0; l<nzB; l++) {
1560           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1561         }
1562         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1563         for (l=0; l<nzB; l++) {
1564           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1565         }
1566 
1567         ct2 += ncols;
1568       }
1569       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
1570     }
1571 
1572     /* create column map (cmap): global col of C -> local col of submat */
1573 #if defined(PETSC_USE_CTABLE)
1574     if (!allcolumns) {
1575       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1576       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1577       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1578         if (icol[j] >= cstart && icol[j] <cend) {
1579           cmap_loc[icol[j] - cstart] = j+1;
1580         } else { /* use PetscTable for non-local col indices */
1581           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1582         }
1583       }
1584     } else {
1585       cmap     = NULL;
1586       cmap_loc = NULL;
1587     }
1588     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1589 #else
1590     if (!allcolumns) {
1591       ierr   = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1592       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1593     } else {
1594       cmap = NULL;
1595     }
1596 #endif
1597 
1598     /* Create lens for MatSeqAIJSetPreallocation() */
1599     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1600 
1601     /* Compute lens from local part of C */
1602     for (j=0; j<nrow; j++) {
1603       proc = 0;
1604       row  = irow[j];
1605       while (row >= C->rmap->range[proc+1]) proc++;
1606       if (proc == rank) {
1607         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1608         if (!allcolumns) {
1609           for (k=0; k<ncols; k++) {
1610 #if defined(PETSC_USE_CTABLE)
1611             if (cols[k] >= cstart && cols[k] <cend) {
1612               tcol = cmap_loc[cols[k] - cstart];
1613             } else {
1614               ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr);
1615             }
1616 #else
1617             tcol = cmap[cols[k]];
1618 #endif
1619             if (tcol) lens[j]++;
1620           }
1621         } else { /* allcolumns */
1622           lens[j] = ncols;
1623         }
1624         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1625       }
1626     }
1627 
1628     /* Create row map (rmap): global row of C -> local row of submat */
1629 #if defined(PETSC_USE_CTABLE)
1630     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1631     for (j=0; j<nrow; j++) {
1632       row  = irow[j];
1633       /*
1634       proc = 0;
1635       while (row >= C->rmap->range[proc+1]) proc++;
1636        */
1637       proc = r2pmap[j];
1638       if (proc == rank) { /* a local row */
1639         rmap_loc[row - rstart] = j;
1640       } else {
1641         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1642       }
1643     }
1644 #else
1645     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1646     for (j=0; j<nrow; j++) {
1647       rmap[irow[j]] = j;
1648     }
1649 #endif
1650 
1651     /* Update lens from offproc data */
1652     /* recv a->j is done */
1653     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1654     for (i=0; i<nrqs; i++) {
1655       proc    = pa[i];
1656       sbuf1_i = sbuf1[proc];
1657       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1658       ct1     = 2 + 1;
1659       ct2     = 0;
1660       rbuf2_i = rbuf2[i]; /* received length of C->j */
1661       rbuf3_i = rbuf3[i]; /* received C->j */
1662 
1663       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1664       max1   = sbuf1_i[2];
1665       for (k=0; k<max1; k++,ct1++) {
1666 #if defined(PETSC_USE_CTABLE)
1667         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1668         row--;
1669         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1670 #else
1671         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1672 #endif
1673         /* Now, store row index of submat in sbuf1_i[ct1] */
1674         sbuf1_i[ct1] = row;
1675 
1676         nnz = rbuf2_i[ct1];
1677         if (!allcolumns) {
1678           for (l=0; l<nnz; l++,ct2++) {
1679 #if defined(PETSC_USE_CTABLE)
1680             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1681               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1682             } else {
1683               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1684             }
1685 #else
1686             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1687 #endif
1688             if (tcol) {
1689               lens[row]++;
1690 
1691               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1692                For reuse, we replace received C->j with index that should be inserted to submat */
1693               /* rbuf3_i[ct3++] = ct2; ??? */
1694             }
1695           }
1696         } else { /* allcolumns */
1697           lens[row] += nnz;
1698         }
1699       }
1700     }
1701     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1702     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1703 
1704     /* Create the submatrices */
1705     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1706     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1707 
1708     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1709     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1710     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1711     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1712     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1713 
1714     /* create struct Mat_SubMat and attached it to submat */
1715     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1716     subc = (Mat_SeqAIJ*)submat->data;
1717     subc->submatis1 = smatis1;
1718 
1719     smatis1->nrqs        = nrqs;
1720     smatis1->nrqr        = nrqr;
1721     smatis1->rbuf1       = rbuf1;
1722     smatis1->rbuf2       = rbuf2;
1723     smatis1->rbuf3       = rbuf3;
1724     smatis1->sbuf2       = sbuf2;
1725     smatis1->req_source2 = req_source2;
1726 
1727     smatis1->sbuf1       = sbuf1;
1728     smatis1->ptr         = ptr;
1729     smatis1->tmp         = tmp;
1730     smatis1->ctr         = ctr;
1731 
1732     smatis1->pa           = pa;
1733     smatis1->req_size     = req_size;
1734     smatis1->req_source1  = req_source1;
1735 
1736     smatis1->allcolumns  = allcolumns;
1737     smatis1->r2pmap      = r2pmap;
1738     smatis1->rmap        = rmap;
1739     smatis1->cmap        = cmap;
1740 #if defined(PETSC_USE_CTABLE)
1741     smatis1->rmap_loc    = rmap_loc;
1742     smatis1->cmap_loc    = cmap_loc;
1743 #endif
1744 
1745     smatis1->destroy     = submat->ops->destroy;
1746     submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices;
1747     submat->factortype   = C->factortype;
1748 
1749     ierr = PetscLogStagePop();CHKERRQ(ierr);
1750     ierr = PetscLogStagePush(stage[2]);CHKERRQ(ierr);
1751   } else { /* scall == MAT_REUSE_MATRIX */
1752     submat = submats[0];
1753     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1754 
1755     subc    = (Mat_SeqAIJ*)submat->data;
1756     smatis1 = subc->submatis1;
1757     nrqs        = smatis1->nrqs;
1758     nrqr        = smatis1->nrqr;
1759     rbuf1       = smatis1->rbuf1;
1760     rbuf2       = smatis1->rbuf2;
1761     rbuf3       = smatis1->rbuf3;
1762     req_source2 = smatis1->req_source2;
1763 
1764     sbuf1     = smatis1->sbuf1;
1765     sbuf2     = smatis1->sbuf2;
1766     ptr       = smatis1->ptr;
1767     tmp       = smatis1->tmp;
1768     ctr       = smatis1->ctr;
1769 
1770     pa         = smatis1->pa;
1771     req_size   = smatis1->req_size;
1772     req_source1 = smatis1->req_source1;
1773 
1774     allcolumns = smatis1->allcolumns;
1775     r2pmap     = smatis1->r2pmap;
1776     rmap       = smatis1->rmap;
1777     cmap       = smatis1->cmap;
1778 #if defined(PETSC_USE_CTABLE)
1779     rmap_loc   = smatis1->rmap_loc;
1780     cmap_loc   = smatis1->cmap_loc;
1781 #endif
1782   }
1783 
1784   /* Post recv matrix values */
1785   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
1786   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1787   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1788   for (i=0; i<nrqs; ++i) {
1789     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1790     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1791   }
1792 
1793   /* Allocate sending buffers for a->a, and send them off */
1794   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1795   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1796   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1797   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1798 
1799   for (i=0; i<nrqr; i++) {
1800     rbuf1_i   = rbuf1[i];
1801     sbuf_aa_i = sbuf_aa[i];
1802     ct1       = 2*rbuf1_i[0]+1;
1803     ct2       = 0;
1804     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1805 
1806     kmax = rbuf1_i[2];
1807     for (k=0; k<kmax; k++,ct1++) {
1808       row = rbuf1_i[ct1] - rstart;
1809       nzA = ai[row+1] - ai[row];
1810       nzB = bi[row+1] - bi[row];
1811       ncols  = nzA + nzB;
1812       cworkB = bj + bi[row];
1813       vworkA = a_a + ai[row];
1814       vworkB = b_a + bi[row];
1815 
1816       /* load the column values for this row into vals*/
1817       vals = sbuf_aa_i + ct2;
1818 
1819       lwrite = 0;
1820       for (l=0; l<nzB; l++) {
1821         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1822       }
1823       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1824       for (l=0; l<nzB; l++) {
1825         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1826       }
1827 
1828       ct2 += ncols;
1829     }
1830     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1831   }
1832 
1833   if (scall == MAT_INITIAL_MATRIX) {
1834    ierr = PetscLogStagePop();CHKERRQ(ierr);
1835    ierr = PetscLogStagePush(stage[3]);CHKERRQ(ierr);
1836   }
1837 
1838   /* Assemble submat */
1839   if (!allcolumns) {
1840     ierr = PetscCalloc2(ncol,&subcols,ncol,&subvals);CHKERRQ(ierr);
1841   }
1842   /* First assemble the local rows */
1843   for (j=0; j<nrow; j++) {
1844     row  = irow[j];
1845     /*
1846     proc = 0;
1847     while (row >= C->rmap->range[proc+1]) proc++;
1848      */
1849     proc = r2pmap[j];
1850     if (proc == rank) {
1851       Crow = row;
1852 #if defined(PETSC_USE_CTABLE)
1853       row = rmap_loc[Crow - rstart]; /* row index of submat */
1854 #else
1855       row = rmap[row];
1856 #endif
1857       ierr = MatGetRow_MPIAIJ(C,Crow,&ncols,&cols,&vals);CHKERRQ(ierr);
1858 
1859       if (!allcolumns) {
1860 #if defined(PETSC_USE_CTABLE)
1861         i = 0;
1862         k = 0;
1863         while (cols[k] < cstart) {
1864           ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr);
1865           if (tcol) {
1866             subcols[i]   = --tcol;
1867             subvals[i++] = vals[k];
1868           }
1869           k++;
1870         }
1871         while (cols[k] < cend && k<ncols) {
1872           tcol = cmap_loc[cols[k]-cstart];
1873           if (tcol) {
1874             subcols[i]   = --tcol;
1875             subvals[i++] = vals[k];
1876           }
1877           k++;
1878         }
1879         while (k<ncols) {
1880           ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr);
1881           if (tcol) {
1882             subcols[i]   = --tcol;
1883             subvals[i++] = vals[k];
1884           }
1885           k++;
1886         }
1887 #else
1888         i = 0;
1889         for (k=0; k<ncols; k++) {
1890           tcol = cmap[cols[k]];
1891           if (tcol) {
1892             subcols[i]   = --tcol;
1893             subvals[i++] = vals[k];
1894           }
1895         }
1896 #endif
1897         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1898       } else { /* allcolumns */
1899         ierr = MatSetValues_SeqAIJ(submat,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
1900       }
1901       ierr = MatRestoreRow_MPIAIJ(C,Crow,&ncols,&cols,&vals);CHKERRQ(ierr);
1902     }
1903   }
1904 
1905   if (scall == MAT_INITIAL_MATRIX) {
1906    ierr = PetscLogStagePop();CHKERRQ(ierr);
1907    ierr = PetscLogStagePush(stage[4]);CHKERRQ(ierr);
1908   }
1909 
1910   /* Now assemble the off-proc rows */
1911   for (i=0; i<nrqs; i++) { /* for each requested message */
1912     /* recv values from other processes */
1913     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1914     proc    = pa[idex];
1915     sbuf1_i = sbuf1[proc];
1916     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1917     ct1     = 2 + 1;
1918     ct2     = 0; /* count of received C->j */
1919     ct3     = 0; /* count of received C->j that will be inserted into submat */
1920     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1921     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1922     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1923 
1924     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1925     max1 = sbuf1_i[2];             /* num of rows */
1926     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1927       row = sbuf1_i[ct1]; /* row index of submat */
1928       if (!allcolumns) {
1929         idex = 0;
1930         if (scall == MAT_INITIAL_MATRIX) {
1931           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1932           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1933 #if defined(PETSC_USE_CTABLE)
1934             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1935               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1936             } else {
1937               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1938             }
1939 #else
1940             tcol = cmap[rbuf3_i[ct2]];
1941 #endif
1942             if (tcol) {
1943               subcols[idex]   = --tcol;
1944               subvals[idex++] = rbuf4_i[ct2];
1945 
1946               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1947                For reuse, we replace received C->j with index that should be inserted to submat */
1948               rbuf3_i[ct3++] = ct2;
1949             }
1950           }
1951           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1952 
1953         } else { /* scall == MAT_REUSE_MATRIX */
1954           submat = submats[0];
1955           subc   = (Mat_SeqAIJ*)submat->data;
1956 
1957           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1958           for (l=0; l<nnz; l++) {
1959             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1960             subvals[idex++] = rbuf4_i[ct2];
1961           }
1962 
1963           bj = subc->j + subc->i[row];
1964           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1965         }
1966       } else { /* allcolumns */
1967         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1968         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1969         ct2 += nnz;
1970       }
1971     }
1972   }
1973 
1974   /* sending a->a are done */
1975   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1976   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1977 
1978   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1979   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1980   submats[0] = submat;
1981 
1982   /* Restore the indices */
1983   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1984   if (!allcolumns) {
1985     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1986   }
1987 
1988   /* Destroy allocated memory */
1989   for (i=0; i<nrqs; ++i) {
1990     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1991   }
1992   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1993   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1994   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1995 
1996   if (!allcolumns) {
1997     ierr = PetscFree2(subcols,subvals);CHKERRQ(ierr);
1998   }
1999 
2000   if (scall == MAT_INITIAL_MATRIX) {
2001     ierr = PetscLogStagePop();CHKERRQ(ierr);
2002 
2003     ierr = PetscFree(lens);CHKERRQ(ierr);
2004     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2005     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2006   }
2007   PetscFunctionReturn(0);
2008 }
2009 
2010 #undef __FUNCT__
2011 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS"
2012 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2013 {
2014   PetscErrorCode ierr;
2015   PetscInt       ncol;
2016   PetscBool      colflag,allcolumns=PETSC_FALSE;
2017 
2018   PetscFunctionBegin;
2019   /* Allocate memory to hold all the submatrices */
2020   if (scall != MAT_REUSE_MATRIX) {
2021     ierr = PetscMalloc1(1,submat);CHKERRQ(ierr);
2022   }
2023 
2024   /* Check for special case: each processor gets entire matrix columns */
2025   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
2026   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
2027   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
2028 
2029   ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
2030   PetscFunctionReturn(0);
2031 }
2032 
2033 #undef __FUNCT__
2034 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
2035 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2036 {
2037   PetscErrorCode ierr;
2038   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
2039   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
2040 
2041   PetscFunctionBegin;
2042   /* Check for special case: each processor gets entire matrix */
2043   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip several MPIU_Allreduce() */
2044     ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2045     PetscFunctionReturn(0);
2046   }
2047 
2048   if (ismax == 1 && C->rmap->N == C->cmap->N) {
2049     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
2050     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
2051     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
2052     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
2053     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2054       wantallmatrix = PETSC_TRUE;
2055 
2056       ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
2057     }
2058   }
2059   ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2060   if (twantallmatrix) {
2061     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2062     PetscFunctionReturn(0);
2063   }
2064 
2065   /* Allocate memory to hold all the submatrices */
2066   if (scall != MAT_REUSE_MATRIX) {
2067     ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr);
2068   }
2069 
2070   /* Check for special case: each processor gets entire matrix columns */
2071   ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr);
2072   for (i=0; i<ismax; i++) {
2073     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2074     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
2075     if (colflag && ncol == C->cmap->N) {
2076       allcolumns[i] = PETSC_TRUE;
2077     } else {
2078       allcolumns[i] = PETSC_FALSE;
2079     }
2080   }
2081 
2082   /* Determine the number of stages through which submatrices are done */
2083   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
2084 
2085   /*
2086      Each stage will extract nmax submatrices.
2087      nmax is determined by the matrix column dimension.
2088      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2089   */
2090   if (!nmax) nmax = 1;
2091   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
2092 
2093   /* Make sure every processor loops through the nstages */
2094   ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2095 
2096   for (i=0,pos=0; i<nstages; i++) {
2097     if (pos+nmax <= ismax) max_no = nmax;
2098     else if (pos == ismax) max_no = 0;
2099     else                   max_no = ismax-pos;
2100     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
2101     pos += max_no;
2102   }
2103 
2104   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
2105   PetscFunctionReturn(0);
2106 }
2107 
2108 /* -------------------------------------------------------------------------*/
2109 #undef __FUNCT__
2110 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
2111 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
2112 {
2113   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2114   Mat            A  = c->A;
2115   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
2116   const PetscInt **icol,**irow;
2117   PetscInt       *nrow,*ncol,start;
2118   PetscErrorCode ierr;
2119   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
2120   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
2121   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
2122   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
2123   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2124 #if defined(PETSC_USE_CTABLE)
2125   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
2126 #else
2127   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2128 #endif
2129   const PetscInt *irow_i;
2130   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2131   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2132   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2133   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2134   MPI_Status     *r_status3,*r_status4,*s_status4;
2135   MPI_Comm       comm;
2136   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
2137   PetscMPIInt    *onodes1,*olengths1;
2138   PetscMPIInt    idex,idex2,end;
2139 
2140   PetscFunctionBegin;
2141   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2142   tag0 = ((PetscObject)C)->tag;
2143   size = c->size;
2144   rank = c->rank;
2145 
2146   /* Get some new tags to keep the communication clean */
2147   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
2148   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2149   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2150 
2151   ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr);
2152 
2153   for (i=0; i<ismax; i++) {
2154     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2155     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2156     if (allcolumns[i]) {
2157       icol[i] = NULL;
2158       ncol[i] = C->cmap->N;
2159     } else {
2160       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2161       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2162     }
2163   }
2164 
2165   /* evaluate communication - mesg to who, length of mesg, and buffer space
2166      required. Based on this, buffers are allocated, and data copied into them*/
2167   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size */
2168   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2169   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2170   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2171   for (i=0; i<ismax; i++) {
2172     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
2173     jmax   = nrow[i];
2174     irow_i = irow[i];
2175     for (j=0; j<jmax; j++) {
2176       l   = 0;
2177       row = irow_i[j];
2178       while (row >= C->rmap->range[l+1]) l++;
2179       proc = l;
2180       w4[proc]++;
2181     }
2182     for (j=0; j<size; j++) {
2183       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
2184     }
2185   }
2186 
2187   nrqs     = 0;              /* no of outgoing messages */
2188   msz      = 0;              /* total mesg length (for all procs) */
2189   w1[rank] = 0;              /* no mesg sent to self */
2190   w3[rank] = 0;
2191   for (i=0; i<size; i++) {
2192     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2193   }
2194   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2195   for (i=0,j=0; i<size; i++) {
2196     if (w1[i]) { pa[j] = i; j++; }
2197   }
2198 
2199   /* Each message would have a header = 1 + 2*(no of IS) + data */
2200   for (i=0; i<nrqs; i++) {
2201     j      = pa[i];
2202     w1[j] += w2[j] + 2* w3[j];
2203     msz   += w1[j];
2204   }
2205   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2206 
2207   /* Determine the number of messages to expect, their lengths, from from-ids */
2208   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2209   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2210 
2211   /* Now post the Irecvs corresponding to these messages */
2212   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2213 
2214   ierr = PetscFree(onodes1);CHKERRQ(ierr);
2215   ierr = PetscFree(olengths1);CHKERRQ(ierr);
2216 
2217   /* Allocate Memory for outgoing messages */
2218   ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2219   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
2220   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
2221 
2222   {
2223     PetscInt *iptr = tmp;
2224     k    = 0;
2225     for (i=0; i<nrqs; i++) {
2226       j        = pa[i];
2227       iptr    += k;
2228       sbuf1[j] = iptr;
2229       k        = w1[j];
2230     }
2231   }
2232 
2233   /* Form the outgoing messages */
2234   /* Initialize the header space */
2235   for (i=0; i<nrqs; i++) {
2236     j           = pa[i];
2237     sbuf1[j][0] = 0;
2238     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
2239     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2240   }
2241 
2242   /* Parse the isrow and copy data into outbuf */
2243   for (i=0; i<ismax; i++) {
2244     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
2245     irow_i = irow[i];
2246     jmax   = nrow[i];
2247     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2248       l   = 0;
2249       row = irow_i[j];
2250       while (row >= C->rmap->range[l+1]) l++;
2251       proc = l;
2252       if (proc != rank) { /* copy to the outgoing buf*/
2253         ctr[proc]++;
2254         *ptr[proc] = row;
2255         ptr[proc]++;
2256       }
2257     }
2258     /* Update the headers for the current IS */
2259     for (j=0; j<size; j++) { /* Can Optimise this loop too */
2260       if ((ctr_j = ctr[j])) {
2261         sbuf1_j        = sbuf1[j];
2262         k              = ++sbuf1_j[0];
2263         sbuf1_j[2*k]   = ctr_j;
2264         sbuf1_j[2*k-1] = i;
2265       }
2266     }
2267   }
2268 
2269   /*  Now  post the sends */
2270   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2271   for (i=0; i<nrqs; ++i) {
2272     j    = pa[i];
2273     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
2274   }
2275 
2276   /* Post Receives to capture the buffer size */
2277   ierr     = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2278   ierr     = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr);
2279   rbuf2[0] = tmp + msz;
2280   for (i=1; i<nrqs; ++i) {
2281     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2282   }
2283   for (i=0; i<nrqs; ++i) {
2284     j    = pa[i];
2285     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
2286   }
2287 
2288   /* Send to other procs the buf size they should allocate */
2289 
2290 
2291   /* Receive messages*/
2292   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2293   ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2294   ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr);
2295   {
2296     PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2297     PetscInt   *sbuf2_i;
2298 
2299     for (i=0; i<nrqr; ++i) {
2300       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
2301 
2302       req_size[idex] = 0;
2303       rbuf1_i        = rbuf1[idex];
2304       start          = 2*rbuf1_i[0] + 1;
2305       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2306       ierr           = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr);
2307       sbuf2_i        = sbuf2[idex];
2308       for (j=start; j<end; j++) {
2309         id              = rbuf1_i[j] - rstart;
2310         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2311         sbuf2_i[j]      = ncols;
2312         req_size[idex] += ncols;
2313       }
2314       req_source[idex] = r_status1[i].MPI_SOURCE;
2315       /* form the header */
2316       sbuf2_i[0] = req_size[idex];
2317       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2318 
2319       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
2320     }
2321   }
2322   ierr = PetscFree(r_status1);CHKERRQ(ierr);
2323   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2324 
2325   /*  recv buffer sizes */
2326   /* Receive messages*/
2327 
2328   ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr);
2329   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2330   ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2331   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2332   ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2333 
2334   for (i=0; i<nrqs; ++i) {
2335     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
2336     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr);
2337     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr);
2338     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
2339     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
2340   }
2341   ierr = PetscFree(r_status2);CHKERRQ(ierr);
2342   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2343 
2344   /* Wait on sends1 and sends2 */
2345   ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2346   ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2347 
2348   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
2349   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
2350   ierr = PetscFree(s_status1);CHKERRQ(ierr);
2351   ierr = PetscFree(s_status2);CHKERRQ(ierr);
2352   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2353   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2354 
2355   /* Now allocate buffers for a->j, and send them off */
2356   ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2357   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2358   ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2359   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2360 
2361   ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2362   {
2363     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2364     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2365     PetscInt cend = C->cmap->rend;
2366     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2367 
2368     for (i=0; i<nrqr; i++) {
2369       rbuf1_i   = rbuf1[i];
2370       sbuf_aj_i = sbuf_aj[i];
2371       ct1       = 2*rbuf1_i[0] + 1;
2372       ct2       = 0;
2373       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2374         kmax = rbuf1[i][2*j];
2375         for (k=0; k<kmax; k++,ct1++) {
2376           row    = rbuf1_i[ct1] - rstart;
2377           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2378           ncols  = nzA + nzB;
2379           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2380 
2381           /* load the column indices for this row into cols*/
2382           cols = sbuf_aj_i + ct2;
2383 
2384           lwrite = 0;
2385           for (l=0; l<nzB; l++) {
2386             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2387           }
2388           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2389           for (l=0; l<nzB; l++) {
2390             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2391           }
2392 
2393           ct2 += ncols;
2394         }
2395       }
2396       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
2397     }
2398   }
2399   ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr);
2400   ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr);
2401 
2402   /* Allocate buffers for a->a, and send them off */
2403   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2404   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2405   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2406   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2407 
2408   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2409   {
2410     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2411     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2412     PetscInt    cend   = C->cmap->rend;
2413     PetscInt    *b_j   = b->j;
2414     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2415 
2416     for (i=0; i<nrqr; i++) {
2417       rbuf1_i   = rbuf1[i];
2418       sbuf_aa_i = sbuf_aa[i];
2419       ct1       = 2*rbuf1_i[0]+1;
2420       ct2       = 0;
2421       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2422         kmax = rbuf1_i[2*j];
2423         for (k=0; k<kmax; k++,ct1++) {
2424           row    = rbuf1_i[ct1] - rstart;
2425           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2426           ncols  = nzA + nzB;
2427           cworkB = b_j + b_i[row];
2428           vworkA = a_a + a_i[row];
2429           vworkB = b_a + b_i[row];
2430 
2431           /* load the column values for this row into vals*/
2432           vals = sbuf_aa_i+ct2;
2433 
2434           lwrite = 0;
2435           for (l=0; l<nzB; l++) {
2436             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2437           }
2438           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2439           for (l=0; l<nzB; l++) {
2440             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2441           }
2442 
2443           ct2 += ncols;
2444         }
2445       }
2446       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
2447     }
2448   }
2449   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
2450   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
2451   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2452   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2453 
2454   /* Form the matrix */
2455   /* create col map: global col of C -> local col of submatrices */
2456   {
2457     const PetscInt *icol_i;
2458 #if defined(PETSC_USE_CTABLE)
2459     ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr);
2460     for (i=0; i<ismax; i++) {
2461       if (!allcolumns[i]) {
2462         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2463 
2464         jmax   = ncol[i];
2465         icol_i = icol[i];
2466         cmap_i = cmap[i];
2467         for (j=0; j<jmax; j++) {
2468           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2469         }
2470       } else {
2471         cmap[i] = NULL;
2472       }
2473     }
2474 #else
2475     ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr);
2476     for (i=0; i<ismax; i++) {
2477       if (!allcolumns[i]) {
2478         ierr   = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2479         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
2480         jmax   = ncol[i];
2481         icol_i = icol[i];
2482         cmap_i = cmap[i];
2483         for (j=0; j<jmax; j++) {
2484           cmap_i[icol_i[j]] = j+1;
2485         }
2486       } else {
2487         cmap[i] = NULL;
2488       }
2489     }
2490 #endif
2491   }
2492 
2493   /* Create lens which is required for MatCreate... */
2494   for (i=0,j=0; i<ismax; i++) j += nrow[i];
2495   ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2496   if (ismax) {
2497     ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr);
2498     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
2499   }
2500   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2501 
2502   /* Update lens from local data */
2503   for (i=0; i<ismax; i++) {
2504     jmax = nrow[i];
2505     if (!allcolumns[i]) cmap_i = cmap[i];
2506     irow_i = irow[i];
2507     lens_i = lens[i];
2508     for (j=0; j<jmax; j++) {
2509       l   = 0;
2510       row = irow_i[j];
2511       while (row >= C->rmap->range[l+1]) l++;
2512       proc = l;
2513       if (proc == rank) {
2514         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2515         if (!allcolumns[i]) {
2516           for (k=0; k<ncols; k++) {
2517 #if defined(PETSC_USE_CTABLE)
2518             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2519 #else
2520             tcol = cmap_i[cols[k]];
2521 #endif
2522             if (tcol) lens_i[j]++;
2523           }
2524         } else { /* allcolumns */
2525           lens_i[j] = ncols;
2526         }
2527         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2528       }
2529     }
2530   }
2531 
2532   /* Create row map: global row of C -> local row of submatrices */
2533 #if defined(PETSC_USE_CTABLE)
2534   ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr);
2535   for (i=0; i<ismax; i++) {
2536     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2537     irow_i = irow[i];
2538     jmax   = nrow[i];
2539     for (j=0; j<jmax; j++) {
2540       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2541     }
2542   }
2543 #else
2544   ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr);
2545   if (ismax) {
2546     ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr);
2547     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
2548   }
2549   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
2550   for (i=0; i<ismax; i++) {
2551     rmap_i = rmap[i];
2552     irow_i = irow[i];
2553     jmax   = nrow[i];
2554     for (j=0; j<jmax; j++) {
2555       rmap_i[irow_i[j]] = j;
2556     }
2557   }
2558 #endif
2559 
2560   /* Update lens from offproc data */
2561   {
2562     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2563 
2564     for (tmp2=0; tmp2<nrqs; tmp2++) {
2565       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
2566       idex    = pa[idex2];
2567       sbuf1_i = sbuf1[idex];
2568       jmax    = sbuf1_i[0];
2569       ct1     = 2*jmax+1;
2570       ct2     = 0;
2571       rbuf2_i = rbuf2[idex2];
2572       rbuf3_i = rbuf3[idex2];
2573       for (j=1; j<=jmax; j++) {
2574         is_no  = sbuf1_i[2*j-1];
2575         max1   = sbuf1_i[2*j];
2576         lens_i = lens[is_no];
2577         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2578         rmap_i = rmap[is_no];
2579         for (k=0; k<max1; k++,ct1++) {
2580 #if defined(PETSC_USE_CTABLE)
2581           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2582           row--;
2583           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2584 #else
2585           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2586 #endif
2587           max2 = rbuf2_i[ct1];
2588           for (l=0; l<max2; l++,ct2++) {
2589             if (!allcolumns[is_no]) {
2590 #if defined(PETSC_USE_CTABLE)
2591               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2592 #else
2593               tcol = cmap_i[rbuf3_i[ct2]];
2594 #endif
2595               if (tcol) lens_i[row]++;
2596             } else { /* allcolumns */
2597               lens_i[row]++; /* lens_i[row] += max2 ? */
2598             }
2599           }
2600         }
2601       }
2602     }
2603   }
2604   ierr = PetscFree(r_status3);CHKERRQ(ierr);
2605   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2606   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
2607   ierr = PetscFree(s_status3);CHKERRQ(ierr);
2608   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2609 
2610   /* Create the submatrices */
2611   if (scall == MAT_REUSE_MATRIX) {
2612     PetscBool flag;
2613 
2614     /*
2615         Assumes new rows are same length as the old rows,hence bug!
2616     */
2617     for (i=0; i<ismax; i++) {
2618       mat = (Mat_SeqAIJ*)(submats[i]->data);
2619       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2620       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2621       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
2622       /* Initial matrix as if empty */
2623       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2624 
2625       submats[i]->factortype = C->factortype;
2626     }
2627   } else {
2628     for (i=0; i<ismax; i++) {
2629       PetscInt rbs,cbs;
2630       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2631       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2632 
2633       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2634       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2635 
2636       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2637       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2638       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2639     }
2640   }
2641 
2642   /* Assemble the matrices */
2643   /* First assemble the local rows */
2644   {
2645     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2646     PetscScalar *imat_a;
2647 
2648     for (i=0; i<ismax; i++) {
2649       mat       = (Mat_SeqAIJ*)submats[i]->data;
2650       imat_ilen = mat->ilen;
2651       imat_j    = mat->j;
2652       imat_i    = mat->i;
2653       imat_a    = mat->a;
2654 
2655       if (!allcolumns[i]) cmap_i = cmap[i];
2656       rmap_i = rmap[i];
2657       irow_i = irow[i];
2658       jmax   = nrow[i];
2659       for (j=0; j<jmax; j++) {
2660         l   = 0;
2661         row = irow_i[j];
2662         while (row >= C->rmap->range[l+1]) l++;
2663         proc = l;
2664         if (proc == rank) {
2665           old_row = row;
2666 #if defined(PETSC_USE_CTABLE)
2667           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2668           row--;
2669 #else
2670           row = rmap_i[row];
2671 #endif
2672           ilen_row = imat_ilen[row];
2673           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2674           mat_i    = imat_i[row];
2675           mat_a    = imat_a + mat_i;
2676           mat_j    = imat_j + mat_i;
2677           if (!allcolumns[i]) {
2678             for (k=0; k<ncols; k++) {
2679 #if defined(PETSC_USE_CTABLE)
2680               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2681 #else
2682               tcol = cmap_i[cols[k]];
2683 #endif
2684               if (tcol) {
2685                 *mat_j++ = tcol - 1;
2686                 *mat_a++ = vals[k];
2687                 ilen_row++;
2688               }
2689             }
2690           } else { /* allcolumns */
2691             for (k=0; k<ncols; k++) {
2692               *mat_j++ = cols[k];  /* global col index! */
2693               *mat_a++ = vals[k];
2694               ilen_row++;
2695             }
2696           }
2697           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2698 
2699           imat_ilen[row] = ilen_row;
2700         }
2701       }
2702     }
2703   }
2704 
2705   /*   Now assemble the off proc rows*/
2706   {
2707     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
2708     PetscInt    *imat_j,*imat_i;
2709     PetscScalar *imat_a,*rbuf4_i;
2710 
2711     for (tmp2=0; tmp2<nrqs; tmp2++) {
2712       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
2713       idex    = pa[idex2];
2714       sbuf1_i = sbuf1[idex];
2715       jmax    = sbuf1_i[0];
2716       ct1     = 2*jmax + 1;
2717       ct2     = 0;
2718       rbuf2_i = rbuf2[idex2];
2719       rbuf3_i = rbuf3[idex2];
2720       rbuf4_i = rbuf4[idex2];
2721       for (j=1; j<=jmax; j++) {
2722         is_no     = sbuf1_i[2*j-1];
2723         rmap_i    = rmap[is_no];
2724         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2725         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
2726         imat_ilen = mat->ilen;
2727         imat_j    = mat->j;
2728         imat_i    = mat->i;
2729         imat_a    = mat->a;
2730         max1      = sbuf1_i[2*j];
2731         for (k=0; k<max1; k++,ct1++) {
2732           row = sbuf1_i[ct1];
2733 #if defined(PETSC_USE_CTABLE)
2734           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2735           row--;
2736 #else
2737           row = rmap_i[row];
2738 #endif
2739           ilen  = imat_ilen[row];
2740           mat_i = imat_i[row];
2741           mat_a = imat_a + mat_i;
2742           mat_j = imat_j + mat_i;
2743           max2  = rbuf2_i[ct1];
2744           if (!allcolumns[is_no]) {
2745             for (l=0; l<max2; l++,ct2++) {
2746 
2747 #if defined(PETSC_USE_CTABLE)
2748               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2749 #else
2750               tcol = cmap_i[rbuf3_i[ct2]];
2751 #endif
2752               if (tcol) {
2753                 *mat_j++ = tcol - 1;
2754                 *mat_a++ = rbuf4_i[ct2];
2755                 ilen++;
2756               }
2757             }
2758           } else { /* allcolumns */
2759             for (l=0; l<max2; l++,ct2++) {
2760               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2761               *mat_a++ = rbuf4_i[ct2];
2762               ilen++;
2763             }
2764           }
2765           imat_ilen[row] = ilen;
2766         }
2767       }
2768     }
2769   }
2770 
2771   /* sort the rows */
2772   {
2773     PetscInt    *imat_ilen,*imat_j,*imat_i;
2774     PetscScalar *imat_a;
2775 
2776     for (i=0; i<ismax; i++) {
2777       mat       = (Mat_SeqAIJ*)submats[i]->data;
2778       imat_j    = mat->j;
2779       imat_i    = mat->i;
2780       imat_a    = mat->a;
2781       imat_ilen = mat->ilen;
2782 
2783       if (allcolumns[i]) continue;
2784       jmax = nrow[i];
2785       for (j=0; j<jmax; j++) {
2786         PetscInt ilen;
2787 
2788         mat_i = imat_i[j];
2789         mat_a = imat_a + mat_i;
2790         mat_j = imat_j + mat_i;
2791         ilen  = imat_ilen[j];
2792         ierr  = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr);
2793       }
2794     }
2795   }
2796 
2797   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2798   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2799   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
2800   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2801   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2802 
2803   /* Restore the indices */
2804   for (i=0; i<ismax; i++) {
2805     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2806     if (!allcolumns[i]) {
2807       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2808     }
2809   }
2810 
2811   /* Destroy allocated memory */
2812   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
2813   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2814   ierr = PetscFree(pa);CHKERRQ(ierr);
2815 
2816   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
2817   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
2818   for (i=0; i<nrqr; ++i) {
2819     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
2820   }
2821   for (i=0; i<nrqs; ++i) {
2822     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
2823     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2824   }
2825 
2826   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
2827   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
2828   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2829   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2830   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2831   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2832   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2833 
2834 #if defined(PETSC_USE_CTABLE)
2835   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
2836 #else
2837   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
2838 #endif
2839   ierr = PetscFree(rmap);CHKERRQ(ierr);
2840 
2841   for (i=0; i<ismax; i++) {
2842     if (!allcolumns[i]) {
2843 #if defined(PETSC_USE_CTABLE)
2844       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
2845 #else
2846       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
2847 #endif
2848     }
2849   }
2850   ierr = PetscFree(cmap);CHKERRQ(ierr);
2851   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2852   ierr = PetscFree(lens);CHKERRQ(ierr);
2853 
2854   for (i=0; i<ismax; i++) {
2855     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2856     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2857   }
2858   PetscFunctionReturn(0);
2859 }
2860 
2861 /*
2862  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2863  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2864  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2865  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2866  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2867  state, and needs to be "assembled" later by compressing B's column space.
2868 
2869  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2870  Following this call, C->A & C->B have been created, even if empty.
2871  */
2872 #undef __FUNCT__
2873 #define __FUNCT__ "MatSetSeqMats_MPIAIJ"
2874 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2875 {
2876   /* If making this function public, change the error returned in this function away from _PLIB. */
2877   PetscErrorCode ierr;
2878   Mat_MPIAIJ     *aij;
2879   Mat_SeqAIJ     *Baij;
2880   PetscBool      seqaij,Bdisassembled;
2881   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2882   PetscScalar    v;
2883   const PetscInt *rowindices,*colindices;
2884 
2885   PetscFunctionBegin;
2886   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2887   if (A) {
2888     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2889     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2890     if (rowemb) {
2891       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2892       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2893     } else {
2894       if (C->rmap->n != A->rmap->n) {
2895 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2896       }
2897     }
2898     if (dcolemb) {
2899       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2900       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2901     } else {
2902       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2903     }
2904   }
2905   if (B) {
2906     ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2907     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2908     if (rowemb) {
2909       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2910       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2911     } else {
2912       if (C->rmap->n != B->rmap->n) {
2913 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2914       }
2915     }
2916     if (ocolemb) {
2917       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2918       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2919     } else {
2920       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2921     }
2922   }
2923 
2924   aij    = (Mat_MPIAIJ*)(C->data);
2925   if (!aij->A) {
2926     /* Mimic parts of MatMPIAIJSetPreallocation() */
2927     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2928     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2929     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2930     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2931     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2932   }
2933   if (A) {
2934     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2935   } else {
2936     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2937   }
2938   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2939     /*
2940       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2941       need to "disassemble" B -- convert it to using C's global indices.
2942       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2943 
2944       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2945       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2946 
2947       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2948       At least avoid calling MatSetValues() and the implied searches?
2949     */
2950 
2951     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2952 #if defined(PETSC_USE_CTABLE)
2953       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2954 #else
2955       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2956       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2957       if (aij->B) {
2958         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2959       }
2960 #endif
2961       ngcol = 0;
2962       if (aij->lvec) {
2963 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2964       }
2965       if (aij->garray) {
2966 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2967 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2968       }
2969       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2970       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2971     }
2972     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2973       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2974     }
2975     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2976       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2977     }
2978   }
2979   Bdisassembled = PETSC_FALSE;
2980   if (!aij->B) {
2981     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2982     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2983     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2984     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
2985     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
2986     Bdisassembled = PETSC_TRUE;
2987   }
2988   if (B) {
2989     Baij = (Mat_SeqAIJ*)(B->data);
2990     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2991       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
2992       for (i=0; i<B->rmap->n; i++) {
2993 	nz[i] = Baij->i[i+1] - Baij->i[i];
2994       }
2995       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
2996       ierr = PetscFree(nz);CHKERRQ(ierr);
2997     }
2998 
2999     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3000     shift = rend-rstart;
3001     count = 0;
3002     rowindices = NULL;
3003     colindices = NULL;
3004     if (rowemb) {
3005       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3006     }
3007     if (ocolemb) {
3008       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3009     }
3010     for (i=0; i<B->rmap->n; i++) {
3011       PetscInt row;
3012       row = i;
3013       if (rowindices) row = rowindices[i];
3014       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3015 	col  = Baij->j[count];
3016 	if (colindices) col = colindices[col];
3017 	if (Bdisassembled && col>=rstart) col += shift;
3018 	v    = Baij->a[count];
3019 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3020 	++count;
3021       }
3022     }
3023     /* No assembly for aij->B is necessary. */
3024     /* FIXME: set aij->B's nonzerostate correctly. */
3025   } else {
3026     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3027   }
3028   C->preallocated  = PETSC_TRUE;
3029   C->was_assembled = PETSC_FALSE;
3030   C->assembled     = PETSC_FALSE;
3031    /*
3032       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3033       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3034    */
3035   PetscFunctionReturn(0);
3036 }
3037 
3038 #undef __FUNCT__
3039 #define __FUNCT__ "MatGetSeqMats_MPIAIJ"
3040 /*
3041   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3042  */
3043 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3044 {
3045   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
3046 
3047   PetscFunctionBegin;
3048   PetscValidPointer(A,2);
3049   PetscValidPointer(B,3);
3050   /* FIXME: make sure C is assembled */
3051   *A = aij->A;
3052   *B = aij->B;
3053   /* Note that we don't incref *A and *B, so be careful! */
3054   PetscFunctionReturn(0);
3055 }
3056 
3057 /*
3058   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3059   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3060 */
3061 #undef __FUNCT__
3062 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ"
3063 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3064                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3065 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3066 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3067 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3068 {
3069   PetscErrorCode ierr;
3070   PetscMPIInt    isize,flag;
3071   PetscInt       i,ii,cismax,ispar,ciscol_localsize;
3072   Mat            *A,*B;
3073   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3074 
3075   PetscFunctionBegin;
3076   if (!ismax) PetscFunctionReturn(0);
3077 
3078   for (i = 0, cismax = 0; i < ismax; ++i) {
3079     PetscMPIInt isize;
3080     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
3081     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3082     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
3083     if (isize > 1) ++cismax;
3084   }
3085   /*
3086      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3087      ispar counts the number of parallel ISs across C's comm.
3088   */
3089   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3090   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3091     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3092     PetscFunctionReturn(0);
3093   }
3094 
3095   /* if (ispar) */
3096   /*
3097     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3098     These are used to extract the off-diag portion of the resulting parallel matrix.
3099     The row IS for the off-diag portion is the same as for the diag portion,
3100     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3101   */
3102   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3103   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3104   for (i = 0, ii = 0; i < ismax; ++i) {
3105     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3106     if (isize > 1) {
3107       /*
3108 	 TODO: This is the part that's ***NOT SCALABLE***.
3109 	 To fix this we need to extract just the indices of C's nonzero columns
3110 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3111 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
3112 	 to be done without serializing on the IS list, so, most likely, it is best
3113 	 done by rewriting MatGetSubMatrices_MPIAIJ() directly.
3114       */
3115       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3116       /* Now we have to
3117 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3118 	     were sorted on each rank, concatenated they might no longer be sorted;
3119 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3120 	     indices in the nondecreasing order to the original index positions.
3121 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3122       */
3123       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3124       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3125       ++ii;
3126     }
3127   }
3128   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3129   for (i = 0, ii = 0; i < ismax; ++i) {
3130     PetscInt       j,issize;
3131     const PetscInt *indices;
3132 
3133     /*
3134        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3135      */
3136     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3137     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3138     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3139     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3140     for (j = 1; j < issize; ++j) {
3141       if (indices[j] == indices[j-1]) {
3142 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3143       }
3144     }
3145     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3146 
3147 
3148     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3149     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3150     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3151     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3152     for (j = 1; j < issize; ++j) {
3153       if (indices[j-1] == indices[j]) {
3154 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3155       }
3156     }
3157     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3158     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3159     if (isize > 1) {
3160       cisrow[ii] = isrow[i];
3161       ++ii;
3162     }
3163   }
3164   /*
3165     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3166     array of sequential matrices underlying the resulting parallel matrices.
3167     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3168     contain duplicates.
3169 
3170     There are as many diag matrices as there are original index sets. There are only as many parallel
3171     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3172 
3173     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3174     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3175       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3176       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3177       with A[i] and B[ii] extracted from the corresponding MPI submat.
3178     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3179       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3180       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3181       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3182       values into A[i] and B[ii] sitting inside the corresponding submat.
3183     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3184       A[i], B[ii], AA[i] or BB[ii] matrices.
3185   */
3186   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3187   if (scall == MAT_INITIAL_MATRIX) {
3188     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3189     /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */
3190   } else {
3191     ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr);
3192     ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr);
3193     /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */
3194     for (i = 0, ii = 0; i < ismax; ++i) {
3195       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3196       if (isize > 1) {
3197 	Mat AA,BB;
3198 	ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr);
3199 	if (!isrow_p[i] && !iscol_p[i]) {
3200 	  A[i] = AA;
3201 	} else {
3202 	  /* TODO: extract A[i] composed on AA. */
3203 	  ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
3204 	}
3205 	if (!isrow_p[i] && !ciscol_p[ii]) {
3206 	  B[ii] = BB;
3207 	} else {
3208 	  /* TODO: extract B[ii] composed on BB. */
3209 	  ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr);
3210 	}
3211 	++ii;
3212       } else {
3213 	if (!isrow_p[i] && !iscol_p[i]) {
3214 	  A[i] = (*submat)[i];
3215 	} else {
3216 	  /* TODO: extract A[i] composed on (*submat)[i]. */
3217 	  ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
3218 	}
3219       }
3220     }
3221   }
3222   /* Now obtain the sequential A and B submatrices separately. */
3223   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr);
3224   /* I did not figure out a good way to fix it right now.
3225    * Local column size of B[i] is different from the size of ciscol[i].
3226    * B[i]'s size is finally determined by MatAssembly, while
3227    * ciscol[i] is computed as the complement of iscol[i].
3228    * It is better to keep only nonzero indices when computing
3229    * the complement ciscol[i].
3230    * */
3231   if(scall==MAT_REUSE_MATRIX){
3232 	for(i=0; i<cismax; i++){
3233 	  ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr);
3234 	  B[i]->cmap->n = ciscol_localsize;
3235 	}
3236   }
3237   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr);
3238   /*
3239     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3240     matrices A & B have been extracted directly into the parallel matrices containing them, or
3241     simply into the sequential matrix identical with the corresponding A (if isize == 1).
3242     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3243     to have the same sparsity pattern.
3244     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3245     must be constructed for C. This is done by setseqmat(s).
3246   */
3247   for (i = 0, ii = 0; i < ismax; ++i) {
3248     /*
3249        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3250        That way we can avoid sorting and computing permutations when reusing.
3251        To this end:
3252         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3253 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
3254 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3255     */
3256     MatStructure pattern;
3257     if (scall == MAT_INITIAL_MATRIX) {
3258       pattern = DIFFERENT_NONZERO_PATTERN;
3259     } else {
3260       pattern = SAME_NONZERO_PATTERN;
3261     }
3262     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3263     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3264     if (isize > 1) {
3265       if (scall == MAT_INITIAL_MATRIX) {
3266 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3267 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3268 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3269 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3270 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3271       }
3272       /*
3273 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3274       */
3275       {
3276 	Mat AA,BB;
3277 	AA = NULL;
3278 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i];
3279 	BB = NULL;
3280 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii];
3281 	if (AA || BB) {
3282 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3283 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3284 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3285 	}
3286 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) {
3287 	  /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */
3288 	  ierr = MatDestroy(&AA);CHKERRQ(ierr);
3289 	}
3290 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) {
3291 	  /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */
3292 	  ierr = MatDestroy(&BB);CHKERRQ(ierr);
3293 	}
3294       }
3295       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3296       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3297       ++ii;
3298     } else { /* if (isize == 1) */
3299       if (scall == MAT_INITIAL_MATRIX) {
3300 	if (isrow_p[i] || iscol_p[i]) {
3301 	  ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3302 	} else (*submat)[i] = A[i];
3303       }
3304       if (isrow_p[i] || iscol_p[i]) {
3305 	ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3306 	/* Otherwise A is extracted straight into (*submats)[i]. */
3307 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3308 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
3309       }
3310     }
3311     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3312     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3313   }
3314   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3315   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3316   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3317   ierr = PetscFree(A);CHKERRQ(ierr);
3318   ierr = PetscFree(B);CHKERRQ(ierr);
3319   PetscFunctionReturn(0);
3320 }
3321 
3322 
3323 
3324 #undef __FUNCT__
3325 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ"
3326 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3327 {
3328   PetscErrorCode ierr;
3329 
3330   PetscFunctionBegin;
3331   ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3332   PetscFunctionReturn(0);
3333 }
3334