xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision c10200c1442b553b7ad65c70101560db4fa22e78)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 #undef __FUNCT__
23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
25 {
26   PetscErrorCode ierr;
27   PetscInt       i;
28 
29   PetscFunctionBegin;
30   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
31   for (i=0; i<ov; ++i) {
32     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
33   }
34   PetscFunctionReturn(0);
35 }
36 
37 #undef __FUNCT__
38 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable"
39 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
40 {
41   PetscErrorCode ierr;
42   PetscInt       i;
43 
44   PetscFunctionBegin;
45   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
46   for (i=0; i<ov; ++i) {
47     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
48   }
49   PetscFunctionReturn(0);
50 }
51 
52 
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
56 {
57   PetscErrorCode   ierr;
58   MPI_Comm         comm;
59   PetscInt        *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
60   PetscInt         *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
61   PetscInt         nrecvrows,*sbsizes = 0,*sbdata = 0;
62   const PetscInt  *indices_i,**indices;
63   PetscLayout      rmap;
64   PetscMPIInt      rank,size,*toranks,*fromranks,nto,nfrom;
65   PetscSF          sf;
66   PetscSFNode     *remote;
67 
68   PetscFunctionBegin;
69   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
70   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
71   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
72   /* get row map to determine where rows should be going */
73   ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr);
74   /* retrieve IS data and put all together so that we
75    * can optimize communication
76    *  */
77   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
78   for (i=0,tlength=0; i<nidx; i++){
79     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
80     tlength += length[i];
81     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
82   }
83   /* find these rows on remote processors */
84   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
85   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
86   nrrows = 0;
87   for (i=0; i<nidx; i++){
88     length_i     = length[i];
89     indices_i    = indices[i];
90     for (j=0; j<length_i; j++){
91       owner = -1;
92       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
93       /* remote processors */
94       if (owner != rank){
95         tosizes_temp[owner]++; /* number of rows to owner */
96         rrow_ranks[nrrows]  = owner; /* processor */
97         rrow_isids[nrrows]   = i; /* is id */
98         remoterows[nrrows++] = indices_i[j]; /* row */
99       }
100     }
101     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
102   }
103   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
104   /* test if we need to exchange messages
105    * generally speaking, we do not need to exchange
106    * data when overlap is 1
107    * */
108   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
109   /* we do not have any messages
110    * It usually corresponds to overlap 1
111    * */
112   if (!reducednrrows){
113     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
114     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
115     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
116     PetscFunctionReturn(0);
117   }
118   nto = 0;
119   /* send sizes and ranks for building a two-sided communcation */
120   for (i=0; i<size; i++){
121    if (tosizes_temp[i]){
122      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
123      tosizes_temp[i] = nto; /* a map from processor to index */
124      toranks[nto++]  = i; /* processor */
125    }
126   }
127   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
128   for (i=0; i<nto; i++){
129     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
130     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
131   }
132   /* send information to other processors */
133   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
134   nrecvrows = 0;
135   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
136   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
137   nrecvrows = 0;
138   for (i=0; i<nfrom; i++){
139     for (j=0; j<fromsizes[2*i]; j++){
140       remote[nrecvrows].rank    = fromranks[i];
141       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
142     }
143   }
144   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
145   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
146   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
147   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
148   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
149   /* message pair <no of is, row>  */
150   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
151   for (i=0; i<nrrows; i++){
152     owner = rrow_ranks[i]; /* processor */
153     j     = tosizes_temp[owner]; /* index */
154     todata[toffsets[j]++] = rrow_isids[i];
155     todata[toffsets[j]++] = remoterows[i];
156   }
157   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
158   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
159   ierr = PetscFree(toffsets);CHKERRQ(ierr);
160   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
161   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
162   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
163   /* send rows belonging to the remote so that then we could get the overlapping data back */
164   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
165   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
166   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
167   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
168   ierr = PetscFree(fromranks);CHKERRQ(ierr);
169   nrecvrows = 0;
170   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
171   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
172   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
173   nrecvrows = 0;
174   for (i=0; i<nto; i++){
175     for (j=0; j<tosizes[2*i]; j++){
176       remote[nrecvrows].rank    = toranks[i];
177       remote[nrecvrows++].index = tosizes[2*i+1]+j;
178     }
179   }
180   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
181   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
182   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
183   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
184   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
185   /* overlap communication and computation */
186   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
187   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
188   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
189   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
190   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
191   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
192   ierr = PetscFree(toranks);CHKERRQ(ierr);
193   ierr = PetscFree(tosizes);CHKERRQ(ierr);
194   ierr = PetscFree(todata);CHKERRQ(ierr);
195   PetscFunctionReturn(0);
196 }
197 
198 #undef __FUNCT__
199 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable"
200 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
201 {
202   PetscInt         *isz,isz_i,i,j,is_id, data_size;
203   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
204   const PetscInt   *indices_i_temp;
205   PetscErrorCode    ierr;
206 
207   PetscFunctionBegin;
208   max_lsize = 0;
209   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
210   for (i=0; i<nidx; i++){
211     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
212     max_lsize = lsize>max_lsize ? lsize:max_lsize;
213     isz[i]    = lsize;
214   }
215   ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr);
216   for (i=0; i<nidx; i++){
217     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
218     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
219     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
220     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
221   }
222   /* retrieve information to get row id and its overlap */
223   for (i=0; i<nrecvs; ){
224     is_id      = recvdata[i++];
225     data_size  = recvdata[i++];
226     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
227     isz_i      = isz[is_id];
228     for (j=0; j< data_size; j++){
229       col = recvdata[i++];
230       indices_i[isz_i++] = col;
231     }
232     isz[is_id] = isz_i;
233   }
234   /* remove duplicate entities */
235   for (i=0; i<nidx; i++){
236     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
237     isz_i      = isz[i];
238     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
239     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
240   }
241   ierr = PetscFree(isz);CHKERRQ(ierr);
242   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
243   PetscFunctionReturn(0);
244 }
245 
246 #undef __FUNCT__
247 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable"
248 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
249 {
250   PetscLayout       rmap,cmap;
251   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
252   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
253   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
254   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
255   Mat               amat,bmat;
256   PetscMPIInt       rank;
257   PetscBool         done;
258   MPI_Comm          comm;
259   PetscErrorCode    ierr;
260 
261   PetscFunctionBegin;
262   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
263   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
264   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
265   /* Even if the mat is symmetric, we still assume it is not symmetric */
266   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
267   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
268   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
269   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
270   /* total number of nonzero values is used to estimate the memory usage in the next step */
271   tnz  = ai[an]+bi[bn];
272   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
273   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
274   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
275   /* to find the longest message */
276   max_fszs = 0;
277   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
278   /* better way to estimate number of nonzero in the mat??? */
279   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
280   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
281   rows_pos  = 0;
282   totalrows = 0;
283   for (i=0; i<nfrom; i++){
284     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
285     /* group data together */
286     for (j=0; j<fromsizes[2*i]; j+=2){
287       is_id                       = fromrows[rows_pos++];/* no of is */
288       rows_i                      = rows_data[is_id];
289       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
290     }
291     /* estimate a space to avoid multiple allocations  */
292     for (j=0; j<nidx; j++){
293       indvc_ij = 0;
294       rows_i   = rows_data[j];
295       for (l=0; l<rows_pos_i[j]; l++){
296         row    = rows_i[l]-rstart;
297         start  = ai[row];
298         end    = ai[row+1];
299         for (k=start; k<end; k++){ /* Amat */
300           col = aj[k] + cstart;
301           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
302         }
303         start = bi[row];
304         end   = bi[row+1];
305         for (k=start; k<end; k++) { /* Bmat */
306           col = gcols[bj[k]];
307           indices_tmp[indvc_ij++] = col;
308         }
309       }
310       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
311       indv_counts[i*nidx+j] = indvc_ij;
312       totalrows            += indvc_ij;
313     }
314   }
315   /* message triple <no of is, number of rows, rows> */
316   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
317   totalrows = 0;
318   rows_pos  = 0;
319   /* use this code again */
320   for (i=0;i<nfrom;i++){
321     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
322     for (j=0; j<fromsizes[2*i]; j+=2){
323       is_id                       = fromrows[rows_pos++];
324       rows_i                      = rows_data[is_id];
325       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
326     }
327     /* add data  */
328     for (j=0; j<nidx; j++){
329       if (!indv_counts[i*nidx+j]) continue;
330       indvc_ij = 0;
331       sbdata[totalrows++] = j;
332       sbdata[totalrows++] = indv_counts[i*nidx+j];
333       sbsizes[2*i]       += 2;
334       rows_i              = rows_data[j];
335       for (l=0; l<rows_pos_i[j]; l++){
336         row   = rows_i[l]-rstart;
337         start = ai[row];
338         end   = ai[row+1];
339         for (k=start; k<end; k++){ /* Amat */
340           col = aj[k] + cstart;
341           indices_tmp[indvc_ij++] = col;
342         }
343         start = bi[row];
344         end   = bi[row+1];
345         for (k=start; k<end; k++) { /* Bmat */
346           col = gcols[bj[k]];
347           indices_tmp[indvc_ij++] = col;
348         }
349       }
350       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
351       sbsizes[2*i]  += indvc_ij;
352       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
353       totalrows += indvc_ij;
354     }
355   }
356   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
357   for (i=0; i<nfrom; i++){
358     offsets[i+1]   = offsets[i] + sbsizes[2*i];
359     sbsizes[2*i+1] = offsets[i];
360   }
361   ierr = PetscFree(offsets);CHKERRQ(ierr);
362   if (sbrowsizes) *sbrowsizes = sbsizes;
363   if (sbrows) *sbrows = sbdata;
364   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
365   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
366   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
367   PetscFunctionReturn(0);
368 }
369 
370 #undef __FUNCT__
371 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable"
372 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
373 {
374   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
375   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
376   PetscInt          lsize,lsize_tmp,owner;
377   PetscMPIInt       rank;
378   Mat               amat,bmat;
379   PetscBool         done;
380   PetscLayout       cmap,rmap;
381   MPI_Comm          comm;
382   PetscErrorCode    ierr;
383 
384   PetscFunctionBegin;
385   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
386   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
387   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
388   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
389   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
390   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
391   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
392   /* is it a safe way to compute number of nonzero values ? */
393   tnz  = ai[an]+bi[bn];
394   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
395   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
396   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
397   /* it is a better way to estimate memory than the old implementation
398    * where global size of matrix is used
399    * */
400   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
401   for (i=0; i<nidx; i++) {
402     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
403     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
404     lsize_tmp = 0;
405     for (j=0; j<lsize; j++) {
406       owner = -1;
407       row   = indices[j];
408       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
409       if (owner != rank) continue;
410       /* local number */
411       row  -= rstart;
412       start = ai[row];
413       end   = ai[row+1];
414       for (k=start; k<end; k++) { /* Amat */
415         col = aj[k] + cstart;
416         indices_temp[lsize_tmp++] = col;
417       }
418       start = bi[row];
419       end   = bi[row+1];
420       for (k=start; k<end; k++) { /* Bmat */
421         col = gcols[bj[k]];
422         indices_temp[lsize_tmp++] = col;
423       }
424     }
425    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
426    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
427    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
428    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
429   }
430   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
431   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
432   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
433   PetscFunctionReturn(0);
434 }
435 
436 
437 /*
438   Sample message format:
439   If a processor A wants processor B to process some elements corresponding
440   to index sets is[1],is[5]
441   mesg [0] = 2   (no of index sets in the mesg)
442   -----------
443   mesg [1] = 1 => is[1]
444   mesg [2] = sizeof(is[1]);
445   -----------
446   mesg [3] = 5  => is[5]
447   mesg [4] = sizeof(is[5]);
448   -----------
449   mesg [5]
450   mesg [n]  datas[1]
451   -----------
452   mesg[n+1]
453   mesg[m]  data(is[5])
454   -----------
455 
456   Notes:
457   nrqs - no of requests sent (or to be sent out)
458   nrqr - no of requests recieved (which have to be or which have been processed
459 */
460 #undef __FUNCT__
461 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
462 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
463 {
464   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
465   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
466   const PetscInt **idx,*idx_i;
467   PetscInt       *n,**data,len;
468 #if defined(PETSC_USE_CTABLE)
469   PetscTable     *table_data,table_data_i;
470   PetscInt       *tdata,tcount,tcount_max;
471 #else
472   PetscInt       *data_i,*d_p;
473 #endif
474   PetscErrorCode ierr;
475   PetscMPIInt    size,rank,tag1,tag2;
476   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
477   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
478   PetscBT        *table;
479   MPI_Comm       comm;
480   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
481   MPI_Status     *s_status,*recv_status;
482   char           *t_p;
483 
484   PetscFunctionBegin;
485   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
486   size = c->size;
487   rank = c->rank;
488   M    = C->rmap->N;
489 
490   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
491   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
492 
493   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
494 
495   for (i=0; i<imax; i++) {
496     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
497     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
498   }
499 
500   /* evaluate communication - mesg to who,length of mesg, and buffer space
501      required. Based on this, buffers are allocated, and data copied into them  */
502   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
503   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
504   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
505   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
506   for (i=0; i<imax; i++) {
507     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
508     idx_i = idx[i];
509     len   = n[i];
510     for (j=0; j<len; j++) {
511       row = idx_i[j];
512       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
513       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
514       w4[proc]++;
515     }
516     for (j=0; j<size; j++) {
517       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
518     }
519   }
520 
521   nrqs     = 0;              /* no of outgoing messages */
522   msz      = 0;              /* total mesg length (for all proc */
523   w1[rank] = 0;              /* no mesg sent to intself */
524   w3[rank] = 0;
525   for (i=0; i<size; i++) {
526     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
527   }
528   /* pa - is list of processors to communicate with */
529   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
530   for (i=0,j=0; i<size; i++) {
531     if (w1[i]) {pa[j] = i; j++;}
532   }
533 
534   /* Each message would have a header = 1 + 2*(no of IS) + data */
535   for (i=0; i<nrqs; i++) {
536     j      = pa[i];
537     w1[j] += w2[j] + 2*w3[j];
538     msz   += w1[j];
539   }
540 
541   /* Determine the number of messages to expect, their lengths, from from-ids */
542   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
543   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
544 
545   /* Now post the Irecvs corresponding to these messages */
546   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
547 
548   /* Allocate Memory for outgoing messages */
549   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
550   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
551   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
552 
553   {
554     PetscInt *iptr = tmp,ict  = 0;
555     for (i=0; i<nrqs; i++) {
556       j         = pa[i];
557       iptr     +=  ict;
558       outdat[j] = iptr;
559       ict       = w1[j];
560     }
561   }
562 
563   /* Form the outgoing messages */
564   /* plug in the headers */
565   for (i=0; i<nrqs; i++) {
566     j            = pa[i];
567     outdat[j][0] = 0;
568     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
569     ptr[j]       = outdat[j] + 2*w3[j] + 1;
570   }
571 
572   /* Memory for doing local proc's work */
573   {
574 #if defined(PETSC_USE_CTABLE)
575     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
576     for (i=0; i<imax; i++) {
577       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
578     }
579     ierr = PetscCalloc4(imax,&table,imax,&data,imax,&isz,(M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr);
580     for (i=0; i<imax; i++) {
581       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
582     }
583 #else
584     PetscInt Mimax = 0,M_BPB_imax = 0;
585     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
586     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
587     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
588     ierr = PetscCalloc5(imax,&table,imax,&data,imax,&isz,M*imax,&d_p,(M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr);
589     for (i=0; i<imax; i++) {
590       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
591       data[i]  = d_p + M*i;
592     }
593 #endif
594   }
595 
596   /* Parse the IS and update local tables and the outgoing buf with the data */
597   {
598     PetscInt n_i,isz_i,*outdat_j,ctr_j;
599     PetscBT  table_i;
600 
601     for (i=0; i<imax; i++) {
602       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
603       n_i     = n[i];
604       table_i = table[i];
605       idx_i   = idx[i];
606 #if defined(PETSC_USE_CTABLE)
607       table_data_i = table_data[i];
608 #else
609       data_i  = data[i];
610 #endif
611       isz_i   = isz[i];
612       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
613         row  = idx_i[j];
614         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
615         if (proc != rank) { /* copy to the outgoing buffer */
616           ctr[proc]++;
617           *ptr[proc] = row;
618           ptr[proc]++;
619         } else if (!PetscBTLookupSet(table_i,row)) {
620 #if defined(PETSC_USE_CTABLE)
621           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
622 #else
623           data_i[isz_i] = row; /* Update the local table */
624 #endif
625           isz_i++;
626         }
627       }
628       /* Update the headers for the current IS */
629       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
630         if ((ctr_j = ctr[j])) {
631           outdat_j        = outdat[j];
632           k               = ++outdat_j[0];
633           outdat_j[2*k]   = ctr_j;
634           outdat_j[2*k-1] = i;
635         }
636       }
637       isz[i] = isz_i;
638     }
639   }
640 
641   /*  Now  post the sends */
642   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
643   for (i=0; i<nrqs; ++i) {
644     j    = pa[i];
645     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
646   }
647 
648   /* No longer need the original indices */
649   for (i=0; i<imax; ++i) {
650     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
651   }
652   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
653 
654   for (i=0; i<imax; ++i) {
655     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
656   }
657 
658   /* Do Local work */
659 #if defined(PETSC_USE_CTABLE)
660   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
661 #else
662   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
663 #endif
664 
665   /* Receive messages */
666   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
667   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
668 
669   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
670   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
671 
672   /* Phase 1 sends are complete - deallocate buffers */
673   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
674   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
675 
676   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
677   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
678   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
679   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
680   ierr = PetscFree(rbuf);CHKERRQ(ierr);
681 
682 
683   /* Send the data back */
684   /* Do a global reduction to know the buffer space req for incoming messages */
685   {
686     PetscMPIInt *rw1;
687 
688     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
689 
690     for (i=0; i<nrqr; ++i) {
691       proc = recv_status[i].MPI_SOURCE;
692 
693       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
694       rw1[proc] = isz1[i];
695     }
696     ierr = PetscFree(onodes1);CHKERRQ(ierr);
697     ierr = PetscFree(olengths1);CHKERRQ(ierr);
698 
699     /* Determine the number of messages to expect, their lengths, from from-ids */
700     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
701     ierr = PetscFree(rw1);CHKERRQ(ierr);
702   }
703   /* Now post the Irecvs corresponding to these messages */
704   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
705 
706   /* Now  post the sends */
707   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
708   for (i=0; i<nrqr; ++i) {
709     j    = recv_status[i].MPI_SOURCE;
710     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
711   }
712 
713   /* receive work done on other processors */
714   {
715     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
716     PetscMPIInt idex;
717     PetscBT     table_i;
718     MPI_Status  *status2;
719 
720     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
721     for (i=0; i<nrqs; ++i) {
722       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
723       /* Process the message */
724       rbuf2_i = rbuf2[idex];
725       ct1     = 2*rbuf2_i[0]+1;
726       jmax    = rbuf2[idex][0];
727       for (j=1; j<=jmax; j++) {
728         max     = rbuf2_i[2*j];
729         is_no   = rbuf2_i[2*j-1];
730         isz_i   = isz[is_no];
731         table_i = table[is_no];
732 #if defined(PETSC_USE_CTABLE)
733         table_data_i = table_data[is_no];
734 #else
735         data_i  = data[is_no];
736 #endif
737         for (k=0; k<max; k++,ct1++) {
738           row = rbuf2_i[ct1];
739           if (!PetscBTLookupSet(table_i,row)) {
740 #if defined(PETSC_USE_CTABLE)
741             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
742 #else
743             data_i[isz_i] = row;
744 #endif
745             isz_i++;
746           }
747         }
748         isz[is_no] = isz_i;
749       }
750     }
751 
752     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
753     ierr = PetscFree(status2);CHKERRQ(ierr);
754   }
755 
756 #if defined(PETSC_USE_CTABLE)
757   tcount_max = 0;
758   for (i=0; i<imax; ++i) {
759     table_data_i = table_data[i];
760     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
761     if (tcount_max < tcount) tcount_max = tcount;
762   }
763   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
764 #endif
765 
766   for (i=0; i<imax; ++i) {
767 #if defined(PETSC_USE_CTABLE)
768     PetscTablePosition tpos;
769     table_data_i = table_data[i];
770 
771     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
772     while (tpos) {
773       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
774       tdata[--j] = --k;
775     }
776     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
777 #else
778     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
779 #endif
780   }
781 
782   ierr = PetscFree(onodes2);CHKERRQ(ierr);
783   ierr = PetscFree(olengths2);CHKERRQ(ierr);
784 
785   ierr = PetscFree(pa);CHKERRQ(ierr);
786   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
787   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
788   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
789   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
790   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
791   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
792   ierr = PetscFree(s_status);CHKERRQ(ierr);
793   ierr = PetscFree(recv_status);CHKERRQ(ierr);
794   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
795   ierr = PetscFree(xdata);CHKERRQ(ierr);
796   ierr = PetscFree(isz1);CHKERRQ(ierr);
797 #if defined(PETSC_USE_CTABLE)
798   for (i=0; i<imax; i++) {
799     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
800   }
801   ierr = PetscFree(table_data);CHKERRQ(ierr);
802   ierr = PetscFree(tdata);CHKERRQ(ierr);
803   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
804 #else
805   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
806 #endif
807   PetscFunctionReturn(0);
808 }
809 
810 #undef __FUNCT__
811 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
812 /*
813    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
814        the work on the local processor.
815 
816      Inputs:
817       C      - MAT_MPIAIJ;
818       imax - total no of index sets processed at a time;
819       table  - an array of char - size = m bits.
820 
821      Output:
822       isz    - array containing the count of the solution elements corresponding
823                to each index set;
824       data or table_data  - pointer to the solutions
825 */
826 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
827 {
828   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
829   Mat        A  = c->A,B = c->B;
830   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
831   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
832   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
833   PetscBT    table_i;
834 #if defined(PETSC_USE_CTABLE)
835   PetscTable         table_data_i;
836   PetscErrorCode     ierr;
837   PetscTablePosition tpos;
838   PetscInt           tcount,*tdata;
839 #else
840   PetscInt           *data_i;
841 #endif
842 
843   PetscFunctionBegin;
844   rstart = C->rmap->rstart;
845   cstart = C->cmap->rstart;
846   ai     = a->i;
847   aj     = a->j;
848   bi     = b->i;
849   bj     = b->j;
850   garray = c->garray;
851 
852   for (i=0; i<imax; i++) {
853 #if defined(PETSC_USE_CTABLE)
854     /* copy existing entries of table_data_i into tdata[] */
855     table_data_i = table_data[i];
856     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
857     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
858 
859     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
860     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
861     while (tpos) {
862       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
863       tdata[--j] = --row;
864       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
865     }
866 #else
867     data_i  = data[i];
868 #endif
869     table_i = table[i];
870     isz_i   = isz[i];
871     max     = isz[i];
872 
873     for (j=0; j<max; j++) {
874 #if defined(PETSC_USE_CTABLE)
875       row   = tdata[j] - rstart;
876 #else
877       row   = data_i[j] - rstart;
878 #endif
879       start = ai[row];
880       end   = ai[row+1];
881       for (k=start; k<end; k++) { /* Amat */
882         val = aj[k] + cstart;
883         if (!PetscBTLookupSet(table_i,val)) {
884 #if defined(PETSC_USE_CTABLE)
885           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
886 #else
887           data_i[isz_i] = val;
888 #endif
889           isz_i++;
890         }
891       }
892       start = bi[row];
893       end   = bi[row+1];
894       for (k=start; k<end; k++) { /* Bmat */
895         val = garray[bj[k]];
896         if (!PetscBTLookupSet(table_i,val)) {
897 #if defined(PETSC_USE_CTABLE)
898           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
899 #else
900           data_i[isz_i] = val;
901 #endif
902           isz_i++;
903         }
904       }
905     }
906     isz[i] = isz_i;
907 
908 #if defined(PETSC_USE_CTABLE)
909     ierr = PetscFree(tdata);CHKERRQ(ierr);
910 #endif
911   }
912   PetscFunctionReturn(0);
913 }
914 
915 #undef __FUNCT__
916 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
917 /*
918       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
919          and return the output
920 
921          Input:
922            C    - the matrix
923            nrqr - no of messages being processed.
924            rbuf - an array of pointers to the recieved requests
925 
926          Output:
927            xdata - array of messages to be sent back
928            isz1  - size of each message
929 
930   For better efficiency perhaps we should malloc separately each xdata[i],
931 then if a remalloc is required we need only copy the data for that one row
932 rather then all previous rows as it is now where a single large chunck of
933 memory is used.
934 
935 */
936 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
937 {
938   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
939   Mat            A  = c->A,B = c->B;
940   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
941   PetscErrorCode ierr;
942   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
943   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
944   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
945   PetscInt       *rbuf_i,kmax,rbuf_0;
946   PetscBT        xtable;
947 
948   PetscFunctionBegin;
949   m      = C->rmap->N;
950   rstart = C->rmap->rstart;
951   cstart = C->cmap->rstart;
952   ai     = a->i;
953   aj     = a->j;
954   bi     = b->i;
955   bj     = b->j;
956   garray = c->garray;
957 
958 
959   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
960     rbuf_i =  rbuf[i];
961     rbuf_0 =  rbuf_i[0];
962     ct    += rbuf_0;
963     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
964   }
965 
966   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
967   else max1 = 1;
968   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
969   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
970   ++no_malloc;
971   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
972   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
973 
974   ct3 = 0;
975   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
976     rbuf_i =  rbuf[i];
977     rbuf_0 =  rbuf_i[0];
978     ct1    =  2*rbuf_0+1;
979     ct2    =  ct1;
980     ct3   += ct1;
981     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
982       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
983       oct2 = ct2;
984       kmax = rbuf_i[2*j];
985       for (k=0; k<kmax; k++,ct1++) {
986         row = rbuf_i[ct1];
987         if (!PetscBTLookupSet(xtable,row)) {
988           if (!(ct3 < mem_estimate)) {
989             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
990             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
991             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
992             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
993             xdata[0]     = tmp;
994             mem_estimate = new_estimate; ++no_malloc;
995             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
996           }
997           xdata[i][ct2++] = row;
998           ct3++;
999         }
1000       }
1001       for (k=oct2,max2=ct2; k<max2; k++) {
1002         row   = xdata[i][k] - rstart;
1003         start = ai[row];
1004         end   = ai[row+1];
1005         for (l=start; l<end; l++) {
1006           val = aj[l] + cstart;
1007           if (!PetscBTLookupSet(xtable,val)) {
1008             if (!(ct3 < mem_estimate)) {
1009               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1010               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1011               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1012               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1013               xdata[0]     = tmp;
1014               mem_estimate = new_estimate; ++no_malloc;
1015               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1016             }
1017             xdata[i][ct2++] = val;
1018             ct3++;
1019           }
1020         }
1021         start = bi[row];
1022         end   = bi[row+1];
1023         for (l=start; l<end; l++) {
1024           val = garray[bj[l]];
1025           if (!PetscBTLookupSet(xtable,val)) {
1026             if (!(ct3 < mem_estimate)) {
1027               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1028               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1029               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1030               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1031               xdata[0]     = tmp;
1032               mem_estimate = new_estimate; ++no_malloc;
1033               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1034             }
1035             xdata[i][ct2++] = val;
1036             ct3++;
1037           }
1038         }
1039       }
1040       /* Update the header*/
1041       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1042       xdata[i][2*j-1] = rbuf_i[2*j-1];
1043     }
1044     xdata[i][0] = rbuf_0;
1045     xdata[i+1]  = xdata[i] + ct2;
1046     isz1[i]     = ct2; /* size of each message */
1047   }
1048   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1049   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1050   PetscFunctionReturn(0);
1051 }
1052 /* -------------------------------------------------------------------------*/
1053 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
1054 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
1055 /*
1056     Every processor gets the entire matrix
1057 */
1058 #undef __FUNCT__
1059 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
1060 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1061 {
1062   Mat            B;
1063   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1064   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1065   PetscErrorCode ierr;
1066   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
1067   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1068   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1069   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1070 
1071   PetscFunctionBegin;
1072   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1073   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
1074 
1075   if (scall == MAT_INITIAL_MATRIX) {
1076     /* ----------------------------------------------------------------
1077          Tell every processor the number of nonzeros per row
1078     */
1079     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1080     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1081       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1082     }
1083     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1084     for (i=0; i<size; i++) {
1085       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1086       displs[i]     = A->rmap->range[i];
1087     }
1088 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1089     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1090 #else
1091     sendcount = A->rmap->rend - A->rmap->rstart;
1092     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1093 #endif
1094     /* ---------------------------------------------------------------
1095          Create the sequential matrix of the same type as the local block diagonal
1096     */
1097     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1098     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1099     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1100     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1101     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1102     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
1103     **Bin = B;
1104     b     = (Mat_SeqAIJ*)B->data;
1105 
1106     /*--------------------------------------------------------------------
1107        Copy my part of matrix column indices over
1108     */
1109     sendcount  = ad->nz + bd->nz;
1110     jsendbuf   = b->j + b->i[rstarts[rank]];
1111     a_jsendbuf = ad->j;
1112     b_jsendbuf = bd->j;
1113     n          = A->rmap->rend - A->rmap->rstart;
1114     cnt        = 0;
1115     for (i=0; i<n; i++) {
1116 
1117       /* put in lower diagonal portion */
1118       m = bd->i[i+1] - bd->i[i];
1119       while (m > 0) {
1120         /* is it above diagonal (in bd (compressed) numbering) */
1121         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1122         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1123         m--;
1124       }
1125 
1126       /* put in diagonal portion */
1127       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1128         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1129       }
1130 
1131       /* put in upper diagonal portion */
1132       while (m-- > 0) {
1133         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1134       }
1135     }
1136     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1137 
1138     /*--------------------------------------------------------------------
1139        Gather all column indices to all processors
1140     */
1141     for (i=0; i<size; i++) {
1142       recvcounts[i] = 0;
1143       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1144         recvcounts[i] += lens[j];
1145       }
1146     }
1147     displs[0] = 0;
1148     for (i=1; i<size; i++) {
1149       displs[i] = displs[i-1] + recvcounts[i-1];
1150     }
1151 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1152     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1153 #else
1154     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1155 #endif
1156     /*--------------------------------------------------------------------
1157         Assemble the matrix into useable form (note numerical values not yet set)
1158     */
1159     /* set the b->ilen (length of each row) values */
1160     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1161     /* set the b->i indices */
1162     b->i[0] = 0;
1163     for (i=1; i<=A->rmap->N; i++) {
1164       b->i[i] = b->i[i-1] + lens[i-1];
1165     }
1166     ierr = PetscFree(lens);CHKERRQ(ierr);
1167     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1168     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1169 
1170   } else {
1171     B = **Bin;
1172     b = (Mat_SeqAIJ*)B->data;
1173   }
1174 
1175   /*--------------------------------------------------------------------
1176        Copy my part of matrix numerical values into the values location
1177   */
1178   if (flag == MAT_GET_VALUES) {
1179     sendcount = ad->nz + bd->nz;
1180     sendbuf   = b->a + b->i[rstarts[rank]];
1181     a_sendbuf = ad->a;
1182     b_sendbuf = bd->a;
1183     b_sendj   = bd->j;
1184     n         = A->rmap->rend - A->rmap->rstart;
1185     cnt       = 0;
1186     for (i=0; i<n; i++) {
1187 
1188       /* put in lower diagonal portion */
1189       m = bd->i[i+1] - bd->i[i];
1190       while (m > 0) {
1191         /* is it above diagonal (in bd (compressed) numbering) */
1192         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1193         sendbuf[cnt++] = *b_sendbuf++;
1194         m--;
1195         b_sendj++;
1196       }
1197 
1198       /* put in diagonal portion */
1199       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1200         sendbuf[cnt++] = *a_sendbuf++;
1201       }
1202 
1203       /* put in upper diagonal portion */
1204       while (m-- > 0) {
1205         sendbuf[cnt++] = *b_sendbuf++;
1206         b_sendj++;
1207       }
1208     }
1209     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1210 
1211     /* -----------------------------------------------------------------
1212        Gather all numerical values to all processors
1213     */
1214     if (!recvcounts) {
1215       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1216     }
1217     for (i=0; i<size; i++) {
1218       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1219     }
1220     displs[0] = 0;
1221     for (i=1; i<size; i++) {
1222       displs[i] = displs[i-1] + recvcounts[i-1];
1223     }
1224     recvbuf = b->a;
1225 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1226     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1227 #else
1228     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1229 #endif
1230   }  /* endof (flag == MAT_GET_VALUES) */
1231   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1232 
1233   if (A->symmetric) {
1234     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1235   } else if (A->hermitian) {
1236     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1237   } else if (A->structurally_symmetric) {
1238     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1239   }
1240   PetscFunctionReturn(0);
1241 }
1242 #undef __FUNCT__
1243 #define __FUNCT__ "MatDestroy_MPIAIJ_MatGetSubmatrices"
1244 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C)
1245 {
1246   PetscErrorCode ierr;
1247   Mat_SeqAIJ     *c = (Mat_SeqAIJ*)C->data;
1248   Mat_SubMat     *submatj = c->submatis1;
1249   PetscInt       i;
1250 
1251   PetscFunctionBegin;
1252   ierr = submatj->destroy(C);CHKERRQ(ierr);
1253 
1254   ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr);
1255 
1256   for (i=0; i<submatj->nrqr; ++i) {
1257     ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr);
1258   }
1259   ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr);
1260 
1261   ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr);
1262   ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr);
1263 
1264   for (i=0; i<submatj->nrqs; ++i) {
1265     ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr);
1266   }
1267   ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr);
1268   ierr = PetscFree(submatj->pa);CHKERRQ(ierr);
1269 
1270 #if defined(PETSC_USE_CTABLE)
1271   ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr);
1272   ierr = PetscFree2(submatj->cmap_loc,submatj->rmap_loc);CHKERRQ(ierr);
1273 #else
1274   ierr = PetscFree(submatj->rmap);CHKERRQ(ierr);
1275 #endif
1276 
1277   if (!submatj->allcolumns) {
1278 #if defined(PETSC_USE_CTABLE)
1279     ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr);
1280 #else
1281     ierr = PetscFree(submatj->cmap);CHKERRQ(ierr);
1282 #endif
1283   }
1284 
1285   ierr = PetscFree(submatj);CHKERRQ(ierr);
1286   PetscFunctionReturn(0);
1287 }
1288 
1289 #undef __FUNCT__
1290 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS_Local"
1291 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1292 {
1293   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1294   Mat            submat,A = c->A,B = c->B;
1295   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1296   PetscInt       *ai = a->i,*bi = b->i,nzA,nzB;
1297   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1298   const PetscInt *icol,*irow;
1299   PetscInt       nrow,ncol,start;
1300   PetscErrorCode ierr;
1301   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
1302   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1303   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1304   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1305   PetscInt       *lens,ncols,*cols,Crow;
1306 #if defined(PETSC_USE_CTABLE)
1307   PetscTable     cmap,rmap;
1308   PetscInt       *cmap_loc,*rmap_loc;
1309 #else
1310   PetscInt       *cmap,*rmap;
1311 #endif
1312   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1313   PetscInt       *cworkB,lwrite,*bj = b->j,*subcols;
1314   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1315   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1316   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1317   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1318   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1319   MPI_Comm       comm;
1320   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1321   PetscMPIInt    *onodes1,*olengths1,idex,end;
1322   Mat_SubMat     *smatis1;
1323 #if defined(PETSC_USE_LOG)
1324   PetscLogStage stage[5];
1325 #endif
1326 
1327   PetscFunctionBegin;
1328   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1329 
1330   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1331   size = c->size;
1332   rank = c->rank;
1333 
1334   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1335   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1336   if (allcolumns) {
1337     icol = NULL;
1338     ncol = C->cmap->N;
1339   } else {
1340     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1341     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1342   }
1343 
1344   if (scall == MAT_INITIAL_MATRIX) {
1345     PetscInt *sbuf2_i,*cworkA,lwrite,*aj = a->j,ctmp;
1346 
1347     ierr = PetscLogStageRegister("Setup",&stage[0]);CHKERRQ(ierr);
1348     ierr = PetscLogStageRegister("Aj comm",&stage[1]);CHKERRQ(ierr);
1349     ierr = PetscLogStageRegister("Aval comm",&stage[2]);CHKERRQ(ierr);
1350     ierr = PetscLogStageRegister("MAssem_loc",&stage[3]);CHKERRQ(ierr);
1351     ierr = PetscLogStageRegister("MAssem_off",&stage[4]);CHKERRQ(ierr);
1352 
1353     ierr = PetscLogStagePush(stage[0]);CHKERRQ(ierr);
1354 
1355     /* Get some new tags to keep the communication clean */
1356     tag1 = ((PetscObject)C)->tag;
1357     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1358     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1359 
1360     /* evaluate communication - mesg to who, length of mesg, and buffer space
1361      required. Based on this, buffers are allocated, and data copied into them */
1362     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
1363 
1364     for (j=0; j<nrow; j++) {
1365       proc = 0;
1366       row  = irow[j];
1367       while (row >= C->rmap->range[proc+1]) proc++;
1368       w4[proc]++;
1369     }
1370     for (j=0; j<size; j++) {
1371       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
1372     }
1373 
1374     nrqs     = 0;              /* no of outgoing messages */
1375     msz      = 0;              /* total mesg length (for all procs) */
1376     w1[rank] = 0;              /* no mesg sent to self */
1377     w3[rank] = 0;
1378     for (i=0; i<size; i++) {
1379       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
1380     }
1381 
1382     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1383     for (i=0,j=0; i<size; i++) {
1384       if (w1[i]) { pa[j] = i; j++; }
1385     }
1386 
1387     /* Each message would have a header = 1 + 2*(no of IS) + data */
1388     for (i=0; i<nrqs; i++) {
1389       proc      = pa[i];
1390       w1[proc] += w2[proc] + 2* w3[proc];
1391       msz      += w1[proc];
1392     }
1393     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1394 
1395     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1396     /* if w2[i]=1, a message of length w1[i] will be sent to proc[i]; */
1397     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1398 
1399     /* nrqs: nsend; nrqr: nrecv; w1: send message length; onodes1: recv id; olengths1: recv message length */
1400     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1401 
1402     /* Now post the Irecvs corresponding to these messages */
1403     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1404 
1405     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1406     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1407 
1408     /* Allocate Memory for outgoing messages */
1409     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1410     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1411     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1412 
1413     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1414     iptr = tmp;
1415     k    = 0;
1416     for (i=0; i<nrqs; i++) {
1417       proc     = pa[i];
1418       iptr    += k;
1419       sbuf1[proc] = iptr;
1420       k           = w1[proc];
1421     }
1422 
1423     /* Form the outgoing messages */
1424     /* Initialize the header space */
1425     for (i=0; i<nrqs; i++) {
1426       proc           = pa[i];
1427       sbuf1[proc][0] = 0;
1428       ierr           = PetscMemzero(sbuf1[proc]+1,2*w3[proc]*sizeof(PetscInt));CHKERRQ(ierr);
1429       ptr[proc]      = sbuf1[proc] + 2*w3[proc] + 1;
1430     }
1431 
1432     /* Parse the isrow and copy data into outbuf */
1433     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1434     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1435       proc  = 0;
1436       row = irow[j];
1437       while (row >= C->rmap->range[proc+1]) proc++;
1438       if (proc != rank) { /* copy to the outgoing buf*/
1439         ctr[proc]++;
1440         *ptr[proc] = row;
1441         ptr[proc]++;
1442       }
1443     }
1444 
1445     /* Update the headers for the current IS */
1446     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1447       if ((ctr_j = ctr[j])) {
1448         sbuf1_j        = sbuf1[j];
1449         k              = ++sbuf1_j[0];
1450         sbuf1_j[2*k]   = ctr_j;
1451         sbuf1_j[2*k-1] = 0;
1452       }
1453     }
1454 
1455     /* Now post the sends */
1456     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1457     for (i=0; i<nrqs; ++i) {
1458       proc = pa[i];
1459       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr);
1460     }
1461 
1462     /* Post Receives to capture the buffer size */
1463     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1464     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1465 
1466     rbuf2[0] = tmp + msz;
1467     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1468 
1469     for (i=0; i<nrqs; ++i) {
1470       proc = pa[i];
1471       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr);
1472     }
1473 
1474     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1475 
1476     /* Send to other procs the buf size they should allocate */
1477     /* Receive messages*/
1478     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1479     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1480 
1481     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
1482     for (i=0; i<nrqr; ++i) {
1483       req_size[i] = 0;
1484       rbuf1_i        = rbuf1[i];
1485       start          = 2*rbuf1_i[0] + 1;
1486       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1487       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1488       sbuf2_i        = sbuf2[i];
1489       for (j=start; j<end; j++) {
1490         k            = rbuf1_i[j] - rstart;
1491         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1492         sbuf2_i[j]   = ncols;
1493         req_size[i] += ncols;
1494       }
1495       req_source1[i] = r_status1[i].MPI_SOURCE;
1496 
1497       /* form the header */
1498       sbuf2_i[0] = req_size[i];
1499       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1500 
1501       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
1502     }
1503 
1504     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1505     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1506 
1507     /* rbuf2 is received, Post recv column indices a->j */
1508     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
1509 
1510     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1511     for (i=0; i<nrqs; ++i) {
1512       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1513       req_source2[i] = r_status2[i].MPI_SOURCE;
1514       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
1515     }
1516 
1517     /* Wait on sends1 and sends2 */
1518     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1519     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1520     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1521     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1522 
1523     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1524     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1525 
1526     ierr = PetscLogStagePop();CHKERRQ(ierr);
1527     ierr = PetscLogStagePush(stage[1]);CHKERRQ(ierr);
1528 
1529     /* Now allocate sending buffers for a->j, and send them off */
1530     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1531     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1532     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1533     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1534 
1535     for (i=0; i<nrqr; i++) { /* for each requested message */
1536       rbuf1_i   = rbuf1[i];
1537       sbuf_aj_i = sbuf_aj[i];
1538       ct1       = 2*rbuf1_i[0] + 1;
1539       ct2       = 0;
1540       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1541 
1542       kmax = rbuf1[i][2];
1543       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1544         row    = rbuf1_i[ct1] - rstart;
1545         nzA    = ai[row+1] - ai[row];     nzB = bi[row+1] - bi[row];
1546         ncols  = nzA + nzB;
1547         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1548 
1549         /* load the column indices for this row into cols*/
1550         cols = sbuf_aj_i + ct2;
1551 
1552         lwrite = 0;
1553         for (l=0; l<nzB; l++) {
1554           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1555         }
1556         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1557         for (l=0; l<nzB; l++) {
1558           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1559         }
1560 
1561         ct2 += ncols;
1562       }
1563       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
1564     }
1565 
1566     /* create column map (cmap): global col of C -> local col of submat */
1567 #if defined(PETSC_USE_CTABLE)
1568     if (!allcolumns) {
1569       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1570       ierr = PetscCalloc2(C->cmap->n,&cmap_loc,C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1571       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1572         if (icol[j] >= cstart && icol[j] <cend) {
1573           cmap_loc[icol[j] - cstart] = j+1;
1574         } else { /* use PetscTable for non-local col indices */
1575           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1576         }
1577       }
1578     } else {
1579       cmap = NULL;
1580     }
1581 #else
1582     if (!allcolumns) {
1583       ierr   = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1584       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1585     } else {
1586       cmap = NULL;
1587     }
1588 #endif
1589 
1590     /* Create lens for MatSeqAIJSetPreallocation() */
1591     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1592 
1593     /* Compute lens from local part of C */
1594     for (j=0; j<nrow; j++) {
1595       proc = 0;
1596       row  = irow[j];
1597       while (row >= C->rmap->range[proc+1]) proc++;
1598       if (proc == rank) {
1599         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1600         if (!allcolumns) {
1601           for (k=0; k<ncols; k++) {
1602 #if defined(PETSC_USE_CTABLE)
1603             if (cols[k] >= cstart && cols[k] <cend) {
1604               tcol = cmap_loc[cols[k] - cstart];
1605             } else {
1606               ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr);
1607             }
1608 #else
1609             tcol = cmap[cols[k]];
1610 #endif
1611             if (tcol) lens[j]++;
1612           }
1613         } else { /* allcolumns */
1614           lens[j] = ncols;
1615         }
1616         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1617       }
1618     }
1619 
1620     /* Create row map (rmap): global row of C -> local row of submat */
1621 #if defined(PETSC_USE_CTABLE)
1622     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1623     for (j=0; j<nrow; j++) {
1624       proc = 0;
1625       row  = irow[j];
1626       while (row >= C->rmap->range[proc+1]) proc++;
1627       if (proc == rank) { /* a local row */
1628         rmap_loc[row - rstart] = j;
1629       } else {
1630         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1631       }
1632     }
1633 #else
1634     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1635     for (j=0; j<nrow; j++) {
1636       rmap[irow[j]] = j;
1637     }
1638 #endif
1639 
1640     /* Update lens from offproc data */
1641     /* recv a->j is done */
1642     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1643     for (i=0; i<nrqs; i++) {
1644       proc    = pa[i];
1645       sbuf1_i = sbuf1[proc];
1646       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1647       ct1     = 2 + 1;
1648       ct2     = 0;
1649       rbuf2_i = rbuf2[i]; /* received length of C->j */
1650       rbuf3_i = rbuf3[i]; /* received C->j */
1651 
1652       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1653       max1   = sbuf1_i[2];
1654       for (k=0; k<max1; k++,ct1++) {
1655 #if defined(PETSC_USE_CTABLE)
1656         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1657         row--;
1658         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1659 #else
1660         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1661 #endif
1662         /* Now, store row index of submat in sbuf1_i[ct1] */
1663         sbuf1_i[ct1] = row;
1664 
1665         nnz = rbuf2_i[ct1];
1666         if (!allcolumns) {
1667           for (l=0; l<nnz; l++,ct2++) {
1668 #if defined(PETSC_USE_CTABLE)
1669             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1670               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1671             } else {
1672               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1673             }
1674 #else
1675             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1676 #endif
1677             if (tcol) {
1678               lens[row]++;
1679 
1680               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1681                For reuse, we replace received C->j with index that should be inserted to submat */
1682               /* rbuf3_i[ct3++] = ct2; ??? */
1683             }
1684           }
1685         } else { /* allcolumns */
1686           lens[row] += nnz;
1687         }
1688       }
1689     }
1690     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1691     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1692 
1693     /* Create the submatrices */
1694     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1695     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1696 
1697     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1698     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1699     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1700     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1701     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1702 
1703     /* create struct Mat_SubMat and attached it to submat */
1704     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1705     subc = (Mat_SeqAIJ*)submat->data;
1706     subc->submatis1 = smatis1;
1707 
1708     smatis1->nrqs        = nrqs;
1709     smatis1->nrqr        = nrqr;
1710     smatis1->rbuf1       = rbuf1;
1711     smatis1->rbuf2       = rbuf2;
1712     smatis1->rbuf3       = rbuf3;
1713     smatis1->sbuf2       = sbuf2;
1714     smatis1->req_source2 = req_source2;
1715 
1716     smatis1->sbuf1       = sbuf1;
1717     smatis1->ptr         = ptr;
1718     smatis1->tmp         = tmp;
1719     smatis1->ctr         = ctr;
1720 
1721     smatis1->pa           = pa;
1722     smatis1->req_size     = req_size;
1723     smatis1->req_source1  = req_source1;
1724 
1725     smatis1->allcolumns  = allcolumns;
1726     smatis1->rmap        = rmap;
1727     smatis1->cmap        = cmap;
1728 #if defined(PETSC_USE_CTABLE)
1729     smatis1->rmap_loc    = rmap_loc;
1730     smatis1->cmap_loc    = cmap_loc;
1731 #endif
1732 
1733     smatis1->destroy     = submat->ops->destroy;
1734     submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices;
1735     submat->factortype   = C->factortype;
1736 
1737     ierr = PetscLogStagePop();CHKERRQ(ierr);
1738     ierr = PetscLogStagePush(stage[2]);CHKERRQ(ierr);
1739   } else { /* scall == MAT_REUSE_MATRIX */
1740     submat = submats[0];
1741     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1742 
1743     subc    = (Mat_SeqAIJ*)submat->data;
1744     smatis1 = subc->submatis1;
1745     nrqs        = smatis1->nrqs;
1746     nrqr        = smatis1->nrqr;
1747     rbuf1       = smatis1->rbuf1;
1748     rbuf2       = smatis1->rbuf2;
1749     rbuf3       = smatis1->rbuf3;
1750     req_source2 = smatis1->req_source2;
1751 
1752     sbuf1     = smatis1->sbuf1;
1753     sbuf2     = smatis1->sbuf2;
1754     ptr       = smatis1->ptr;
1755     tmp       = smatis1->tmp;
1756     ctr       = smatis1->ctr;
1757 
1758     pa         = smatis1->pa;
1759     req_size   = smatis1->req_size;
1760     req_source1 = smatis1->req_source1;
1761 
1762     allcolumns = smatis1->allcolumns;
1763     rmap       = smatis1->rmap;
1764     cmap       = smatis1->cmap;
1765 #if defined(PETSC_USE_CTABLE)
1766     rmap_loc   = smatis1->rmap_loc;
1767     cmap_loc   = smatis1->cmap_loc;
1768 #endif
1769   }
1770 
1771   /* Post recv matrix values */
1772   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
1773   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1774   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1775   for (i=0; i<nrqs; ++i) {
1776     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1777     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1778   }
1779 
1780   /* Allocate sending buffers for a->a, and send them off */
1781   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1782   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1783   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1784   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1785 
1786   for (i=0; i<nrqr; i++) {
1787     rbuf1_i   = rbuf1[i];
1788     sbuf_aa_i = sbuf_aa[i];
1789     ct1       = 2*rbuf1_i[0]+1;
1790     ct2       = 0;
1791     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1792 
1793     kmax = rbuf1_i[2];
1794     for (k=0; k<kmax; k++,ct1++) {
1795       row = rbuf1_i[ct1] - rstart;
1796       nzA = ai[row+1] - ai[row];
1797       nzB = bi[row+1] - bi[row];
1798       ncols  = nzA + nzB;
1799       cworkB = bj + bi[row];
1800       vworkA = a_a + ai[row];
1801       vworkB = b_a + bi[row];
1802 
1803       /* load the column values for this row into vals*/
1804       vals = sbuf_aa_i + ct2;
1805 
1806       lwrite = 0;
1807       for (l=0; l<nzB; l++) {
1808         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1809       }
1810       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1811       for (l=0; l<nzB; l++) {
1812         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1813       }
1814 
1815       ct2 += ncols;
1816     }
1817     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1818   }
1819 
1820   if (scall == MAT_INITIAL_MATRIX) {
1821    ierr = PetscLogStagePop();CHKERRQ(ierr);
1822    ierr = PetscLogStagePush(stage[3]);CHKERRQ(ierr);
1823   }
1824 
1825   /* Assemble submat */
1826   if (!allcolumns) {
1827     ierr = PetscCalloc2(ncol,&subcols,ncol,&subvals);CHKERRQ(ierr);
1828   }
1829   /* First assemble the local rows */
1830   for (j=0; j<nrow; j++) {
1831     proc = 0;
1832     row  = irow[j];
1833     while (row >= C->rmap->range[proc+1]) proc++;
1834     if (proc == rank) {
1835       Crow = row;
1836 #if defined(PETSC_USE_CTABLE)
1837       row = rmap_loc[Crow - rstart]; /* row index of submat */
1838 #else
1839       row = rmap[row];
1840 #endif
1841       ierr = MatGetRow_MPIAIJ(C,Crow,&ncols,&cols,&vals);CHKERRQ(ierr);
1842 
1843       if (!allcolumns) {
1844 #if defined(PETSC_USE_CTABLE)
1845         i = 0;
1846         k = 0;
1847         while (cols[k] < cstart) {
1848           ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr);
1849           if (tcol) {
1850             subcols[i]   = --tcol;
1851             subvals[i++] = vals[k];
1852           }
1853           k++;
1854         }
1855         while (cols[k] < cend && k<ncols) {
1856           tcol = cmap_loc[cols[k]-cstart];
1857           if (tcol) {
1858             subcols[i]   = --tcol;
1859             subvals[i++] = vals[k];
1860           }
1861           k++;
1862         }
1863         while (k<ncols) {
1864           ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr);
1865           if (tcol) {
1866             subcols[i]   = --tcol;
1867             subvals[i++] = vals[k];
1868           }
1869           k++;
1870         }
1871 #else
1872         i = 0;
1873         for (k=0; k<ncols; k++) {
1874           tcol = cmap[cols[k]];
1875           if (tcol) {
1876             subcols[i]   = --tcol;
1877             subvals[i++] = vals[k];
1878           }
1879         }
1880 #endif
1881         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1882       } else { /* allcolumns */
1883         ierr = MatSetValues_SeqAIJ(submat,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
1884       }
1885       ierr = MatRestoreRow_MPIAIJ(C,Crow,&ncols,&cols,&vals);CHKERRQ(ierr);
1886     }
1887   }
1888 
1889   if (scall == MAT_INITIAL_MATRIX) {
1890    ierr = PetscLogStagePop();CHKERRQ(ierr);
1891    ierr = PetscLogStagePush(stage[4]);CHKERRQ(ierr);
1892   }
1893 
1894   /* Now assemble the off-proc rows */
1895   for (i=0; i<nrqs; i++) { /* for each requested message */
1896     /* recv values from other processes */
1897     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1898     proc    = pa[idex];
1899     sbuf1_i = sbuf1[proc];
1900     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1901     ct1     = 2 + 1;
1902     ct2     = 0; /* count of received C->j */
1903     ct3     = 0; /* count of received C->j that will be inserted into submat */
1904     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1905     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1906     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1907 
1908     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1909     max1 = sbuf1_i[2];             /* num of rows */
1910     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1911       row = sbuf1_i[ct1]; /* row index of submat */
1912       if (!allcolumns) {
1913         idex = 0;
1914         if (scall == MAT_INITIAL_MATRIX) {
1915           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1916           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1917 #if defined(PETSC_USE_CTABLE)
1918             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1919               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1920             } else {
1921               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1922             }
1923 #else
1924             tcol = cmap[rbuf3_i[ct2]];
1925 #endif
1926             if (tcol) {
1927               subcols[idex]   = --tcol;
1928               subvals[idex++] = rbuf4_i[ct2];
1929 
1930               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1931                For reuse, we replace received C->j with index that should be inserted to submat */
1932               rbuf3_i[ct3++] = ct2;
1933             }
1934           }
1935           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1936 
1937         } else { /* scall == MAT_REUSE_MATRIX */
1938           submat = submats[0];
1939           subc   = (Mat_SeqAIJ*)submat->data;
1940 
1941           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1942           for (l=0; l<nnz; l++) {
1943             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1944             subvals[idex++] = rbuf4_i[ct2];
1945           }
1946 
1947           bj = subc->j + subc->i[row];
1948           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1949         }
1950       } else { /* allcolumns */
1951         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1952         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1953         ct2 += nnz;
1954       }
1955     }
1956   }
1957 
1958   /* sending a->a are done */
1959   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1960   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1961 
1962   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1963   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1964   submats[0] = submat;
1965 
1966   /* Restore the indices */
1967   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1968   if (!allcolumns) {
1969     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1970   }
1971 
1972   /* Destroy allocated memory */
1973   for (i=0; i<nrqs; ++i) {
1974     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1975   }
1976   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1977   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1978   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1979 
1980   if (!allcolumns) {
1981     ierr = PetscFree2(subcols,subvals);CHKERRQ(ierr);
1982   }
1983 
1984   if (scall == MAT_INITIAL_MATRIX) {
1985     ierr = PetscLogStagePop();CHKERRQ(ierr);
1986 
1987     ierr = PetscFree(lens);CHKERRQ(ierr);
1988     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1989     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1990   }
1991   PetscFunctionReturn(0);
1992 }
1993 
1994 #undef __FUNCT__
1995 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS"
1996 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1997 {
1998   PetscErrorCode ierr;
1999   PetscInt       ncol;
2000   PetscBool      colflag,allcolumns=PETSC_FALSE;
2001 
2002   PetscFunctionBegin;
2003   /* Allocate memory to hold all the submatrices */
2004   if (scall != MAT_REUSE_MATRIX) {
2005     ierr = PetscMalloc1(1,submat);CHKERRQ(ierr);
2006   }
2007 
2008   /* Check for special case: each processor gets entire matrix columns */
2009   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
2010   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
2011   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
2012 
2013   ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
2014   PetscFunctionReturn(0);
2015 }
2016 
2017 #undef __FUNCT__
2018 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
2019 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2020 {
2021   PetscErrorCode ierr;
2022   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
2023   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
2024 
2025   PetscFunctionBegin;
2026   /* Check for special case: each processor gets entire matrix */
2027   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip several MPIU_Allreduce() */
2028     ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2029     PetscFunctionReturn(0);
2030   }
2031 
2032   if (ismax == 1 && C->rmap->N == C->cmap->N) {
2033     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
2034     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
2035     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
2036     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
2037     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2038       wantallmatrix = PETSC_TRUE;
2039 
2040       ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
2041     }
2042   }
2043   ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2044   if (twantallmatrix) {
2045     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2046     PetscFunctionReturn(0);
2047   }
2048 
2049   /* Allocate memory to hold all the submatrices */
2050   if (scall != MAT_REUSE_MATRIX) {
2051     ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr);
2052   }
2053 
2054   /* Check for special case: each processor gets entire matrix columns */
2055   ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr);
2056   for (i=0; i<ismax; i++) {
2057     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2058     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
2059     if (colflag && ncol == C->cmap->N) {
2060       allcolumns[i] = PETSC_TRUE;
2061     } else {
2062       allcolumns[i] = PETSC_FALSE;
2063     }
2064   }
2065 
2066   /* Determine the number of stages through which submatrices are done */
2067   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
2068 
2069   /*
2070      Each stage will extract nmax submatrices.
2071      nmax is determined by the matrix column dimension.
2072      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2073   */
2074   if (!nmax) nmax = 1;
2075   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
2076 
2077   /* Make sure every processor loops through the nstages */
2078   ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2079 
2080   for (i=0,pos=0; i<nstages; i++) {
2081     if (pos+nmax <= ismax) max_no = nmax;
2082     else if (pos == ismax) max_no = 0;
2083     else                   max_no = ismax-pos;
2084     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
2085     pos += max_no;
2086   }
2087 
2088   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
2089   PetscFunctionReturn(0);
2090 }
2091 
2092 /* -------------------------------------------------------------------------*/
2093 #undef __FUNCT__
2094 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
2095 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
2096 {
2097   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2098   Mat            A  = c->A;
2099   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
2100   const PetscInt **icol,**irow;
2101   PetscInt       *nrow,*ncol,start;
2102   PetscErrorCode ierr;
2103   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
2104   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
2105   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
2106   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
2107   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2108 #if defined(PETSC_USE_CTABLE)
2109   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
2110 #else
2111   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2112 #endif
2113   const PetscInt *irow_i;
2114   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2115   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2116   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2117   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2118   MPI_Status     *r_status3,*r_status4,*s_status4;
2119   MPI_Comm       comm;
2120   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
2121   PetscMPIInt    *onodes1,*olengths1;
2122   PetscMPIInt    idex,idex2,end;
2123 
2124   PetscFunctionBegin;
2125   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2126   tag0 = ((PetscObject)C)->tag;
2127   size = c->size;
2128   rank = c->rank;
2129 
2130   /* Get some new tags to keep the communication clean */
2131   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
2132   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2133   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2134 
2135   ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr);
2136 
2137   for (i=0; i<ismax; i++) {
2138     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2139     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2140     if (allcolumns[i]) {
2141       icol[i] = NULL;
2142       ncol[i] = C->cmap->N;
2143     } else {
2144       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2145       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2146     }
2147   }
2148 
2149   /* evaluate communication - mesg to who, length of mesg, and buffer space
2150      required. Based on this, buffers are allocated, and data copied into them*/
2151   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size */
2152   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2153   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2154   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2155   for (i=0; i<ismax; i++) {
2156     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
2157     jmax   = nrow[i];
2158     irow_i = irow[i];
2159     for (j=0; j<jmax; j++) {
2160       l   = 0;
2161       row = irow_i[j];
2162       while (row >= C->rmap->range[l+1]) l++;
2163       proc = l;
2164       w4[proc]++;
2165     }
2166     for (j=0; j<size; j++) {
2167       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
2168     }
2169   }
2170 
2171   nrqs     = 0;              /* no of outgoing messages */
2172   msz      = 0;              /* total mesg length (for all procs) */
2173   w1[rank] = 0;              /* no mesg sent to self */
2174   w3[rank] = 0;
2175   for (i=0; i<size; i++) {
2176     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2177   }
2178   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2179   for (i=0,j=0; i<size; i++) {
2180     if (w1[i]) { pa[j] = i; j++; }
2181   }
2182 
2183   /* Each message would have a header = 1 + 2*(no of IS) + data */
2184   for (i=0; i<nrqs; i++) {
2185     j      = pa[i];
2186     w1[j] += w2[j] + 2* w3[j];
2187     msz   += w1[j];
2188   }
2189   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2190 
2191   /* Determine the number of messages to expect, their lengths, from from-ids */
2192   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2193   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2194 
2195   /* Now post the Irecvs corresponding to these messages */
2196   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2197 
2198   ierr = PetscFree(onodes1);CHKERRQ(ierr);
2199   ierr = PetscFree(olengths1);CHKERRQ(ierr);
2200 
2201   /* Allocate Memory for outgoing messages */
2202   ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2203   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
2204   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
2205 
2206   {
2207     PetscInt *iptr = tmp;
2208     k    = 0;
2209     for (i=0; i<nrqs; i++) {
2210       j        = pa[i];
2211       iptr    += k;
2212       sbuf1[j] = iptr;
2213       k        = w1[j];
2214     }
2215   }
2216 
2217   /* Form the outgoing messages */
2218   /* Initialize the header space */
2219   for (i=0; i<nrqs; i++) {
2220     j           = pa[i];
2221     sbuf1[j][0] = 0;
2222     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
2223     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2224   }
2225 
2226   /* Parse the isrow and copy data into outbuf */
2227   for (i=0; i<ismax; i++) {
2228     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
2229     irow_i = irow[i];
2230     jmax   = nrow[i];
2231     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2232       l   = 0;
2233       row = irow_i[j];
2234       while (row >= C->rmap->range[l+1]) l++;
2235       proc = l;
2236       if (proc != rank) { /* copy to the outgoing buf*/
2237         ctr[proc]++;
2238         *ptr[proc] = row;
2239         ptr[proc]++;
2240       }
2241     }
2242     /* Update the headers for the current IS */
2243     for (j=0; j<size; j++) { /* Can Optimise this loop too */
2244       if ((ctr_j = ctr[j])) {
2245         sbuf1_j        = sbuf1[j];
2246         k              = ++sbuf1_j[0];
2247         sbuf1_j[2*k]   = ctr_j;
2248         sbuf1_j[2*k-1] = i;
2249       }
2250     }
2251   }
2252 
2253   /*  Now  post the sends */
2254   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2255   for (i=0; i<nrqs; ++i) {
2256     j    = pa[i];
2257     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
2258   }
2259 
2260   /* Post Receives to capture the buffer size */
2261   ierr     = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2262   ierr     = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr);
2263   rbuf2[0] = tmp + msz;
2264   for (i=1; i<nrqs; ++i) {
2265     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2266   }
2267   for (i=0; i<nrqs; ++i) {
2268     j    = pa[i];
2269     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
2270   }
2271 
2272   /* Send to other procs the buf size they should allocate */
2273 
2274 
2275   /* Receive messages*/
2276   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2277   ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2278   ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr);
2279   {
2280     PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2281     PetscInt   *sbuf2_i;
2282 
2283     for (i=0; i<nrqr; ++i) {
2284       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
2285 
2286       req_size[idex] = 0;
2287       rbuf1_i        = rbuf1[idex];
2288       start          = 2*rbuf1_i[0] + 1;
2289       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2290       ierr           = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr);
2291       sbuf2_i        = sbuf2[idex];
2292       for (j=start; j<end; j++) {
2293         id              = rbuf1_i[j] - rstart;
2294         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2295         sbuf2_i[j]      = ncols;
2296         req_size[idex] += ncols;
2297       }
2298       req_source[idex] = r_status1[i].MPI_SOURCE;
2299       /* form the header */
2300       sbuf2_i[0] = req_size[idex];
2301       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2302 
2303       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
2304     }
2305   }
2306   ierr = PetscFree(r_status1);CHKERRQ(ierr);
2307   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2308 
2309   /*  recv buffer sizes */
2310   /* Receive messages*/
2311 
2312   ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr);
2313   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2314   ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2315   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2316   ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2317 
2318   for (i=0; i<nrqs; ++i) {
2319     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
2320     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr);
2321     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr);
2322     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
2323     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
2324   }
2325   ierr = PetscFree(r_status2);CHKERRQ(ierr);
2326   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2327 
2328   /* Wait on sends1 and sends2 */
2329   ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2330   ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2331 
2332   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
2333   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
2334   ierr = PetscFree(s_status1);CHKERRQ(ierr);
2335   ierr = PetscFree(s_status2);CHKERRQ(ierr);
2336   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2337   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2338 
2339   /* Now allocate buffers for a->j, and send them off */
2340   ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2341   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2342   ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2343   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2344 
2345   ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2346   {
2347     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2348     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2349     PetscInt cend = C->cmap->rend;
2350     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2351 
2352     for (i=0; i<nrqr; i++) {
2353       rbuf1_i   = rbuf1[i];
2354       sbuf_aj_i = sbuf_aj[i];
2355       ct1       = 2*rbuf1_i[0] + 1;
2356       ct2       = 0;
2357       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2358         kmax = rbuf1[i][2*j];
2359         for (k=0; k<kmax; k++,ct1++) {
2360           row    = rbuf1_i[ct1] - rstart;
2361           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2362           ncols  = nzA + nzB;
2363           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2364 
2365           /* load the column indices for this row into cols*/
2366           cols = sbuf_aj_i + ct2;
2367 
2368           lwrite = 0;
2369           for (l=0; l<nzB; l++) {
2370             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2371           }
2372           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2373           for (l=0; l<nzB; l++) {
2374             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2375           }
2376 
2377           ct2 += ncols;
2378         }
2379       }
2380       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
2381     }
2382   }
2383   ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr);
2384   ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr);
2385 
2386   /* Allocate buffers for a->a, and send them off */
2387   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2388   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2389   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2390   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2391 
2392   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2393   {
2394     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2395     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2396     PetscInt    cend   = C->cmap->rend;
2397     PetscInt    *b_j   = b->j;
2398     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2399 
2400     for (i=0; i<nrqr; i++) {
2401       rbuf1_i   = rbuf1[i];
2402       sbuf_aa_i = sbuf_aa[i];
2403       ct1       = 2*rbuf1_i[0]+1;
2404       ct2       = 0;
2405       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2406         kmax = rbuf1_i[2*j];
2407         for (k=0; k<kmax; k++,ct1++) {
2408           row    = rbuf1_i[ct1] - rstart;
2409           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2410           ncols  = nzA + nzB;
2411           cworkB = b_j + b_i[row];
2412           vworkA = a_a + a_i[row];
2413           vworkB = b_a + b_i[row];
2414 
2415           /* load the column values for this row into vals*/
2416           vals = sbuf_aa_i+ct2;
2417 
2418           lwrite = 0;
2419           for (l=0; l<nzB; l++) {
2420             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2421           }
2422           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2423           for (l=0; l<nzB; l++) {
2424             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2425           }
2426 
2427           ct2 += ncols;
2428         }
2429       }
2430       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
2431     }
2432   }
2433   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
2434   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
2435   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2436   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2437 
2438   /* Form the matrix */
2439   /* create col map: global col of C -> local col of submatrices */
2440   {
2441     const PetscInt *icol_i;
2442 #if defined(PETSC_USE_CTABLE)
2443     ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr);
2444     for (i=0; i<ismax; i++) {
2445       if (!allcolumns[i]) {
2446         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2447 
2448         jmax   = ncol[i];
2449         icol_i = icol[i];
2450         cmap_i = cmap[i];
2451         for (j=0; j<jmax; j++) {
2452           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2453         }
2454       } else {
2455         cmap[i] = NULL;
2456       }
2457     }
2458 #else
2459     ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr);
2460     for (i=0; i<ismax; i++) {
2461       if (!allcolumns[i]) {
2462         ierr   = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2463         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
2464         jmax   = ncol[i];
2465         icol_i = icol[i];
2466         cmap_i = cmap[i];
2467         for (j=0; j<jmax; j++) {
2468           cmap_i[icol_i[j]] = j+1;
2469         }
2470       } else {
2471         cmap[i] = NULL;
2472       }
2473     }
2474 #endif
2475   }
2476 
2477   /* Create lens which is required for MatCreate... */
2478   for (i=0,j=0; i<ismax; i++) j += nrow[i];
2479   ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2480   if (ismax) {
2481     ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr);
2482     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
2483   }
2484   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2485 
2486   /* Update lens from local data */
2487   for (i=0; i<ismax; i++) {
2488     jmax = nrow[i];
2489     if (!allcolumns[i]) cmap_i = cmap[i];
2490     irow_i = irow[i];
2491     lens_i = lens[i];
2492     for (j=0; j<jmax; j++) {
2493       l   = 0;
2494       row = irow_i[j];
2495       while (row >= C->rmap->range[l+1]) l++;
2496       proc = l;
2497       if (proc == rank) {
2498         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2499         if (!allcolumns[i]) {
2500           for (k=0; k<ncols; k++) {
2501 #if defined(PETSC_USE_CTABLE)
2502             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2503 #else
2504             tcol = cmap_i[cols[k]];
2505 #endif
2506             if (tcol) lens_i[j]++;
2507           }
2508         } else { /* allcolumns */
2509           lens_i[j] = ncols;
2510         }
2511         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2512       }
2513     }
2514   }
2515 
2516   /* Create row map: global row of C -> local row of submatrices */
2517 #if defined(PETSC_USE_CTABLE)
2518   ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr);
2519   for (i=0; i<ismax; i++) {
2520     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2521     irow_i = irow[i];
2522     jmax   = nrow[i];
2523     for (j=0; j<jmax; j++) {
2524       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2525     }
2526   }
2527 #else
2528   ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr);
2529   if (ismax) {
2530     ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr);
2531     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
2532   }
2533   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
2534   for (i=0; i<ismax; i++) {
2535     rmap_i = rmap[i];
2536     irow_i = irow[i];
2537     jmax   = nrow[i];
2538     for (j=0; j<jmax; j++) {
2539       rmap_i[irow_i[j]] = j;
2540     }
2541   }
2542 #endif
2543 
2544   /* Update lens from offproc data */
2545   {
2546     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2547 
2548     for (tmp2=0; tmp2<nrqs; tmp2++) {
2549       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
2550       idex    = pa[idex2];
2551       sbuf1_i = sbuf1[idex];
2552       jmax    = sbuf1_i[0];
2553       ct1     = 2*jmax+1;
2554       ct2     = 0;
2555       rbuf2_i = rbuf2[idex2];
2556       rbuf3_i = rbuf3[idex2];
2557       for (j=1; j<=jmax; j++) {
2558         is_no  = sbuf1_i[2*j-1];
2559         max1   = sbuf1_i[2*j];
2560         lens_i = lens[is_no];
2561         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2562         rmap_i = rmap[is_no];
2563         for (k=0; k<max1; k++,ct1++) {
2564 #if defined(PETSC_USE_CTABLE)
2565           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2566           row--;
2567           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2568 #else
2569           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2570 #endif
2571           max2 = rbuf2_i[ct1];
2572           for (l=0; l<max2; l++,ct2++) {
2573             if (!allcolumns[is_no]) {
2574 #if defined(PETSC_USE_CTABLE)
2575               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2576 #else
2577               tcol = cmap_i[rbuf3_i[ct2]];
2578 #endif
2579               if (tcol) lens_i[row]++;
2580             } else { /* allcolumns */
2581               lens_i[row]++; /* lens_i[row] += max2 ? */
2582             }
2583           }
2584         }
2585       }
2586     }
2587   }
2588   ierr = PetscFree(r_status3);CHKERRQ(ierr);
2589   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2590   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
2591   ierr = PetscFree(s_status3);CHKERRQ(ierr);
2592   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2593 
2594   /* Create the submatrices */
2595   if (scall == MAT_REUSE_MATRIX) {
2596     PetscBool flag;
2597 
2598     /*
2599         Assumes new rows are same length as the old rows,hence bug!
2600     */
2601     for (i=0; i<ismax; i++) {
2602       mat = (Mat_SeqAIJ*)(submats[i]->data);
2603       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2604       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2605       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
2606       /* Initial matrix as if empty */
2607       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2608 
2609       submats[i]->factortype = C->factortype;
2610     }
2611   } else {
2612     for (i=0; i<ismax; i++) {
2613       PetscInt rbs,cbs;
2614       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2615       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2616 
2617       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2618       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2619 
2620       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2621       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2622       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2623     }
2624   }
2625 
2626   /* Assemble the matrices */
2627   /* First assemble the local rows */
2628   {
2629     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2630     PetscScalar *imat_a;
2631 
2632     for (i=0; i<ismax; i++) {
2633       mat       = (Mat_SeqAIJ*)submats[i]->data;
2634       imat_ilen = mat->ilen;
2635       imat_j    = mat->j;
2636       imat_i    = mat->i;
2637       imat_a    = mat->a;
2638 
2639       if (!allcolumns[i]) cmap_i = cmap[i];
2640       rmap_i = rmap[i];
2641       irow_i = irow[i];
2642       jmax   = nrow[i];
2643       for (j=0; j<jmax; j++) {
2644         l   = 0;
2645         row = irow_i[j];
2646         while (row >= C->rmap->range[l+1]) l++;
2647         proc = l;
2648         if (proc == rank) {
2649           old_row = row;
2650 #if defined(PETSC_USE_CTABLE)
2651           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2652           row--;
2653 #else
2654           row = rmap_i[row];
2655 #endif
2656           ilen_row = imat_ilen[row];
2657           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2658           mat_i    = imat_i[row];
2659           mat_a    = imat_a + mat_i;
2660           mat_j    = imat_j + mat_i;
2661           if (!allcolumns[i]) {
2662             for (k=0; k<ncols; k++) {
2663 #if defined(PETSC_USE_CTABLE)
2664               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2665 #else
2666               tcol = cmap_i[cols[k]];
2667 #endif
2668               if (tcol) {
2669                 *mat_j++ = tcol - 1;
2670                 *mat_a++ = vals[k];
2671                 ilen_row++;
2672               }
2673             }
2674           } else { /* allcolumns */
2675             for (k=0; k<ncols; k++) {
2676               *mat_j++ = cols[k];  /* global col index! */
2677               *mat_a++ = vals[k];
2678               ilen_row++;
2679             }
2680           }
2681           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2682 
2683           imat_ilen[row] = ilen_row;
2684         }
2685       }
2686     }
2687   }
2688 
2689   /*   Now assemble the off proc rows*/
2690   {
2691     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
2692     PetscInt    *imat_j,*imat_i;
2693     PetscScalar *imat_a,*rbuf4_i;
2694 
2695     for (tmp2=0; tmp2<nrqs; tmp2++) {
2696       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
2697       idex    = pa[idex2];
2698       sbuf1_i = sbuf1[idex];
2699       jmax    = sbuf1_i[0];
2700       ct1     = 2*jmax + 1;
2701       ct2     = 0;
2702       rbuf2_i = rbuf2[idex2];
2703       rbuf3_i = rbuf3[idex2];
2704       rbuf4_i = rbuf4[idex2];
2705       for (j=1; j<=jmax; j++) {
2706         is_no     = sbuf1_i[2*j-1];
2707         rmap_i    = rmap[is_no];
2708         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2709         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
2710         imat_ilen = mat->ilen;
2711         imat_j    = mat->j;
2712         imat_i    = mat->i;
2713         imat_a    = mat->a;
2714         max1      = sbuf1_i[2*j];
2715         for (k=0; k<max1; k++,ct1++) {
2716           row = sbuf1_i[ct1];
2717 #if defined(PETSC_USE_CTABLE)
2718           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2719           row--;
2720 #else
2721           row = rmap_i[row];
2722 #endif
2723           ilen  = imat_ilen[row];
2724           mat_i = imat_i[row];
2725           mat_a = imat_a + mat_i;
2726           mat_j = imat_j + mat_i;
2727           max2  = rbuf2_i[ct1];
2728           if (!allcolumns[is_no]) {
2729             for (l=0; l<max2; l++,ct2++) {
2730 
2731 #if defined(PETSC_USE_CTABLE)
2732               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2733 #else
2734               tcol = cmap_i[rbuf3_i[ct2]];
2735 #endif
2736               if (tcol) {
2737                 *mat_j++ = tcol - 1;
2738                 *mat_a++ = rbuf4_i[ct2];
2739                 ilen++;
2740               }
2741             }
2742           } else { /* allcolumns */
2743             for (l=0; l<max2; l++,ct2++) {
2744               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2745               *mat_a++ = rbuf4_i[ct2];
2746               ilen++;
2747             }
2748           }
2749           imat_ilen[row] = ilen;
2750         }
2751       }
2752     }
2753   }
2754 
2755   /* sort the rows */
2756   {
2757     PetscInt    *imat_ilen,*imat_j,*imat_i;
2758     PetscScalar *imat_a;
2759 
2760     for (i=0; i<ismax; i++) {
2761       mat       = (Mat_SeqAIJ*)submats[i]->data;
2762       imat_j    = mat->j;
2763       imat_i    = mat->i;
2764       imat_a    = mat->a;
2765       imat_ilen = mat->ilen;
2766 
2767       if (allcolumns[i]) continue;
2768       jmax = nrow[i];
2769       for (j=0; j<jmax; j++) {
2770         PetscInt ilen;
2771 
2772         mat_i = imat_i[j];
2773         mat_a = imat_a + mat_i;
2774         mat_j = imat_j + mat_i;
2775         ilen  = imat_ilen[j];
2776         ierr  = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr);
2777       }
2778     }
2779   }
2780 
2781   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2782   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2783   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
2784   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2785   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2786 
2787   /* Restore the indices */
2788   for (i=0; i<ismax; i++) {
2789     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2790     if (!allcolumns[i]) {
2791       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2792     }
2793   }
2794 
2795   /* Destroy allocated memory */
2796   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
2797   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2798   ierr = PetscFree(pa);CHKERRQ(ierr);
2799 
2800   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
2801   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
2802   for (i=0; i<nrqr; ++i) {
2803     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
2804   }
2805   for (i=0; i<nrqs; ++i) {
2806     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
2807     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2808   }
2809 
2810   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
2811   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
2812   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2813   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2814   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2815   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2816   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2817 
2818 #if defined(PETSC_USE_CTABLE)
2819   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
2820 #else
2821   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
2822 #endif
2823   ierr = PetscFree(rmap);CHKERRQ(ierr);
2824 
2825   for (i=0; i<ismax; i++) {
2826     if (!allcolumns[i]) {
2827 #if defined(PETSC_USE_CTABLE)
2828       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
2829 #else
2830       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
2831 #endif
2832     }
2833   }
2834   ierr = PetscFree(cmap);CHKERRQ(ierr);
2835   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2836   ierr = PetscFree(lens);CHKERRQ(ierr);
2837 
2838   for (i=0; i<ismax; i++) {
2839     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2840     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2841   }
2842   PetscFunctionReturn(0);
2843 }
2844 
2845 /*
2846  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2847  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2848  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2849  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2850  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2851  state, and needs to be "assembled" later by compressing B's column space.
2852 
2853  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2854  Following this call, C->A & C->B have been created, even if empty.
2855  */
2856 #undef __FUNCT__
2857 #define __FUNCT__ "MatSetSeqMats_MPIAIJ"
2858 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2859 {
2860   /* If making this function public, change the error returned in this function away from _PLIB. */
2861   PetscErrorCode ierr;
2862   Mat_MPIAIJ     *aij;
2863   Mat_SeqAIJ     *Baij;
2864   PetscBool      seqaij,Bdisassembled;
2865   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2866   PetscScalar    v;
2867   const PetscInt *rowindices,*colindices;
2868 
2869   PetscFunctionBegin;
2870   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2871   if (A) {
2872     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2873     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2874     if (rowemb) {
2875       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2876       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2877     } else {
2878       if (C->rmap->n != A->rmap->n) {
2879 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2880       }
2881     }
2882     if (dcolemb) {
2883       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2884       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2885     } else {
2886       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2887     }
2888   }
2889   if (B) {
2890     ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2891     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2892     if (rowemb) {
2893       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2894       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2895     } else {
2896       if (C->rmap->n != B->rmap->n) {
2897 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2898       }
2899     }
2900     if (ocolemb) {
2901       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2902       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2903     } else {
2904       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2905     }
2906   }
2907 
2908   aij    = (Mat_MPIAIJ*)(C->data);
2909   if (!aij->A) {
2910     /* Mimic parts of MatMPIAIJSetPreallocation() */
2911     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2912     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2913     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2914     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2915     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2916   }
2917   if (A) {
2918     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2919   } else {
2920     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2921   }
2922   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2923     /*
2924       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2925       need to "disassemble" B -- convert it to using C's global indices.
2926       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2927 
2928       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2929       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2930 
2931       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2932       At least avoid calling MatSetValues() and the implied searches?
2933     */
2934 
2935     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2936 #if defined(PETSC_USE_CTABLE)
2937       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2938 #else
2939       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2940       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2941       if (aij->B) {
2942         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2943       }
2944 #endif
2945       ngcol = 0;
2946       if (aij->lvec) {
2947 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2948       }
2949       if (aij->garray) {
2950 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2951 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2952       }
2953       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2954       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2955     }
2956     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2957       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2958     }
2959     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2960       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2961     }
2962   }
2963   Bdisassembled = PETSC_FALSE;
2964   if (!aij->B) {
2965     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2966     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2967     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2968     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
2969     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
2970     Bdisassembled = PETSC_TRUE;
2971   }
2972   if (B) {
2973     Baij = (Mat_SeqAIJ*)(B->data);
2974     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2975       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
2976       for (i=0; i<B->rmap->n; i++) {
2977 	nz[i] = Baij->i[i+1] - Baij->i[i];
2978       }
2979       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
2980       ierr = PetscFree(nz);CHKERRQ(ierr);
2981     }
2982 
2983     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
2984     shift = rend-rstart;
2985     count = 0;
2986     rowindices = NULL;
2987     colindices = NULL;
2988     if (rowemb) {
2989       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
2990     }
2991     if (ocolemb) {
2992       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
2993     }
2994     for (i=0; i<B->rmap->n; i++) {
2995       PetscInt row;
2996       row = i;
2997       if (rowindices) row = rowindices[i];
2998       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
2999 	col  = Baij->j[count];
3000 	if (colindices) col = colindices[col];
3001 	if (Bdisassembled && col>=rstart) col += shift;
3002 	v    = Baij->a[count];
3003 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3004 	++count;
3005       }
3006     }
3007     /* No assembly for aij->B is necessary. */
3008     /* FIXME: set aij->B's nonzerostate correctly. */
3009   } else {
3010     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3011   }
3012   C->preallocated  = PETSC_TRUE;
3013   C->was_assembled = PETSC_FALSE;
3014   C->assembled     = PETSC_FALSE;
3015    /*
3016       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3017       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3018    */
3019   PetscFunctionReturn(0);
3020 }
3021 
3022 #undef __FUNCT__
3023 #define __FUNCT__ "MatGetSeqMats_MPIAIJ"
3024 /*
3025   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3026  */
3027 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3028 {
3029   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
3030 
3031   PetscFunctionBegin;
3032   PetscValidPointer(A,2);
3033   PetscValidPointer(B,3);
3034   /* FIXME: make sure C is assembled */
3035   *A = aij->A;
3036   *B = aij->B;
3037   /* Note that we don't incref *A and *B, so be careful! */
3038   PetscFunctionReturn(0);
3039 }
3040 
3041 /*
3042   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3043   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3044 */
3045 #undef __FUNCT__
3046 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ"
3047 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3048                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3049 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3050 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3051 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3052 {
3053   PetscErrorCode ierr;
3054   PetscMPIInt    isize,flag;
3055   PetscInt       i,ii,cismax,ispar,ciscol_localsize;
3056   Mat            *A,*B;
3057   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3058 
3059   PetscFunctionBegin;
3060   if (!ismax) PetscFunctionReturn(0);
3061 
3062   for (i = 0, cismax = 0; i < ismax; ++i) {
3063     PetscMPIInt isize;
3064     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
3065     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3066     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
3067     if (isize > 1) ++cismax;
3068   }
3069   /*
3070      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3071      ispar counts the number of parallel ISs across C's comm.
3072   */
3073   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3074   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3075     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3076     PetscFunctionReturn(0);
3077   }
3078 
3079   /* if (ispar) */
3080   /*
3081     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3082     These are used to extract the off-diag portion of the resulting parallel matrix.
3083     The row IS for the off-diag portion is the same as for the diag portion,
3084     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3085   */
3086   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3087   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3088   for (i = 0, ii = 0; i < ismax; ++i) {
3089     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3090     if (isize > 1) {
3091       /*
3092 	 TODO: This is the part that's ***NOT SCALABLE***.
3093 	 To fix this we need to extract just the indices of C's nonzero columns
3094 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3095 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
3096 	 to be done without serializing on the IS list, so, most likely, it is best
3097 	 done by rewriting MatGetSubMatrices_MPIAIJ() directly.
3098       */
3099       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3100       /* Now we have to
3101 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3102 	     were sorted on each rank, concatenated they might no longer be sorted;
3103 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3104 	     indices in the nondecreasing order to the original index positions.
3105 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3106       */
3107       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3108       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3109       ++ii;
3110     }
3111   }
3112   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3113   for (i = 0, ii = 0; i < ismax; ++i) {
3114     PetscInt       j,issize;
3115     const PetscInt *indices;
3116 
3117     /*
3118        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3119      */
3120     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3121     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3122     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3123     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3124     for (j = 1; j < issize; ++j) {
3125       if (indices[j] == indices[j-1]) {
3126 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3127       }
3128     }
3129     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3130 
3131 
3132     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3133     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3134     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3135     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3136     for (j = 1; j < issize; ++j) {
3137       if (indices[j-1] == indices[j]) {
3138 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3139       }
3140     }
3141     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3142     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3143     if (isize > 1) {
3144       cisrow[ii] = isrow[i];
3145       ++ii;
3146     }
3147   }
3148   /*
3149     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3150     array of sequential matrices underlying the resulting parallel matrices.
3151     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3152     contain duplicates.
3153 
3154     There are as many diag matrices as there are original index sets. There are only as many parallel
3155     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3156 
3157     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3158     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3159       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3160       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3161       with A[i] and B[ii] extracted from the corresponding MPI submat.
3162     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3163       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3164       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3165       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3166       values into A[i] and B[ii] sitting inside the corresponding submat.
3167     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3168       A[i], B[ii], AA[i] or BB[ii] matrices.
3169   */
3170   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3171   if (scall == MAT_INITIAL_MATRIX) {
3172     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3173     /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */
3174   } else {
3175     ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr);
3176     ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr);
3177     /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */
3178     for (i = 0, ii = 0; i < ismax; ++i) {
3179       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3180       if (isize > 1) {
3181 	Mat AA,BB;
3182 	ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr);
3183 	if (!isrow_p[i] && !iscol_p[i]) {
3184 	  A[i] = AA;
3185 	} else {
3186 	  /* TODO: extract A[i] composed on AA. */
3187 	  ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
3188 	}
3189 	if (!isrow_p[i] && !ciscol_p[ii]) {
3190 	  B[ii] = BB;
3191 	} else {
3192 	  /* TODO: extract B[ii] composed on BB. */
3193 	  ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr);
3194 	}
3195 	++ii;
3196       } else {
3197 	if (!isrow_p[i] && !iscol_p[i]) {
3198 	  A[i] = (*submat)[i];
3199 	} else {
3200 	  /* TODO: extract A[i] composed on (*submat)[i]. */
3201 	  ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
3202 	}
3203       }
3204     }
3205   }
3206   /* Now obtain the sequential A and B submatrices separately. */
3207   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr);
3208   /* I did not figure out a good way to fix it right now.
3209    * Local column size of B[i] is different from the size of ciscol[i].
3210    * B[i]'s size is finally determined by MatAssembly, while
3211    * ciscol[i] is computed as the complement of iscol[i].
3212    * It is better to keep only nonzero indices when computing
3213    * the complement ciscol[i].
3214    * */
3215   if(scall==MAT_REUSE_MATRIX){
3216 	for(i=0; i<cismax; i++){
3217 	  ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr);
3218 	  B[i]->cmap->n = ciscol_localsize;
3219 	}
3220   }
3221   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr);
3222   /*
3223     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3224     matrices A & B have been extracted directly into the parallel matrices containing them, or
3225     simply into the sequential matrix identical with the corresponding A (if isize == 1).
3226     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3227     to have the same sparsity pattern.
3228     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3229     must be constructed for C. This is done by setseqmat(s).
3230   */
3231   for (i = 0, ii = 0; i < ismax; ++i) {
3232     /*
3233        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3234        That way we can avoid sorting and computing permutations when reusing.
3235        To this end:
3236         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3237 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
3238 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3239     */
3240     MatStructure pattern;
3241     if (scall == MAT_INITIAL_MATRIX) {
3242       pattern = DIFFERENT_NONZERO_PATTERN;
3243     } else {
3244       pattern = SAME_NONZERO_PATTERN;
3245     }
3246     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3247     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3248     if (isize > 1) {
3249       if (scall == MAT_INITIAL_MATRIX) {
3250 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3251 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3252 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3253 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3254 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3255       }
3256       /*
3257 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3258       */
3259       {
3260 	Mat AA,BB;
3261 	AA = NULL;
3262 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i];
3263 	BB = NULL;
3264 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii];
3265 	if (AA || BB) {
3266 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3267 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3268 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3269 	}
3270 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) {
3271 	  /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */
3272 	  ierr = MatDestroy(&AA);CHKERRQ(ierr);
3273 	}
3274 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) {
3275 	  /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */
3276 	  ierr = MatDestroy(&BB);CHKERRQ(ierr);
3277 	}
3278       }
3279       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3280       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3281       ++ii;
3282     } else { /* if (isize == 1) */
3283       if (scall == MAT_INITIAL_MATRIX) {
3284 	if (isrow_p[i] || iscol_p[i]) {
3285 	  ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3286 	} else (*submat)[i] = A[i];
3287       }
3288       if (isrow_p[i] || iscol_p[i]) {
3289 	ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3290 	/* Otherwise A is extracted straight into (*submats)[i]. */
3291 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3292 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
3293       }
3294     }
3295     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3296     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3297   }
3298   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3299   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3300   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3301   ierr = PetscFree(A);CHKERRQ(ierr);
3302   ierr = PetscFree(B);CHKERRQ(ierr);
3303   PetscFunctionReturn(0);
3304 }
3305 
3306 
3307 
3308 #undef __FUNCT__
3309 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ"
3310 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3311 {
3312   PetscErrorCode ierr;
3313 
3314   PetscFunctionBegin;
3315   ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3316   PetscFunctionReturn(0);
3317 }
3318