xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision eaa5305cee10621aa4272e6dcd94f29e695dbdf2)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 #undef __FUNCT__
23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
25 {
26   PetscErrorCode ierr;
27   PetscInt       i;
28 
29   PetscFunctionBegin;
30   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
31   for (i=0; i<ov; ++i) {
32     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
33   }
34   PetscFunctionReturn(0);
35 }
36 
37 #undef __FUNCT__
38 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable"
39 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
40 {
41   PetscErrorCode ierr;
42   PetscInt       i;
43 
44   PetscFunctionBegin;
45   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
46   for (i=0; i<ov; ++i) {
47     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
48   }
49   PetscFunctionReturn(0);
50 }
51 
52 
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
56 {
57   PetscErrorCode   ierr;
58   MPI_Comm         comm;
59   PetscInt        *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
60   PetscInt         *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
61   PetscInt         nrecvrows,*sbsizes = 0,*sbdata = 0;
62   const PetscInt  *indices_i,**indices;
63   PetscLayout      rmap;
64   PetscMPIInt      rank,size,*toranks,*fromranks,nto,nfrom;
65   PetscSF          sf;
66   PetscSFNode     *remote;
67 
68   PetscFunctionBegin;
69   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
70   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
71   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
72   /* get row map to determine where rows should be going */
73   ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr);
74   /* retrieve IS data and put all together so that we
75    * can optimize communication
76    *  */
77   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
78   for (i=0,tlength=0; i<nidx; i++){
79     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
80     tlength += length[i];
81     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
82   }
83   /* find these rows on remote processors */
84   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
85   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
86   nrrows = 0;
87   for (i=0; i<nidx; i++){
88     length_i     = length[i];
89     indices_i    = indices[i];
90     for (j=0; j<length_i; j++){
91       owner = -1;
92       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
93       /* remote processors */
94       if (owner != rank){
95         tosizes_temp[owner]++; /* number of rows to owner */
96         rrow_ranks[nrrows]  = owner; /* processor */
97         rrow_isids[nrrows]   = i; /* is id */
98         remoterows[nrrows++] = indices_i[j]; /* row */
99       }
100     }
101     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
102   }
103   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
104   /* test if we need to exchange messages
105    * generally speaking, we do not need to exchange
106    * data when overlap is 1
107    * */
108   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
109   /* we do not have any messages
110    * It usually corresponds to overlap 1
111    * */
112   if (!reducednrrows){
113     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
114     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
115     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
116     PetscFunctionReturn(0);
117   }
118   nto = 0;
119   /* send sizes and ranks for building a two-sided communcation */
120   for (i=0; i<size; i++){
121    if (tosizes_temp[i]){
122      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
123      tosizes_temp[i] = nto; /* a map from processor to index */
124      toranks[nto++]  = i; /* processor */
125    }
126   }
127   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
128   for (i=0; i<nto; i++){
129     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
130     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
131   }
132   /* send information to other processors */
133   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
134   nrecvrows = 0;
135   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
136   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
137   nrecvrows = 0;
138   for (i=0; i<nfrom; i++){
139     for (j=0; j<fromsizes[2*i]; j++){
140       remote[nrecvrows].rank    = fromranks[i];
141       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
142     }
143   }
144   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
145   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
146   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
147   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
148   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
149   /* message pair <no of is, row>  */
150   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
151   for (i=0; i<nrrows; i++){
152     owner = rrow_ranks[i]; /* processor */
153     j     = tosizes_temp[owner]; /* index */
154     todata[toffsets[j]++] = rrow_isids[i];
155     todata[toffsets[j]++] = remoterows[i];
156   }
157   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
158   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
159   ierr = PetscFree(toffsets);CHKERRQ(ierr);
160   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
161   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
162   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
163   /* send rows belonging to the remote so that then we could get the overlapping data back */
164   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
165   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
166   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
167   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
168   ierr = PetscFree(fromranks);CHKERRQ(ierr);
169   nrecvrows = 0;
170   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
171   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
172   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
173   nrecvrows = 0;
174   for (i=0; i<nto; i++){
175     for (j=0; j<tosizes[2*i]; j++){
176       remote[nrecvrows].rank    = toranks[i];
177       remote[nrecvrows++].index = tosizes[2*i+1]+j;
178     }
179   }
180   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
181   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
182   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
183   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
184   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
185   /* overlap communication and computation */
186   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
187   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
188   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
189   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
190   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
191   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
192   ierr = PetscFree(toranks);CHKERRQ(ierr);
193   ierr = PetscFree(tosizes);CHKERRQ(ierr);
194   ierr = PetscFree(todata);CHKERRQ(ierr);
195   PetscFunctionReturn(0);
196 }
197 
198 #undef __FUNCT__
199 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable"
200 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
201 {
202   PetscInt         *isz,isz_i,i,j,is_id, data_size;
203   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
204   const PetscInt   *indices_i_temp;
205   PetscErrorCode    ierr;
206 
207   PetscFunctionBegin;
208   max_lsize = 0;
209   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
210   for (i=0; i<nidx; i++){
211     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
212     max_lsize = lsize>max_lsize ? lsize:max_lsize;
213     isz[i]    = lsize;
214   }
215   ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr);
216   for (i=0; i<nidx; i++){
217     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
218     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
219     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
220     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
221   }
222   /* retrieve information to get row id and its overlap */
223   for (i=0; i<nrecvs; ){
224     is_id      = recvdata[i++];
225     data_size  = recvdata[i++];
226     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
227     isz_i      = isz[is_id];
228     for (j=0; j< data_size; j++){
229       col = recvdata[i++];
230       indices_i[isz_i++] = col;
231     }
232     isz[is_id] = isz_i;
233   }
234   /* remove duplicate entities */
235   for (i=0; i<nidx; i++){
236     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
237     isz_i      = isz[i];
238     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
239     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
240   }
241   ierr = PetscFree(isz);CHKERRQ(ierr);
242   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
243   PetscFunctionReturn(0);
244 }
245 
246 #undef __FUNCT__
247 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable"
248 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
249 {
250   PetscLayout       rmap,cmap;
251   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
252   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
253   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
254   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
255   Mat               amat,bmat;
256   PetscMPIInt       rank;
257   PetscBool         done;
258   MPI_Comm          comm;
259   PetscErrorCode    ierr;
260 
261   PetscFunctionBegin;
262   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
263   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
264   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
265   /* Even if the mat is symmetric, we still assume it is not symmetric */
266   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
267   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
268   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
269   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
270   /* total number of nonzero values is used to estimate the memory usage in the next step */
271   tnz  = ai[an]+bi[bn];
272   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
273   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
274   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
275   /* to find the longest message */
276   max_fszs = 0;
277   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
278   /* better way to estimate number of nonzero in the mat??? */
279   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
280   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
281   rows_pos  = 0;
282   totalrows = 0;
283   for (i=0; i<nfrom; i++){
284     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
285     /* group data together */
286     for (j=0; j<fromsizes[2*i]; j+=2){
287       is_id                       = fromrows[rows_pos++];/* no of is */
288       rows_i                      = rows_data[is_id];
289       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
290     }
291     /* estimate a space to avoid multiple allocations  */
292     for (j=0; j<nidx; j++){
293       indvc_ij = 0;
294       rows_i   = rows_data[j];
295       for (l=0; l<rows_pos_i[j]; l++){
296         row    = rows_i[l]-rstart;
297         start  = ai[row];
298         end    = ai[row+1];
299         for (k=start; k<end; k++){ /* Amat */
300           col = aj[k] + cstart;
301           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
302         }
303         start = bi[row];
304         end   = bi[row+1];
305         for (k=start; k<end; k++) { /* Bmat */
306           col = gcols[bj[k]];
307           indices_tmp[indvc_ij++] = col;
308         }
309       }
310       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
311       indv_counts[i*nidx+j] = indvc_ij;
312       totalrows            += indvc_ij;
313     }
314   }
315   /* message triple <no of is, number of rows, rows> */
316   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
317   totalrows = 0;
318   rows_pos  = 0;
319   /* use this code again */
320   for (i=0;i<nfrom;i++){
321     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
322     for (j=0; j<fromsizes[2*i]; j+=2){
323       is_id                       = fromrows[rows_pos++];
324       rows_i                      = rows_data[is_id];
325       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
326     }
327     /* add data  */
328     for (j=0; j<nidx; j++){
329       if (!indv_counts[i*nidx+j]) continue;
330       indvc_ij = 0;
331       sbdata[totalrows++] = j;
332       sbdata[totalrows++] = indv_counts[i*nidx+j];
333       sbsizes[2*i]       += 2;
334       rows_i              = rows_data[j];
335       for (l=0; l<rows_pos_i[j]; l++){
336         row   = rows_i[l]-rstart;
337         start = ai[row];
338         end   = ai[row+1];
339         for (k=start; k<end; k++){ /* Amat */
340           col = aj[k] + cstart;
341           indices_tmp[indvc_ij++] = col;
342         }
343         start = bi[row];
344         end   = bi[row+1];
345         for (k=start; k<end; k++) { /* Bmat */
346           col = gcols[bj[k]];
347           indices_tmp[indvc_ij++] = col;
348         }
349       }
350       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
351       sbsizes[2*i]  += indvc_ij;
352       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
353       totalrows += indvc_ij;
354     }
355   }
356   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
357   for (i=0; i<nfrom; i++){
358     offsets[i+1]   = offsets[i] + sbsizes[2*i];
359     sbsizes[2*i+1] = offsets[i];
360   }
361   ierr = PetscFree(offsets);CHKERRQ(ierr);
362   if (sbrowsizes) *sbrowsizes = sbsizes;
363   if (sbrows) *sbrows = sbdata;
364   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
365   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
366   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
367   PetscFunctionReturn(0);
368 }
369 
370 #undef __FUNCT__
371 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable"
372 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
373 {
374   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
375   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
376   PetscInt          lsize,lsize_tmp,owner;
377   PetscMPIInt       rank;
378   Mat               amat,bmat;
379   PetscBool         done;
380   PetscLayout       cmap,rmap;
381   MPI_Comm          comm;
382   PetscErrorCode    ierr;
383 
384   PetscFunctionBegin;
385   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
386   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
387   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
388   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
389   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
390   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
391   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
392   /* is it a safe way to compute number of nonzero values ? */
393   tnz  = ai[an]+bi[bn];
394   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
395   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
396   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
397   /* it is a better way to estimate memory than the old implementation
398    * where global size of matrix is used
399    * */
400   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
401   for (i=0; i<nidx; i++) {
402     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
403     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
404     lsize_tmp = 0;
405     for (j=0; j<lsize; j++) {
406       owner = -1;
407       row   = indices[j];
408       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
409       if (owner != rank) continue;
410       /* local number */
411       row  -= rstart;
412       start = ai[row];
413       end   = ai[row+1];
414       for (k=start; k<end; k++) { /* Amat */
415         col = aj[k] + cstart;
416         indices_temp[lsize_tmp++] = col;
417       }
418       start = bi[row];
419       end   = bi[row+1];
420       for (k=start; k<end; k++) { /* Bmat */
421         col = gcols[bj[k]];
422         indices_temp[lsize_tmp++] = col;
423       }
424     }
425    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
426    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
427    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
428    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
429   }
430   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
431   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
432   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
433   PetscFunctionReturn(0);
434 }
435 
436 
437 /*
438   Sample message format:
439   If a processor A wants processor B to process some elements corresponding
440   to index sets is[1],is[5]
441   mesg [0] = 2   (no of index sets in the mesg)
442   -----------
443   mesg [1] = 1 => is[1]
444   mesg [2] = sizeof(is[1]);
445   -----------
446   mesg [3] = 5  => is[5]
447   mesg [4] = sizeof(is[5]);
448   -----------
449   mesg [5]
450   mesg [n]  datas[1]
451   -----------
452   mesg[n+1]
453   mesg[m]  data(is[5])
454   -----------
455 
456   Notes:
457   nrqs - no of requests sent (or to be sent out)
458   nrqr - no of requests recieved (which have to be or which have been processed
459 */
460 #undef __FUNCT__
461 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
462 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
463 {
464   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
465   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
466   const PetscInt **idx,*idx_i;
467   PetscInt       *n,**data,len;
468 #if defined(PETSC_USE_CTABLE)
469   PetscTable     *table_data,table_data_i;
470   PetscInt       *tdata,tcount,tcount_max;
471 #else
472   PetscInt       *data_i,*d_p;
473 #endif
474   PetscErrorCode ierr;
475   PetscMPIInt    size,rank,tag1,tag2;
476   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
477   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
478   PetscBT        *table;
479   MPI_Comm       comm;
480   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
481   MPI_Status     *s_status,*recv_status;
482   char           *t_p;
483 
484   PetscFunctionBegin;
485   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
486   size = c->size;
487   rank = c->rank;
488   M    = C->rmap->N;
489 
490   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
491   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
492 
493   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
494 
495   for (i=0; i<imax; i++) {
496     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
497     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
498   }
499 
500   /* evaluate communication - mesg to who,length of mesg, and buffer space
501      required. Based on this, buffers are allocated, and data copied into them  */
502   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
503   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
504   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
505   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
506   for (i=0; i<imax; i++) {
507     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
508     idx_i = idx[i];
509     len   = n[i];
510     for (j=0; j<len; j++) {
511       row = idx_i[j];
512       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
513       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
514       w4[proc]++;
515     }
516     for (j=0; j<size; j++) {
517       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
518     }
519   }
520 
521   nrqs     = 0;              /* no of outgoing messages */
522   msz      = 0;              /* total mesg length (for all proc */
523   w1[rank] = 0;              /* no mesg sent to intself */
524   w3[rank] = 0;
525   for (i=0; i<size; i++) {
526     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
527   }
528   /* pa - is list of processors to communicate with */
529   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
530   for (i=0,j=0; i<size; i++) {
531     if (w1[i]) {pa[j] = i; j++;}
532   }
533 
534   /* Each message would have a header = 1 + 2*(no of IS) + data */
535   for (i=0; i<nrqs; i++) {
536     j      = pa[i];
537     w1[j] += w2[j] + 2*w3[j];
538     msz   += w1[j];
539   }
540 
541   /* Determine the number of messages to expect, their lengths, from from-ids */
542   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
543   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
544 
545   /* Now post the Irecvs corresponding to these messages */
546   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
547 
548   /* Allocate Memory for outgoing messages */
549   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
550   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
551   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
552 
553   {
554     PetscInt *iptr = tmp,ict  = 0;
555     for (i=0; i<nrqs; i++) {
556       j         = pa[i];
557       iptr     +=  ict;
558       outdat[j] = iptr;
559       ict       = w1[j];
560     }
561   }
562 
563   /* Form the outgoing messages */
564   /* plug in the headers */
565   for (i=0; i<nrqs; i++) {
566     j            = pa[i];
567     outdat[j][0] = 0;
568     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
569     ptr[j]       = outdat[j] + 2*w3[j] + 1;
570   }
571 
572   /* Memory for doing local proc's work */
573   {
574 #if defined(PETSC_USE_CTABLE)
575     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
576     for (i=0; i<imax; i++) {
577       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
578     }
579     ierr = PetscCalloc4(imax,&table,imax,&data,imax,&isz,(M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr);
580     for (i=0; i<imax; i++) {
581       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
582     }
583 #else
584     PetscInt Mimax = 0,M_BPB_imax = 0;
585     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
586     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
587     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
588     ierr = PetscCalloc5(imax,&table,imax,&data,imax,&isz,M*imax,&d_p,(M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr);
589     for (i=0; i<imax; i++) {
590       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
591       data[i]  = d_p + M*i;
592     }
593 #endif
594   }
595 
596   /* Parse the IS and update local tables and the outgoing buf with the data */
597   {
598     PetscInt n_i,isz_i,*outdat_j,ctr_j;
599     PetscBT  table_i;
600 
601     for (i=0; i<imax; i++) {
602       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
603       n_i     = n[i];
604       table_i = table[i];
605       idx_i   = idx[i];
606 #if defined(PETSC_USE_CTABLE)
607       table_data_i = table_data[i];
608 #else
609       data_i  = data[i];
610 #endif
611       isz_i   = isz[i];
612       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
613         row  = idx_i[j];
614         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
615         if (proc != rank) { /* copy to the outgoing buffer */
616           ctr[proc]++;
617           *ptr[proc] = row;
618           ptr[proc]++;
619         } else if (!PetscBTLookupSet(table_i,row)) {
620 #if defined(PETSC_USE_CTABLE)
621           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
622 #else
623           data_i[isz_i] = row; /* Update the local table */
624 #endif
625           isz_i++;
626         }
627       }
628       /* Update the headers for the current IS */
629       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
630         if ((ctr_j = ctr[j])) {
631           outdat_j        = outdat[j];
632           k               = ++outdat_j[0];
633           outdat_j[2*k]   = ctr_j;
634           outdat_j[2*k-1] = i;
635         }
636       }
637       isz[i] = isz_i;
638     }
639   }
640 
641   /*  Now  post the sends */
642   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
643   for (i=0; i<nrqs; ++i) {
644     j    = pa[i];
645     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
646   }
647 
648   /* No longer need the original indices */
649   for (i=0; i<imax; ++i) {
650     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
651   }
652   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
653 
654   for (i=0; i<imax; ++i) {
655     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
656   }
657 
658   /* Do Local work */
659 #if defined(PETSC_USE_CTABLE)
660   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
661 #else
662   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
663 #endif
664 
665   /* Receive messages */
666   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
667   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
668 
669   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
670   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
671 
672   /* Phase 1 sends are complete - deallocate buffers */
673   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
674   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
675 
676   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
677   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
678   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
679   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
680   ierr = PetscFree(rbuf);CHKERRQ(ierr);
681 
682 
683   /* Send the data back */
684   /* Do a global reduction to know the buffer space req for incoming messages */
685   {
686     PetscMPIInt *rw1;
687 
688     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
689 
690     for (i=0; i<nrqr; ++i) {
691       proc = recv_status[i].MPI_SOURCE;
692 
693       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
694       rw1[proc] = isz1[i];
695     }
696     ierr = PetscFree(onodes1);CHKERRQ(ierr);
697     ierr = PetscFree(olengths1);CHKERRQ(ierr);
698 
699     /* Determine the number of messages to expect, their lengths, from from-ids */
700     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
701     ierr = PetscFree(rw1);CHKERRQ(ierr);
702   }
703   /* Now post the Irecvs corresponding to these messages */
704   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
705 
706   /* Now  post the sends */
707   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
708   for (i=0; i<nrqr; ++i) {
709     j    = recv_status[i].MPI_SOURCE;
710     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
711   }
712 
713   /* receive work done on other processors */
714   {
715     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
716     PetscMPIInt idex;
717     PetscBT     table_i;
718     MPI_Status  *status2;
719 
720     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
721     for (i=0; i<nrqs; ++i) {
722       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
723       /* Process the message */
724       rbuf2_i = rbuf2[idex];
725       ct1     = 2*rbuf2_i[0]+1;
726       jmax    = rbuf2[idex][0];
727       for (j=1; j<=jmax; j++) {
728         max     = rbuf2_i[2*j];
729         is_no   = rbuf2_i[2*j-1];
730         isz_i   = isz[is_no];
731         table_i = table[is_no];
732 #if defined(PETSC_USE_CTABLE)
733         table_data_i = table_data[is_no];
734 #else
735         data_i  = data[is_no];
736 #endif
737         for (k=0; k<max; k++,ct1++) {
738           row = rbuf2_i[ct1];
739           if (!PetscBTLookupSet(table_i,row)) {
740 #if defined(PETSC_USE_CTABLE)
741             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
742 #else
743             data_i[isz_i] = row;
744 #endif
745             isz_i++;
746           }
747         }
748         isz[is_no] = isz_i;
749       }
750     }
751 
752     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
753     ierr = PetscFree(status2);CHKERRQ(ierr);
754   }
755 
756 #if defined(PETSC_USE_CTABLE)
757   tcount_max = 0;
758   for (i=0; i<imax; ++i) {
759     table_data_i = table_data[i];
760     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
761     if (tcount_max < tcount) tcount_max = tcount;
762   }
763   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
764 #endif
765 
766   for (i=0; i<imax; ++i) {
767 #if defined(PETSC_USE_CTABLE)
768     PetscTablePosition tpos;
769     table_data_i = table_data[i];
770 
771     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
772     while (tpos) {
773       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
774       tdata[--j] = --k;
775     }
776     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
777 #else
778     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
779 #endif
780   }
781 
782   ierr = PetscFree(onodes2);CHKERRQ(ierr);
783   ierr = PetscFree(olengths2);CHKERRQ(ierr);
784 
785   ierr = PetscFree(pa);CHKERRQ(ierr);
786   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
787   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
788   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
789   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
790   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
791   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
792   ierr = PetscFree(s_status);CHKERRQ(ierr);
793   ierr = PetscFree(recv_status);CHKERRQ(ierr);
794   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
795   ierr = PetscFree(xdata);CHKERRQ(ierr);
796   ierr = PetscFree(isz1);CHKERRQ(ierr);
797 #if defined(PETSC_USE_CTABLE)
798   for (i=0; i<imax; i++) {
799     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
800   }
801   ierr = PetscFree(table_data);CHKERRQ(ierr);
802   ierr = PetscFree(tdata);CHKERRQ(ierr);
803   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
804 #else
805   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
806 #endif
807   PetscFunctionReturn(0);
808 }
809 
810 #undef __FUNCT__
811 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
812 /*
813    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
814        the work on the local processor.
815 
816      Inputs:
817       C      - MAT_MPIAIJ;
818       imax - total no of index sets processed at a time;
819       table  - an array of char - size = m bits.
820 
821      Output:
822       isz    - array containing the count of the solution elements corresponding
823                to each index set;
824       data or table_data  - pointer to the solutions
825 */
826 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
827 {
828   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
829   Mat        A  = c->A,B = c->B;
830   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
831   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
832   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
833   PetscBT    table_i;
834 #if defined(PETSC_USE_CTABLE)
835   PetscTable         table_data_i;
836   PetscErrorCode     ierr;
837   PetscTablePosition tpos;
838   PetscInt           tcount,*tdata;
839 #else
840   PetscInt           *data_i;
841 #endif
842 
843   PetscFunctionBegin;
844   rstart = C->rmap->rstart;
845   cstart = C->cmap->rstart;
846   ai     = a->i;
847   aj     = a->j;
848   bi     = b->i;
849   bj     = b->j;
850   garray = c->garray;
851 
852   for (i=0; i<imax; i++) {
853 #if defined(PETSC_USE_CTABLE)
854     /* copy existing entries of table_data_i into tdata[] */
855     table_data_i = table_data[i];
856     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
857     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
858 
859     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
860     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
861     while (tpos) {
862       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
863       tdata[--j] = --row;
864       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
865     }
866 #else
867     data_i  = data[i];
868 #endif
869     table_i = table[i];
870     isz_i   = isz[i];
871     max     = isz[i];
872 
873     for (j=0; j<max; j++) {
874 #if defined(PETSC_USE_CTABLE)
875       row   = tdata[j] - rstart;
876 #else
877       row   = data_i[j] - rstart;
878 #endif
879       start = ai[row];
880       end   = ai[row+1];
881       for (k=start; k<end; k++) { /* Amat */
882         val = aj[k] + cstart;
883         if (!PetscBTLookupSet(table_i,val)) {
884 #if defined(PETSC_USE_CTABLE)
885           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
886 #else
887           data_i[isz_i] = val;
888 #endif
889           isz_i++;
890         }
891       }
892       start = bi[row];
893       end   = bi[row+1];
894       for (k=start; k<end; k++) { /* Bmat */
895         val = garray[bj[k]];
896         if (!PetscBTLookupSet(table_i,val)) {
897 #if defined(PETSC_USE_CTABLE)
898           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
899 #else
900           data_i[isz_i] = val;
901 #endif
902           isz_i++;
903         }
904       }
905     }
906     isz[i] = isz_i;
907 
908 #if defined(PETSC_USE_CTABLE)
909     ierr = PetscFree(tdata);CHKERRQ(ierr);
910 #endif
911   }
912   PetscFunctionReturn(0);
913 }
914 
915 #undef __FUNCT__
916 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
917 /*
918       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
919          and return the output
920 
921          Input:
922            C    - the matrix
923            nrqr - no of messages being processed.
924            rbuf - an array of pointers to the recieved requests
925 
926          Output:
927            xdata - array of messages to be sent back
928            isz1  - size of each message
929 
930   For better efficiency perhaps we should malloc separately each xdata[i],
931 then if a remalloc is required we need only copy the data for that one row
932 rather then all previous rows as it is now where a single large chunck of
933 memory is used.
934 
935 */
936 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
937 {
938   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
939   Mat            A  = c->A,B = c->B;
940   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
941   PetscErrorCode ierr;
942   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
943   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
944   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
945   PetscInt       *rbuf_i,kmax,rbuf_0;
946   PetscBT        xtable;
947 
948   PetscFunctionBegin;
949   m      = C->rmap->N;
950   rstart = C->rmap->rstart;
951   cstart = C->cmap->rstart;
952   ai     = a->i;
953   aj     = a->j;
954   bi     = b->i;
955   bj     = b->j;
956   garray = c->garray;
957 
958 
959   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
960     rbuf_i =  rbuf[i];
961     rbuf_0 =  rbuf_i[0];
962     ct    += rbuf_0;
963     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
964   }
965 
966   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
967   else max1 = 1;
968   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
969   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
970   ++no_malloc;
971   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
972   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
973 
974   ct3 = 0;
975   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
976     rbuf_i =  rbuf[i];
977     rbuf_0 =  rbuf_i[0];
978     ct1    =  2*rbuf_0+1;
979     ct2    =  ct1;
980     ct3   += ct1;
981     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
982       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
983       oct2 = ct2;
984       kmax = rbuf_i[2*j];
985       for (k=0; k<kmax; k++,ct1++) {
986         row = rbuf_i[ct1];
987         if (!PetscBTLookupSet(xtable,row)) {
988           if (!(ct3 < mem_estimate)) {
989             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
990             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
991             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
992             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
993             xdata[0]     = tmp;
994             mem_estimate = new_estimate; ++no_malloc;
995             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
996           }
997           xdata[i][ct2++] = row;
998           ct3++;
999         }
1000       }
1001       for (k=oct2,max2=ct2; k<max2; k++) {
1002         row   = xdata[i][k] - rstart;
1003         start = ai[row];
1004         end   = ai[row+1];
1005         for (l=start; l<end; l++) {
1006           val = aj[l] + cstart;
1007           if (!PetscBTLookupSet(xtable,val)) {
1008             if (!(ct3 < mem_estimate)) {
1009               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1010               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1011               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1012               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1013               xdata[0]     = tmp;
1014               mem_estimate = new_estimate; ++no_malloc;
1015               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1016             }
1017             xdata[i][ct2++] = val;
1018             ct3++;
1019           }
1020         }
1021         start = bi[row];
1022         end   = bi[row+1];
1023         for (l=start; l<end; l++) {
1024           val = garray[bj[l]];
1025           if (!PetscBTLookupSet(xtable,val)) {
1026             if (!(ct3 < mem_estimate)) {
1027               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1028               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1029               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1030               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1031               xdata[0]     = tmp;
1032               mem_estimate = new_estimate; ++no_malloc;
1033               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1034             }
1035             xdata[i][ct2++] = val;
1036             ct3++;
1037           }
1038         }
1039       }
1040       /* Update the header*/
1041       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1042       xdata[i][2*j-1] = rbuf_i[2*j-1];
1043     }
1044     xdata[i][0] = rbuf_0;
1045     xdata[i+1]  = xdata[i] + ct2;
1046     isz1[i]     = ct2; /* size of each message */
1047   }
1048   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1049   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1050   PetscFunctionReturn(0);
1051 }
1052 /* -------------------------------------------------------------------------*/
1053 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
1054 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
1055 /*
1056     Every processor gets the entire matrix
1057 */
1058 #undef __FUNCT__
1059 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
1060 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1061 {
1062   Mat            B;
1063   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1064   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1065   PetscErrorCode ierr;
1066   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
1067   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1068   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1069   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1070 
1071   PetscFunctionBegin;
1072   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1073   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
1074 
1075   if (scall == MAT_INITIAL_MATRIX) {
1076     /* ----------------------------------------------------------------
1077          Tell every processor the number of nonzeros per row
1078     */
1079     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1080     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1081       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1082     }
1083     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1084     for (i=0; i<size; i++) {
1085       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1086       displs[i]     = A->rmap->range[i];
1087     }
1088 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1089     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1090 #else
1091     sendcount = A->rmap->rend - A->rmap->rstart;
1092     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1093 #endif
1094     /* ---------------------------------------------------------------
1095          Create the sequential matrix of the same type as the local block diagonal
1096     */
1097     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1098     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1099     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1100     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1101     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1102     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
1103     **Bin = B;
1104     b     = (Mat_SeqAIJ*)B->data;
1105 
1106     /*--------------------------------------------------------------------
1107        Copy my part of matrix column indices over
1108     */
1109     sendcount  = ad->nz + bd->nz;
1110     jsendbuf   = b->j + b->i[rstarts[rank]];
1111     a_jsendbuf = ad->j;
1112     b_jsendbuf = bd->j;
1113     n          = A->rmap->rend - A->rmap->rstart;
1114     cnt        = 0;
1115     for (i=0; i<n; i++) {
1116 
1117       /* put in lower diagonal portion */
1118       m = bd->i[i+1] - bd->i[i];
1119       while (m > 0) {
1120         /* is it above diagonal (in bd (compressed) numbering) */
1121         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1122         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1123         m--;
1124       }
1125 
1126       /* put in diagonal portion */
1127       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1128         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1129       }
1130 
1131       /* put in upper diagonal portion */
1132       while (m-- > 0) {
1133         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1134       }
1135     }
1136     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1137 
1138     /*--------------------------------------------------------------------
1139        Gather all column indices to all processors
1140     */
1141     for (i=0; i<size; i++) {
1142       recvcounts[i] = 0;
1143       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1144         recvcounts[i] += lens[j];
1145       }
1146     }
1147     displs[0] = 0;
1148     for (i=1; i<size; i++) {
1149       displs[i] = displs[i-1] + recvcounts[i-1];
1150     }
1151 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1152     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1153 #else
1154     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1155 #endif
1156     /*--------------------------------------------------------------------
1157         Assemble the matrix into useable form (note numerical values not yet set)
1158     */
1159     /* set the b->ilen (length of each row) values */
1160     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1161     /* set the b->i indices */
1162     b->i[0] = 0;
1163     for (i=1; i<=A->rmap->N; i++) {
1164       b->i[i] = b->i[i-1] + lens[i-1];
1165     }
1166     ierr = PetscFree(lens);CHKERRQ(ierr);
1167     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1168     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1169 
1170   } else {
1171     B = **Bin;
1172     b = (Mat_SeqAIJ*)B->data;
1173   }
1174 
1175   /*--------------------------------------------------------------------
1176        Copy my part of matrix numerical values into the values location
1177   */
1178   if (flag == MAT_GET_VALUES) {
1179     sendcount = ad->nz + bd->nz;
1180     sendbuf   = b->a + b->i[rstarts[rank]];
1181     a_sendbuf = ad->a;
1182     b_sendbuf = bd->a;
1183     b_sendj   = bd->j;
1184     n         = A->rmap->rend - A->rmap->rstart;
1185     cnt       = 0;
1186     for (i=0; i<n; i++) {
1187 
1188       /* put in lower diagonal portion */
1189       m = bd->i[i+1] - bd->i[i];
1190       while (m > 0) {
1191         /* is it above diagonal (in bd (compressed) numbering) */
1192         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1193         sendbuf[cnt++] = *b_sendbuf++;
1194         m--;
1195         b_sendj++;
1196       }
1197 
1198       /* put in diagonal portion */
1199       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1200         sendbuf[cnt++] = *a_sendbuf++;
1201       }
1202 
1203       /* put in upper diagonal portion */
1204       while (m-- > 0) {
1205         sendbuf[cnt++] = *b_sendbuf++;
1206         b_sendj++;
1207       }
1208     }
1209     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1210 
1211     /* -----------------------------------------------------------------
1212        Gather all numerical values to all processors
1213     */
1214     if (!recvcounts) {
1215       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1216     }
1217     for (i=0; i<size; i++) {
1218       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1219     }
1220     displs[0] = 0;
1221     for (i=1; i<size; i++) {
1222       displs[i] = displs[i-1] + recvcounts[i-1];
1223     }
1224     recvbuf = b->a;
1225 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1226     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1227 #else
1228     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1229 #endif
1230   }  /* endof (flag == MAT_GET_VALUES) */
1231   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1232 
1233   if (A->symmetric) {
1234     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1235   } else if (A->hermitian) {
1236     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1237   } else if (A->structurally_symmetric) {
1238     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1239   }
1240   PetscFunctionReturn(0);
1241 }
1242 #undef __FUNCT__
1243 #define __FUNCT__ "MatDestroy_MPIAIJ_MatGetSubmatrices"
1244 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C)
1245 {
1246   PetscErrorCode ierr;
1247   Mat_SeqAIJ     *c = (Mat_SeqAIJ*)C->data;
1248   Mat_SubMat     *submatj = c->submatis1;
1249   PetscInt       i;
1250 
1251   PetscFunctionBegin;
1252   ierr = submatj->destroy(C);CHKERRQ(ierr);
1253 
1254   ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr);
1255 
1256   for (i=0; i<submatj->nrqr; ++i) {
1257     ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr);
1258   }
1259   ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr);
1260 
1261   ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr);
1262   ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr);
1263 
1264   for (i=0; i<submatj->nrqs; ++i) {
1265     ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr);
1266   }
1267   ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr);
1268   ierr = PetscFree(submatj->pa);CHKERRQ(ierr);
1269 
1270   ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr);
1271 
1272 #if defined(PETSC_USE_CTABLE)
1273   ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr);
1274   if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);}
1275   ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr);
1276 #else
1277   ierr = PetscFree(submatj->rmap);CHKERRQ(ierr);
1278 #endif
1279 
1280   if (!submatj->allcolumns) {
1281 #if defined(PETSC_USE_CTABLE)
1282     ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr);
1283 #else
1284     ierr = PetscFree(submatj->cmap);CHKERRQ(ierr);
1285 #endif
1286   }
1287 
1288   ierr = PetscFree(submatj);CHKERRQ(ierr);
1289   PetscFunctionReturn(0);
1290 }
1291 
1292 #undef __FUNCT__
1293 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS_Local"
1294 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1295 {
1296   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1297   Mat            submat,A = c->A,B = c->B;
1298   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1299   PetscInt       *ai = a->i,*bi = b->i,nzA,nzB;
1300   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1301   const PetscInt *icol,*irow;
1302   PetscInt       nrow,ncol,start;
1303   PetscErrorCode ierr;
1304   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1305   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1306   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1307   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1308   PetscInt       *lens,ncols,*cols,Crow;
1309 #if defined(PETSC_USE_CTABLE)
1310   PetscTable     cmap,rmap;
1311   PetscInt       *cmap_loc,*rmap_loc;
1312 #else
1313   PetscInt       *cmap,*rmap;
1314 #endif
1315   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1316   PetscInt       *cworkB,lwrite,*bj = b->j,*subcols,*row2proc;
1317   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1318   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1319   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1320   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1321   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1322   MPI_Comm       comm;
1323   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1324   PetscMPIInt    *onodes1,*olengths1,idex,end;
1325   Mat_SubMat     *smatis1;
1326 #if defined(PETSC_USE_LOG)
1327   PetscLogStage stage[5];
1328 #endif
1329   PetscBool      isrowsorted;
1330 
1331   PetscFunctionBegin;
1332   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1333 
1334   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1335   size = c->size;
1336   rank = c->rank;
1337 
1338   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1339   if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted");
1340 
1341   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1342   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1343   if (allcolumns) {
1344     icol = NULL;
1345     ncol = C->cmap->N;
1346   } else {
1347     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1348     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1349   }
1350 
1351   if (scall == MAT_INITIAL_MATRIX) {
1352     PetscInt *sbuf2_i,*cworkA,lwrite,*aj = a->j,ctmp;
1353 
1354     ierr = PetscLogStageRegister("Setup",&stage[0]);CHKERRQ(ierr);
1355     ierr = PetscLogStageRegister("Aj comm",&stage[1]);CHKERRQ(ierr);
1356     ierr = PetscLogStageRegister("Aval comm",&stage[2]);CHKERRQ(ierr);
1357     ierr = PetscLogStageRegister("MAssem_loc",&stage[3]);CHKERRQ(ierr);
1358     ierr = PetscLogStageRegister("MAssem_off",&stage[4]);CHKERRQ(ierr);
1359 
1360     ierr = PetscLogStagePush(stage[0]);CHKERRQ(ierr);
1361 
1362     /* Get some new tags to keep the communication clean */
1363     tag1 = ((PetscObject)C)->tag;
1364     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1365     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1366 
1367     /* evaluate communication - mesg to who, length of mesg, and buffer space
1368      required. Based on this, buffers are allocated, and data copied into them */
1369     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1370     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1371 
1372     /* w1[proc] = num of rows owned by proc */
1373     proc = 0;
1374     for (j=0; j<nrow; j++) {
1375       row  = irow[j]; /* sorted! */
1376       while (row >= C->rmap->range[proc+1]) proc++;
1377       w1[proc]++;
1378       row2proc[j] = proc; /* map row index to proc */
1379     }
1380 
1381     nrqs     = 0;              /* num of outgoing messages */
1382     msz      = 0;              /* total mesg length (for all procs) */
1383     w1[rank] = 0;              /* num mesg sent to self */
1384     for (proc=0; proc<size; proc++) {
1385       if (w1[proc])  { w2[proc] = 1; nrqs++;} /* there exists a message to this proc */
1386     }
1387 
1388     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1389     for (proc=0,j=0; proc<size; proc++) {
1390       if (w1[proc]) { pa[j++] = proc;}
1391     }
1392 
1393     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1394     for (i=0; i<nrqs; i++) {
1395       proc      = pa[i];
1396       w1[proc] += 3;
1397       msz      += w1[proc];
1398     }
1399     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1400 
1401     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1402     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1403     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1404 
1405     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1406        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1407     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1408 
1409     /* Now post the Irecvs corresponding to these messages */
1410     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1411 
1412     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1413     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1414 
1415     /* Allocate Memory for outgoing messages */
1416     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1417     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1418     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1419 
1420     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1421     iptr = tmp;
1422     for (i=0; i<nrqs; i++) {
1423       proc     = pa[i];
1424       sbuf1[proc] = iptr;
1425       iptr += w1[proc];
1426     }
1427 
1428     /* Form the outgoing messages */
1429     /* Initialize the header space */
1430     for (i=0; i<nrqs; i++) {
1431       proc           = pa[i];
1432       ierr           = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr);
1433       ptr[proc]      = sbuf1[proc] + 3;
1434     }
1435 
1436     /* Parse the isrow and copy data into outbuf */
1437     ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1438     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1439       proc = row2proc[j];
1440       if (proc != rank) { /* copy to the outgoing buf*/
1441         *ptr[proc] = irow[j];
1442         ctr[proc]++; ptr[proc]++;
1443       }
1444     }
1445 
1446     /* Update the headers for the current IS */
1447     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1448       if ((ctr_j = ctr[j])) {
1449         sbuf1_j        = sbuf1[j];
1450         k              = ++sbuf1_j[0];
1451         sbuf1_j[2*k]   = ctr_j;
1452         sbuf1_j[2*k-1] = 0;
1453       }
1454     }
1455 
1456     /* Now post the sends */
1457     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1458     for (i=0; i<nrqs; ++i) {
1459       proc = pa[i];
1460       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr);
1461     }
1462 
1463     /* Post Receives to capture the buffer size */
1464     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1465     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1466 
1467     rbuf2[0] = tmp + msz;
1468     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1469 
1470     for (i=0; i<nrqs; ++i) {
1471       proc = pa[i];
1472       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr);
1473     }
1474 
1475     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1476 
1477     /* Send to other procs the buf size they should allocate */
1478     /* Receive messages*/
1479     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1480     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1481 
1482     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
1483     for (i=0; i<nrqr; ++i) {
1484       req_size[i] = 0;
1485       rbuf1_i        = rbuf1[i];
1486       start          = 2*rbuf1_i[0] + 1;
1487       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1488       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1489       sbuf2_i        = sbuf2[i];
1490       for (j=start; j<end; j++) {
1491         k            = rbuf1_i[j] - rstart;
1492         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1493         sbuf2_i[j]   = ncols;
1494         req_size[i] += ncols;
1495       }
1496       req_source1[i] = r_status1[i].MPI_SOURCE;
1497 
1498       /* form the header */
1499       sbuf2_i[0] = req_size[i];
1500       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1501 
1502       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
1503     }
1504 
1505     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1506     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1507 
1508     /* rbuf2 is received, Post recv column indices a->j */
1509     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
1510 
1511     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1512     for (i=0; i<nrqs; ++i) {
1513       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1514       req_source2[i] = r_status2[i].MPI_SOURCE;
1515       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
1516     }
1517 
1518     /* Wait on sends1 and sends2 */
1519     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1520     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1521     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1522     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1523 
1524     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1525     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1526 
1527     ierr = PetscLogStagePop();CHKERRQ(ierr);
1528     ierr = PetscLogStagePush(stage[1]);CHKERRQ(ierr);
1529 
1530     /* Now allocate sending buffers for a->j, and send them off */
1531     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1532     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1533     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1534     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1535 
1536     for (i=0; i<nrqr; i++) { /* for each requested message */
1537       rbuf1_i   = rbuf1[i];
1538       sbuf_aj_i = sbuf_aj[i];
1539       ct1       = 2*rbuf1_i[0] + 1;
1540       ct2       = 0;
1541       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1542 
1543       kmax = rbuf1[i][2];
1544       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1545         row    = rbuf1_i[ct1] - rstart;
1546         nzA    = ai[row+1] - ai[row];     nzB = bi[row+1] - bi[row];
1547         ncols  = nzA + nzB;
1548         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1549 
1550         /* load the column indices for this row into cols*/
1551         cols = sbuf_aj_i + ct2;
1552 
1553         lwrite = 0;
1554         for (l=0; l<nzB; l++) {
1555           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1556         }
1557         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1558         for (l=0; l<nzB; l++) {
1559           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1560         }
1561 
1562         ct2 += ncols;
1563       }
1564       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
1565     }
1566 
1567     /* create column map (cmap): global col of C -> local col of submat */
1568 #if defined(PETSC_USE_CTABLE)
1569     if (!allcolumns) {
1570       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1571       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1572       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1573         if (icol[j] >= cstart && icol[j] <cend) {
1574           cmap_loc[icol[j] - cstart] = j+1;
1575         } else { /* use PetscTable for non-local col indices */
1576           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1577         }
1578       }
1579     } else {
1580       cmap     = NULL;
1581       cmap_loc = NULL;
1582     }
1583     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1584 #else
1585     if (!allcolumns) {
1586       ierr   = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1587       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1588     } else {
1589       cmap = NULL;
1590     }
1591 #endif
1592 
1593     /* Create lens for MatSeqAIJSetPreallocation() */
1594     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1595 
1596     /* Compute lens from local part of C */
1597     for (j=0; j<nrow; j++) {
1598       row  = irow[j];
1599       proc = row2proc[j];
1600       if (proc == rank) {
1601         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1602         if (!allcolumns) {
1603           for (k=0; k<ncols; k++) {
1604 #if defined(PETSC_USE_CTABLE)
1605             if (cols[k] >= cstart && cols[k] <cend) {
1606               tcol = cmap_loc[cols[k] - cstart];
1607             } else {
1608               ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr);
1609             }
1610 #else
1611             tcol = cmap[cols[k]];
1612 #endif
1613             if (tcol) lens[j]++;
1614           }
1615         } else { /* allcolumns */
1616           lens[j] = ncols;
1617         }
1618         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1619       }
1620     }
1621 
1622     /* Create row map (rmap): global row of C -> local row of submat */
1623 #if defined(PETSC_USE_CTABLE)
1624     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1625     for (j=0; j<nrow; j++) {
1626       row  = irow[j];
1627       proc = row2proc[j];
1628       if (proc == rank) { /* a local row */
1629         rmap_loc[row - rstart] = j;
1630       } else {
1631         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1632       }
1633     }
1634 #else
1635     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1636     for (j=0; j<nrow; j++) {
1637       rmap[irow[j]] = j;
1638     }
1639 #endif
1640 
1641     /* Update lens from offproc data */
1642     /* recv a->j is done */
1643     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1644     for (i=0; i<nrqs; i++) {
1645       proc    = pa[i];
1646       sbuf1_i = sbuf1[proc];
1647       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1648       ct1     = 2 + 1;
1649       ct2     = 0;
1650       rbuf2_i = rbuf2[i]; /* received length of C->j */
1651       rbuf3_i = rbuf3[i]; /* received C->j */
1652 
1653       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1654       max1   = sbuf1_i[2];
1655       for (k=0; k<max1; k++,ct1++) {
1656 #if defined(PETSC_USE_CTABLE)
1657         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1658         row--;
1659         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1660 #else
1661         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1662 #endif
1663         /* Now, store row index of submat in sbuf1_i[ct1] */
1664         sbuf1_i[ct1] = row;
1665 
1666         nnz = rbuf2_i[ct1];
1667         if (!allcolumns) {
1668           for (l=0; l<nnz; l++,ct2++) {
1669 #if defined(PETSC_USE_CTABLE)
1670             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1671               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1672             } else {
1673               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1674             }
1675 #else
1676             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1677 #endif
1678             if (tcol) {
1679               lens[row]++;
1680 
1681               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1682                For reuse, we replace received C->j with index that should be inserted to submat */
1683               /* rbuf3_i[ct3++] = ct2; ??? */
1684             }
1685           }
1686         } else { /* allcolumns */
1687           lens[row] += nnz;
1688         }
1689       }
1690     }
1691     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1692     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1693 
1694     /* Create the submatrices */
1695     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1696     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1697 
1698     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1699     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1700     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1701     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1702     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1703 
1704     /* create struct Mat_SubMat and attached it to submat */
1705     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1706     subc = (Mat_SeqAIJ*)submat->data;
1707     subc->submatis1 = smatis1;
1708 
1709     smatis1->nrqs        = nrqs;
1710     smatis1->nrqr        = nrqr;
1711     smatis1->rbuf1       = rbuf1;
1712     smatis1->rbuf2       = rbuf2;
1713     smatis1->rbuf3       = rbuf3;
1714     smatis1->sbuf2       = sbuf2;
1715     smatis1->req_source2 = req_source2;
1716 
1717     smatis1->sbuf1       = sbuf1;
1718     smatis1->ptr         = ptr;
1719     smatis1->tmp         = tmp;
1720     smatis1->ctr         = ctr;
1721 
1722     smatis1->pa           = pa;
1723     smatis1->req_size     = req_size;
1724     smatis1->req_source1  = req_source1;
1725 
1726     smatis1->allcolumns  = allcolumns;
1727     smatis1->row2proc    = row2proc;
1728     smatis1->rmap        = rmap;
1729     smatis1->cmap        = cmap;
1730 #if defined(PETSC_USE_CTABLE)
1731     smatis1->rmap_loc    = rmap_loc;
1732     smatis1->cmap_loc    = cmap_loc;
1733 #endif
1734 
1735     smatis1->destroy     = submat->ops->destroy;
1736     submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices;
1737     submat->factortype   = C->factortype;
1738 
1739     ierr = PetscLogStagePop();CHKERRQ(ierr);
1740     ierr = PetscLogStagePush(stage[2]);CHKERRQ(ierr);
1741   } else { /* scall == MAT_REUSE_MATRIX */
1742     submat = submats[0];
1743     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1744 
1745     subc    = (Mat_SeqAIJ*)submat->data;
1746     smatis1 = subc->submatis1;
1747     nrqs        = smatis1->nrqs;
1748     nrqr        = smatis1->nrqr;
1749     rbuf1       = smatis1->rbuf1;
1750     rbuf2       = smatis1->rbuf2;
1751     rbuf3       = smatis1->rbuf3;
1752     req_source2 = smatis1->req_source2;
1753 
1754     sbuf1     = smatis1->sbuf1;
1755     sbuf2     = smatis1->sbuf2;
1756     ptr       = smatis1->ptr;
1757     tmp       = smatis1->tmp;
1758     ctr       = smatis1->ctr;
1759 
1760     pa         = smatis1->pa;
1761     req_size   = smatis1->req_size;
1762     req_source1 = smatis1->req_source1;
1763 
1764     allcolumns = smatis1->allcolumns;
1765     row2proc   = smatis1->row2proc;
1766     rmap       = smatis1->rmap;
1767     cmap       = smatis1->cmap;
1768 #if defined(PETSC_USE_CTABLE)
1769     rmap_loc   = smatis1->rmap_loc;
1770     cmap_loc   = smatis1->cmap_loc;
1771 #endif
1772   }
1773 
1774   /* Post recv matrix values */
1775   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
1776   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1777   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1778   for (i=0; i<nrqs; ++i) {
1779     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1780     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1781   }
1782 
1783   /* Allocate sending buffers for a->a, and send them off */
1784   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1785   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1786   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1787   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1788 
1789   for (i=0; i<nrqr; i++) {
1790     rbuf1_i   = rbuf1[i];
1791     sbuf_aa_i = sbuf_aa[i];
1792     ct1       = 2*rbuf1_i[0]+1;
1793     ct2       = 0;
1794     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1795 
1796     kmax = rbuf1_i[2];
1797     for (k=0; k<kmax; k++,ct1++) {
1798       row = rbuf1_i[ct1] - rstart;
1799       nzA = ai[row+1] - ai[row];
1800       nzB = bi[row+1] - bi[row];
1801       ncols  = nzA + nzB;
1802       cworkB = bj + bi[row];
1803       vworkA = a_a + ai[row];
1804       vworkB = b_a + bi[row];
1805 
1806       /* load the column values for this row into vals*/
1807       vals = sbuf_aa_i + ct2;
1808 
1809       lwrite = 0;
1810       for (l=0; l<nzB; l++) {
1811         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1812       }
1813       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1814       for (l=0; l<nzB; l++) {
1815         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1816       }
1817 
1818       ct2 += ncols;
1819     }
1820     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1821   }
1822 
1823   if (scall == MAT_INITIAL_MATRIX) {
1824    ierr = PetscLogStagePop();CHKERRQ(ierr);
1825    ierr = PetscLogStagePush(stage[3]);CHKERRQ(ierr);
1826   }
1827 
1828   /* Assemble submat */
1829   if (!allcolumns) {
1830     ierr = PetscCalloc2(ncol,&subcols,ncol,&subvals);CHKERRQ(ierr);
1831   }
1832   /* First assemble the local rows */
1833   for (j=0; j<nrow; j++) {
1834     row  = irow[j];
1835     proc = row2proc[j];
1836     if (proc == rank) {
1837       Crow = row;
1838 #if defined(PETSC_USE_CTABLE)
1839       row = rmap_loc[Crow - rstart]; /* row index of submat */
1840 #else
1841       row = rmap[row];
1842 #endif
1843       ierr = MatGetRow_MPIAIJ(C,Crow,&ncols,&cols,&vals);CHKERRQ(ierr);
1844 
1845       if (!allcolumns) {
1846 #if defined(PETSC_USE_CTABLE)
1847         i = 0;
1848         k = 0;
1849         while (cols[k] < cstart) {
1850           ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr);
1851           if (tcol) {
1852             subcols[i]   = --tcol;
1853             subvals[i++] = vals[k];
1854           }
1855           k++;
1856         }
1857         while (cols[k] < cend && k<ncols) {
1858           tcol = cmap_loc[cols[k]-cstart];
1859           if (tcol) {
1860             subcols[i]   = --tcol;
1861             subvals[i++] = vals[k];
1862           }
1863           k++;
1864         }
1865         while (k<ncols) {
1866           ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr);
1867           if (tcol) {
1868             subcols[i]   = --tcol;
1869             subvals[i++] = vals[k];
1870           }
1871           k++;
1872         }
1873 #else
1874         i = 0;
1875         for (k=0; k<ncols; k++) {
1876           tcol = cmap[cols[k]];
1877           if (tcol) {
1878             subcols[i]   = --tcol;
1879             subvals[i++] = vals[k];
1880           }
1881         }
1882 #endif
1883         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1884       } else { /* allcolumns */
1885         ierr = MatSetValues_SeqAIJ(submat,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
1886       }
1887       ierr = MatRestoreRow_MPIAIJ(C,Crow,&ncols,&cols,&vals);CHKERRQ(ierr);
1888     }
1889   }
1890 
1891   if (scall == MAT_INITIAL_MATRIX) {
1892    ierr = PetscLogStagePop();CHKERRQ(ierr);
1893    ierr = PetscLogStagePush(stage[4]);CHKERRQ(ierr);
1894   }
1895 
1896   /* Now assemble the off-proc rows */
1897   for (i=0; i<nrqs; i++) { /* for each requested message */
1898     /* recv values from other processes */
1899     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1900     proc    = pa[idex];
1901     sbuf1_i = sbuf1[proc];
1902     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1903     ct1     = 2 + 1;
1904     ct2     = 0; /* count of received C->j */
1905     ct3     = 0; /* count of received C->j that will be inserted into submat */
1906     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1907     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1908     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1909 
1910     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1911     max1 = sbuf1_i[2];             /* num of rows */
1912     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1913       row = sbuf1_i[ct1]; /* row index of submat */
1914       if (!allcolumns) {
1915         idex = 0;
1916         if (scall == MAT_INITIAL_MATRIX) {
1917           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1918           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1919 #if defined(PETSC_USE_CTABLE)
1920             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1921               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1922             } else {
1923               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1924             }
1925 #else
1926             tcol = cmap[rbuf3_i[ct2]];
1927 #endif
1928             if (tcol) {
1929               subcols[idex]   = --tcol;
1930               subvals[idex++] = rbuf4_i[ct2];
1931 
1932               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1933                For reuse, we replace received C->j with index that should be inserted to submat */
1934               rbuf3_i[ct3++] = ct2;
1935             }
1936           }
1937           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1938 
1939         } else { /* scall == MAT_REUSE_MATRIX */
1940           submat = submats[0];
1941           subc   = (Mat_SeqAIJ*)submat->data;
1942 
1943           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1944           for (l=0; l<nnz; l++) {
1945             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1946             subvals[idex++] = rbuf4_i[ct2];
1947           }
1948 
1949           bj = subc->j + subc->i[row];
1950           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1951         }
1952       } else { /* allcolumns */
1953         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1954         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1955         ct2 += nnz;
1956       }
1957     }
1958   }
1959 
1960   /* sending a->a are done */
1961   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1962   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1963 
1964   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1965   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1966   submats[0] = submat;
1967 
1968   /* Restore the indices */
1969   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1970   if (!allcolumns) {
1971     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1972   }
1973 
1974   /* Destroy allocated memory */
1975   for (i=0; i<nrqs; ++i) {
1976     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1977   }
1978   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1979   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1980   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1981 
1982   if (!allcolumns) {
1983     ierr = PetscFree2(subcols,subvals);CHKERRQ(ierr);
1984   }
1985 
1986   if (scall == MAT_INITIAL_MATRIX) {
1987     ierr = PetscLogStagePop();CHKERRQ(ierr);
1988 
1989     ierr = PetscFree(lens);CHKERRQ(ierr);
1990     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1991     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1992   }
1993   PetscFunctionReturn(0);
1994 }
1995 
1996 #undef __FUNCT__
1997 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS"
1998 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1999 {
2000   PetscErrorCode ierr;
2001   PetscInt       ncol;
2002   PetscBool      colflag,allcolumns=PETSC_FALSE;
2003 
2004   PetscFunctionBegin;
2005   /* Allocate memory to hold all the submatrices */
2006   if (scall != MAT_REUSE_MATRIX) {
2007     ierr = PetscMalloc1(1,submat);CHKERRQ(ierr);
2008   }
2009 
2010   /* Check for special case: each processor gets entire matrix columns */
2011   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
2012   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
2013   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
2014 
2015   ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
2016   PetscFunctionReturn(0);
2017 }
2018 
2019 #undef __FUNCT__
2020 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
2021 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2022 {
2023   PetscErrorCode ierr;
2024   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
2025   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
2026 
2027   PetscFunctionBegin;
2028   /* Check for special case: each processor gets entire matrix */
2029   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip several MPIU_Allreduce() */
2030     ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2031     PetscFunctionReturn(0);
2032   }
2033 
2034   if (ismax == 1 && C->rmap->N == C->cmap->N) {
2035     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
2036     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
2037     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
2038     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
2039     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2040       wantallmatrix = PETSC_TRUE;
2041 
2042       ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
2043     }
2044   }
2045   ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2046   if (twantallmatrix) {
2047     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2048     PetscFunctionReturn(0);
2049   }
2050 
2051   /* Allocate memory to hold all the submatrices */
2052   if (scall != MAT_REUSE_MATRIX) {
2053     ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr);
2054   }
2055 
2056   /* Check for special case: each processor gets entire matrix columns */
2057   ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr);
2058   for (i=0; i<ismax; i++) {
2059     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2060     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
2061     if (colflag && ncol == C->cmap->N) {
2062       allcolumns[i] = PETSC_TRUE;
2063     } else {
2064       allcolumns[i] = PETSC_FALSE;
2065     }
2066   }
2067 
2068   /* Determine the number of stages through which submatrices are done */
2069   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
2070 
2071   /*
2072      Each stage will extract nmax submatrices.
2073      nmax is determined by the matrix column dimension.
2074      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2075   */
2076   if (!nmax) nmax = 1;
2077   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
2078 
2079   /* Make sure every processor loops through the nstages */
2080   ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2081 
2082   for (i=0,pos=0; i<nstages; i++) {
2083     if (pos+nmax <= ismax) max_no = nmax;
2084     else if (pos == ismax) max_no = 0;
2085     else                   max_no = ismax-pos;
2086     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
2087     pos += max_no;
2088   }
2089 
2090   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
2091   PetscFunctionReturn(0);
2092 }
2093 
2094 /* -------------------------------------------------------------------------*/
2095 #undef __FUNCT__
2096 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
2097 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
2098 {
2099   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2100   Mat            A  = c->A;
2101   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
2102   const PetscInt **icol,**irow;
2103   PetscInt       *nrow,*ncol,start;
2104   PetscErrorCode ierr;
2105   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
2106   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
2107   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
2108   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
2109   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2110 #if defined(PETSC_USE_CTABLE)
2111   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
2112 #else
2113   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2114 #endif
2115   const PetscInt *irow_i;
2116   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2117   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2118   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2119   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2120   MPI_Status     *r_status3,*r_status4,*s_status4;
2121   MPI_Comm       comm;
2122   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
2123   PetscMPIInt    *onodes1,*olengths1;
2124   PetscMPIInt    idex,idex2,end;
2125 
2126   PetscFunctionBegin;
2127   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2128   tag0 = ((PetscObject)C)->tag;
2129   size = c->size;
2130   rank = c->rank;
2131 
2132   /* Get some new tags to keep the communication clean */
2133   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
2134   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2135   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2136 
2137   ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr);
2138 
2139   for (i=0; i<ismax; i++) {
2140     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2141     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2142     if (allcolumns[i]) {
2143       icol[i] = NULL;
2144       ncol[i] = C->cmap->N;
2145     } else {
2146       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2147       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2148     }
2149   }
2150 
2151   /* evaluate communication - mesg to who, length of mesg, and buffer space
2152      required. Based on this, buffers are allocated, and data copied into them*/
2153   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size */
2154   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2155   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2156   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2157   for (i=0; i<ismax; i++) {
2158     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
2159     jmax   = nrow[i];
2160     irow_i = irow[i];
2161     for (j=0; j<jmax; j++) {
2162       l   = 0;
2163       row = irow_i[j];
2164       while (row >= C->rmap->range[l+1]) l++;
2165       proc = l;
2166       w4[proc]++;
2167     }
2168     for (j=0; j<size; j++) {
2169       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
2170     }
2171   }
2172 
2173   nrqs     = 0;              /* no of outgoing messages */
2174   msz      = 0;              /* total mesg length (for all procs) */
2175   w1[rank] = 0;              /* no mesg sent to self */
2176   w3[rank] = 0;
2177   for (i=0; i<size; i++) {
2178     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2179   }
2180   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2181   for (i=0,j=0; i<size; i++) {
2182     if (w1[i]) { pa[j] = i; j++; }
2183   }
2184 
2185   /* Each message would have a header = 1 + 2*(no of IS) + data */
2186   for (i=0; i<nrqs; i++) {
2187     j      = pa[i];
2188     w1[j] += w2[j] + 2* w3[j];
2189     msz   += w1[j];
2190   }
2191   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2192 
2193   /* Determine the number of messages to expect, their lengths, from from-ids */
2194   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2195   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2196 
2197   /* Now post the Irecvs corresponding to these messages */
2198   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2199 
2200   ierr = PetscFree(onodes1);CHKERRQ(ierr);
2201   ierr = PetscFree(olengths1);CHKERRQ(ierr);
2202 
2203   /* Allocate Memory for outgoing messages */
2204   ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2205   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
2206   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
2207 
2208   {
2209     PetscInt *iptr = tmp;
2210     k    = 0;
2211     for (i=0; i<nrqs; i++) {
2212       j        = pa[i];
2213       iptr    += k;
2214       sbuf1[j] = iptr;
2215       k        = w1[j];
2216     }
2217   }
2218 
2219   /* Form the outgoing messages */
2220   /* Initialize the header space */
2221   for (i=0; i<nrqs; i++) {
2222     j           = pa[i];
2223     sbuf1[j][0] = 0;
2224     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
2225     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2226   }
2227 
2228   /* Parse the isrow and copy data into outbuf */
2229   for (i=0; i<ismax; i++) {
2230     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
2231     irow_i = irow[i];
2232     jmax   = nrow[i];
2233     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2234       l   = 0;
2235       row = irow_i[j];
2236       while (row >= C->rmap->range[l+1]) l++;
2237       proc = l;
2238       if (proc != rank) { /* copy to the outgoing buf*/
2239         ctr[proc]++;
2240         *ptr[proc] = row;
2241         ptr[proc]++;
2242       }
2243     }
2244     /* Update the headers for the current IS */
2245     for (j=0; j<size; j++) { /* Can Optimise this loop too */
2246       if ((ctr_j = ctr[j])) {
2247         sbuf1_j        = sbuf1[j];
2248         k              = ++sbuf1_j[0];
2249         sbuf1_j[2*k]   = ctr_j;
2250         sbuf1_j[2*k-1] = i;
2251       }
2252     }
2253   }
2254 
2255   /*  Now  post the sends */
2256   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2257   for (i=0; i<nrqs; ++i) {
2258     j    = pa[i];
2259     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
2260   }
2261 
2262   /* Post Receives to capture the buffer size */
2263   ierr     = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2264   ierr     = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr);
2265   rbuf2[0] = tmp + msz;
2266   for (i=1; i<nrqs; ++i) {
2267     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2268   }
2269   for (i=0; i<nrqs; ++i) {
2270     j    = pa[i];
2271     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
2272   }
2273 
2274   /* Send to other procs the buf size they should allocate */
2275 
2276 
2277   /* Receive messages*/
2278   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2279   ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2280   ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr);
2281   {
2282     PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2283     PetscInt   *sbuf2_i;
2284 
2285     for (i=0; i<nrqr; ++i) {
2286       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
2287 
2288       req_size[idex] = 0;
2289       rbuf1_i        = rbuf1[idex];
2290       start          = 2*rbuf1_i[0] + 1;
2291       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2292       ierr           = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr);
2293       sbuf2_i        = sbuf2[idex];
2294       for (j=start; j<end; j++) {
2295         id              = rbuf1_i[j] - rstart;
2296         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2297         sbuf2_i[j]      = ncols;
2298         req_size[idex] += ncols;
2299       }
2300       req_source[idex] = r_status1[i].MPI_SOURCE;
2301       /* form the header */
2302       sbuf2_i[0] = req_size[idex];
2303       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2304 
2305       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
2306     }
2307   }
2308   ierr = PetscFree(r_status1);CHKERRQ(ierr);
2309   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2310 
2311   /*  recv buffer sizes */
2312   /* Receive messages*/
2313 
2314   ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr);
2315   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2316   ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2317   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2318   ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2319 
2320   for (i=0; i<nrqs; ++i) {
2321     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
2322     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr);
2323     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr);
2324     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
2325     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
2326   }
2327   ierr = PetscFree(r_status2);CHKERRQ(ierr);
2328   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2329 
2330   /* Wait on sends1 and sends2 */
2331   ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2332   ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2333 
2334   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
2335   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
2336   ierr = PetscFree(s_status1);CHKERRQ(ierr);
2337   ierr = PetscFree(s_status2);CHKERRQ(ierr);
2338   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2339   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2340 
2341   /* Now allocate buffers for a->j, and send them off */
2342   ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2343   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2344   ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2345   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2346 
2347   ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2348   {
2349     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2350     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2351     PetscInt cend = C->cmap->rend;
2352     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2353 
2354     for (i=0; i<nrqr; i++) {
2355       rbuf1_i   = rbuf1[i];
2356       sbuf_aj_i = sbuf_aj[i];
2357       ct1       = 2*rbuf1_i[0] + 1;
2358       ct2       = 0;
2359       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2360         kmax = rbuf1[i][2*j];
2361         for (k=0; k<kmax; k++,ct1++) {
2362           row    = rbuf1_i[ct1] - rstart;
2363           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2364           ncols  = nzA + nzB;
2365           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2366 
2367           /* load the column indices for this row into cols*/
2368           cols = sbuf_aj_i + ct2;
2369 
2370           lwrite = 0;
2371           for (l=0; l<nzB; l++) {
2372             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2373           }
2374           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2375           for (l=0; l<nzB; l++) {
2376             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2377           }
2378 
2379           ct2 += ncols;
2380         }
2381       }
2382       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
2383     }
2384   }
2385   ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr);
2386   ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr);
2387 
2388   /* Allocate buffers for a->a, and send them off */
2389   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2390   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2391   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2392   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2393 
2394   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2395   {
2396     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2397     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2398     PetscInt    cend   = C->cmap->rend;
2399     PetscInt    *b_j   = b->j;
2400     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2401 
2402     for (i=0; i<nrqr; i++) {
2403       rbuf1_i   = rbuf1[i];
2404       sbuf_aa_i = sbuf_aa[i];
2405       ct1       = 2*rbuf1_i[0]+1;
2406       ct2       = 0;
2407       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2408         kmax = rbuf1_i[2*j];
2409         for (k=0; k<kmax; k++,ct1++) {
2410           row    = rbuf1_i[ct1] - rstart;
2411           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2412           ncols  = nzA + nzB;
2413           cworkB = b_j + b_i[row];
2414           vworkA = a_a + a_i[row];
2415           vworkB = b_a + b_i[row];
2416 
2417           /* load the column values for this row into vals*/
2418           vals = sbuf_aa_i+ct2;
2419 
2420           lwrite = 0;
2421           for (l=0; l<nzB; l++) {
2422             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2423           }
2424           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2425           for (l=0; l<nzB; l++) {
2426             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2427           }
2428 
2429           ct2 += ncols;
2430         }
2431       }
2432       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
2433     }
2434   }
2435   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
2436   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
2437   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2438   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2439 
2440   /* Form the matrix */
2441   /* create col map: global col of C -> local col of submatrices */
2442   {
2443     const PetscInt *icol_i;
2444 #if defined(PETSC_USE_CTABLE)
2445     ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr);
2446     for (i=0; i<ismax; i++) {
2447       if (!allcolumns[i]) {
2448         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2449 
2450         jmax   = ncol[i];
2451         icol_i = icol[i];
2452         cmap_i = cmap[i];
2453         for (j=0; j<jmax; j++) {
2454           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2455         }
2456       } else {
2457         cmap[i] = NULL;
2458       }
2459     }
2460 #else
2461     ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr);
2462     for (i=0; i<ismax; i++) {
2463       if (!allcolumns[i]) {
2464         ierr   = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2465         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
2466         jmax   = ncol[i];
2467         icol_i = icol[i];
2468         cmap_i = cmap[i];
2469         for (j=0; j<jmax; j++) {
2470           cmap_i[icol_i[j]] = j+1;
2471         }
2472       } else {
2473         cmap[i] = NULL;
2474       }
2475     }
2476 #endif
2477   }
2478 
2479   /* Create lens which is required for MatCreate... */
2480   for (i=0,j=0; i<ismax; i++) j += nrow[i];
2481   ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2482   if (ismax) {
2483     ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr);
2484     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
2485   }
2486   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2487 
2488   /* Update lens from local data */
2489   for (i=0; i<ismax; i++) {
2490     jmax = nrow[i];
2491     if (!allcolumns[i]) cmap_i = cmap[i];
2492     irow_i = irow[i];
2493     lens_i = lens[i];
2494     for (j=0; j<jmax; j++) {
2495       l   = 0;
2496       row = irow_i[j];
2497       while (row >= C->rmap->range[l+1]) l++;
2498       proc = l;
2499       if (proc == rank) {
2500         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2501         if (!allcolumns[i]) {
2502           for (k=0; k<ncols; k++) {
2503 #if defined(PETSC_USE_CTABLE)
2504             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2505 #else
2506             tcol = cmap_i[cols[k]];
2507 #endif
2508             if (tcol) lens_i[j]++;
2509           }
2510         } else { /* allcolumns */
2511           lens_i[j] = ncols;
2512         }
2513         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2514       }
2515     }
2516   }
2517 
2518   /* Create row map: global row of C -> local row of submatrices */
2519 #if defined(PETSC_USE_CTABLE)
2520   ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr);
2521   for (i=0; i<ismax; i++) {
2522     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2523     irow_i = irow[i];
2524     jmax   = nrow[i];
2525     for (j=0; j<jmax; j++) {
2526       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2527     }
2528   }
2529 #else
2530   ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr);
2531   if (ismax) {
2532     ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr);
2533     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
2534   }
2535   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
2536   for (i=0; i<ismax; i++) {
2537     rmap_i = rmap[i];
2538     irow_i = irow[i];
2539     jmax   = nrow[i];
2540     for (j=0; j<jmax; j++) {
2541       rmap_i[irow_i[j]] = j;
2542     }
2543   }
2544 #endif
2545 
2546   /* Update lens from offproc data */
2547   {
2548     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2549 
2550     for (tmp2=0; tmp2<nrqs; tmp2++) {
2551       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
2552       idex    = pa[idex2];
2553       sbuf1_i = sbuf1[idex];
2554       jmax    = sbuf1_i[0];
2555       ct1     = 2*jmax+1;
2556       ct2     = 0;
2557       rbuf2_i = rbuf2[idex2];
2558       rbuf3_i = rbuf3[idex2];
2559       for (j=1; j<=jmax; j++) {
2560         is_no  = sbuf1_i[2*j-1];
2561         max1   = sbuf1_i[2*j];
2562         lens_i = lens[is_no];
2563         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2564         rmap_i = rmap[is_no];
2565         for (k=0; k<max1; k++,ct1++) {
2566 #if defined(PETSC_USE_CTABLE)
2567           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2568           row--;
2569           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2570 #else
2571           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2572 #endif
2573           max2 = rbuf2_i[ct1];
2574           for (l=0; l<max2; l++,ct2++) {
2575             if (!allcolumns[is_no]) {
2576 #if defined(PETSC_USE_CTABLE)
2577               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2578 #else
2579               tcol = cmap_i[rbuf3_i[ct2]];
2580 #endif
2581               if (tcol) lens_i[row]++;
2582             } else { /* allcolumns */
2583               lens_i[row]++; /* lens_i[row] += max2 ? */
2584             }
2585           }
2586         }
2587       }
2588     }
2589   }
2590   ierr = PetscFree(r_status3);CHKERRQ(ierr);
2591   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2592   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
2593   ierr = PetscFree(s_status3);CHKERRQ(ierr);
2594   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2595 
2596   /* Create the submatrices */
2597   if (scall == MAT_REUSE_MATRIX) {
2598     PetscBool flag;
2599 
2600     /*
2601         Assumes new rows are same length as the old rows,hence bug!
2602     */
2603     for (i=0; i<ismax; i++) {
2604       mat = (Mat_SeqAIJ*)(submats[i]->data);
2605       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2606       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2607       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
2608       /* Initial matrix as if empty */
2609       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2610 
2611       submats[i]->factortype = C->factortype;
2612     }
2613   } else {
2614     for (i=0; i<ismax; i++) {
2615       PetscInt rbs,cbs;
2616       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2617       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2618 
2619       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2620       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2621 
2622       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2623       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2624       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2625     }
2626   }
2627 
2628   /* Assemble the matrices */
2629   /* First assemble the local rows */
2630   {
2631     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2632     PetscScalar *imat_a;
2633 
2634     for (i=0; i<ismax; i++) {
2635       mat       = (Mat_SeqAIJ*)submats[i]->data;
2636       imat_ilen = mat->ilen;
2637       imat_j    = mat->j;
2638       imat_i    = mat->i;
2639       imat_a    = mat->a;
2640 
2641       if (!allcolumns[i]) cmap_i = cmap[i];
2642       rmap_i = rmap[i];
2643       irow_i = irow[i];
2644       jmax   = nrow[i];
2645       for (j=0; j<jmax; j++) {
2646         l   = 0;
2647         row = irow_i[j];
2648         while (row >= C->rmap->range[l+1]) l++;
2649         proc = l;
2650         if (proc == rank) {
2651           old_row = row;
2652 #if defined(PETSC_USE_CTABLE)
2653           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2654           row--;
2655 #else
2656           row = rmap_i[row];
2657 #endif
2658           ilen_row = imat_ilen[row];
2659           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2660           mat_i    = imat_i[row];
2661           mat_a    = imat_a + mat_i;
2662           mat_j    = imat_j + mat_i;
2663           if (!allcolumns[i]) {
2664             for (k=0; k<ncols; k++) {
2665 #if defined(PETSC_USE_CTABLE)
2666               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2667 #else
2668               tcol = cmap_i[cols[k]];
2669 #endif
2670               if (tcol) {
2671                 *mat_j++ = tcol - 1;
2672                 *mat_a++ = vals[k];
2673                 ilen_row++;
2674               }
2675             }
2676           } else { /* allcolumns */
2677             for (k=0; k<ncols; k++) {
2678               *mat_j++ = cols[k];  /* global col index! */
2679               *mat_a++ = vals[k];
2680               ilen_row++;
2681             }
2682           }
2683           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2684 
2685           imat_ilen[row] = ilen_row;
2686         }
2687       }
2688     }
2689   }
2690 
2691   /*   Now assemble the off proc rows*/
2692   {
2693     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
2694     PetscInt    *imat_j,*imat_i;
2695     PetscScalar *imat_a,*rbuf4_i;
2696 
2697     for (tmp2=0; tmp2<nrqs; tmp2++) {
2698       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
2699       idex    = pa[idex2];
2700       sbuf1_i = sbuf1[idex];
2701       jmax    = sbuf1_i[0];
2702       ct1     = 2*jmax + 1;
2703       ct2     = 0;
2704       rbuf2_i = rbuf2[idex2];
2705       rbuf3_i = rbuf3[idex2];
2706       rbuf4_i = rbuf4[idex2];
2707       for (j=1; j<=jmax; j++) {
2708         is_no     = sbuf1_i[2*j-1];
2709         rmap_i    = rmap[is_no];
2710         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2711         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
2712         imat_ilen = mat->ilen;
2713         imat_j    = mat->j;
2714         imat_i    = mat->i;
2715         imat_a    = mat->a;
2716         max1      = sbuf1_i[2*j];
2717         for (k=0; k<max1; k++,ct1++) {
2718           row = sbuf1_i[ct1];
2719 #if defined(PETSC_USE_CTABLE)
2720           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2721           row--;
2722 #else
2723           row = rmap_i[row];
2724 #endif
2725           ilen  = imat_ilen[row];
2726           mat_i = imat_i[row];
2727           mat_a = imat_a + mat_i;
2728           mat_j = imat_j + mat_i;
2729           max2  = rbuf2_i[ct1];
2730           if (!allcolumns[is_no]) {
2731             for (l=0; l<max2; l++,ct2++) {
2732 
2733 #if defined(PETSC_USE_CTABLE)
2734               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2735 #else
2736               tcol = cmap_i[rbuf3_i[ct2]];
2737 #endif
2738               if (tcol) {
2739                 *mat_j++ = tcol - 1;
2740                 *mat_a++ = rbuf4_i[ct2];
2741                 ilen++;
2742               }
2743             }
2744           } else { /* allcolumns */
2745             for (l=0; l<max2; l++,ct2++) {
2746               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2747               *mat_a++ = rbuf4_i[ct2];
2748               ilen++;
2749             }
2750           }
2751           imat_ilen[row] = ilen;
2752         }
2753       }
2754     }
2755   }
2756 
2757   /* sort the rows */
2758   {
2759     PetscInt    *imat_ilen,*imat_j,*imat_i;
2760     PetscScalar *imat_a;
2761 
2762     for (i=0; i<ismax; i++) {
2763       mat       = (Mat_SeqAIJ*)submats[i]->data;
2764       imat_j    = mat->j;
2765       imat_i    = mat->i;
2766       imat_a    = mat->a;
2767       imat_ilen = mat->ilen;
2768 
2769       if (allcolumns[i]) continue;
2770       jmax = nrow[i];
2771       for (j=0; j<jmax; j++) {
2772         PetscInt ilen;
2773 
2774         mat_i = imat_i[j];
2775         mat_a = imat_a + mat_i;
2776         mat_j = imat_j + mat_i;
2777         ilen  = imat_ilen[j];
2778         ierr  = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr);
2779       }
2780     }
2781   }
2782 
2783   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2784   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2785   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
2786   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2787   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2788 
2789   /* Restore the indices */
2790   for (i=0; i<ismax; i++) {
2791     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2792     if (!allcolumns[i]) {
2793       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2794     }
2795   }
2796 
2797   /* Destroy allocated memory */
2798   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
2799   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2800   ierr = PetscFree(pa);CHKERRQ(ierr);
2801 
2802   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
2803   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
2804   for (i=0; i<nrqr; ++i) {
2805     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
2806   }
2807   for (i=0; i<nrqs; ++i) {
2808     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
2809     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2810   }
2811 
2812   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
2813   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
2814   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2815   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2816   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2817   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2818   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2819 
2820 #if defined(PETSC_USE_CTABLE)
2821   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
2822 #else
2823   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
2824 #endif
2825   ierr = PetscFree(rmap);CHKERRQ(ierr);
2826 
2827   for (i=0; i<ismax; i++) {
2828     if (!allcolumns[i]) {
2829 #if defined(PETSC_USE_CTABLE)
2830       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
2831 #else
2832       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
2833 #endif
2834     }
2835   }
2836   ierr = PetscFree(cmap);CHKERRQ(ierr);
2837   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2838   ierr = PetscFree(lens);CHKERRQ(ierr);
2839 
2840   for (i=0; i<ismax; i++) {
2841     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2842     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2843   }
2844   PetscFunctionReturn(0);
2845 }
2846 
2847 /*
2848  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2849  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2850  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2851  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2852  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2853  state, and needs to be "assembled" later by compressing B's column space.
2854 
2855  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2856  Following this call, C->A & C->B have been created, even if empty.
2857  */
2858 #undef __FUNCT__
2859 #define __FUNCT__ "MatSetSeqMats_MPIAIJ"
2860 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2861 {
2862   /* If making this function public, change the error returned in this function away from _PLIB. */
2863   PetscErrorCode ierr;
2864   Mat_MPIAIJ     *aij;
2865   Mat_SeqAIJ     *Baij;
2866   PetscBool      seqaij,Bdisassembled;
2867   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2868   PetscScalar    v;
2869   const PetscInt *rowindices,*colindices;
2870 
2871   PetscFunctionBegin;
2872   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2873   if (A) {
2874     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2875     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2876     if (rowemb) {
2877       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2878       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2879     } else {
2880       if (C->rmap->n != A->rmap->n) {
2881 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2882       }
2883     }
2884     if (dcolemb) {
2885       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2886       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2887     } else {
2888       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2889     }
2890   }
2891   if (B) {
2892     ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2893     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2894     if (rowemb) {
2895       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2896       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2897     } else {
2898       if (C->rmap->n != B->rmap->n) {
2899 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2900       }
2901     }
2902     if (ocolemb) {
2903       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2904       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2905     } else {
2906       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2907     }
2908   }
2909 
2910   aij    = (Mat_MPIAIJ*)(C->data);
2911   if (!aij->A) {
2912     /* Mimic parts of MatMPIAIJSetPreallocation() */
2913     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2914     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2915     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2916     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2917     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2918   }
2919   if (A) {
2920     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2921   } else {
2922     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2923   }
2924   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2925     /*
2926       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2927       need to "disassemble" B -- convert it to using C's global indices.
2928       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2929 
2930       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2931       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2932 
2933       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2934       At least avoid calling MatSetValues() and the implied searches?
2935     */
2936 
2937     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2938 #if defined(PETSC_USE_CTABLE)
2939       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2940 #else
2941       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2942       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2943       if (aij->B) {
2944         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2945       }
2946 #endif
2947       ngcol = 0;
2948       if (aij->lvec) {
2949 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2950       }
2951       if (aij->garray) {
2952 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2953 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2954       }
2955       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2956       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2957     }
2958     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2959       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2960     }
2961     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2962       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2963     }
2964   }
2965   Bdisassembled = PETSC_FALSE;
2966   if (!aij->B) {
2967     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2968     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2969     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2970     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
2971     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
2972     Bdisassembled = PETSC_TRUE;
2973   }
2974   if (B) {
2975     Baij = (Mat_SeqAIJ*)(B->data);
2976     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2977       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
2978       for (i=0; i<B->rmap->n; i++) {
2979 	nz[i] = Baij->i[i+1] - Baij->i[i];
2980       }
2981       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
2982       ierr = PetscFree(nz);CHKERRQ(ierr);
2983     }
2984 
2985     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
2986     shift = rend-rstart;
2987     count = 0;
2988     rowindices = NULL;
2989     colindices = NULL;
2990     if (rowemb) {
2991       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
2992     }
2993     if (ocolemb) {
2994       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
2995     }
2996     for (i=0; i<B->rmap->n; i++) {
2997       PetscInt row;
2998       row = i;
2999       if (rowindices) row = rowindices[i];
3000       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3001 	col  = Baij->j[count];
3002 	if (colindices) col = colindices[col];
3003 	if (Bdisassembled && col>=rstart) col += shift;
3004 	v    = Baij->a[count];
3005 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3006 	++count;
3007       }
3008     }
3009     /* No assembly for aij->B is necessary. */
3010     /* FIXME: set aij->B's nonzerostate correctly. */
3011   } else {
3012     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3013   }
3014   C->preallocated  = PETSC_TRUE;
3015   C->was_assembled = PETSC_FALSE;
3016   C->assembled     = PETSC_FALSE;
3017    /*
3018       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3019       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3020    */
3021   PetscFunctionReturn(0);
3022 }
3023 
3024 #undef __FUNCT__
3025 #define __FUNCT__ "MatGetSeqMats_MPIAIJ"
3026 /*
3027   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3028  */
3029 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3030 {
3031   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
3032 
3033   PetscFunctionBegin;
3034   PetscValidPointer(A,2);
3035   PetscValidPointer(B,3);
3036   /* FIXME: make sure C is assembled */
3037   *A = aij->A;
3038   *B = aij->B;
3039   /* Note that we don't incref *A and *B, so be careful! */
3040   PetscFunctionReturn(0);
3041 }
3042 
3043 /*
3044   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3045   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3046 */
3047 #undef __FUNCT__
3048 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ"
3049 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3050                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3051 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3052 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3053 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3054 {
3055   PetscErrorCode ierr;
3056   PetscMPIInt    isize,flag;
3057   PetscInt       i,ii,cismax,ispar,ciscol_localsize;
3058   Mat            *A,*B;
3059   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3060 
3061   PetscFunctionBegin;
3062   if (!ismax) PetscFunctionReturn(0);
3063 
3064   for (i = 0, cismax = 0; i < ismax; ++i) {
3065     PetscMPIInt isize;
3066     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
3067     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3068     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
3069     if (isize > 1) ++cismax;
3070   }
3071   /*
3072      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3073      ispar counts the number of parallel ISs across C's comm.
3074   */
3075   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3076   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3077     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3078     PetscFunctionReturn(0);
3079   }
3080 
3081   /* if (ispar) */
3082   /*
3083     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3084     These are used to extract the off-diag portion of the resulting parallel matrix.
3085     The row IS for the off-diag portion is the same as for the diag portion,
3086     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3087   */
3088   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3089   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3090   for (i = 0, ii = 0; i < ismax; ++i) {
3091     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3092     if (isize > 1) {
3093       /*
3094 	 TODO: This is the part that's ***NOT SCALABLE***.
3095 	 To fix this we need to extract just the indices of C's nonzero columns
3096 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3097 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
3098 	 to be done without serializing on the IS list, so, most likely, it is best
3099 	 done by rewriting MatGetSubMatrices_MPIAIJ() directly.
3100       */
3101       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3102       /* Now we have to
3103 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3104 	     were sorted on each rank, concatenated they might no longer be sorted;
3105 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3106 	     indices in the nondecreasing order to the original index positions.
3107 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3108       */
3109       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3110       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3111       ++ii;
3112     }
3113   }
3114   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3115   for (i = 0, ii = 0; i < ismax; ++i) {
3116     PetscInt       j,issize;
3117     const PetscInt *indices;
3118 
3119     /*
3120        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3121      */
3122     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3123     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3124     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3125     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3126     for (j = 1; j < issize; ++j) {
3127       if (indices[j] == indices[j-1]) {
3128 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3129       }
3130     }
3131     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3132 
3133 
3134     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3135     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3136     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3137     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3138     for (j = 1; j < issize; ++j) {
3139       if (indices[j-1] == indices[j]) {
3140 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3141       }
3142     }
3143     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3144     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3145     if (isize > 1) {
3146       cisrow[ii] = isrow[i];
3147       ++ii;
3148     }
3149   }
3150   /*
3151     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3152     array of sequential matrices underlying the resulting parallel matrices.
3153     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3154     contain duplicates.
3155 
3156     There are as many diag matrices as there are original index sets. There are only as many parallel
3157     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3158 
3159     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3160     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3161       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3162       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3163       with A[i] and B[ii] extracted from the corresponding MPI submat.
3164     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3165       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3166       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3167       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3168       values into A[i] and B[ii] sitting inside the corresponding submat.
3169     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3170       A[i], B[ii], AA[i] or BB[ii] matrices.
3171   */
3172   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3173   if (scall == MAT_INITIAL_MATRIX) {
3174     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3175     /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */
3176   } else {
3177     ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr);
3178     ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr);
3179     /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */
3180     for (i = 0, ii = 0; i < ismax; ++i) {
3181       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3182       if (isize > 1) {
3183 	Mat AA,BB;
3184 	ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr);
3185 	if (!isrow_p[i] && !iscol_p[i]) {
3186 	  A[i] = AA;
3187 	} else {
3188 	  /* TODO: extract A[i] composed on AA. */
3189 	  ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
3190 	}
3191 	if (!isrow_p[i] && !ciscol_p[ii]) {
3192 	  B[ii] = BB;
3193 	} else {
3194 	  /* TODO: extract B[ii] composed on BB. */
3195 	  ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr);
3196 	}
3197 	++ii;
3198       } else {
3199 	if (!isrow_p[i] && !iscol_p[i]) {
3200 	  A[i] = (*submat)[i];
3201 	} else {
3202 	  /* TODO: extract A[i] composed on (*submat)[i]. */
3203 	  ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
3204 	}
3205       }
3206     }
3207   }
3208   /* Now obtain the sequential A and B submatrices separately. */
3209   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr);
3210   /* I did not figure out a good way to fix it right now.
3211    * Local column size of B[i] is different from the size of ciscol[i].
3212    * B[i]'s size is finally determined by MatAssembly, while
3213    * ciscol[i] is computed as the complement of iscol[i].
3214    * It is better to keep only nonzero indices when computing
3215    * the complement ciscol[i].
3216    * */
3217   if(scall==MAT_REUSE_MATRIX){
3218 	for(i=0; i<cismax; i++){
3219 	  ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr);
3220 	  B[i]->cmap->n = ciscol_localsize;
3221 	}
3222   }
3223   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr);
3224   /*
3225     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3226     matrices A & B have been extracted directly into the parallel matrices containing them, or
3227     simply into the sequential matrix identical with the corresponding A (if isize == 1).
3228     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3229     to have the same sparsity pattern.
3230     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3231     must be constructed for C. This is done by setseqmat(s).
3232   */
3233   for (i = 0, ii = 0; i < ismax; ++i) {
3234     /*
3235        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3236        That way we can avoid sorting and computing permutations when reusing.
3237        To this end:
3238         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3239 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
3240 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3241     */
3242     MatStructure pattern;
3243     if (scall == MAT_INITIAL_MATRIX) {
3244       pattern = DIFFERENT_NONZERO_PATTERN;
3245     } else {
3246       pattern = SAME_NONZERO_PATTERN;
3247     }
3248     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3249     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3250     if (isize > 1) {
3251       if (scall == MAT_INITIAL_MATRIX) {
3252 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3253 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3254 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3255 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3256 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3257       }
3258       /*
3259 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3260       */
3261       {
3262 	Mat AA,BB;
3263 	AA = NULL;
3264 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i];
3265 	BB = NULL;
3266 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii];
3267 	if (AA || BB) {
3268 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3269 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3270 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3271 	}
3272 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) {
3273 	  /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */
3274 	  ierr = MatDestroy(&AA);CHKERRQ(ierr);
3275 	}
3276 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) {
3277 	  /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */
3278 	  ierr = MatDestroy(&BB);CHKERRQ(ierr);
3279 	}
3280       }
3281       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3282       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3283       ++ii;
3284     } else { /* if (isize == 1) */
3285       if (scall == MAT_INITIAL_MATRIX) {
3286 	if (isrow_p[i] || iscol_p[i]) {
3287 	  ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3288 	} else (*submat)[i] = A[i];
3289       }
3290       if (isrow_p[i] || iscol_p[i]) {
3291 	ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3292 	/* Otherwise A is extracted straight into (*submats)[i]. */
3293 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3294 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
3295       }
3296     }
3297     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3298     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3299   }
3300   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3301   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3302   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3303   ierr = PetscFree(A);CHKERRQ(ierr);
3304   ierr = PetscFree(B);CHKERRQ(ierr);
3305   PetscFunctionReturn(0);
3306 }
3307 
3308 
3309 
3310 #undef __FUNCT__
3311 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ"
3312 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3313 {
3314   PetscErrorCode ierr;
3315 
3316   PetscFunctionBegin;
3317   ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3318   PetscFunctionReturn(0);
3319 }
3320