xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision f59157ec7137ce66d08be5433b58f6698a6b311f)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
22 {
23   PetscErrorCode ierr;
24   PetscInt       i;
25 
26   PetscFunctionBegin;
27   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28   for (i=0; i<ov; ++i) {
29     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
30   }
31   PetscFunctionReturn(0);
32 }
33 
34 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
35 {
36   PetscErrorCode ierr;
37   PetscInt       i;
38 
39   PetscFunctionBegin;
40   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
41   for (i=0; i<ov; ++i) {
42     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
43   }
44   PetscFunctionReturn(0);
45 }
46 
47 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
48 {
49   PetscErrorCode ierr;
50   MPI_Comm       comm;
51   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j;
52   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
53   PetscInt       nrecvrows,*sbsizes = NULL,*sbdata = NULL;
54   const PetscInt *indices_i,**indices;
55   PetscLayout    rmap;
56   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom,owner;
57   PetscSF        sf;
58   PetscSFNode    *remote;
59 
60   PetscFunctionBegin;
61   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
62   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
63   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
64   /* get row map to determine where rows should be going */
65   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
66   /* retrieve IS data and put all together so that we
67    * can optimize communication
68    *  */
69   ierr = PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
70   for (i=0,tlength=0; i<nidx; i++) {
71     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
72     tlength += length[i];
73     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
74   }
75   /* find these rows on remote processors */
76   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
77   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
78   nrrows = 0;
79   for (i=0; i<nidx; i++) {
80     length_i  = length[i];
81     indices_i = indices[i];
82     for (j=0; j<length_i; j++) {
83       owner = -1;
84       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
85       /* remote processors */
86       if (owner != rank) {
87         tosizes_temp[owner]++; /* number of rows to owner */
88         rrow_ranks[nrrows]  = owner; /* processor */
89         rrow_isids[nrrows]   = i; /* is id */
90         remoterows[nrrows++] = indices_i[j]; /* row */
91       }
92     }
93     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
94   }
95   ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr);
96   /* test if we need to exchange messages
97    * generally speaking, we do not need to exchange
98    * data when overlap is 1
99    * */
100   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRMPI(ierr);
101   /* we do not have any messages
102    * It usually corresponds to overlap 1
103    * */
104   if (!reducednrrows) {
105     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
106     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
107     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
108     PetscFunctionReturn(0);
109   }
110   nto = 0;
111   /* send sizes and ranks for building a two-sided communcation */
112   for (i=0; i<size; i++) {
113     if (tosizes_temp[i]) {
114       tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
115       tosizes_temp[i] = nto; /* a map from processor to index */
116       toranks[nto++]  = i; /* processor */
117     }
118   }
119   ierr = PetscMalloc1(nto+1,&toffsets);CHKERRQ(ierr);
120   toffsets[0] = 0;
121   for (i=0; i<nto; i++) {
122     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
123     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
124   }
125   /* send information to other processors */
126   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
127   nrecvrows = 0;
128   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
129   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
130   nrecvrows = 0;
131   for (i=0; i<nfrom; i++) {
132     for (j=0; j<fromsizes[2*i]; j++) {
133       remote[nrecvrows].rank    = fromranks[i];
134       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
135     }
136   }
137   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
138   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
139   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
140   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
141   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
142   /* message pair <no of is, row>  */
143   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
144   for (i=0; i<nrrows; i++) {
145     owner = rrow_ranks[i]; /* processor */
146     j     = tosizes_temp[owner]; /* index */
147     todata[toffsets[j]++] = rrow_isids[i];
148     todata[toffsets[j]++] = remoterows[i];
149   }
150   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
151   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
152   ierr = PetscFree(toffsets);CHKERRQ(ierr);
153   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr);
154   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr);
155   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
156   /* send rows belonging to the remote so that then we could get the overlapping data back */
157   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
158   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
159   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
160   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
161   ierr = PetscFree(fromranks);CHKERRQ(ierr);
162   nrecvrows = 0;
163   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
164   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
165   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
166   nrecvrows = 0;
167   for (i=0; i<nto; i++) {
168     for (j=0; j<tosizes[2*i]; j++) {
169       remote[nrecvrows].rank    = toranks[i];
170       remote[nrecvrows++].index = tosizes[2*i+1]+j;
171     }
172   }
173   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
174   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
175   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
176   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
177   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
178   /* overlap communication and computation */
179   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr);
180   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
181   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr);
182   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
183   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
184   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
185   ierr = PetscFree(toranks);CHKERRQ(ierr);
186   ierr = PetscFree(tosizes);CHKERRQ(ierr);
187   ierr = PetscFree(todata);CHKERRQ(ierr);
188   PetscFunctionReturn(0);
189 }
190 
191 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
192 {
193   PetscInt         *isz,isz_i,i,j,is_id, data_size;
194   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
195   const PetscInt   *indices_i_temp;
196   MPI_Comm         *iscomms;
197   PetscErrorCode    ierr;
198 
199   PetscFunctionBegin;
200   max_lsize = 0;
201   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
202   for (i=0; i<nidx; i++) {
203     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
204     max_lsize = lsize>max_lsize ? lsize:max_lsize;
205     isz[i]    = lsize;
206   }
207   ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr);
208   for (i=0; i<nidx; i++) {
209     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
210     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
211     ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr);
212     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
213     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
214   }
215   /* retrieve information to get row id and its overlap */
216   for (i=0; i<nrecvs;) {
217     is_id     = recvdata[i++];
218     data_size = recvdata[i++];
219     indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
220     isz_i     = isz[is_id];
221     for (j=0; j< data_size; j++) {
222       col = recvdata[i++];
223       indices_i[isz_i++] = col;
224     }
225     isz[is_id] = isz_i;
226   }
227   /* remove duplicate entities */
228   for (i=0; i<nidx; i++) {
229     indices_i = indices_temp+(max_lsize+nrecvs)*i;
230     isz_i     = isz[i];
231     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
232     ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
233     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
234   }
235   ierr = PetscFree(isz);CHKERRQ(ierr);
236   ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr);
237   PetscFunctionReturn(0);
238 }
239 
240 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
241 {
242   PetscLayout       rmap,cmap;
243   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
244   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
245   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
246   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
247   Mat               amat,bmat;
248   PetscMPIInt       rank;
249   PetscBool         done;
250   MPI_Comm          comm;
251   PetscErrorCode    ierr;
252 
253   PetscFunctionBegin;
254   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
255   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
256   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
257   /* Even if the mat is symmetric, we still assume it is not symmetric */
258   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
259   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
260   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
261   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
262   /* total number of nonzero values is used to estimate the memory usage in the next step */
263   tnz  = ai[an]+bi[bn];
264   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
265   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
266   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
267   /* to find the longest message */
268   max_fszs = 0;
269   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
270   /* better way to estimate number of nonzero in the mat??? */
271   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
272   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
273   rows_pos  = 0;
274   totalrows = 0;
275   for (i=0; i<nfrom; i++) {
276     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
277     /* group data together */
278     for (j=0; j<fromsizes[2*i]; j+=2) {
279       is_id                       = fromrows[rows_pos++];/* no of is */
280       rows_i                      = rows_data[is_id];
281       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
282     }
283     /* estimate a space to avoid multiple allocations  */
284     for (j=0; j<nidx; j++) {
285       indvc_ij = 0;
286       rows_i   = rows_data[j];
287       for (l=0; l<rows_pos_i[j]; l++) {
288         row    = rows_i[l]-rstart;
289         start  = ai[row];
290         end    = ai[row+1];
291         for (k=start; k<end; k++) { /* Amat */
292           col = aj[k] + cstart;
293           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
294         }
295         start = bi[row];
296         end   = bi[row+1];
297         for (k=start; k<end; k++) { /* Bmat */
298           col = gcols[bj[k]];
299           indices_tmp[indvc_ij++] = col;
300         }
301       }
302       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
303       indv_counts[i*nidx+j] = indvc_ij;
304       totalrows            += indvc_ij;
305     }
306   }
307   /* message triple <no of is, number of rows, rows> */
308   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
309   totalrows = 0;
310   rows_pos  = 0;
311   /* use this code again */
312   for (i=0;i<nfrom;i++) {
313     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
314     for (j=0; j<fromsizes[2*i]; j+=2) {
315       is_id                       = fromrows[rows_pos++];
316       rows_i                      = rows_data[is_id];
317       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
318     }
319     /* add data  */
320     for (j=0; j<nidx; j++) {
321       if (!indv_counts[i*nidx+j]) continue;
322       indvc_ij = 0;
323       sbdata[totalrows++] = j;
324       sbdata[totalrows++] = indv_counts[i*nidx+j];
325       sbsizes[2*i]       += 2;
326       rows_i              = rows_data[j];
327       for (l=0; l<rows_pos_i[j]; l++) {
328         row   = rows_i[l]-rstart;
329         start = ai[row];
330         end   = ai[row+1];
331         for (k=start; k<end; k++) { /* Amat */
332           col = aj[k] + cstart;
333           indices_tmp[indvc_ij++] = col;
334         }
335         start = bi[row];
336         end   = bi[row+1];
337         for (k=start; k<end; k++) { /* Bmat */
338           col = gcols[bj[k]];
339           indices_tmp[indvc_ij++] = col;
340         }
341       }
342       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
343       sbsizes[2*i]  += indvc_ij;
344       ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr);
345       totalrows += indvc_ij;
346     }
347   }
348   ierr = PetscMalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
349   offsets[0] = 0;
350   for (i=0; i<nfrom; i++) {
351     offsets[i+1]   = offsets[i] + sbsizes[2*i];
352     sbsizes[2*i+1] = offsets[i];
353   }
354   ierr = PetscFree(offsets);CHKERRQ(ierr);
355   if (sbrowsizes) *sbrowsizes = sbsizes;
356   if (sbrows) *sbrows = sbdata;
357   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
358   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
359   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
360   PetscFunctionReturn(0);
361 }
362 
363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
364 {
365   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
366   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
367   PetscInt          lsize,lsize_tmp;
368   PetscMPIInt       rank,owner;
369   Mat               amat,bmat;
370   PetscBool         done;
371   PetscLayout       cmap,rmap;
372   MPI_Comm          comm;
373   PetscErrorCode    ierr;
374 
375   PetscFunctionBegin;
376   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
377   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
378   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
379   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
380   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
381   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
382   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
383   /* is it a safe way to compute number of nonzero values ? */
384   tnz  = ai[an]+bi[bn];
385   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
386   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
387   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
388   /* it is a better way to estimate memory than the old implementation
389    * where global size of matrix is used
390    * */
391   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
392   for (i=0; i<nidx; i++) {
393     MPI_Comm iscomm;
394 
395     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
396     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
397     lsize_tmp = 0;
398     for (j=0; j<lsize; j++) {
399       owner = -1;
400       row   = indices[j];
401       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
402       if (owner != rank) continue;
403       /* local number */
404       row  -= rstart;
405       start = ai[row];
406       end   = ai[row+1];
407       for (k=start; k<end; k++) { /* Amat */
408         col = aj[k] + cstart;
409         indices_temp[lsize_tmp++] = col;
410       }
411       start = bi[row];
412       end   = bi[row+1];
413       for (k=start; k<end; k++) { /* Bmat */
414         col = gcols[bj[k]];
415         indices_temp[lsize_tmp++] = col;
416       }
417     }
418    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
419    ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr);
420    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
421    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
422    ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
423    ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr);
424   }
425   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
426   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
427   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
428   PetscFunctionReturn(0);
429 }
430 
431 /*
432   Sample message format:
433   If a processor A wants processor B to process some elements corresponding
434   to index sets is[1],is[5]
435   mesg [0] = 2   (no of index sets in the mesg)
436   -----------
437   mesg [1] = 1 => is[1]
438   mesg [2] = sizeof(is[1]);
439   -----------
440   mesg [3] = 5  => is[5]
441   mesg [4] = sizeof(is[5]);
442   -----------
443   mesg [5]
444   mesg [n]  datas[1]
445   -----------
446   mesg[n+1]
447   mesg[m]  data(is[5])
448   -----------
449 
450   Notes:
451   nrqs - no of requests sent (or to be sent out)
452   nrqr - no of requests received (which have to be or which have been processed)
453 */
454 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
455 {
456   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
457   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
458   const PetscInt **idx,*idx_i;
459   PetscInt       *n,**data,len;
460 #if defined(PETSC_USE_CTABLE)
461   PetscTable     *table_data,table_data_i;
462   PetscInt       *tdata,tcount,tcount_max;
463 #else
464   PetscInt       *data_i,*d_p;
465 #endif
466   PetscErrorCode ierr;
467   PetscMPIInt    size,rank,tag1,tag2,proc = 0;
468   PetscInt       M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
469   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
470   PetscBT        *table;
471   MPI_Comm       comm;
472   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
473   MPI_Status     *s_status,*recv_status;
474   MPI_Comm       *iscomms;
475   char           *t_p;
476 
477   PetscFunctionBegin;
478   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
479   size = c->size;
480   rank = c->rank;
481   M    = C->rmap->N;
482 
483   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
484   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
485 
486   ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr);
487 
488   for (i=0; i<imax; i++) {
489     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
490     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
491   }
492 
493   /* evaluate communication - mesg to who,length of mesg, and buffer space
494      required. Based on this, buffers are allocated, and data copied into them  */
495   ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
496   for (i=0; i<imax; i++) {
497     ierr  = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/
498     idx_i = idx[i];
499     len   = n[i];
500     for (j=0; j<len; j++) {
501       row = idx_i[j];
502       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
503       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
504       w4[proc]++;
505     }
506     for (j=0; j<size; j++) {
507       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
508     }
509   }
510 
511   nrqs     = 0;              /* no of outgoing messages */
512   msz      = 0;              /* total mesg length (for all proc */
513   w1[rank] = 0;              /* no mesg sent to intself */
514   w3[rank] = 0;
515   for (i=0; i<size; i++) {
516     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
517   }
518   /* pa - is list of processors to communicate with */
519   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
520   for (i=0,j=0; i<size; i++) {
521     if (w1[i]) {pa[j] = i; j++;}
522   }
523 
524   /* Each message would have a header = 1 + 2*(no of IS) + data */
525   for (i=0; i<nrqs; i++) {
526     j      = pa[i];
527     w1[j] += w2[j] + 2*w3[j];
528     msz   += w1[j];
529   }
530 
531   /* Determine the number of messages to expect, their lengths, from from-ids */
532   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
533   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
534 
535   /* Now post the Irecvs corresponding to these messages */
536   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
537 
538   /* Allocate Memory for outgoing messages */
539   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
540   ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr);
541   ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
542 
543   {
544     PetscInt *iptr = tmp,ict  = 0;
545     for (i=0; i<nrqs; i++) {
546       j         = pa[i];
547       iptr     +=  ict;
548       outdat[j] = iptr;
549       ict       = w1[j];
550     }
551   }
552 
553   /* Form the outgoing messages */
554   /* plug in the headers */
555   for (i=0; i<nrqs; i++) {
556     j            = pa[i];
557     outdat[j][0] = 0;
558     ierr         = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr);
559     ptr[j]       = outdat[j] + 2*w3[j] + 1;
560   }
561 
562   /* Memory for doing local proc's work */
563   {
564     PetscInt M_BPB_imax = 0;
565 #if defined(PETSC_USE_CTABLE)
566     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
567     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
568     for (i=0; i<imax; i++) {
569       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
570     }
571     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
572     for (i=0; i<imax; i++) {
573       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
574     }
575 #else
576     PetscInt Mimax = 0;
577     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
578     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
579     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
580     for (i=0; i<imax; i++) {
581       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
582       data[i]  = d_p + M*i;
583     }
584 #endif
585   }
586 
587   /* Parse the IS and update local tables and the outgoing buf with the data */
588   {
589     PetscInt n_i,isz_i,*outdat_j,ctr_j;
590     PetscBT  table_i;
591 
592     for (i=0; i<imax; i++) {
593       ierr    = PetscArrayzero(ctr,size);CHKERRQ(ierr);
594       n_i     = n[i];
595       table_i = table[i];
596       idx_i   = idx[i];
597 #if defined(PETSC_USE_CTABLE)
598       table_data_i = table_data[i];
599 #else
600       data_i  = data[i];
601 #endif
602       isz_i   = isz[i];
603       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
604         row  = idx_i[j];
605         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
606         if (proc != rank) { /* copy to the outgoing buffer */
607           ctr[proc]++;
608           *ptr[proc] = row;
609           ptr[proc]++;
610         } else if (!PetscBTLookupSet(table_i,row)) {
611 #if defined(PETSC_USE_CTABLE)
612           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
613 #else
614           data_i[isz_i] = row; /* Update the local table */
615 #endif
616           isz_i++;
617         }
618       }
619       /* Update the headers for the current IS */
620       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
621         if ((ctr_j = ctr[j])) {
622           outdat_j        = outdat[j];
623           k               = ++outdat_j[0];
624           outdat_j[2*k]   = ctr_j;
625           outdat_j[2*k-1] = i;
626         }
627       }
628       isz[i] = isz_i;
629     }
630   }
631 
632   /*  Now  post the sends */
633   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
634   for (i=0; i<nrqs; ++i) {
635     j    = pa[i];
636     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRMPI(ierr);
637   }
638 
639   /* No longer need the original indices */
640   ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr);
641   for (i=0; i<imax; ++i) {
642     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
643     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
644   }
645   ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr);
646 
647   for (i=0; i<imax; ++i) {
648     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
649   }
650 
651   /* Do Local work */
652 #if defined(PETSC_USE_CTABLE)
653   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
654 #else
655   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
656 #endif
657 
658   /* Receive messages */
659   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
660   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRMPI(ierr);}
661 
662   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
663   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRMPI(ierr);}
664 
665   /* Phase 1 sends are complete - deallocate buffers */
666   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
667   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
668 
669   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
670   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
671   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
672   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
673   ierr = PetscFree(rbuf);CHKERRQ(ierr);
674 
675   /* Send the data back */
676   /* Do a global reduction to know the buffer space req for incoming messages */
677   {
678     PetscMPIInt *rw1;
679 
680     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
681 
682     for (i=0; i<nrqr; ++i) {
683       proc = recv_status[i].MPI_SOURCE;
684 
685       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
686       rw1[proc] = isz1[i];
687     }
688     ierr = PetscFree(onodes1);CHKERRQ(ierr);
689     ierr = PetscFree(olengths1);CHKERRQ(ierr);
690 
691     /* Determine the number of messages to expect, their lengths, from from-ids */
692     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
693     ierr = PetscFree(rw1);CHKERRQ(ierr);
694   }
695   /* Now post the Irecvs corresponding to these messages */
696   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
697 
698   /* Now  post the sends */
699   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
700   for (i=0; i<nrqr; ++i) {
701     j    = recv_status[i].MPI_SOURCE;
702     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRMPI(ierr);
703   }
704 
705   /* receive work done on other processors */
706   {
707     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
708     PetscMPIInt idex;
709     PetscBT     table_i;
710     MPI_Status  *status2;
711 
712     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
713     for (i=0; i<nrqs; ++i) {
714       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRMPI(ierr);
715       /* Process the message */
716       rbuf2_i = rbuf2[idex];
717       ct1     = 2*rbuf2_i[0]+1;
718       jmax    = rbuf2[idex][0];
719       for (j=1; j<=jmax; j++) {
720         max     = rbuf2_i[2*j];
721         is_no   = rbuf2_i[2*j-1];
722         isz_i   = isz[is_no];
723         table_i = table[is_no];
724 #if defined(PETSC_USE_CTABLE)
725         table_data_i = table_data[is_no];
726 #else
727         data_i  = data[is_no];
728 #endif
729         for (k=0; k<max; k++,ct1++) {
730           row = rbuf2_i[ct1];
731           if (!PetscBTLookupSet(table_i,row)) {
732 #if defined(PETSC_USE_CTABLE)
733             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
734 #else
735             data_i[isz_i] = row;
736 #endif
737             isz_i++;
738           }
739         }
740         isz[is_no] = isz_i;
741       }
742     }
743 
744     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRMPI(ierr);}
745     ierr = PetscFree(status2);CHKERRQ(ierr);
746   }
747 
748 #if defined(PETSC_USE_CTABLE)
749   tcount_max = 0;
750   for (i=0; i<imax; ++i) {
751     table_data_i = table_data[i];
752     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
753     if (tcount_max < tcount) tcount_max = tcount;
754   }
755   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
756 #endif
757 
758   for (i=0; i<imax; ++i) {
759 #if defined(PETSC_USE_CTABLE)
760     PetscTablePosition tpos;
761     table_data_i = table_data[i];
762 
763     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
764     while (tpos) {
765       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
766       tdata[--j] = --k;
767     }
768     ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
769 #else
770     ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
771 #endif
772     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
773   }
774 
775   ierr = PetscFree(iscomms);CHKERRQ(ierr);
776   ierr = PetscFree(onodes2);CHKERRQ(ierr);
777   ierr = PetscFree(olengths2);CHKERRQ(ierr);
778 
779   ierr = PetscFree(pa);CHKERRQ(ierr);
780   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
781   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
782   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
783   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
784   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
785   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
786   ierr = PetscFree(s_status);CHKERRQ(ierr);
787   ierr = PetscFree(recv_status);CHKERRQ(ierr);
788   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
789   ierr = PetscFree(xdata);CHKERRQ(ierr);
790   ierr = PetscFree(isz1);CHKERRQ(ierr);
791 #if defined(PETSC_USE_CTABLE)
792   for (i=0; i<imax; i++) {
793     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
794   }
795   ierr = PetscFree(table_data);CHKERRQ(ierr);
796   ierr = PetscFree(tdata);CHKERRQ(ierr);
797   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
798 #else
799   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
800 #endif
801   PetscFunctionReturn(0);
802 }
803 
804 /*
805    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
806        the work on the local processor.
807 
808      Inputs:
809       C      - MAT_MPIAIJ;
810       imax - total no of index sets processed at a time;
811       table  - an array of char - size = m bits.
812 
813      Output:
814       isz    - array containing the count of the solution elements corresponding
815                to each index set;
816       data or table_data  - pointer to the solutions
817 */
818 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
819 {
820   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
821   Mat        A  = c->A,B = c->B;
822   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
823   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
824   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
825   PetscBT    table_i;
826 #if defined(PETSC_USE_CTABLE)
827   PetscTable         table_data_i;
828   PetscErrorCode     ierr;
829   PetscTablePosition tpos;
830   PetscInt           tcount,*tdata;
831 #else
832   PetscInt           *data_i;
833 #endif
834 
835   PetscFunctionBegin;
836   rstart = C->rmap->rstart;
837   cstart = C->cmap->rstart;
838   ai     = a->i;
839   aj     = a->j;
840   bi     = b->i;
841   bj     = b->j;
842   garray = c->garray;
843 
844   for (i=0; i<imax; i++) {
845 #if defined(PETSC_USE_CTABLE)
846     /* copy existing entries of table_data_i into tdata[] */
847     table_data_i = table_data[i];
848     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
849     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %d != isz[%d] %d",tcount,i,isz[i]);
850 
851     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
852     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
853     while (tpos) {
854       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
855       tdata[--j] = --row;
856       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB," j %d >= tcount %d",j,tcount);
857     }
858 #else
859     data_i  = data[i];
860 #endif
861     table_i = table[i];
862     isz_i   = isz[i];
863     max     = isz[i];
864 
865     for (j=0; j<max; j++) {
866 #if defined(PETSC_USE_CTABLE)
867       row   = tdata[j] - rstart;
868 #else
869       row   = data_i[j] - rstart;
870 #endif
871       start = ai[row];
872       end   = ai[row+1];
873       for (k=start; k<end; k++) { /* Amat */
874         val = aj[k] + cstart;
875         if (!PetscBTLookupSet(table_i,val)) {
876 #if defined(PETSC_USE_CTABLE)
877           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
878 #else
879           data_i[isz_i] = val;
880 #endif
881           isz_i++;
882         }
883       }
884       start = bi[row];
885       end   = bi[row+1];
886       for (k=start; k<end; k++) { /* Bmat */
887         val = garray[bj[k]];
888         if (!PetscBTLookupSet(table_i,val)) {
889 #if defined(PETSC_USE_CTABLE)
890           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
891 #else
892           data_i[isz_i] = val;
893 #endif
894           isz_i++;
895         }
896       }
897     }
898     isz[i] = isz_i;
899 
900 #if defined(PETSC_USE_CTABLE)
901     ierr = PetscFree(tdata);CHKERRQ(ierr);
902 #endif
903   }
904   PetscFunctionReturn(0);
905 }
906 
907 /*
908       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
909          and return the output
910 
911          Input:
912            C    - the matrix
913            nrqr - no of messages being processed.
914            rbuf - an array of pointers to the received requests
915 
916          Output:
917            xdata - array of messages to be sent back
918            isz1  - size of each message
919 
920   For better efficiency perhaps we should malloc separately each xdata[i],
921 then if a remalloc is required we need only copy the data for that one row
922 rather then all previous rows as it is now where a single large chunck of
923 memory is used.
924 
925 */
926 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
927 {
928   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
929   Mat            A  = c->A,B = c->B;
930   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
931   PetscErrorCode ierr;
932   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
933   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
934   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
935   PetscInt       *rbuf_i,kmax,rbuf_0;
936   PetscBT        xtable;
937 
938   PetscFunctionBegin;
939   m      = C->rmap->N;
940   rstart = C->rmap->rstart;
941   cstart = C->cmap->rstart;
942   ai     = a->i;
943   aj     = a->j;
944   bi     = b->i;
945   bj     = b->j;
946   garray = c->garray;
947 
948   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
949     rbuf_i =  rbuf[i];
950     rbuf_0 =  rbuf_i[0];
951     ct    += rbuf_0;
952     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
953   }
954 
955   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
956   else max1 = 1;
957   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
958   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
959   ++no_malloc;
960   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
961   ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr);
962 
963   ct3 = 0;
964   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
965     rbuf_i =  rbuf[i];
966     rbuf_0 =  rbuf_i[0];
967     ct1    =  2*rbuf_0+1;
968     ct2    =  ct1;
969     ct3   += ct1;
970     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
971       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
972       oct2 = ct2;
973       kmax = rbuf_i[2*j];
974       for (k=0; k<kmax; k++,ct1++) {
975         row = rbuf_i[ct1];
976         if (!PetscBTLookupSet(xtable,row)) {
977           if (!(ct3 < mem_estimate)) {
978             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
979             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
980             ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
981             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
982             xdata[0]     = tmp;
983             mem_estimate = new_estimate; ++no_malloc;
984             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
985           }
986           xdata[i][ct2++] = row;
987           ct3++;
988         }
989       }
990       for (k=oct2,max2=ct2; k<max2; k++) {
991         row   = xdata[i][k] - rstart;
992         start = ai[row];
993         end   = ai[row+1];
994         for (l=start; l<end; l++) {
995           val = aj[l] + cstart;
996           if (!PetscBTLookupSet(xtable,val)) {
997             if (!(ct3 < mem_estimate)) {
998               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
999               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1000               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
1001               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1002               xdata[0]     = tmp;
1003               mem_estimate = new_estimate; ++no_malloc;
1004               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1005             }
1006             xdata[i][ct2++] = val;
1007             ct3++;
1008           }
1009         }
1010         start = bi[row];
1011         end   = bi[row+1];
1012         for (l=start; l<end; l++) {
1013           val = garray[bj[l]];
1014           if (!PetscBTLookupSet(xtable,val)) {
1015             if (!(ct3 < mem_estimate)) {
1016               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1017               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1018               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
1019               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1020               xdata[0]     = tmp;
1021               mem_estimate = new_estimate; ++no_malloc;
1022               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1023             }
1024             xdata[i][ct2++] = val;
1025             ct3++;
1026           }
1027         }
1028       }
1029       /* Update the header*/
1030       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1031       xdata[i][2*j-1] = rbuf_i[2*j-1];
1032     }
1033     xdata[i][0] = rbuf_0;
1034     xdata[i+1]  = xdata[i] + ct2;
1035     isz1[i]     = ct2; /* size of each message */
1036   }
1037   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1038   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1039   PetscFunctionReturn(0);
1040 }
1041 /* -------------------------------------------------------------------------*/
1042 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1043 /*
1044     Every processor gets the entire matrix
1045 */
1046 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1047 {
1048   Mat            B;
1049   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1050   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1051   PetscErrorCode ierr;
1052   PetscMPIInt    size,rank,*recvcounts = NULL,*displs = NULL;
1053   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1054   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1055 
1056   PetscFunctionBegin;
1057   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
1058   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRMPI(ierr);
1059   if (scall == MAT_INITIAL_MATRIX) {
1060     /* ----------------------------------------------------------------
1061          Tell every processor the number of nonzeros per row
1062     */
1063     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1064     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1065       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1066     }
1067     ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1068     for (i=0; i<size; i++) {
1069       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1070       displs[i]     = A->rmap->range[i];
1071     }
1072 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1073     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1074 #else
1075     sendcount = A->rmap->rend - A->rmap->rstart;
1076     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1077 #endif
1078     /* ---------------------------------------------------------------
1079          Create the sequential matrix of the same type as the local block diagonal
1080     */
1081     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1082     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1083     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1084     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1085     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1086     ierr  = PetscCalloc1(2,Bin);CHKERRQ(ierr);
1087     **Bin = B;
1088     b     = (Mat_SeqAIJ*)B->data;
1089 
1090     /*--------------------------------------------------------------------
1091        Copy my part of matrix column indices over
1092     */
1093     sendcount  = ad->nz + bd->nz;
1094     jsendbuf   = b->j + b->i[rstarts[rank]];
1095     a_jsendbuf = ad->j;
1096     b_jsendbuf = bd->j;
1097     n          = A->rmap->rend - A->rmap->rstart;
1098     cnt        = 0;
1099     for (i=0; i<n; i++) {
1100       /* put in lower diagonal portion */
1101       m = bd->i[i+1] - bd->i[i];
1102       while (m > 0) {
1103         /* is it above diagonal (in bd (compressed) numbering) */
1104         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1105         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1106         m--;
1107       }
1108 
1109       /* put in diagonal portion */
1110       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1111         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1112       }
1113 
1114       /* put in upper diagonal portion */
1115       while (m-- > 0) {
1116         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1117       }
1118     }
1119     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1120 
1121     /*--------------------------------------------------------------------
1122        Gather all column indices to all processors
1123     */
1124     for (i=0; i<size; i++) {
1125       recvcounts[i] = 0;
1126       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1127         recvcounts[i] += lens[j];
1128       }
1129     }
1130     displs[0] = 0;
1131     for (i=1; i<size; i++) {
1132       displs[i] = displs[i-1] + recvcounts[i-1];
1133     }
1134 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1135     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1136 #else
1137     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1138 #endif
1139     /*--------------------------------------------------------------------
1140         Assemble the matrix into useable form (note numerical values not yet set)
1141     */
1142     /* set the b->ilen (length of each row) values */
1143     ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr);
1144     /* set the b->i indices */
1145     b->i[0] = 0;
1146     for (i=1; i<=A->rmap->N; i++) {
1147       b->i[i] = b->i[i-1] + lens[i-1];
1148     }
1149     ierr = PetscFree(lens);CHKERRQ(ierr);
1150     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1151     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1152 
1153   } else {
1154     B = **Bin;
1155     b = (Mat_SeqAIJ*)B->data;
1156   }
1157 
1158   /*--------------------------------------------------------------------
1159        Copy my part of matrix numerical values into the values location
1160   */
1161   if (flag == MAT_GET_VALUES) {
1162     const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf;
1163     MatScalar         *sendbuf,*recvbuf;
1164 
1165     ierr = MatSeqAIJGetArrayRead(a->A,&ada);CHKERRQ(ierr);
1166     ierr = MatSeqAIJGetArrayRead(a->B,&bda);CHKERRQ(ierr);
1167     sendcount = ad->nz + bd->nz;
1168     sendbuf   = b->a + b->i[rstarts[rank]];
1169     a_sendbuf = ada;
1170     b_sendbuf = bda;
1171     b_sendj   = bd->j;
1172     n         = A->rmap->rend - A->rmap->rstart;
1173     cnt       = 0;
1174     for (i=0; i<n; i++) {
1175       /* put in lower diagonal portion */
1176       m = bd->i[i+1] - bd->i[i];
1177       while (m > 0) {
1178         /* is it above diagonal (in bd (compressed) numbering) */
1179         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1180         sendbuf[cnt++] = *b_sendbuf++;
1181         m--;
1182         b_sendj++;
1183       }
1184 
1185       /* put in diagonal portion */
1186       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1187         sendbuf[cnt++] = *a_sendbuf++;
1188       }
1189 
1190       /* put in upper diagonal portion */
1191       while (m-- > 0) {
1192         sendbuf[cnt++] = *b_sendbuf++;
1193         b_sendj++;
1194       }
1195     }
1196     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1197 
1198     /* -----------------------------------------------------------------
1199        Gather all numerical values to all processors
1200     */
1201     if (!recvcounts) {
1202       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1203     }
1204     for (i=0; i<size; i++) {
1205       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1206     }
1207     displs[0] = 0;
1208     for (i=1; i<size; i++) {
1209       displs[i] = displs[i-1] + recvcounts[i-1];
1210     }
1211     recvbuf = b->a;
1212 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1213     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1214 #else
1215     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1216 #endif
1217     ierr = MatSeqAIJRestoreArrayRead(a->A,&ada);CHKERRQ(ierr);
1218     ierr = MatSeqAIJRestoreArrayRead(a->B,&bda);CHKERRQ(ierr);
1219   }  /* endof (flag == MAT_GET_VALUES) */
1220   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1221 
1222   if (A->symmetric) {
1223     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1224   } else if (A->hermitian) {
1225     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1226   } else if (A->structurally_symmetric) {
1227     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1228   }
1229   PetscFunctionReturn(0);
1230 }
1231 
1232 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1233 {
1234   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1235   Mat            submat,A = c->A,B = c->B;
1236   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1237   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1238   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1239   const PetscInt *icol,*irow;
1240   PetscInt       nrow,ncol,start;
1241   PetscErrorCode ierr;
1242   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1243   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1244   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1245   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1246   PetscInt       *lens,rmax,ncols,*cols,Crow;
1247 #if defined(PETSC_USE_CTABLE)
1248   PetscTable     cmap,rmap;
1249   PetscInt       *cmap_loc,*rmap_loc;
1250 #else
1251   PetscInt       *cmap,*rmap;
1252 #endif
1253   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1254   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1255   PetscScalar    *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL;
1256   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1257   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1258   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1259   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1260   MPI_Comm       comm;
1261   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1262   PetscMPIInt    *onodes1,*olengths1,idex,end;
1263   Mat_SubSppt    *smatis1;
1264   PetscBool      isrowsorted,iscolsorted;
1265 
1266   PetscFunctionBegin;
1267   PetscValidLogicalCollectiveInt(C,ismax,2);
1268   PetscValidLogicalCollectiveEnum(C,scall,5);
1269   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1270   ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
1271   ierr = MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
1272   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1273   size = c->size;
1274   rank = c->rank;
1275 
1276   ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr);
1277   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1278   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1279   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1280   if (allcolumns) {
1281     icol = NULL;
1282     ncol = C->cmap->N;
1283   } else {
1284     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1285     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1286   }
1287 
1288   if (scall == MAT_INITIAL_MATRIX) {
1289     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1290 
1291     /* Get some new tags to keep the communication clean */
1292     tag1 = ((PetscObject)C)->tag;
1293     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1294     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1295 
1296     /* evaluate communication - mesg to who, length of mesg, and buffer space
1297      required. Based on this, buffers are allocated, and data copied into them */
1298     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1299     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1300 
1301     /* w1[proc] = num of rows owned by proc -- to be requested */
1302     proc = 0;
1303     nrqs = 0; /* num of outgoing messages */
1304     for (j=0; j<nrow; j++) {
1305       row  = irow[j];
1306       if (!isrowsorted) proc = 0;
1307       while (row >= C->rmap->range[proc+1]) proc++;
1308       w1[proc]++;
1309       row2proc[j] = proc; /* map row index to proc */
1310 
1311       if (proc != rank && !w2[proc]) {
1312         w2[proc] = 1; nrqs++;
1313       }
1314     }
1315     w1[rank] = 0;  /* rows owned by self will not be requested */
1316 
1317     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1318     for (proc=0,j=0; proc<size; proc++) {
1319       if (w1[proc]) { pa[j++] = proc;}
1320     }
1321 
1322     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1323     msz = 0;              /* total mesg length (for all procs) */
1324     for (i=0; i<nrqs; i++) {
1325       proc      = pa[i];
1326       w1[proc] += 3;
1327       msz      += w1[proc];
1328     }
1329     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1330 
1331     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1332     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1333     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1334 
1335     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1336        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1337     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1338 
1339     /* Now post the Irecvs corresponding to these messages */
1340     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1341 
1342     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1343     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1344 
1345     /* Allocate Memory for outgoing messages */
1346     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1347     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
1348     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
1349 
1350     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1351     iptr = tmp;
1352     for (i=0; i<nrqs; i++) {
1353       proc        = pa[i];
1354       sbuf1[proc] = iptr;
1355       iptr       += w1[proc];
1356     }
1357 
1358     /* Form the outgoing messages */
1359     /* Initialize the header space */
1360     for (i=0; i<nrqs; i++) {
1361       proc      = pa[i];
1362       ierr      = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr);
1363       ptr[proc] = sbuf1[proc] + 3;
1364     }
1365 
1366     /* Parse the isrow and copy data into outbuf */
1367     ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr);
1368     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1369       proc = row2proc[j];
1370       if (proc != rank) { /* copy to the outgoing buf*/
1371         *ptr[proc] = irow[j];
1372         ctr[proc]++; ptr[proc]++;
1373       }
1374     }
1375 
1376     /* Update the headers for the current IS */
1377     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1378       if ((ctr_j = ctr[j])) {
1379         sbuf1_j        = sbuf1[j];
1380         k              = ++sbuf1_j[0];
1381         sbuf1_j[2*k]   = ctr_j;
1382         sbuf1_j[2*k-1] = 0;
1383       }
1384     }
1385 
1386     /* Now post the sends */
1387     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1388     for (i=0; i<nrqs; ++i) {
1389       proc = pa[i];
1390       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRMPI(ierr);
1391     }
1392 
1393     /* Post Receives to capture the buffer size */
1394     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1395     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1396 
1397     rbuf2[0] = tmp + msz;
1398     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1399 
1400     for (i=0; i<nrqs; ++i) {
1401       proc = pa[i];
1402       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRMPI(ierr);
1403     }
1404 
1405     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1406 
1407     /* Send to other procs the buf size they should allocate */
1408     /* Receive messages*/
1409     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1410     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1411 
1412     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr);
1413     for (i=0; i<nrqr; ++i) {
1414       req_size[i] = 0;
1415       rbuf1_i        = rbuf1[i];
1416       start          = 2*rbuf1_i[0] + 1;
1417       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRMPI(ierr);
1418       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1419       sbuf2_i        = sbuf2[i];
1420       for (j=start; j<end; j++) {
1421         k            = rbuf1_i[j] - rstart;
1422         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1423         sbuf2_i[j]   = ncols;
1424         req_size[i] += ncols;
1425       }
1426       req_source1[i] = r_status1[i].MPI_SOURCE;
1427 
1428       /* form the header */
1429       sbuf2_i[0] = req_size[i];
1430       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1431 
1432       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr);
1433     }
1434 
1435     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1436     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1437 
1438     /* rbuf2 is received, Post recv column indices a->j */
1439     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr);
1440 
1441     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1442     for (i=0; i<nrqs; ++i) {
1443       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1444       req_source2[i] = r_status2[i].MPI_SOURCE;
1445       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr);
1446     }
1447 
1448     /* Wait on sends1 and sends2 */
1449     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1450     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr);
1451     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1452     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1453 
1454     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr);
1455     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1456 
1457     /* Now allocate sending buffers for a->j, and send them off */
1458     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1459     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1460     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1461     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1462 
1463     for (i=0; i<nrqr; i++) { /* for each requested message */
1464       rbuf1_i   = rbuf1[i];
1465       sbuf_aj_i = sbuf_aj[i];
1466       ct1       = 2*rbuf1_i[0] + 1;
1467       ct2       = 0;
1468       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1469 
1470       kmax = rbuf1[i][2];
1471       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1472         row    = rbuf1_i[ct1] - rstart;
1473         nzA    = ai[row+1] - ai[row];
1474         nzB    = bi[row+1] - bi[row];
1475         ncols  = nzA + nzB;
1476         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1477 
1478         /* load the column indices for this row into cols*/
1479         cols = sbuf_aj_i + ct2;
1480 
1481         lwrite = 0;
1482         for (l=0; l<nzB; l++) {
1483           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1484         }
1485         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1486         for (l=0; l<nzB; l++) {
1487           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1488         }
1489 
1490         ct2 += ncols;
1491       }
1492       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr);
1493     }
1494 
1495     /* create column map (cmap): global col of C -> local col of submat */
1496 #if defined(PETSC_USE_CTABLE)
1497     if (!allcolumns) {
1498       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1499       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1500       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1501         if (icol[j] >= cstart && icol[j] <cend) {
1502           cmap_loc[icol[j] - cstart] = j+1;
1503         } else { /* use PetscTable for non-local col indices */
1504           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1505         }
1506       }
1507     } else {
1508       cmap     = NULL;
1509       cmap_loc = NULL;
1510     }
1511     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1512 #else
1513     if (!allcolumns) {
1514       ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1515       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1516     } else {
1517       cmap = NULL;
1518     }
1519 #endif
1520 
1521     /* Create lens for MatSeqAIJSetPreallocation() */
1522     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1523 
1524     /* Compute lens from local part of C */
1525     for (j=0; j<nrow; j++) {
1526       row  = irow[j];
1527       proc = row2proc[j];
1528       if (proc == rank) {
1529         /* diagonal part A = c->A */
1530         ncols = ai[row-rstart+1] - ai[row-rstart];
1531         cols  = aj + ai[row-rstart];
1532         if (!allcolumns) {
1533           for (k=0; k<ncols; k++) {
1534 #if defined(PETSC_USE_CTABLE)
1535             tcol = cmap_loc[cols[k]];
1536 #else
1537             tcol = cmap[cols[k]+cstart];
1538 #endif
1539             if (tcol) lens[j]++;
1540           }
1541         } else { /* allcolumns */
1542           lens[j] = ncols;
1543         }
1544 
1545         /* off-diagonal part B = c->B */
1546         ncols = bi[row-rstart+1] - bi[row-rstart];
1547         cols  = bj + bi[row-rstart];
1548         if (!allcolumns) {
1549           for (k=0; k<ncols; k++) {
1550 #if defined(PETSC_USE_CTABLE)
1551             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1552 #else
1553             tcol = cmap[bmap[cols[k]]];
1554 #endif
1555             if (tcol) lens[j]++;
1556           }
1557         } else { /* allcolumns */
1558           lens[j] += ncols;
1559         }
1560       }
1561     }
1562 
1563     /* Create row map (rmap): global row of C -> local row of submat */
1564 #if defined(PETSC_USE_CTABLE)
1565     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1566     for (j=0; j<nrow; j++) {
1567       row  = irow[j];
1568       proc = row2proc[j];
1569       if (proc == rank) { /* a local row */
1570         rmap_loc[row - rstart] = j;
1571       } else {
1572         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1573       }
1574     }
1575 #else
1576     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1577     for (j=0; j<nrow; j++) {
1578       rmap[irow[j]] = j;
1579     }
1580 #endif
1581 
1582     /* Update lens from offproc data */
1583     /* recv a->j is done */
1584     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRMPI(ierr);
1585     for (i=0; i<nrqs; i++) {
1586       proc    = pa[i];
1587       sbuf1_i = sbuf1[proc];
1588       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1589       ct1     = 2 + 1;
1590       ct2     = 0;
1591       rbuf2_i = rbuf2[i]; /* received length of C->j */
1592       rbuf3_i = rbuf3[i]; /* received C->j */
1593 
1594       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1595       max1   = sbuf1_i[2];
1596       for (k=0; k<max1; k++,ct1++) {
1597 #if defined(PETSC_USE_CTABLE)
1598         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1599         row--;
1600         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1601 #else
1602         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1603 #endif
1604         /* Now, store row index of submat in sbuf1_i[ct1] */
1605         sbuf1_i[ct1] = row;
1606 
1607         nnz = rbuf2_i[ct1];
1608         if (!allcolumns) {
1609           for (l=0; l<nnz; l++,ct2++) {
1610 #if defined(PETSC_USE_CTABLE)
1611             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1612               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1613             } else {
1614               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1615             }
1616 #else
1617             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1618 #endif
1619             if (tcol) lens[row]++;
1620           }
1621         } else { /* allcolumns */
1622           lens[row] += nnz;
1623         }
1624       }
1625     }
1626     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr);
1627     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1628 
1629     /* Create the submatrices */
1630     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1631     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1632 
1633     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1634     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1635     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1636     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1637     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1638 
1639     /* create struct Mat_SubSppt and attached it to submat */
1640     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1641     subc = (Mat_SeqAIJ*)submat->data;
1642     subc->submatis1 = smatis1;
1643 
1644     smatis1->id          = 0;
1645     smatis1->nrqs        = nrqs;
1646     smatis1->nrqr        = nrqr;
1647     smatis1->rbuf1       = rbuf1;
1648     smatis1->rbuf2       = rbuf2;
1649     smatis1->rbuf3       = rbuf3;
1650     smatis1->sbuf2       = sbuf2;
1651     smatis1->req_source2 = req_source2;
1652 
1653     smatis1->sbuf1       = sbuf1;
1654     smatis1->ptr         = ptr;
1655     smatis1->tmp         = tmp;
1656     smatis1->ctr         = ctr;
1657 
1658     smatis1->pa           = pa;
1659     smatis1->req_size     = req_size;
1660     smatis1->req_source1  = req_source1;
1661 
1662     smatis1->allcolumns  = allcolumns;
1663     smatis1->singleis    = PETSC_TRUE;
1664     smatis1->row2proc    = row2proc;
1665     smatis1->rmap        = rmap;
1666     smatis1->cmap        = cmap;
1667 #if defined(PETSC_USE_CTABLE)
1668     smatis1->rmap_loc    = rmap_loc;
1669     smatis1->cmap_loc    = cmap_loc;
1670 #endif
1671 
1672     smatis1->destroy     = submat->ops->destroy;
1673     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1674     submat->factortype   = C->factortype;
1675 
1676     /* compute rmax */
1677     rmax = 0;
1678     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1679 
1680   } else { /* scall == MAT_REUSE_MATRIX */
1681     submat = submats[0];
1682     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1683 
1684     subc    = (Mat_SeqAIJ*)submat->data;
1685     rmax    = subc->rmax;
1686     smatis1 = subc->submatis1;
1687     nrqs        = smatis1->nrqs;
1688     nrqr        = smatis1->nrqr;
1689     rbuf1       = smatis1->rbuf1;
1690     rbuf2       = smatis1->rbuf2;
1691     rbuf3       = smatis1->rbuf3;
1692     req_source2 = smatis1->req_source2;
1693 
1694     sbuf1     = smatis1->sbuf1;
1695     sbuf2     = smatis1->sbuf2;
1696     ptr       = smatis1->ptr;
1697     tmp       = smatis1->tmp;
1698     ctr       = smatis1->ctr;
1699 
1700     pa         = smatis1->pa;
1701     req_size   = smatis1->req_size;
1702     req_source1 = smatis1->req_source1;
1703 
1704     allcolumns = smatis1->allcolumns;
1705     row2proc   = smatis1->row2proc;
1706     rmap       = smatis1->rmap;
1707     cmap       = smatis1->cmap;
1708 #if defined(PETSC_USE_CTABLE)
1709     rmap_loc   = smatis1->rmap_loc;
1710     cmap_loc   = smatis1->cmap_loc;
1711 #endif
1712   }
1713 
1714   /* Post recv matrix values */
1715   ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1716   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1717   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1718   for (i=0; i<nrqs; ++i) {
1719     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1720     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr);
1721   }
1722 
1723   /* Allocate sending buffers for a->a, and send them off */
1724   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1725   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1726   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1727   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1728 
1729   for (i=0; i<nrqr; i++) {
1730     rbuf1_i   = rbuf1[i];
1731     sbuf_aa_i = sbuf_aa[i];
1732     ct1       = 2*rbuf1_i[0]+1;
1733     ct2       = 0;
1734     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1735 
1736     kmax = rbuf1_i[2];
1737     for (k=0; k<kmax; k++,ct1++) {
1738       row = rbuf1_i[ct1] - rstart;
1739       nzA = ai[row+1] - ai[row];
1740       nzB = bi[row+1] - bi[row];
1741       ncols  = nzA + nzB;
1742       cworkB = bj + bi[row];
1743       vworkA = a_a + ai[row];
1744       vworkB = b_a + bi[row];
1745 
1746       /* load the column values for this row into vals*/
1747       vals = sbuf_aa_i + ct2;
1748 
1749       lwrite = 0;
1750       for (l=0; l<nzB; l++) {
1751         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1752       }
1753       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1754       for (l=0; l<nzB; l++) {
1755         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1756       }
1757 
1758       ct2 += ncols;
1759     }
1760     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr);
1761   }
1762 
1763   /* Assemble submat */
1764   /* First assemble the local rows */
1765   for (j=0; j<nrow; j++) {
1766     row  = irow[j];
1767     proc = row2proc[j];
1768     if (proc == rank) {
1769       Crow = row - rstart;  /* local row index of C */
1770 #if defined(PETSC_USE_CTABLE)
1771       row = rmap_loc[Crow]; /* row index of submat */
1772 #else
1773       row = rmap[row];
1774 #endif
1775 
1776       if (allcolumns) {
1777         /* diagonal part A = c->A */
1778         ncols = ai[Crow+1] - ai[Crow];
1779         cols  = aj + ai[Crow];
1780         vals  = a_a + ai[Crow];
1781         i     = 0;
1782         for (k=0; k<ncols; k++) {
1783           subcols[i]   = cols[k] + cstart;
1784           subvals[i++] = vals[k];
1785         }
1786 
1787         /* off-diagonal part B = c->B */
1788         ncols = bi[Crow+1] - bi[Crow];
1789         cols  = bj + bi[Crow];
1790         vals  = b_a + bi[Crow];
1791         for (k=0; k<ncols; k++) {
1792           subcols[i]   = bmap[cols[k]];
1793           subvals[i++] = vals[k];
1794         }
1795 
1796         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1797 
1798       } else { /* !allcolumns */
1799 #if defined(PETSC_USE_CTABLE)
1800         /* diagonal part A = c->A */
1801         ncols = ai[Crow+1] - ai[Crow];
1802         cols  = aj + ai[Crow];
1803         vals  = a_a + ai[Crow];
1804         i     = 0;
1805         for (k=0; k<ncols; k++) {
1806           tcol = cmap_loc[cols[k]];
1807           if (tcol) {
1808             subcols[i]   = --tcol;
1809             subvals[i++] = vals[k];
1810           }
1811         }
1812 
1813         /* off-diagonal part B = c->B */
1814         ncols = bi[Crow+1] - bi[Crow];
1815         cols  = bj + bi[Crow];
1816         vals  = b_a + bi[Crow];
1817         for (k=0; k<ncols; k++) {
1818           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1819           if (tcol) {
1820             subcols[i]   = --tcol;
1821             subvals[i++] = vals[k];
1822           }
1823         }
1824 #else
1825         /* diagonal part A = c->A */
1826         ncols = ai[Crow+1] - ai[Crow];
1827         cols  = aj + ai[Crow];
1828         vals  = a_a + ai[Crow];
1829         i     = 0;
1830         for (k=0; k<ncols; k++) {
1831           tcol = cmap[cols[k]+cstart];
1832           if (tcol) {
1833             subcols[i]   = --tcol;
1834             subvals[i++] = vals[k];
1835           }
1836         }
1837 
1838         /* off-diagonal part B = c->B */
1839         ncols = bi[Crow+1] - bi[Crow];
1840         cols  = bj + bi[Crow];
1841         vals  = b_a + bi[Crow];
1842         for (k=0; k<ncols; k++) {
1843           tcol = cmap[bmap[cols[k]]];
1844           if (tcol) {
1845             subcols[i]   = --tcol;
1846             subvals[i++] = vals[k];
1847           }
1848         }
1849 #endif
1850         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1851       }
1852     }
1853   }
1854 
1855   /* Now assemble the off-proc rows */
1856   for (i=0; i<nrqs; i++) { /* for each requested message */
1857     /* recv values from other processes */
1858     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRMPI(ierr);
1859     proc    = pa[idex];
1860     sbuf1_i = sbuf1[proc];
1861     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1862     ct1     = 2 + 1;
1863     ct2     = 0; /* count of received C->j */
1864     ct3     = 0; /* count of received C->j that will be inserted into submat */
1865     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1866     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1867     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1868 
1869     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1870     max1 = sbuf1_i[2];             /* num of rows */
1871     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1872       row = sbuf1_i[ct1]; /* row index of submat */
1873       if (!allcolumns) {
1874         idex = 0;
1875         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1876           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1877           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1878 #if defined(PETSC_USE_CTABLE)
1879             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1880               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1881             } else {
1882               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1883             }
1884 #else
1885             tcol = cmap[rbuf3_i[ct2]];
1886 #endif
1887             if (tcol) {
1888               subcols[idex]   = --tcol; /* may not be sorted */
1889               subvals[idex++] = rbuf4_i[ct2];
1890 
1891               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1892                For reuse, we replace received C->j with index that should be inserted to submat */
1893               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1894             }
1895           }
1896           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1897         } else { /* scall == MAT_REUSE_MATRIX */
1898           submat = submats[0];
1899           subc   = (Mat_SeqAIJ*)submat->data;
1900 
1901           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1902           for (l=0; l<nnz; l++) {
1903             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1904             subvals[idex++] = rbuf4_i[ct2];
1905           }
1906 
1907           bj = subc->j + subc->i[row]; /* sorted column indices */
1908           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1909         }
1910       } else { /* allcolumns */
1911         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1912         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1913         ct2 += nnz;
1914       }
1915     }
1916   }
1917 
1918   /* sending a->a are done */
1919   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr);
1920   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1921 
1922   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1923   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1924   submats[0] = submat;
1925 
1926   /* Restore the indices */
1927   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1928   if (!allcolumns) {
1929     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1930   }
1931 
1932   /* Destroy allocated memory */
1933   for (i=0; i<nrqs; ++i) {
1934     ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr);
1935   }
1936   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1937   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1938   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1939 
1940   if (scall == MAT_INITIAL_MATRIX) {
1941     ierr = PetscFree(lens);CHKERRQ(ierr);
1942     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1943     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1944   }
1945   ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
1946   ierr = MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
1947   PetscFunctionReturn(0);
1948 }
1949 
1950 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1951 {
1952   PetscErrorCode ierr;
1953   PetscInt       ncol;
1954   PetscBool      colflag,allcolumns=PETSC_FALSE;
1955 
1956   PetscFunctionBegin;
1957   /* Allocate memory to hold all the submatrices */
1958   if (scall == MAT_INITIAL_MATRIX) {
1959     ierr = PetscCalloc1(2,submat);CHKERRQ(ierr);
1960   }
1961 
1962   /* Check for special case: each processor gets entire matrix columns */
1963   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
1964   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1965   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1966 
1967   ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
1968   PetscFunctionReturn(0);
1969 }
1970 
1971 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1972 {
1973   PetscErrorCode ierr;
1974   PetscInt       nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1975   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1976   Mat_SeqAIJ     *subc;
1977   Mat_SubSppt    *smat;
1978 
1979   PetscFunctionBegin;
1980   /* Check for special case: each processor has a single IS */
1981   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1982     ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1983     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1984     PetscFunctionReturn(0);
1985   }
1986 
1987   /* Collect global wantallmatrix and nstages */
1988   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1989   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1990   if (!nmax) nmax = 1;
1991 
1992   if (scall == MAT_INITIAL_MATRIX) {
1993     /* Collect global wantallmatrix and nstages */
1994     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1995       ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1996       ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1997       ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1998       ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1999       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2000         wantallmatrix = PETSC_TRUE;
2001 
2002         ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
2003       }
2004     }
2005 
2006     /* Determine the number of stages through which submatrices are done
2007        Each stage will extract nmax submatrices.
2008        nmax is determined by the matrix column dimension.
2009        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2010     */
2011     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2012 
2013     in[0] = -1*(PetscInt)wantallmatrix;
2014     in[1] = nstages;
2015     ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr);
2016     wantallmatrix = (PetscBool)(-out[0]);
2017     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2018 
2019   } else { /* MAT_REUSE_MATRIX */
2020     if (ismax) {
2021       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2022       smat = subc->submatis1;
2023     } else { /* (*submat)[0] is a dummy matrix */
2024       smat = (Mat_SubSppt*)(*submat)[0]->data;
2025     }
2026     if (!smat) {
2027       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2028       wantallmatrix = PETSC_TRUE;
2029     } else if (smat->singleis) {
2030       ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2031       PetscFunctionReturn(0);
2032     } else {
2033       nstages = smat->nstages;
2034     }
2035   }
2036 
2037   if (wantallmatrix) {
2038     ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2039     PetscFunctionReturn(0);
2040   }
2041 
2042   /* Allocate memory to hold all the submatrices and dummy submatrices */
2043   if (scall == MAT_INITIAL_MATRIX) {
2044     ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr);
2045   }
2046 
2047   for (i=0,pos=0; i<nstages; i++) {
2048     if (pos+nmax <= ismax) max_no = nmax;
2049     else if (pos >= ismax) max_no = 0;
2050     else                   max_no = ismax-pos;
2051 
2052     ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
2053     if (!max_no) {
2054       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2055         smat = (Mat_SubSppt*)(*submat)[pos]->data;
2056         smat->nstages = nstages;
2057       }
2058       pos++; /* advance to next dummy matrix if any */
2059     } else pos += max_no;
2060   }
2061 
2062   if (ismax && scall == MAT_INITIAL_MATRIX) {
2063     /* save nstages for reuse */
2064     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2065     smat = subc->submatis1;
2066     smat->nstages = nstages;
2067   }
2068   PetscFunctionReturn(0);
2069 }
2070 
2071 /* -------------------------------------------------------------------------*/
2072 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2073 {
2074   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2075   Mat            A  = c->A;
2076   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2077   const PetscInt **icol,**irow;
2078   PetscInt       *nrow,*ncol,start;
2079   PetscErrorCode ierr;
2080   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2081   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2082   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2083   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2084   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2085 #if defined(PETSC_USE_CTABLE)
2086   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2087 #else
2088   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2089 #endif
2090   const PetscInt *irow_i;
2091   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2092   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2093   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2094   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2095   MPI_Status     *r_status3,*r_status4,*s_status4;
2096   MPI_Comm       comm;
2097   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2098   PetscMPIInt    *onodes1,*olengths1,end;
2099   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2100   Mat_SubSppt    *smat_i;
2101   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2102   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2103 
2104   PetscFunctionBegin;
2105   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2106   size = c->size;
2107   rank = c->rank;
2108 
2109   ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr);
2110   ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
2111 
2112   for (i=0; i<ismax; i++) {
2113     ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
2114     if (!issorted[i]) iscsorted = issorted[i];
2115 
2116     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2117 
2118     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2119     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2120 
2121     /* Check for special case: allcolumn */
2122     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2123     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2124     if (colflag && ncol[i] == C->cmap->N) {
2125       allcolumns[i] = PETSC_TRUE;
2126       icol[i] = NULL;
2127     } else {
2128       allcolumns[i] = PETSC_FALSE;
2129       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2130     }
2131   }
2132 
2133   if (scall == MAT_REUSE_MATRIX) {
2134     /* Assumes new rows are same length as the old rows */
2135     for (i=0; i<ismax; i++) {
2136       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2137       subc = (Mat_SeqAIJ*)submats[i]->data;
2138       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2139 
2140       /* Initial matrix as if empty */
2141       ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr);
2142 
2143       smat_i   = subc->submatis1;
2144 
2145       nrqs        = smat_i->nrqs;
2146       nrqr        = smat_i->nrqr;
2147       rbuf1       = smat_i->rbuf1;
2148       rbuf2       = smat_i->rbuf2;
2149       rbuf3       = smat_i->rbuf3;
2150       req_source2 = smat_i->req_source2;
2151 
2152       sbuf1     = smat_i->sbuf1;
2153       sbuf2     = smat_i->sbuf2;
2154       ptr       = smat_i->ptr;
2155       tmp       = smat_i->tmp;
2156       ctr       = smat_i->ctr;
2157 
2158       pa          = smat_i->pa;
2159       req_size    = smat_i->req_size;
2160       req_source1 = smat_i->req_source1;
2161 
2162       allcolumns[i] = smat_i->allcolumns;
2163       row2proc[i]   = smat_i->row2proc;
2164       rmap[i]       = smat_i->rmap;
2165       cmap[i]       = smat_i->cmap;
2166     }
2167 
2168     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2169       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2170       smat_i = (Mat_SubSppt*)submats[0]->data;
2171 
2172       nrqs        = smat_i->nrqs;
2173       nrqr        = smat_i->nrqr;
2174       rbuf1       = smat_i->rbuf1;
2175       rbuf2       = smat_i->rbuf2;
2176       rbuf3       = smat_i->rbuf3;
2177       req_source2 = smat_i->req_source2;
2178 
2179       sbuf1       = smat_i->sbuf1;
2180       sbuf2       = smat_i->sbuf2;
2181       ptr         = smat_i->ptr;
2182       tmp         = smat_i->tmp;
2183       ctr         = smat_i->ctr;
2184 
2185       pa          = smat_i->pa;
2186       req_size    = smat_i->req_size;
2187       req_source1 = smat_i->req_source1;
2188 
2189       allcolumns[0] = PETSC_FALSE;
2190     }
2191   } else { /* scall == MAT_INITIAL_MATRIX */
2192     /* Get some new tags to keep the communication clean */
2193     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2194     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2195 
2196     /* evaluate communication - mesg to who, length of mesg, and buffer space
2197      required. Based on this, buffers are allocated, and data copied into them*/
2198     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size, initialize work vectors */
2199 
2200     for (i=0; i<ismax; i++) {
2201       jmax   = nrow[i];
2202       irow_i = irow[i];
2203 
2204       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2205       row2proc[i] = row2proc_i;
2206 
2207       if (issorted[i]) proc = 0;
2208       for (j=0; j<jmax; j++) {
2209         if (!issorted[i]) proc = 0;
2210         row = irow_i[j];
2211         while (row >= C->rmap->range[proc+1]) proc++;
2212         w4[proc]++;
2213         row2proc_i[j] = proc; /* map row index to proc */
2214       }
2215       for (j=0; j<size; j++) {
2216         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2217       }
2218     }
2219 
2220     nrqs     = 0;              /* no of outgoing messages */
2221     msz      = 0;              /* total mesg length (for all procs) */
2222     w1[rank] = 0;              /* no mesg sent to self */
2223     w3[rank] = 0;
2224     for (i=0; i<size; i++) {
2225       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2226     }
2227     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2228     for (i=0,j=0; i<size; i++) {
2229       if (w1[i]) { pa[j] = i; j++; }
2230     }
2231 
2232     /* Each message would have a header = 1 + 2*(no of IS) + data */
2233     for (i=0; i<nrqs; i++) {
2234       j      = pa[i];
2235       w1[j] += w2[j] + 2* w3[j];
2236       msz   += w1[j];
2237     }
2238     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2239 
2240     /* Determine the number of messages to expect, their lengths, from from-ids */
2241     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2242     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2243 
2244     /* Now post the Irecvs corresponding to these messages */
2245     tag0 = ((PetscObject)C)->tag;
2246     ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2247 
2248     ierr = PetscFree(onodes1);CHKERRQ(ierr);
2249     ierr = PetscFree(olengths1);CHKERRQ(ierr);
2250 
2251     /* Allocate Memory for outgoing messages */
2252     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2253     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
2254     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
2255 
2256     {
2257       PetscInt *iptr = tmp;
2258       k    = 0;
2259       for (i=0; i<nrqs; i++) {
2260         j        = pa[i];
2261         iptr    += k;
2262         sbuf1[j] = iptr;
2263         k        = w1[j];
2264       }
2265     }
2266 
2267     /* Form the outgoing messages. Initialize the header space */
2268     for (i=0; i<nrqs; i++) {
2269       j           = pa[i];
2270       sbuf1[j][0] = 0;
2271       ierr        = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr);
2272       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2273     }
2274 
2275     /* Parse the isrow and copy data into outbuf */
2276     for (i=0; i<ismax; i++) {
2277       row2proc_i = row2proc[i];
2278       ierr   = PetscArrayzero(ctr,size);CHKERRQ(ierr);
2279       irow_i = irow[i];
2280       jmax   = nrow[i];
2281       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2282         proc = row2proc_i[j];
2283         if (proc != rank) { /* copy to the outgoing buf*/
2284           ctr[proc]++;
2285           *ptr[proc] = irow_i[j];
2286           ptr[proc]++;
2287         }
2288       }
2289       /* Update the headers for the current IS */
2290       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2291         if ((ctr_j = ctr[j])) {
2292           sbuf1_j        = sbuf1[j];
2293           k              = ++sbuf1_j[0];
2294           sbuf1_j[2*k]   = ctr_j;
2295           sbuf1_j[2*k-1] = i;
2296         }
2297       }
2298     }
2299 
2300     /*  Now  post the sends */
2301     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2302     for (i=0; i<nrqs; ++i) {
2303       j    = pa[i];
2304       ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRMPI(ierr);
2305     }
2306 
2307     /* Post Receives to capture the buffer size */
2308     ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2309     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
2310     rbuf2[0] = tmp + msz;
2311     for (i=1; i<nrqs; ++i) {
2312       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2313     }
2314     for (i=0; i<nrqs; ++i) {
2315       j    = pa[i];
2316       ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRMPI(ierr);
2317     }
2318 
2319     /* Send to other procs the buf size they should allocate */
2320     /* Receive messages*/
2321     ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2322     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2323     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2324     {
2325       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2326       PetscInt   *sbuf2_i;
2327 
2328       ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr);
2329       for (i=0; i<nrqr; ++i) {
2330         req_size[i] = 0;
2331         rbuf1_i        = rbuf1[i];
2332         start          = 2*rbuf1_i[0] + 1;
2333         ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRMPI(ierr);
2334         ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
2335         sbuf2_i        = sbuf2[i];
2336         for (j=start; j<end; j++) {
2337           id              = rbuf1_i[j] - rstart;
2338           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2339           sbuf2_i[j]      = ncols;
2340           req_size[i] += ncols;
2341         }
2342         req_source1[i] = r_status1[i].MPI_SOURCE;
2343         /* form the header */
2344         sbuf2_i[0] = req_size[i];
2345         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2346 
2347         ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr);
2348       }
2349     }
2350     ierr = PetscFree(r_status1);CHKERRQ(ierr);
2351     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2352     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2353 
2354     /* Receive messages*/
2355     ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2356     ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2357 
2358     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr);
2359     for (i=0; i<nrqs; ++i) {
2360       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
2361       req_source2[i] = r_status2[i].MPI_SOURCE;
2362       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr);
2363     }
2364     ierr = PetscFree(r_status2);CHKERRQ(ierr);
2365     ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2366 
2367     /* Wait on sends1 and sends2 */
2368     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2369     ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2370 
2371     if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr);}
2372     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr);}
2373     ierr = PetscFree(s_status1);CHKERRQ(ierr);
2374     ierr = PetscFree(s_status2);CHKERRQ(ierr);
2375     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2376     ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2377 
2378     /* Now allocate sending buffers for a->j, and send them off */
2379     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2380     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2381     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2382     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2383 
2384     ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2385     {
2386       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2387       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2388       PetscInt cend = C->cmap->rend;
2389       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2390 
2391       for (i=0; i<nrqr; i++) {
2392         rbuf1_i   = rbuf1[i];
2393         sbuf_aj_i = sbuf_aj[i];
2394         ct1       = 2*rbuf1_i[0] + 1;
2395         ct2       = 0;
2396         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2397           kmax = rbuf1[i][2*j];
2398           for (k=0; k<kmax; k++,ct1++) {
2399             row    = rbuf1_i[ct1] - rstart;
2400             nzA    = a_i[row+1] - a_i[row];
2401             nzB    = b_i[row+1] - b_i[row];
2402             ncols  = nzA + nzB;
2403             cworkA = a_j + a_i[row];
2404             cworkB = b_j + b_i[row];
2405 
2406             /* load the column indices for this row into cols */
2407             cols = sbuf_aj_i + ct2;
2408 
2409             lwrite = 0;
2410             for (l=0; l<nzB; l++) {
2411               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2412             }
2413             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2414             for (l=0; l<nzB; l++) {
2415               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2416             }
2417 
2418             ct2 += ncols;
2419           }
2420         }
2421         ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr);
2422       }
2423     }
2424     ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
2425 
2426     /* create col map: global col of C -> local col of submatrices */
2427     {
2428       const PetscInt *icol_i;
2429 #if defined(PETSC_USE_CTABLE)
2430       for (i=0; i<ismax; i++) {
2431         if (!allcolumns[i]) {
2432           ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2433 
2434           jmax   = ncol[i];
2435           icol_i = icol[i];
2436           cmap_i = cmap[i];
2437           for (j=0; j<jmax; j++) {
2438             ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2439           }
2440         } else cmap[i] = NULL;
2441       }
2442 #else
2443       for (i=0; i<ismax; i++) {
2444         if (!allcolumns[i]) {
2445           ierr   = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2446           jmax   = ncol[i];
2447           icol_i = icol[i];
2448           cmap_i = cmap[i];
2449           for (j=0; j<jmax; j++) {
2450             cmap_i[icol_i[j]] = j+1;
2451           }
2452         } else cmap[i] = NULL;
2453       }
2454 #endif
2455     }
2456 
2457     /* Create lens which is required for MatCreate... */
2458     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2459     ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2460 
2461     if (ismax) {
2462       ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
2463     }
2464     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2465 
2466     /* Update lens from local data */
2467     for (i=0; i<ismax; i++) {
2468       row2proc_i = row2proc[i];
2469       jmax = nrow[i];
2470       if (!allcolumns[i]) cmap_i = cmap[i];
2471       irow_i = irow[i];
2472       lens_i = lens[i];
2473       for (j=0; j<jmax; j++) {
2474         row = irow_i[j];
2475         proc = row2proc_i[j];
2476         if (proc == rank) {
2477           ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr);
2478           if (!allcolumns[i]) {
2479             for (k=0; k<ncols; k++) {
2480 #if defined(PETSC_USE_CTABLE)
2481               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2482 #else
2483               tcol = cmap_i[cols[k]];
2484 #endif
2485               if (tcol) lens_i[j]++;
2486             }
2487           } else { /* allcolumns */
2488             lens_i[j] = ncols;
2489           }
2490           ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr);
2491         }
2492       }
2493     }
2494 
2495     /* Create row map: global row of C -> local row of submatrices */
2496 #if defined(PETSC_USE_CTABLE)
2497     for (i=0; i<ismax; i++) {
2498       ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2499       irow_i = irow[i];
2500       jmax   = nrow[i];
2501       for (j=0; j<jmax; j++) {
2502       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2503       }
2504     }
2505 #else
2506     for (i=0; i<ismax; i++) {
2507       ierr   = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2508       rmap_i = rmap[i];
2509       irow_i = irow[i];
2510       jmax   = nrow[i];
2511       for (j=0; j<jmax; j++) {
2512         rmap_i[irow_i[j]] = j;
2513       }
2514     }
2515 #endif
2516 
2517     /* Update lens from offproc data */
2518     {
2519       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2520 
2521       ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRMPI(ierr);
2522       for (tmp2=0; tmp2<nrqs; tmp2++) {
2523         sbuf1_i = sbuf1[pa[tmp2]];
2524         jmax    = sbuf1_i[0];
2525         ct1     = 2*jmax+1;
2526         ct2     = 0;
2527         rbuf2_i = rbuf2[tmp2];
2528         rbuf3_i = rbuf3[tmp2];
2529         for (j=1; j<=jmax; j++) {
2530           is_no  = sbuf1_i[2*j-1];
2531           max1   = sbuf1_i[2*j];
2532           lens_i = lens[is_no];
2533           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2534           rmap_i = rmap[is_no];
2535           for (k=0; k<max1; k++,ct1++) {
2536 #if defined(PETSC_USE_CTABLE)
2537             ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2538             row--;
2539             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2540 #else
2541             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2542 #endif
2543             max2 = rbuf2_i[ct1];
2544             for (l=0; l<max2; l++,ct2++) {
2545               if (!allcolumns[is_no]) {
2546 #if defined(PETSC_USE_CTABLE)
2547                 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2548 #else
2549                 tcol = cmap_i[rbuf3_i[ct2]];
2550 #endif
2551                 if (tcol) lens_i[row]++;
2552               } else { /* allcolumns */
2553                 lens_i[row]++; /* lens_i[row] += max2 ? */
2554               }
2555             }
2556           }
2557         }
2558       }
2559     }
2560     ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2561     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr);}
2562     ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr);
2563     ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2564 
2565     /* Create the submatrices */
2566     for (i=0; i<ismax; i++) {
2567       PetscInt    rbs,cbs;
2568 
2569       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2570       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2571 
2572       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2573       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2574 
2575       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2576       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2577       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2578 
2579       /* create struct Mat_SubSppt and attached it to submat */
2580       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2581       subc = (Mat_SeqAIJ*)submats[i]->data;
2582       subc->submatis1 = smat_i;
2583 
2584       smat_i->destroy          = submats[i]->ops->destroy;
2585       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2586       submats[i]->factortype   = C->factortype;
2587 
2588       smat_i->id          = i;
2589       smat_i->nrqs        = nrqs;
2590       smat_i->nrqr        = nrqr;
2591       smat_i->rbuf1       = rbuf1;
2592       smat_i->rbuf2       = rbuf2;
2593       smat_i->rbuf3       = rbuf3;
2594       smat_i->sbuf2       = sbuf2;
2595       smat_i->req_source2 = req_source2;
2596 
2597       smat_i->sbuf1       = sbuf1;
2598       smat_i->ptr         = ptr;
2599       smat_i->tmp         = tmp;
2600       smat_i->ctr         = ctr;
2601 
2602       smat_i->pa           = pa;
2603       smat_i->req_size     = req_size;
2604       smat_i->req_source1  = req_source1;
2605 
2606       smat_i->allcolumns  = allcolumns[i];
2607       smat_i->singleis    = PETSC_FALSE;
2608       smat_i->row2proc    = row2proc[i];
2609       smat_i->rmap        = rmap[i];
2610       smat_i->cmap        = cmap[i];
2611     }
2612 
2613     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2614       ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
2615       ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2616       ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr);
2617 
2618       /* create struct Mat_SubSppt and attached it to submat */
2619       ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr);
2620       submats[0]->data = (void*)smat_i;
2621 
2622       smat_i->destroy          = submats[0]->ops->destroy;
2623       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2624       submats[0]->factortype   = C->factortype;
2625 
2626       smat_i->id          = 0;
2627       smat_i->nrqs        = nrqs;
2628       smat_i->nrqr        = nrqr;
2629       smat_i->rbuf1       = rbuf1;
2630       smat_i->rbuf2       = rbuf2;
2631       smat_i->rbuf3       = rbuf3;
2632       smat_i->sbuf2       = sbuf2;
2633       smat_i->req_source2 = req_source2;
2634 
2635       smat_i->sbuf1       = sbuf1;
2636       smat_i->ptr         = ptr;
2637       smat_i->tmp         = tmp;
2638       smat_i->ctr         = ctr;
2639 
2640       smat_i->pa           = pa;
2641       smat_i->req_size     = req_size;
2642       smat_i->req_source1  = req_source1;
2643 
2644       smat_i->allcolumns  = PETSC_FALSE;
2645       smat_i->singleis    = PETSC_FALSE;
2646       smat_i->row2proc    = NULL;
2647       smat_i->rmap        = NULL;
2648       smat_i->cmap        = NULL;
2649     }
2650 
2651     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2652     ierr = PetscFree(lens);CHKERRQ(ierr);
2653     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2654     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2655 
2656   } /* endof scall == MAT_INITIAL_MATRIX */
2657 
2658   /* Post recv matrix values */
2659   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
2660   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2661   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2662   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2663   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2664   for (i=0; i<nrqs; ++i) {
2665     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
2666     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr);
2667   }
2668 
2669   /* Allocate sending buffers for a->a, and send them off */
2670   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2671   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2672   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2673   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2674 
2675   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2676   {
2677     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2678     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2679     PetscInt    cend   = C->cmap->rend;
2680     PetscInt    *b_j   = b->j;
2681     PetscScalar *vworkA,*vworkB,*a_a,*b_a;
2682 
2683     ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
2684     ierr = MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
2685     for (i=0; i<nrqr; i++) {
2686       rbuf1_i   = rbuf1[i];
2687       sbuf_aa_i = sbuf_aa[i];
2688       ct1       = 2*rbuf1_i[0]+1;
2689       ct2       = 0;
2690       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2691         kmax = rbuf1_i[2*j];
2692         for (k=0; k<kmax; k++,ct1++) {
2693           row    = rbuf1_i[ct1] - rstart;
2694           nzA    = a_i[row+1] - a_i[row];
2695           nzB    = b_i[row+1] - b_i[row];
2696           ncols  = nzA + nzB;
2697           cworkB = b_j + b_i[row];
2698           vworkA = a_a + a_i[row];
2699           vworkB = b_a + b_i[row];
2700 
2701           /* load the column values for this row into vals*/
2702           vals = sbuf_aa_i+ct2;
2703 
2704           lwrite = 0;
2705           for (l=0; l<nzB; l++) {
2706             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2707           }
2708           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2709           for (l=0; l<nzB; l++) {
2710             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2711           }
2712 
2713           ct2 += ncols;
2714         }
2715       }
2716       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr);
2717     }
2718     ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
2719     ierr = MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
2720   }
2721 
2722   /* Assemble the matrices */
2723   /* First assemble the local rows */
2724   for (i=0; i<ismax; i++) {
2725     row2proc_i = row2proc[i];
2726     subc      = (Mat_SeqAIJ*)submats[i]->data;
2727     imat_ilen = subc->ilen;
2728     imat_j    = subc->j;
2729     imat_i    = subc->i;
2730     imat_a    = subc->a;
2731 
2732     if (!allcolumns[i]) cmap_i = cmap[i];
2733     rmap_i = rmap[i];
2734     irow_i = irow[i];
2735     jmax   = nrow[i];
2736     for (j=0; j<jmax; j++) {
2737       row  = irow_i[j];
2738       proc = row2proc_i[j];
2739       if (proc == rank) {
2740         old_row = row;
2741 #if defined(PETSC_USE_CTABLE)
2742         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2743         row--;
2744 #else
2745         row = rmap_i[row];
2746 #endif
2747         ilen_row = imat_ilen[row];
2748         ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2749         mat_i    = imat_i[row];
2750         mat_a    = imat_a + mat_i;
2751         mat_j    = imat_j + mat_i;
2752         if (!allcolumns[i]) {
2753           for (k=0; k<ncols; k++) {
2754 #if defined(PETSC_USE_CTABLE)
2755             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2756 #else
2757             tcol = cmap_i[cols[k]];
2758 #endif
2759             if (tcol) {
2760               *mat_j++ = tcol - 1;
2761               *mat_a++ = vals[k];
2762               ilen_row++;
2763             }
2764           }
2765         } else { /* allcolumns */
2766           for (k=0; k<ncols; k++) {
2767             *mat_j++ = cols[k];  /* global col index! */
2768             *mat_a++ = vals[k];
2769             ilen_row++;
2770           }
2771         }
2772         ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2773 
2774         imat_ilen[row] = ilen_row;
2775       }
2776     }
2777   }
2778 
2779   /* Now assemble the off proc rows */
2780   ierr    = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRMPI(ierr);
2781   for (tmp2=0; tmp2<nrqs; tmp2++) {
2782     sbuf1_i = sbuf1[pa[tmp2]];
2783     jmax    = sbuf1_i[0];
2784     ct1     = 2*jmax + 1;
2785     ct2     = 0;
2786     rbuf2_i = rbuf2[tmp2];
2787     rbuf3_i = rbuf3[tmp2];
2788     rbuf4_i = rbuf4[tmp2];
2789     for (j=1; j<=jmax; j++) {
2790       is_no     = sbuf1_i[2*j-1];
2791       rmap_i    = rmap[is_no];
2792       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2793       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2794       imat_ilen = subc->ilen;
2795       imat_j    = subc->j;
2796       imat_i    = subc->i;
2797       imat_a    = subc->a;
2798       max1      = sbuf1_i[2*j];
2799       for (k=0; k<max1; k++,ct1++) {
2800         row = sbuf1_i[ct1];
2801 #if defined(PETSC_USE_CTABLE)
2802         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2803         row--;
2804 #else
2805         row = rmap_i[row];
2806 #endif
2807         ilen  = imat_ilen[row];
2808         mat_i = imat_i[row];
2809         mat_a = imat_a + mat_i;
2810         mat_j = imat_j + mat_i;
2811         max2  = rbuf2_i[ct1];
2812         if (!allcolumns[is_no]) {
2813           for (l=0; l<max2; l++,ct2++) {
2814 #if defined(PETSC_USE_CTABLE)
2815             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2816 #else
2817             tcol = cmap_i[rbuf3_i[ct2]];
2818 #endif
2819             if (tcol) {
2820               *mat_j++ = tcol - 1;
2821               *mat_a++ = rbuf4_i[ct2];
2822               ilen++;
2823             }
2824           }
2825         } else { /* allcolumns */
2826           for (l=0; l<max2; l++,ct2++) {
2827             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2828             *mat_a++ = rbuf4_i[ct2];
2829             ilen++;
2830           }
2831         }
2832         imat_ilen[row] = ilen;
2833       }
2834     }
2835   }
2836 
2837   if (!iscsorted) { /* sort column indices of the rows */
2838     for (i=0; i<ismax; i++) {
2839       subc      = (Mat_SeqAIJ*)submats[i]->data;
2840       imat_j    = subc->j;
2841       imat_i    = subc->i;
2842       imat_a    = subc->a;
2843       imat_ilen = subc->ilen;
2844 
2845       if (allcolumns[i]) continue;
2846       jmax = nrow[i];
2847       for (j=0; j<jmax; j++) {
2848         mat_i = imat_i[j];
2849         mat_a = imat_a + mat_i;
2850         mat_j = imat_j + mat_i;
2851         ierr  = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr);
2852       }
2853     }
2854   }
2855 
2856   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2857   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2858   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr);}
2859   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2860   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2861 
2862   /* Restore the indices */
2863   for (i=0; i<ismax; i++) {
2864     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2865     if (!allcolumns[i]) {
2866       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2867     }
2868   }
2869 
2870   for (i=0; i<ismax; i++) {
2871     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2872     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2873   }
2874 
2875   /* Destroy allocated memory */
2876   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2877   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2878   ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr);
2879 
2880   for (i=0; i<nrqs; ++i) {
2881     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2882   }
2883   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2884 
2885   ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr);
2886   PetscFunctionReturn(0);
2887 }
2888 
2889 /*
2890  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2891  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2892  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2893  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2894  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2895  state, and needs to be "assembled" later by compressing B's column space.
2896 
2897  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2898  Following this call, C->A & C->B have been created, even if empty.
2899  */
2900 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2901 {
2902   /* If making this function public, change the error returned in this function away from _PLIB. */
2903   PetscErrorCode ierr;
2904   Mat_MPIAIJ     *aij;
2905   Mat_SeqAIJ     *Baij;
2906   PetscBool      seqaij,Bdisassembled;
2907   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2908   PetscScalar    v;
2909   const PetscInt *rowindices,*colindices;
2910 
2911   PetscFunctionBegin;
2912   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2913   if (A) {
2914     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2915     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2916     if (rowemb) {
2917       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2918       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2919     } else {
2920       if (C->rmap->n != A->rmap->n) {
2921         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2922       }
2923     }
2924     if (dcolemb) {
2925       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2926       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2927     } else {
2928       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2929     }
2930   }
2931   if (B) {
2932     ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2933     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2934     if (rowemb) {
2935       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2936       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2937     } else {
2938       if (C->rmap->n != B->rmap->n) {
2939         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2940       }
2941     }
2942     if (ocolemb) {
2943       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2944       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2945     } else {
2946       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2947     }
2948   }
2949 
2950   aij = (Mat_MPIAIJ*)C->data;
2951   if (!aij->A) {
2952     /* Mimic parts of MatMPIAIJSetPreallocation() */
2953     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2954     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2955     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2956     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2957     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2958   }
2959   if (A) {
2960     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2961   } else {
2962     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2963   }
2964   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2965     /*
2966       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2967       need to "disassemble" B -- convert it to using C's global indices.
2968       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2969 
2970       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2971       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2972 
2973       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2974       At least avoid calling MatSetValues() and the implied searches?
2975     */
2976 
2977     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2978 #if defined(PETSC_USE_CTABLE)
2979       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2980 #else
2981       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2982       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2983       if (aij->B) {
2984         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2985       }
2986 #endif
2987       ngcol = 0;
2988       if (aij->lvec) {
2989         ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2990       }
2991       if (aij->garray) {
2992         ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2993         ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2994       }
2995       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2996       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2997     }
2998     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2999       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
3000     }
3001     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
3002       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
3003     }
3004   }
3005   Bdisassembled = PETSC_FALSE;
3006   if (!aij->B) {
3007     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
3008     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
3009     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
3010     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
3011     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
3012     Bdisassembled = PETSC_TRUE;
3013   }
3014   if (B) {
3015     Baij = (Mat_SeqAIJ*)B->data;
3016     if (pattern == DIFFERENT_NONZERO_PATTERN) {
3017       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
3018       for (i=0; i<B->rmap->n; i++) {
3019         nz[i] = Baij->i[i+1] - Baij->i[i];
3020       }
3021       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3022       ierr = PetscFree(nz);CHKERRQ(ierr);
3023     }
3024 
3025     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3026     shift = rend-rstart;
3027     count = 0;
3028     rowindices = NULL;
3029     colindices = NULL;
3030     if (rowemb) {
3031       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3032     }
3033     if (ocolemb) {
3034       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3035     }
3036     for (i=0; i<B->rmap->n; i++) {
3037       PetscInt row;
3038       row = i;
3039       if (rowindices) row = rowindices[i];
3040       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3041         col  = Baij->j[count];
3042         if (colindices) col = colindices[col];
3043         if (Bdisassembled && col>=rstart) col += shift;
3044         v    = Baij->a[count];
3045         ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3046         ++count;
3047       }
3048     }
3049     /* No assembly for aij->B is necessary. */
3050     /* FIXME: set aij->B's nonzerostate correctly. */
3051   } else {
3052     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3053   }
3054   C->preallocated  = PETSC_TRUE;
3055   C->was_assembled = PETSC_FALSE;
3056   C->assembled     = PETSC_FALSE;
3057    /*
3058       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3059       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3060    */
3061   PetscFunctionReturn(0);
3062 }
3063 
3064 /*
3065   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3066  */
3067 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3068 {
3069   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3070 
3071   PetscFunctionBegin;
3072   PetscValidPointer(A,2);
3073   PetscValidPointer(B,3);
3074   /* FIXME: make sure C is assembled */
3075   *A = aij->A;
3076   *B = aij->B;
3077   /* Note that we don't incref *A and *B, so be careful! */
3078   PetscFunctionReturn(0);
3079 }
3080 
3081 /*
3082   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3083   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3084 */
3085 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3086                                                PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3087                                                PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3088                                                PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3089                                                PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3090 {
3091   PetscErrorCode ierr;
3092   PetscMPIInt    size,flag;
3093   PetscInt       i,ii,cismax,ispar;
3094   Mat            *A,*B;
3095   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3096 
3097   PetscFunctionBegin;
3098   if (!ismax) PetscFunctionReturn(0);
3099 
3100   for (i = 0, cismax = 0; i < ismax; ++i) {
3101     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRMPI(ierr);
3102     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3103     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRMPI(ierr);
3104     if (size > 1) ++cismax;
3105   }
3106 
3107   /*
3108      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3109      ispar counts the number of parallel ISs across C's comm.
3110   */
3111   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr);
3112   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3113     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3114     PetscFunctionReturn(0);
3115   }
3116 
3117   /* if (ispar) */
3118   /*
3119     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3120     These are used to extract the off-diag portion of the resulting parallel matrix.
3121     The row IS for the off-diag portion is the same as for the diag portion,
3122     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3123   */
3124   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3125   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3126   for (i = 0, ii = 0; i < ismax; ++i) {
3127     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3128     if (size > 1) {
3129       /*
3130          TODO: This is the part that's ***NOT SCALABLE***.
3131          To fix this we need to extract just the indices of C's nonzero columns
3132          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3133          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3134          to be done without serializing on the IS list, so, most likely, it is best
3135          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3136       */
3137       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3138       /* Now we have to
3139          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3140              were sorted on each rank, concatenated they might no longer be sorted;
3141          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3142              indices in the nondecreasing order to the original index positions.
3143          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3144       */
3145       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3146       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3147       ++ii;
3148     }
3149   }
3150   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3151   for (i = 0, ii = 0; i < ismax; ++i) {
3152     PetscInt       j,issize;
3153     const PetscInt *indices;
3154 
3155     /*
3156        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3157      */
3158     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3159     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3160     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3161     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3162     for (j = 1; j < issize; ++j) {
3163       if (indices[j] == indices[j-1]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3164     }
3165     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3166     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3167     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3168     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3169     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3170     for (j = 1; j < issize; ++j) {
3171       if (indices[j-1] == indices[j]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3172     }
3173     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3174     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3175     if (size > 1) {
3176       cisrow[ii] = isrow[i];
3177       ++ii;
3178     }
3179   }
3180   /*
3181     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3182     array of sequential matrices underlying the resulting parallel matrices.
3183     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3184     contain duplicates.
3185 
3186     There are as many diag matrices as there are original index sets. There are only as many parallel
3187     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3188 
3189     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3190     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3191       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3192       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3193       with A[i] and B[ii] extracted from the corresponding MPI submat.
3194     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3195       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3196       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3197       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3198       values into A[i] and B[ii] sitting inside the corresponding submat.
3199     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3200       A[i], B[ii], AA[i] or BB[ii] matrices.
3201   */
3202   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3203   if (scall == MAT_INITIAL_MATRIX) {
3204     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3205   }
3206 
3207   /* Now obtain the sequential A and B submatrices separately. */
3208   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3209   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3210   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3211 
3212   /*
3213     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3214     matrices A & B have been extracted directly into the parallel matrices containing them, or
3215     simply into the sequential matrix identical with the corresponding A (if size == 1).
3216     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3217     to have the same sparsity pattern.
3218     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3219     must be constructed for C. This is done by setseqmat(s).
3220   */
3221   for (i = 0, ii = 0; i < ismax; ++i) {
3222     /*
3223        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3224        That way we can avoid sorting and computing permutations when reusing.
3225        To this end:
3226         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3227         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3228           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3229     */
3230     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3231 
3232     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3233     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3234     if (size > 1) {
3235       if (scall == MAT_INITIAL_MATRIX) {
3236         ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3237         ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3238         ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3239         ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3240         ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3241       }
3242       /*
3243         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3244       */
3245       {
3246         Mat AA = A[i],BB = B[ii];
3247 
3248         if (AA || BB) {
3249           ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3250           ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3251           ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3252         }
3253         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3254       }
3255       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3256       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3257       ++ii;
3258     } else { /* if (size == 1) */
3259       if (scall == MAT_REUSE_MATRIX) {
3260         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3261       }
3262       if (isrow_p[i] || iscol_p[i]) {
3263         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3264         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3265         /* Otherwise A is extracted straight into (*submats)[i]. */
3266         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3267         ierr = MatDestroy(A+i);CHKERRQ(ierr);
3268       } else (*submat)[i] = A[i];
3269     }
3270     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3271     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3272   }
3273   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3274   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3275   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3276   ierr = PetscFree(A);CHKERRQ(ierr);
3277   ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr);
3278   PetscFunctionReturn(0);
3279 }
3280 
3281 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3282 {
3283   PetscErrorCode ierr;
3284 
3285   PetscFunctionBegin;
3286   ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3287   PetscFunctionReturn(0);
3288 }
3289