xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 16b643558ddb2850265bc9ef423683e46e446b85)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
23 {
24   PetscErrorCode ierr;
25   PetscInt       i;
26 
27   PetscFunctionBegin;
28   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29   for (i=0; i<ov; ++i) {
30     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
31   }
32   PetscFunctionReturn(0);
33 }
34 
35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
36 {
37   PetscErrorCode ierr;
38   PetscInt       i;
39 
40   PetscFunctionBegin;
41   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
42   for (i=0; i<ov; ++i) {
43     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
44   }
45   PetscFunctionReturn(0);
46 }
47 
48 
49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
50 {
51   PetscErrorCode ierr;
52   MPI_Comm       comm;
53   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
54   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
55   PetscInt       nrecvrows,*sbsizes = 0,*sbdata = 0;
56   const PetscInt *indices_i,**indices;
57   PetscLayout    rmap;
58   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom;
59   PetscSF        sf;
60   PetscSFNode    *remote;
61 
62   PetscFunctionBegin;
63   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
64   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
65   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
66   /* get row map to determine where rows should be going */
67   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
68   /* retrieve IS data and put all together so that we
69    * can optimize communication
70    *  */
71   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
72   for (i=0,tlength=0; i<nidx; i++){
73     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
74     tlength += length[i];
75     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
76   }
77   /* find these rows on remote processors */
78   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
79   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
80   nrrows = 0;
81   for (i=0; i<nidx; i++){
82     length_i     = length[i];
83     indices_i    = indices[i];
84     for (j=0; j<length_i; j++){
85       owner = -1;
86       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
87       /* remote processors */
88       if (owner != rank){
89         tosizes_temp[owner]++; /* number of rows to owner */
90         rrow_ranks[nrrows]  = owner; /* processor */
91         rrow_isids[nrrows]   = i; /* is id */
92         remoterows[nrrows++] = indices_i[j]; /* row */
93       }
94     }
95     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
96   }
97   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
98   /* test if we need to exchange messages
99    * generally speaking, we do not need to exchange
100    * data when overlap is 1
101    * */
102   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
103   /* we do not have any messages
104    * It usually corresponds to overlap 1
105    * */
106   if (!reducednrrows){
107     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
108     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
109     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
110     PetscFunctionReturn(0);
111   }
112   nto = 0;
113   /* send sizes and ranks for building a two-sided communcation */
114   for (i=0; i<size; i++){
115    if (tosizes_temp[i]){
116      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
117      tosizes_temp[i] = nto; /* a map from processor to index */
118      toranks[nto++]  = i; /* processor */
119    }
120   }
121   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
122   for (i=0; i<nto; i++){
123     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
124     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
125   }
126   /* send information to other processors */
127   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
128   nrecvrows = 0;
129   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
130   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
131   nrecvrows = 0;
132   for (i=0; i<nfrom; i++){
133     for (j=0; j<fromsizes[2*i]; j++){
134       remote[nrecvrows].rank    = fromranks[i];
135       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
136     }
137   }
138   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
139   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
140   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
141   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
142   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
143   /* message pair <no of is, row>  */
144   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
145   for (i=0; i<nrrows; i++){
146     owner = rrow_ranks[i]; /* processor */
147     j     = tosizes_temp[owner]; /* index */
148     todata[toffsets[j]++] = rrow_isids[i];
149     todata[toffsets[j]++] = remoterows[i];
150   }
151   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
152   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
153   ierr = PetscFree(toffsets);CHKERRQ(ierr);
154   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
155   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
156   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
157   /* send rows belonging to the remote so that then we could get the overlapping data back */
158   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
159   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
160   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
161   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
162   ierr = PetscFree(fromranks);CHKERRQ(ierr);
163   nrecvrows = 0;
164   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
165   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
166   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
167   nrecvrows = 0;
168   for (i=0; i<nto; i++){
169     for (j=0; j<tosizes[2*i]; j++){
170       remote[nrecvrows].rank    = toranks[i];
171       remote[nrecvrows++].index = tosizes[2*i+1]+j;
172     }
173   }
174   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
175   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
176   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
177   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
178   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
179   /* overlap communication and computation */
180   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
181   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
182   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
183   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
184   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
185   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
186   ierr = PetscFree(toranks);CHKERRQ(ierr);
187   ierr = PetscFree(tosizes);CHKERRQ(ierr);
188   ierr = PetscFree(todata);CHKERRQ(ierr);
189   PetscFunctionReturn(0);
190 }
191 
192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
193 {
194   PetscInt         *isz,isz_i,i,j,is_id, data_size;
195   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
196   const PetscInt   *indices_i_temp;
197   PetscErrorCode    ierr;
198 
199   PetscFunctionBegin;
200   max_lsize = 0;
201   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
202   for (i=0; i<nidx; i++){
203     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
204     max_lsize = lsize>max_lsize ? lsize:max_lsize;
205     isz[i]    = lsize;
206   }
207   ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr);
208   for (i=0; i<nidx; i++){
209     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
210     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
211     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
212     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
213   }
214   /* retrieve information to get row id and its overlap */
215   for (i=0; i<nrecvs; ){
216     is_id      = recvdata[i++];
217     data_size  = recvdata[i++];
218     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
219     isz_i      = isz[is_id];
220     for (j=0; j< data_size; j++){
221       col = recvdata[i++];
222       indices_i[isz_i++] = col;
223     }
224     isz[is_id] = isz_i;
225   }
226   /* remove duplicate entities */
227   for (i=0; i<nidx; i++){
228     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
229     isz_i      = isz[i];
230     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
231     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
232   }
233   ierr = PetscFree(isz);CHKERRQ(ierr);
234   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
235   PetscFunctionReturn(0);
236 }
237 
238 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
239 {
240   PetscLayout       rmap,cmap;
241   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
242   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
243   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
244   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
245   Mat               amat,bmat;
246   PetscMPIInt       rank;
247   PetscBool         done;
248   MPI_Comm          comm;
249   PetscErrorCode    ierr;
250 
251   PetscFunctionBegin;
252   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
253   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
254   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
255   /* Even if the mat is symmetric, we still assume it is not symmetric */
256   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
257   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
258   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
259   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
260   /* total number of nonzero values is used to estimate the memory usage in the next step */
261   tnz  = ai[an]+bi[bn];
262   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
263   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
264   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
265   /* to find the longest message */
266   max_fszs = 0;
267   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
268   /* better way to estimate number of nonzero in the mat??? */
269   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
270   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
271   rows_pos  = 0;
272   totalrows = 0;
273   for (i=0; i<nfrom; i++){
274     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
275     /* group data together */
276     for (j=0; j<fromsizes[2*i]; j+=2){
277       is_id                       = fromrows[rows_pos++];/* no of is */
278       rows_i                      = rows_data[is_id];
279       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
280     }
281     /* estimate a space to avoid multiple allocations  */
282     for (j=0; j<nidx; j++){
283       indvc_ij = 0;
284       rows_i   = rows_data[j];
285       for (l=0; l<rows_pos_i[j]; l++){
286         row    = rows_i[l]-rstart;
287         start  = ai[row];
288         end    = ai[row+1];
289         for (k=start; k<end; k++){ /* Amat */
290           col = aj[k] + cstart;
291           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
292         }
293         start = bi[row];
294         end   = bi[row+1];
295         for (k=start; k<end; k++) { /* Bmat */
296           col = gcols[bj[k]];
297           indices_tmp[indvc_ij++] = col;
298         }
299       }
300       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
301       indv_counts[i*nidx+j] = indvc_ij;
302       totalrows            += indvc_ij;
303     }
304   }
305   /* message triple <no of is, number of rows, rows> */
306   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
307   totalrows = 0;
308   rows_pos  = 0;
309   /* use this code again */
310   for (i=0;i<nfrom;i++){
311     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
312     for (j=0; j<fromsizes[2*i]; j+=2){
313       is_id                       = fromrows[rows_pos++];
314       rows_i                      = rows_data[is_id];
315       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
316     }
317     /* add data  */
318     for (j=0; j<nidx; j++){
319       if (!indv_counts[i*nidx+j]) continue;
320       indvc_ij = 0;
321       sbdata[totalrows++] = j;
322       sbdata[totalrows++] = indv_counts[i*nidx+j];
323       sbsizes[2*i]       += 2;
324       rows_i              = rows_data[j];
325       for (l=0; l<rows_pos_i[j]; l++){
326         row   = rows_i[l]-rstart;
327         start = ai[row];
328         end   = ai[row+1];
329         for (k=start; k<end; k++){ /* Amat */
330           col = aj[k] + cstart;
331           indices_tmp[indvc_ij++] = col;
332         }
333         start = bi[row];
334         end   = bi[row+1];
335         for (k=start; k<end; k++) { /* Bmat */
336           col = gcols[bj[k]];
337           indices_tmp[indvc_ij++] = col;
338         }
339       }
340       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
341       sbsizes[2*i]  += indvc_ij;
342       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
343       totalrows += indvc_ij;
344     }
345   }
346   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
347   for (i=0; i<nfrom; i++){
348     offsets[i+1]   = offsets[i] + sbsizes[2*i];
349     sbsizes[2*i+1] = offsets[i];
350   }
351   ierr = PetscFree(offsets);CHKERRQ(ierr);
352   if (sbrowsizes) *sbrowsizes = sbsizes;
353   if (sbrows) *sbrows = sbdata;
354   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
355   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
356   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
357   PetscFunctionReturn(0);
358 }
359 
360 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
361 {
362   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
363   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
364   PetscInt          lsize,lsize_tmp,owner;
365   PetscMPIInt       rank;
366   Mat               amat,bmat;
367   PetscBool         done;
368   PetscLayout       cmap,rmap;
369   MPI_Comm          comm;
370   PetscErrorCode    ierr;
371 
372   PetscFunctionBegin;
373   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
374   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
375   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
376   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
377   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
378   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
379   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
380   /* is it a safe way to compute number of nonzero values ? */
381   tnz  = ai[an]+bi[bn];
382   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
383   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
384   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
385   /* it is a better way to estimate memory than the old implementation
386    * where global size of matrix is used
387    * */
388   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
389   for (i=0; i<nidx; i++) {
390     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
391     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
392     lsize_tmp = 0;
393     for (j=0; j<lsize; j++) {
394       owner = -1;
395       row   = indices[j];
396       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
397       if (owner != rank) continue;
398       /* local number */
399       row  -= rstart;
400       start = ai[row];
401       end   = ai[row+1];
402       for (k=start; k<end; k++) { /* Amat */
403         col = aj[k] + cstart;
404         indices_temp[lsize_tmp++] = col;
405       }
406       start = bi[row];
407       end   = bi[row+1];
408       for (k=start; k<end; k++) { /* Bmat */
409         col = gcols[bj[k]];
410         indices_temp[lsize_tmp++] = col;
411       }
412     }
413    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
414    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
415    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
416    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
417   }
418   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
419   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
420   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
421   PetscFunctionReturn(0);
422 }
423 
424 
425 /*
426   Sample message format:
427   If a processor A wants processor B to process some elements corresponding
428   to index sets is[1],is[5]
429   mesg [0] = 2   (no of index sets in the mesg)
430   -----------
431   mesg [1] = 1 => is[1]
432   mesg [2] = sizeof(is[1]);
433   -----------
434   mesg [3] = 5  => is[5]
435   mesg [4] = sizeof(is[5]);
436   -----------
437   mesg [5]
438   mesg [n]  datas[1]
439   -----------
440   mesg[n+1]
441   mesg[m]  data(is[5])
442   -----------
443 
444   Notes:
445   nrqs - no of requests sent (or to be sent out)
446   nrqr - no of requests recieved (which have to be or which have been processed
447 */
448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
449 {
450   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
451   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
452   const PetscInt **idx,*idx_i;
453   PetscInt       *n,**data,len;
454 #if defined(PETSC_USE_CTABLE)
455   PetscTable     *table_data,table_data_i;
456   PetscInt       *tdata,tcount,tcount_max;
457 #else
458   PetscInt       *data_i,*d_p;
459 #endif
460   PetscErrorCode ierr;
461   PetscMPIInt    size,rank,tag1,tag2;
462   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
463   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
464   PetscBT        *table;
465   MPI_Comm       comm;
466   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
467   MPI_Status     *s_status,*recv_status;
468   char           *t_p;
469 
470   PetscFunctionBegin;
471   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
472   size = c->size;
473   rank = c->rank;
474   M    = C->rmap->N;
475 
476   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
477   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
478 
479   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
480 
481   for (i=0; i<imax; i++) {
482     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
483     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
484   }
485 
486   /* evaluate communication - mesg to who,length of mesg, and buffer space
487      required. Based on this, buffers are allocated, and data copied into them  */
488   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
489   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
490   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
491   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
492   for (i=0; i<imax; i++) {
493     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
494     idx_i = idx[i];
495     len   = n[i];
496     for (j=0; j<len; j++) {
497       row = idx_i[j];
498       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
499       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
500       w4[proc]++;
501     }
502     for (j=0; j<size; j++) {
503       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
504     }
505   }
506 
507   nrqs     = 0;              /* no of outgoing messages */
508   msz      = 0;              /* total mesg length (for all proc */
509   w1[rank] = 0;              /* no mesg sent to intself */
510   w3[rank] = 0;
511   for (i=0; i<size; i++) {
512     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
513   }
514   /* pa - is list of processors to communicate with */
515   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
516   for (i=0,j=0; i<size; i++) {
517     if (w1[i]) {pa[j] = i; j++;}
518   }
519 
520   /* Each message would have a header = 1 + 2*(no of IS) + data */
521   for (i=0; i<nrqs; i++) {
522     j      = pa[i];
523     w1[j] += w2[j] + 2*w3[j];
524     msz   += w1[j];
525   }
526 
527   /* Determine the number of messages to expect, their lengths, from from-ids */
528   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
529   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
530 
531   /* Now post the Irecvs corresponding to these messages */
532   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
533 
534   /* Allocate Memory for outgoing messages */
535   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
536   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
537   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
538 
539   {
540     PetscInt *iptr = tmp,ict  = 0;
541     for (i=0; i<nrqs; i++) {
542       j         = pa[i];
543       iptr     +=  ict;
544       outdat[j] = iptr;
545       ict       = w1[j];
546     }
547   }
548 
549   /* Form the outgoing messages */
550   /* plug in the headers */
551   for (i=0; i<nrqs; i++) {
552     j            = pa[i];
553     outdat[j][0] = 0;
554     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
555     ptr[j]       = outdat[j] + 2*w3[j] + 1;
556   }
557 
558   /* Memory for doing local proc's work */
559   {
560     PetscInt M_BPB_imax = 0;
561 #if defined(PETSC_USE_CTABLE)
562     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
563     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
564     for (i=0; i<imax; i++) {
565       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
566     }
567     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
568     for (i=0; i<imax; i++) {
569       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
570     }
571 #else
572     PetscInt Mimax = 0;
573     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
574     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
575     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
576     for (i=0; i<imax; i++) {
577       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
578       data[i]  = d_p + M*i;
579     }
580 #endif
581   }
582 
583   /* Parse the IS and update local tables and the outgoing buf with the data */
584   {
585     PetscInt n_i,isz_i,*outdat_j,ctr_j;
586     PetscBT  table_i;
587 
588     for (i=0; i<imax; i++) {
589       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
590       n_i     = n[i];
591       table_i = table[i];
592       idx_i   = idx[i];
593 #if defined(PETSC_USE_CTABLE)
594       table_data_i = table_data[i];
595 #else
596       data_i  = data[i];
597 #endif
598       isz_i   = isz[i];
599       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
600         row  = idx_i[j];
601         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
602         if (proc != rank) { /* copy to the outgoing buffer */
603           ctr[proc]++;
604           *ptr[proc] = row;
605           ptr[proc]++;
606         } else if (!PetscBTLookupSet(table_i,row)) {
607 #if defined(PETSC_USE_CTABLE)
608           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
609 #else
610           data_i[isz_i] = row; /* Update the local table */
611 #endif
612           isz_i++;
613         }
614       }
615       /* Update the headers for the current IS */
616       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
617         if ((ctr_j = ctr[j])) {
618           outdat_j        = outdat[j];
619           k               = ++outdat_j[0];
620           outdat_j[2*k]   = ctr_j;
621           outdat_j[2*k-1] = i;
622         }
623       }
624       isz[i] = isz_i;
625     }
626   }
627 
628   /*  Now  post the sends */
629   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
630   for (i=0; i<nrqs; ++i) {
631     j    = pa[i];
632     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
633   }
634 
635   /* No longer need the original indices */
636   for (i=0; i<imax; ++i) {
637     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
638   }
639   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
640 
641   for (i=0; i<imax; ++i) {
642     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
643   }
644 
645   /* Do Local work */
646 #if defined(PETSC_USE_CTABLE)
647   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
648 #else
649   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
650 #endif
651 
652   /* Receive messages */
653   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
654   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
655 
656   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
657   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
658 
659   /* Phase 1 sends are complete - deallocate buffers */
660   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
661   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
662 
663   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
664   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
665   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
666   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
667   ierr = PetscFree(rbuf);CHKERRQ(ierr);
668 
669 
670   /* Send the data back */
671   /* Do a global reduction to know the buffer space req for incoming messages */
672   {
673     PetscMPIInt *rw1;
674 
675     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
676 
677     for (i=0; i<nrqr; ++i) {
678       proc = recv_status[i].MPI_SOURCE;
679 
680       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
681       rw1[proc] = isz1[i];
682     }
683     ierr = PetscFree(onodes1);CHKERRQ(ierr);
684     ierr = PetscFree(olengths1);CHKERRQ(ierr);
685 
686     /* Determine the number of messages to expect, their lengths, from from-ids */
687     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
688     ierr = PetscFree(rw1);CHKERRQ(ierr);
689   }
690   /* Now post the Irecvs corresponding to these messages */
691   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
692 
693   /* Now  post the sends */
694   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
695   for (i=0; i<nrqr; ++i) {
696     j    = recv_status[i].MPI_SOURCE;
697     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
698   }
699 
700   /* receive work done on other processors */
701   {
702     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
703     PetscMPIInt idex;
704     PetscBT     table_i;
705     MPI_Status  *status2;
706 
707     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
708     for (i=0; i<nrqs; ++i) {
709       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
710       /* Process the message */
711       rbuf2_i = rbuf2[idex];
712       ct1     = 2*rbuf2_i[0]+1;
713       jmax    = rbuf2[idex][0];
714       for (j=1; j<=jmax; j++) {
715         max     = rbuf2_i[2*j];
716         is_no   = rbuf2_i[2*j-1];
717         isz_i   = isz[is_no];
718         table_i = table[is_no];
719 #if defined(PETSC_USE_CTABLE)
720         table_data_i = table_data[is_no];
721 #else
722         data_i  = data[is_no];
723 #endif
724         for (k=0; k<max; k++,ct1++) {
725           row = rbuf2_i[ct1];
726           if (!PetscBTLookupSet(table_i,row)) {
727 #if defined(PETSC_USE_CTABLE)
728             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
729 #else
730             data_i[isz_i] = row;
731 #endif
732             isz_i++;
733           }
734         }
735         isz[is_no] = isz_i;
736       }
737     }
738 
739     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
740     ierr = PetscFree(status2);CHKERRQ(ierr);
741   }
742 
743 #if defined(PETSC_USE_CTABLE)
744   tcount_max = 0;
745   for (i=0; i<imax; ++i) {
746     table_data_i = table_data[i];
747     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
748     if (tcount_max < tcount) tcount_max = tcount;
749   }
750   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
751 #endif
752 
753   for (i=0; i<imax; ++i) {
754 #if defined(PETSC_USE_CTABLE)
755     PetscTablePosition tpos;
756     table_data_i = table_data[i];
757 
758     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
759     while (tpos) {
760       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
761       tdata[--j] = --k;
762     }
763     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
764 #else
765     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
766 #endif
767   }
768 
769   ierr = PetscFree(onodes2);CHKERRQ(ierr);
770   ierr = PetscFree(olengths2);CHKERRQ(ierr);
771 
772   ierr = PetscFree(pa);CHKERRQ(ierr);
773   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
774   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
775   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
776   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
777   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
778   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
779   ierr = PetscFree(s_status);CHKERRQ(ierr);
780   ierr = PetscFree(recv_status);CHKERRQ(ierr);
781   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
782   ierr = PetscFree(xdata);CHKERRQ(ierr);
783   ierr = PetscFree(isz1);CHKERRQ(ierr);
784 #if defined(PETSC_USE_CTABLE)
785   for (i=0; i<imax; i++) {
786     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
787   }
788   ierr = PetscFree(table_data);CHKERRQ(ierr);
789   ierr = PetscFree(tdata);CHKERRQ(ierr);
790   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
791 #else
792   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
793 #endif
794   PetscFunctionReturn(0);
795 }
796 
797 /*
798    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
799        the work on the local processor.
800 
801      Inputs:
802       C      - MAT_MPIAIJ;
803       imax - total no of index sets processed at a time;
804       table  - an array of char - size = m bits.
805 
806      Output:
807       isz    - array containing the count of the solution elements corresponding
808                to each index set;
809       data or table_data  - pointer to the solutions
810 */
811 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
812 {
813   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
814   Mat        A  = c->A,B = c->B;
815   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
816   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
817   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
818   PetscBT    table_i;
819 #if defined(PETSC_USE_CTABLE)
820   PetscTable         table_data_i;
821   PetscErrorCode     ierr;
822   PetscTablePosition tpos;
823   PetscInt           tcount,*tdata;
824 #else
825   PetscInt           *data_i;
826 #endif
827 
828   PetscFunctionBegin;
829   rstart = C->rmap->rstart;
830   cstart = C->cmap->rstart;
831   ai     = a->i;
832   aj     = a->j;
833   bi     = b->i;
834   bj     = b->j;
835   garray = c->garray;
836 
837   for (i=0; i<imax; i++) {
838 #if defined(PETSC_USE_CTABLE)
839     /* copy existing entries of table_data_i into tdata[] */
840     table_data_i = table_data[i];
841     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
842     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
843 
844     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
845     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
846     while (tpos) {
847       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
848       tdata[--j] = --row;
849       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
850     }
851 #else
852     data_i  = data[i];
853 #endif
854     table_i = table[i];
855     isz_i   = isz[i];
856     max     = isz[i];
857 
858     for (j=0; j<max; j++) {
859 #if defined(PETSC_USE_CTABLE)
860       row   = tdata[j] - rstart;
861 #else
862       row   = data_i[j] - rstart;
863 #endif
864       start = ai[row];
865       end   = ai[row+1];
866       for (k=start; k<end; k++) { /* Amat */
867         val = aj[k] + cstart;
868         if (!PetscBTLookupSet(table_i,val)) {
869 #if defined(PETSC_USE_CTABLE)
870           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
871 #else
872           data_i[isz_i] = val;
873 #endif
874           isz_i++;
875         }
876       }
877       start = bi[row];
878       end   = bi[row+1];
879       for (k=start; k<end; k++) { /* Bmat */
880         val = garray[bj[k]];
881         if (!PetscBTLookupSet(table_i,val)) {
882 #if defined(PETSC_USE_CTABLE)
883           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
884 #else
885           data_i[isz_i] = val;
886 #endif
887           isz_i++;
888         }
889       }
890     }
891     isz[i] = isz_i;
892 
893 #if defined(PETSC_USE_CTABLE)
894     ierr = PetscFree(tdata);CHKERRQ(ierr);
895 #endif
896   }
897   PetscFunctionReturn(0);
898 }
899 
900 /*
901       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
902          and return the output
903 
904          Input:
905            C    - the matrix
906            nrqr - no of messages being processed.
907            rbuf - an array of pointers to the recieved requests
908 
909          Output:
910            xdata - array of messages to be sent back
911            isz1  - size of each message
912 
913   For better efficiency perhaps we should malloc separately each xdata[i],
914 then if a remalloc is required we need only copy the data for that one row
915 rather then all previous rows as it is now where a single large chunck of
916 memory is used.
917 
918 */
919 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
920 {
921   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
922   Mat            A  = c->A,B = c->B;
923   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
924   PetscErrorCode ierr;
925   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
926   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
927   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
928   PetscInt       *rbuf_i,kmax,rbuf_0;
929   PetscBT        xtable;
930 
931   PetscFunctionBegin;
932   m      = C->rmap->N;
933   rstart = C->rmap->rstart;
934   cstart = C->cmap->rstart;
935   ai     = a->i;
936   aj     = a->j;
937   bi     = b->i;
938   bj     = b->j;
939   garray = c->garray;
940 
941 
942   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
943     rbuf_i =  rbuf[i];
944     rbuf_0 =  rbuf_i[0];
945     ct    += rbuf_0;
946     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
947   }
948 
949   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
950   else max1 = 1;
951   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
952   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
953   ++no_malloc;
954   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
955   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
956 
957   ct3 = 0;
958   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
959     rbuf_i =  rbuf[i];
960     rbuf_0 =  rbuf_i[0];
961     ct1    =  2*rbuf_0+1;
962     ct2    =  ct1;
963     ct3   += ct1;
964     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
965       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
966       oct2 = ct2;
967       kmax = rbuf_i[2*j];
968       for (k=0; k<kmax; k++,ct1++) {
969         row = rbuf_i[ct1];
970         if (!PetscBTLookupSet(xtable,row)) {
971           if (!(ct3 < mem_estimate)) {
972             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
973             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
974             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
975             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
976             xdata[0]     = tmp;
977             mem_estimate = new_estimate; ++no_malloc;
978             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
979           }
980           xdata[i][ct2++] = row;
981           ct3++;
982         }
983       }
984       for (k=oct2,max2=ct2; k<max2; k++) {
985         row   = xdata[i][k] - rstart;
986         start = ai[row];
987         end   = ai[row+1];
988         for (l=start; l<end; l++) {
989           val = aj[l] + cstart;
990           if (!PetscBTLookupSet(xtable,val)) {
991             if (!(ct3 < mem_estimate)) {
992               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
993               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
994               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
995               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
996               xdata[0]     = tmp;
997               mem_estimate = new_estimate; ++no_malloc;
998               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
999             }
1000             xdata[i][ct2++] = val;
1001             ct3++;
1002           }
1003         }
1004         start = bi[row];
1005         end   = bi[row+1];
1006         for (l=start; l<end; l++) {
1007           val = garray[bj[l]];
1008           if (!PetscBTLookupSet(xtable,val)) {
1009             if (!(ct3 < mem_estimate)) {
1010               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1011               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1012               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1013               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1014               xdata[0]     = tmp;
1015               mem_estimate = new_estimate; ++no_malloc;
1016               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1017             }
1018             xdata[i][ct2++] = val;
1019             ct3++;
1020           }
1021         }
1022       }
1023       /* Update the header*/
1024       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1025       xdata[i][2*j-1] = rbuf_i[2*j-1];
1026     }
1027     xdata[i][0] = rbuf_0;
1028     xdata[i+1]  = xdata[i] + ct2;
1029     isz1[i]     = ct2; /* size of each message */
1030   }
1031   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1032   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1033   PetscFunctionReturn(0);
1034 }
1035 /* -------------------------------------------------------------------------*/
1036 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1037 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
1038 /*
1039     Every processor gets the entire matrix
1040 */
1041 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1042 {
1043   Mat            B;
1044   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1045   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1046   PetscErrorCode ierr;
1047   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
1048   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1049   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1050   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1051 
1052   PetscFunctionBegin;
1053   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1054   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
1055 
1056   if (scall == MAT_INITIAL_MATRIX) {
1057     /* ----------------------------------------------------------------
1058          Tell every processor the number of nonzeros per row
1059     */
1060     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1061     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1062       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1063     }
1064     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1065     for (i=0; i<size; i++) {
1066       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1067       displs[i]     = A->rmap->range[i];
1068     }
1069 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1070     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1071 #else
1072     sendcount = A->rmap->rend - A->rmap->rstart;
1073     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1074 #endif
1075     /* ---------------------------------------------------------------
1076          Create the sequential matrix of the same type as the local block diagonal
1077     */
1078     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1079     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1080     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1081     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1082     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1083     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
1084     **Bin = B;
1085     b     = (Mat_SeqAIJ*)B->data;
1086 
1087     /*--------------------------------------------------------------------
1088        Copy my part of matrix column indices over
1089     */
1090     sendcount  = ad->nz + bd->nz;
1091     jsendbuf   = b->j + b->i[rstarts[rank]];
1092     a_jsendbuf = ad->j;
1093     b_jsendbuf = bd->j;
1094     n          = A->rmap->rend - A->rmap->rstart;
1095     cnt        = 0;
1096     for (i=0; i<n; i++) {
1097 
1098       /* put in lower diagonal portion */
1099       m = bd->i[i+1] - bd->i[i];
1100       while (m > 0) {
1101         /* is it above diagonal (in bd (compressed) numbering) */
1102         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1103         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1104         m--;
1105       }
1106 
1107       /* put in diagonal portion */
1108       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1109         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1110       }
1111 
1112       /* put in upper diagonal portion */
1113       while (m-- > 0) {
1114         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1115       }
1116     }
1117     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1118 
1119     /*--------------------------------------------------------------------
1120        Gather all column indices to all processors
1121     */
1122     for (i=0; i<size; i++) {
1123       recvcounts[i] = 0;
1124       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1125         recvcounts[i] += lens[j];
1126       }
1127     }
1128     displs[0] = 0;
1129     for (i=1; i<size; i++) {
1130       displs[i] = displs[i-1] + recvcounts[i-1];
1131     }
1132 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1133     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1134 #else
1135     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1136 #endif
1137     /*--------------------------------------------------------------------
1138         Assemble the matrix into useable form (note numerical values not yet set)
1139     */
1140     /* set the b->ilen (length of each row) values */
1141     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1142     /* set the b->i indices */
1143     b->i[0] = 0;
1144     for (i=1; i<=A->rmap->N; i++) {
1145       b->i[i] = b->i[i-1] + lens[i-1];
1146     }
1147     ierr = PetscFree(lens);CHKERRQ(ierr);
1148     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1149     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1150 
1151   } else {
1152     B = **Bin;
1153     b = (Mat_SeqAIJ*)B->data;
1154   }
1155 
1156   /*--------------------------------------------------------------------
1157        Copy my part of matrix numerical values into the values location
1158   */
1159   if (flag == MAT_GET_VALUES) {
1160     sendcount = ad->nz + bd->nz;
1161     sendbuf   = b->a + b->i[rstarts[rank]];
1162     a_sendbuf = ad->a;
1163     b_sendbuf = bd->a;
1164     b_sendj   = bd->j;
1165     n         = A->rmap->rend - A->rmap->rstart;
1166     cnt       = 0;
1167     for (i=0; i<n; i++) {
1168 
1169       /* put in lower diagonal portion */
1170       m = bd->i[i+1] - bd->i[i];
1171       while (m > 0) {
1172         /* is it above diagonal (in bd (compressed) numbering) */
1173         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1174         sendbuf[cnt++] = *b_sendbuf++;
1175         m--;
1176         b_sendj++;
1177       }
1178 
1179       /* put in diagonal portion */
1180       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1181         sendbuf[cnt++] = *a_sendbuf++;
1182       }
1183 
1184       /* put in upper diagonal portion */
1185       while (m-- > 0) {
1186         sendbuf[cnt++] = *b_sendbuf++;
1187         b_sendj++;
1188       }
1189     }
1190     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1191 
1192     /* -----------------------------------------------------------------
1193        Gather all numerical values to all processors
1194     */
1195     if (!recvcounts) {
1196       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1197     }
1198     for (i=0; i<size; i++) {
1199       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1200     }
1201     displs[0] = 0;
1202     for (i=1; i<size; i++) {
1203       displs[i] = displs[i-1] + recvcounts[i-1];
1204     }
1205     recvbuf = b->a;
1206 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1207     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1208 #else
1209     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1210 #endif
1211   }  /* endof (flag == MAT_GET_VALUES) */
1212   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1213 
1214   if (A->symmetric) {
1215     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1216   } else if (A->hermitian) {
1217     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1218   } else if (A->structurally_symmetric) {
1219     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1220   }
1221   PetscFunctionReturn(0);
1222 }
1223 
1224 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1225 {
1226   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1227   Mat            submat,A = c->A,B = c->B;
1228   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1229   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1230   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1231   const PetscInt *icol,*irow;
1232   PetscInt       nrow,ncol,start;
1233   PetscErrorCode ierr;
1234   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1235   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1236   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1237   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1238   PetscInt       *lens,rmax,ncols,*cols,Crow;
1239 #if defined(PETSC_USE_CTABLE)
1240   PetscTable     cmap,rmap;
1241   PetscInt       *cmap_loc,*rmap_loc;
1242 #else
1243   PetscInt       *cmap,*rmap;
1244 #endif
1245   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1246   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1247   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1248   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1249   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1250   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1251   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1252   MPI_Comm       comm;
1253   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1254   PetscMPIInt    *onodes1,*olengths1,idex,end;
1255   Mat_SubMat     *smatis1;
1256   PetscBool      isrowsorted;
1257 
1258   PetscFunctionBegin;
1259   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1260 
1261   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1262   size = c->size;
1263   rank = c->rank;
1264 
1265   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1266   if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted");
1267 
1268   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1269   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1270   if (allcolumns) {
1271     icol = NULL;
1272     ncol = C->cmap->N;
1273   } else {
1274     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1275     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1276   }
1277 
1278   if (scall == MAT_INITIAL_MATRIX) {
1279     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1280 
1281     /* Get some new tags to keep the communication clean */
1282     tag1 = ((PetscObject)C)->tag;
1283     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1284     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1285 
1286     /* evaluate communication - mesg to who, length of mesg, and buffer space
1287      required. Based on this, buffers are allocated, and data copied into them */
1288     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1289     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1290 
1291     /* w1[proc] = num of rows owned by proc -- to be requested */
1292     proc = 0;
1293     nrqs = 0; /* num of outgoing messages */
1294     for (j=0; j<nrow; j++) {
1295       row  = irow[j]; /* sorted! */
1296       while (row >= C->rmap->range[proc+1]) proc++;
1297       w1[proc]++;
1298       row2proc[j] = proc; /* map row index to proc */
1299 
1300       if (proc != rank && !w2[proc]) {
1301         w2[proc] = 1; nrqs++;
1302       }
1303     }
1304     w1[rank] = 0;  /* rows owned by self will not be requested */
1305 
1306     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1307     for (proc=0,j=0; proc<size; proc++) {
1308       if (w1[proc]) { pa[j++] = proc;}
1309     }
1310 
1311     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1312     msz = 0;              /* total mesg length (for all procs) */
1313     for (i=0; i<nrqs; i++) {
1314       proc      = pa[i];
1315       w1[proc] += 3;
1316       msz      += w1[proc];
1317     }
1318     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1319 
1320     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1321     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1322     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1323 
1324     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1325        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1326     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1327 
1328     /* Now post the Irecvs corresponding to these messages */
1329     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1330 
1331     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1332     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1333 
1334     /* Allocate Memory for outgoing messages */
1335     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1336     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1337     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1338 
1339     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1340     iptr = tmp;
1341     for (i=0; i<nrqs; i++) {
1342       proc        = pa[i];
1343       sbuf1[proc] = iptr;
1344       iptr       += w1[proc];
1345     }
1346 
1347     /* Form the outgoing messages */
1348     /* Initialize the header space */
1349     for (i=0; i<nrqs; i++) {
1350       proc      = pa[i];
1351       ierr      = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr);
1352       ptr[proc] = sbuf1[proc] + 3;
1353     }
1354 
1355     /* Parse the isrow and copy data into outbuf */
1356     ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1357     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1358       proc = row2proc[j];
1359       if (proc != rank) { /* copy to the outgoing buf*/
1360         *ptr[proc] = irow[j];
1361         ctr[proc]++; ptr[proc]++;
1362       }
1363     }
1364 
1365     /* Update the headers for the current IS */
1366     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1367       if ((ctr_j = ctr[j])) {
1368         sbuf1_j        = sbuf1[j];
1369         k              = ++sbuf1_j[0];
1370         sbuf1_j[2*k]   = ctr_j;
1371         sbuf1_j[2*k-1] = 0;
1372       }
1373     }
1374 
1375     /* Now post the sends */
1376     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1377     for (i=0; i<nrqs; ++i) {
1378       proc = pa[i];
1379       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr);
1380     }
1381 
1382     /* Post Receives to capture the buffer size */
1383     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1384     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1385 
1386     rbuf2[0] = tmp + msz;
1387     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1388 
1389     for (i=0; i<nrqs; ++i) {
1390       proc = pa[i];
1391       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr);
1392     }
1393 
1394     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1395 
1396     /* Send to other procs the buf size they should allocate */
1397     /* Receive messages*/
1398     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1399     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1400 
1401     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
1402     for (i=0; i<nrqr; ++i) {
1403       req_size[i] = 0;
1404       rbuf1_i        = rbuf1[i];
1405       start          = 2*rbuf1_i[0] + 1;
1406       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1407       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1408       sbuf2_i        = sbuf2[i];
1409       for (j=start; j<end; j++) {
1410         k            = rbuf1_i[j] - rstart;
1411         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1412         sbuf2_i[j]   = ncols;
1413         req_size[i] += ncols;
1414       }
1415       req_source1[i] = r_status1[i].MPI_SOURCE;
1416 
1417       /* form the header */
1418       sbuf2_i[0] = req_size[i];
1419       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1420 
1421       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
1422     }
1423 
1424     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1425     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1426 
1427     /* rbuf2 is received, Post recv column indices a->j */
1428     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
1429 
1430     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1431     for (i=0; i<nrqs; ++i) {
1432       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1433       req_source2[i] = r_status2[i].MPI_SOURCE;
1434       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
1435     }
1436 
1437     /* Wait on sends1 and sends2 */
1438     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1439     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1440     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1441     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1442 
1443     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1444     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1445 
1446     /* Now allocate sending buffers for a->j, and send them off */
1447     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1448     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1449     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1450     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1451 
1452     for (i=0; i<nrqr; i++) { /* for each requested message */
1453       rbuf1_i   = rbuf1[i];
1454       sbuf_aj_i = sbuf_aj[i];
1455       ct1       = 2*rbuf1_i[0] + 1;
1456       ct2       = 0;
1457       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1458 
1459       kmax = rbuf1[i][2];
1460       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1461         row    = rbuf1_i[ct1] - rstart;
1462         nzA    = ai[row+1] - ai[row];
1463         nzB    = bi[row+1] - bi[row];
1464         ncols  = nzA + nzB;
1465         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1466 
1467         /* load the column indices for this row into cols*/
1468         cols = sbuf_aj_i + ct2;
1469 
1470         lwrite = 0;
1471         for (l=0; l<nzB; l++) {
1472           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1473         }
1474         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1475         for (l=0; l<nzB; l++) {
1476           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1477         }
1478 
1479         ct2 += ncols;
1480       }
1481       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
1482     }
1483 
1484     /* create column map (cmap): global col of C -> local col of submat */
1485 #if defined(PETSC_USE_CTABLE)
1486     if (!allcolumns) {
1487       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1488       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1489       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1490         if (icol[j] >= cstart && icol[j] <cend) {
1491           cmap_loc[icol[j] - cstart] = j+1;
1492         } else { /* use PetscTable for non-local col indices */
1493           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1494         }
1495       }
1496     } else {
1497       cmap     = NULL;
1498       cmap_loc = NULL;
1499     }
1500     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1501 #else
1502     if (!allcolumns) {
1503       ierr   = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1504       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1505     } else {
1506       cmap = NULL;
1507     }
1508 #endif
1509 
1510     /* Create lens for MatSeqAIJSetPreallocation() */
1511     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1512 
1513     /* Compute lens from local part of C */
1514     for (j=0; j<nrow; j++) {
1515       row  = irow[j];
1516       proc = row2proc[j];
1517       if (proc == rank) {
1518         /* diagonal part A = c->A */
1519         ncols = ai[row-rstart+1] - ai[row-rstart];
1520         cols  = aj + ai[row-rstart];
1521         if (!allcolumns) {
1522           for (k=0; k<ncols; k++) {
1523 #if defined(PETSC_USE_CTABLE)
1524             tcol = cmap_loc[cols[k]];
1525 #else
1526             tcol = cmap[cols[k]+cstart];
1527 #endif
1528             if (tcol) lens[j]++;
1529           }
1530         } else { /* allcolumns */
1531           lens[j] = ncols;
1532         }
1533 
1534         /* off-diagonal part B = c->B */
1535         ncols = bi[row-rstart+1] - bi[row-rstart];
1536         cols  = bj + bi[row-rstart];
1537         if (!allcolumns) {
1538           for (k=0; k<ncols; k++) {
1539 #if defined(PETSC_USE_CTABLE)
1540             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1541 #else
1542             tcol = cmap[bmap[cols[k]]];
1543 #endif
1544             if (tcol) lens[j]++;
1545           }
1546         } else { /* allcolumns */
1547           lens[j] += ncols;
1548         }
1549       }
1550     }
1551 
1552     /* Create row map (rmap): global row of C -> local row of submat */
1553 #if defined(PETSC_USE_CTABLE)
1554     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1555     for (j=0; j<nrow; j++) {
1556       row  = irow[j];
1557       proc = row2proc[j];
1558       if (proc == rank) { /* a local row */
1559         rmap_loc[row - rstart] = j;
1560       } else {
1561         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1562       }
1563     }
1564 #else
1565     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1566     for (j=0; j<nrow; j++) {
1567       rmap[irow[j]] = j;
1568     }
1569 #endif
1570 
1571     /* Update lens from offproc data */
1572     /* recv a->j is done */
1573     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1574     for (i=0; i<nrqs; i++) {
1575       proc    = pa[i];
1576       sbuf1_i = sbuf1[proc];
1577       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1578       ct1     = 2 + 1;
1579       ct2     = 0;
1580       rbuf2_i = rbuf2[i]; /* received length of C->j */
1581       rbuf3_i = rbuf3[i]; /* received C->j */
1582 
1583       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1584       max1   = sbuf1_i[2];
1585       for (k=0; k<max1; k++,ct1++) {
1586 #if defined(PETSC_USE_CTABLE)
1587         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1588         row--;
1589         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1590 #else
1591         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1592 #endif
1593         /* Now, store row index of submat in sbuf1_i[ct1] */
1594         sbuf1_i[ct1] = row;
1595 
1596         nnz = rbuf2_i[ct1];
1597         if (!allcolumns) {
1598           for (l=0; l<nnz; l++,ct2++) {
1599 #if defined(PETSC_USE_CTABLE)
1600             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1601               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1602             } else {
1603               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1604             }
1605 #else
1606             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1607 #endif
1608             if (tcol) lens[row]++;
1609           }
1610         } else { /* allcolumns */
1611           lens[row] += nnz;
1612         }
1613       }
1614     }
1615     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1616     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1617 
1618     /* Create the submatrices */
1619     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1620     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1621 
1622     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1623     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1624     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1625     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1626     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1627 
1628     /* create struct Mat_SubMat and attached it to submat */
1629     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1630     subc = (Mat_SeqAIJ*)submat->data;
1631     subc->submatis1 = smatis1;
1632 
1633     smatis1->id          = 0;
1634     smatis1->nrqs        = nrqs;
1635     smatis1->nrqr        = nrqr;
1636     smatis1->rbuf1       = rbuf1;
1637     smatis1->rbuf2       = rbuf2;
1638     smatis1->rbuf3       = rbuf3;
1639     smatis1->sbuf2       = sbuf2;
1640     smatis1->req_source2 = req_source2;
1641 
1642     smatis1->sbuf1       = sbuf1;
1643     smatis1->ptr         = ptr;
1644     smatis1->tmp         = tmp;
1645     smatis1->ctr         = ctr;
1646 
1647     smatis1->pa           = pa;
1648     smatis1->req_size     = req_size;
1649     smatis1->req_source1  = req_source1;
1650 
1651     smatis1->allcolumns  = allcolumns;
1652     smatis1->singleis    = PETSC_TRUE;
1653     smatis1->row2proc    = row2proc;
1654     smatis1->rmap        = rmap;
1655     smatis1->cmap        = cmap;
1656 #if defined(PETSC_USE_CTABLE)
1657     smatis1->rmap_loc    = rmap_loc;
1658     smatis1->cmap_loc    = cmap_loc;
1659 #endif
1660 
1661     smatis1->destroy     = submat->ops->destroy;
1662     submat->ops->destroy = MatDestroy_SeqAIJ_Submatrices;
1663     submat->factortype   = C->factortype;
1664 
1665     /* compute rmax */
1666     rmax = 0;
1667     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1668 
1669   } else { /* scall == MAT_REUSE_MATRIX */
1670     submat = submats[0];
1671     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1672 
1673     subc    = (Mat_SeqAIJ*)submat->data;
1674     rmax    = subc->rmax;
1675     smatis1 = subc->submatis1;
1676     nrqs        = smatis1->nrqs;
1677     nrqr        = smatis1->nrqr;
1678     rbuf1       = smatis1->rbuf1;
1679     rbuf2       = smatis1->rbuf2;
1680     rbuf3       = smatis1->rbuf3;
1681     req_source2 = smatis1->req_source2;
1682 
1683     sbuf1     = smatis1->sbuf1;
1684     sbuf2     = smatis1->sbuf2;
1685     ptr       = smatis1->ptr;
1686     tmp       = smatis1->tmp;
1687     ctr       = smatis1->ctr;
1688 
1689     pa         = smatis1->pa;
1690     req_size   = smatis1->req_size;
1691     req_source1 = smatis1->req_source1;
1692 
1693     allcolumns = smatis1->allcolumns;
1694     row2proc   = smatis1->row2proc;
1695     rmap       = smatis1->rmap;
1696     cmap       = smatis1->cmap;
1697 #if defined(PETSC_USE_CTABLE)
1698     rmap_loc   = smatis1->rmap_loc;
1699     cmap_loc   = smatis1->cmap_loc;
1700 #endif
1701   }
1702 
1703   /* Post recv matrix values */
1704   ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1705   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1706   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1707   for (i=0; i<nrqs; ++i) {
1708     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1709     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1710   }
1711 
1712   /* Allocate sending buffers for a->a, and send them off */
1713   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1714   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1715   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1716   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1717 
1718   for (i=0; i<nrqr; i++) {
1719     rbuf1_i   = rbuf1[i];
1720     sbuf_aa_i = sbuf_aa[i];
1721     ct1       = 2*rbuf1_i[0]+1;
1722     ct2       = 0;
1723     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1724 
1725     kmax = rbuf1_i[2];
1726     for (k=0; k<kmax; k++,ct1++) {
1727       row = rbuf1_i[ct1] - rstart;
1728       nzA = ai[row+1] - ai[row];
1729       nzB = bi[row+1] - bi[row];
1730       ncols  = nzA + nzB;
1731       cworkB = bj + bi[row];
1732       vworkA = a_a + ai[row];
1733       vworkB = b_a + bi[row];
1734 
1735       /* load the column values for this row into vals*/
1736       vals = sbuf_aa_i + ct2;
1737 
1738       lwrite = 0;
1739       for (l=0; l<nzB; l++) {
1740         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1741       }
1742       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1743       for (l=0; l<nzB; l++) {
1744         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1745       }
1746 
1747       ct2 += ncols;
1748     }
1749     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1750   }
1751 
1752   /* Assemble submat */
1753   /* First assemble the local rows */
1754   for (j=0; j<nrow; j++) {
1755     row  = irow[j];
1756     proc = row2proc[j];
1757     if (proc == rank) {
1758       Crow = row - rstart;  /* local row index of C */
1759 #if defined(PETSC_USE_CTABLE)
1760       row = rmap_loc[Crow]; /* row index of submat */
1761 #else
1762       row = rmap[row];
1763 #endif
1764 
1765       if (allcolumns) {
1766         /* diagonal part A = c->A */
1767         ncols = ai[Crow+1] - ai[Crow];
1768         cols  = aj + ai[Crow];
1769         vals  = a->a + ai[Crow];
1770         i     = 0;
1771         for (k=0; k<ncols; k++) {
1772           subcols[i]   = cols[k] + cstart;
1773           subvals[i++] = vals[k];
1774         }
1775 
1776         /* off-diagonal part B = c->B */
1777         ncols = bi[Crow+1] - bi[Crow];
1778         cols  = bj + bi[Crow];
1779         vals  = b->a + bi[Crow];
1780         for (k=0; k<ncols; k++) {
1781           subcols[i]   = bmap[cols[k]];
1782           subvals[i++] = vals[k];
1783         }
1784 
1785         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1786 
1787       } else { /* !allcolumns */
1788 #if defined(PETSC_USE_CTABLE)
1789         /* diagonal part A = c->A */
1790         ncols = ai[Crow+1] - ai[Crow];
1791         cols  = aj + ai[Crow];
1792         vals  = a->a + ai[Crow];
1793         i     = 0;
1794         for (k=0; k<ncols; k++) {
1795           tcol = cmap_loc[cols[k]];
1796           if (tcol) {
1797             subcols[i]   = --tcol;
1798             subvals[i++] = vals[k];
1799           }
1800         }
1801 
1802         /* off-diagonal part B = c->B */
1803         ncols = bi[Crow+1] - bi[Crow];
1804         cols  = bj + bi[Crow];
1805         vals  = b->a + bi[Crow];
1806         for (k=0; k<ncols; k++) {
1807           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1808           if (tcol) {
1809             subcols[i]   = --tcol;
1810             subvals[i++] = vals[k];
1811           }
1812         }
1813 #else
1814         /* diagonal part A = c->A */
1815         ncols = ai[Crow+1] - ai[Crow];
1816         cols  = aj + ai[Crow];
1817         vals  = a->a + ai[Crow];
1818         i     = 0;
1819         for (k=0; k<ncols; k++) {
1820           tcol = cmap[cols[k]+cstart];
1821           if (tcol) {
1822             subcols[i]   = --tcol;
1823             subvals[i++] = vals[k];
1824           }
1825         }
1826 
1827         /* off-diagonal part B = c->B */
1828         ncols = bi[Crow+1] - bi[Crow];
1829         cols  = bj + bi[Crow];
1830         vals  = b->a + bi[Crow];
1831         for (k=0; k<ncols; k++) {
1832           tcol = cmap[bmap[cols[k]]];
1833           if (tcol) {
1834             subcols[i]   = --tcol;
1835             subvals[i++] = vals[k];
1836           }
1837         }
1838 #endif
1839         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1840       }
1841     }
1842   }
1843 
1844   /* Now assemble the off-proc rows */
1845   for (i=0; i<nrqs; i++) { /* for each requested message */
1846     /* recv values from other processes */
1847     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1848     proc    = pa[idex];
1849     sbuf1_i = sbuf1[proc];
1850     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1851     ct1     = 2 + 1;
1852     ct2     = 0; /* count of received C->j */
1853     ct3     = 0; /* count of received C->j that will be inserted into submat */
1854     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1855     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1856     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1857 
1858     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1859     max1 = sbuf1_i[2];             /* num of rows */
1860     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1861       row = sbuf1_i[ct1]; /* row index of submat */
1862       if (!allcolumns) {
1863         idex = 0;
1864         if (scall == MAT_INITIAL_MATRIX) {
1865           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1866           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1867 #if defined(PETSC_USE_CTABLE)
1868             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1869               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1870             } else {
1871               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1872             }
1873 #else
1874             tcol = cmap[rbuf3_i[ct2]];
1875 #endif
1876             if (tcol) {
1877               subcols[idex]   = --tcol;
1878               subvals[idex++] = rbuf4_i[ct2];
1879 
1880               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1881                For reuse, we replace received C->j with index that should be inserted to submat */
1882               rbuf3_i[ct3++] = ct2;
1883             }
1884           }
1885           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1886 
1887         } else { /* scall == MAT_REUSE_MATRIX */
1888           submat = submats[0];
1889           subc   = (Mat_SeqAIJ*)submat->data;
1890 
1891           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1892           for (l=0; l<nnz; l++) {
1893             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1894             subvals[idex++] = rbuf4_i[ct2];
1895           }
1896 
1897           bj = subc->j + subc->i[row];
1898           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1899         }
1900       } else { /* allcolumns */
1901         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1902         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1903         ct2 += nnz;
1904       }
1905     }
1906   }
1907 
1908   /* sending a->a are done */
1909   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1910   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1911 
1912   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1913   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1914   submats[0] = submat;
1915 
1916   /* Restore the indices */
1917   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1918   if (!allcolumns) {
1919     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1920   }
1921 
1922   /* Destroy allocated memory */
1923   for (i=0; i<nrqs; ++i) {
1924     ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr);
1925   }
1926   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1927   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1928   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1929 
1930   if (scall == MAT_INITIAL_MATRIX) {
1931     ierr = PetscFree(lens);CHKERRQ(ierr);
1932     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1933     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1934   }
1935   PetscFunctionReturn(0);
1936 }
1937 
1938 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1939 {
1940   PetscErrorCode ierr;
1941   PetscInt       ncol;
1942   PetscBool      colflag,allcolumns=PETSC_FALSE;
1943 
1944   PetscFunctionBegin;
1945   /* Allocate memory to hold all the submatrices */
1946   if (scall == MAT_INITIAL_MATRIX) {
1947     ierr = PetscCalloc1(2,submat);CHKERRQ(ierr);
1948   }
1949 
1950   /* Check for special case: each processor gets entire matrix columns */
1951   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
1952   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1953   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1954 
1955   ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
1956   PetscFunctionReturn(0);
1957 }
1958 
1959 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1960 {
1961   PetscErrorCode ierr;
1962   PetscInt       nmax,nstages,i,pos,max_no,nrow,ncol,in[2],out[2];
1963   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1964   Mat_SeqAIJ     *subc;
1965   Mat_SubMat     *smat;
1966 
1967   PetscFunctionBegin;
1968   /* Check for special case: each processor has a single IS */
1969   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */
1970     ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1971     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */
1972     PetscFunctionReturn(0);
1973   }
1974 
1975   /* Collect global wantallmatrix and nstages */
1976   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1977   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1978   if (!nmax) nmax = 1;
1979 
1980   if (scall == MAT_INITIAL_MATRIX) {
1981     /* Collect global wantallmatrix and nstages */
1982     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1983       ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1984       ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1985       ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1986       ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1987       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1988         wantallmatrix = PETSC_TRUE;
1989 
1990         ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
1991       }
1992     }
1993 
1994     /* Determine the number of stages through which submatrices are done
1995        Each stage will extract nmax submatrices.
1996        nmax is determined by the matrix column dimension.
1997        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1998     */
1999     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2000 
2001     in[0] = -1*(PetscInt)wantallmatrix;
2002     in[1] = nstages;
2003     ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2004     wantallmatrix = (PetscBool)(-out[0]);
2005     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2006 
2007   } else { /* MAT_REUSE_MATRIX */
2008     if (ismax) {
2009       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2010       smat = subc->submatis1;
2011     } else { /* (*submat)[0] is a dummy matrix */
2012       smat = (Mat_SubMat*)(*submat)[0]->data;
2013     }
2014     if (!smat) {
2015       /* smat is not generated by MatGetSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2016       wantallmatrix = PETSC_TRUE;
2017     } else if (smat->singleis) {
2018       ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2019       PetscFunctionReturn(0);
2020     } else {
2021       nstages = smat->nstages;
2022     }
2023   }
2024 
2025   if (wantallmatrix) {
2026     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2027     PetscFunctionReturn(0);
2028   }
2029 
2030   /* Allocate memory to hold all the submatrices and dummy submatrices */
2031   if (scall == MAT_INITIAL_MATRIX) {
2032     ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr);
2033   }
2034 
2035 #if 0
2036   PetscMPIInt rank;
2037   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)C),&rank);CHKERRQ(ierr);
2038   printf("[%d] reuse %d, ismax %d, CN %d, nstages %d, nmax %d\n",rank,scall,ismax,C->cmap->N,nstages,nmax);
2039 #endif
2040   for (i=0,pos=0; i<nstages; i++) {
2041     if (pos+nmax <= ismax) max_no = nmax;
2042     else if (pos == ismax) max_no = 0;
2043     else                   max_no = ismax-pos;
2044     /* if (!max_no) printf("[%d] max_no=0, %d-stage\n",rank,i); */
2045     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
2046     if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2047       smat = (Mat_SubMat*)(*submat)[pos]->data;
2048       smat->nstages = nstages;
2049     }
2050     pos += max_no;
2051   }
2052 
2053   if (ismax && scall == MAT_INITIAL_MATRIX) {
2054     /* save nstages for reuse */
2055     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2056     smat = subc->submatis1;
2057     smat->nstages = nstages;
2058   }
2059   PetscFunctionReturn(0);
2060 }
2061 
2062 /* -------------------------------------------------------------------------*/
2063 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2064 {
2065   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2066   Mat            A  = c->A;
2067   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2068   const PetscInt **icol,**irow;
2069   PetscInt       *nrow,*ncol,start;
2070   PetscErrorCode ierr;
2071   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2072   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2073   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2074   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2075   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2076 #if defined(PETSC_USE_CTABLE)
2077   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2078 #else
2079   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2080 #endif
2081   const PetscInt *irow_i;
2082   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2083   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2084   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2085   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2086   MPI_Status     *r_status3,*r_status4,*s_status4;
2087   MPI_Comm       comm;
2088   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2089   PetscMPIInt    *onodes1,*olengths1,end;
2090   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2091   Mat_SubMat     *smat_i;
2092   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2093   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2094 
2095   PetscFunctionBegin;
2096   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2097   size = c->size;
2098   rank = c->rank;
2099 
2100   ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr);
2101   ierr = PetscMalloc5(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
2102 
2103   for (i=0; i<ismax; i++) {
2104     ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
2105     if (!issorted[i]) iscsorted = issorted[i];
2106 
2107     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2108 
2109     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2110     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2111 
2112     /* Check for special case: allcolumn */
2113     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2114     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2115     if (colflag && ncol[i] == C->cmap->N) {
2116       allcolumns[i] = PETSC_TRUE;
2117       icol[i] = NULL;
2118     } else {
2119       allcolumns[i] = PETSC_FALSE;
2120       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2121     }
2122   }
2123 
2124   if (scall == MAT_REUSE_MATRIX) {
2125     /* Assumes new rows are same length as the old rows */
2126     for (i=0; i<ismax; i++) {
2127       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2128       subc = (Mat_SeqAIJ*)submats[i]->data;
2129       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2130 
2131       /* Initial matrix as if empty */
2132       ierr = PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2133 
2134       smat_i   = subc->submatis1;
2135 
2136       nrqs        = smat_i->nrqs;
2137       nrqr        = smat_i->nrqr;
2138       rbuf1       = smat_i->rbuf1;
2139       rbuf2       = smat_i->rbuf2;
2140       rbuf3       = smat_i->rbuf3;
2141       req_source2 = smat_i->req_source2;
2142 
2143       sbuf1     = smat_i->sbuf1;
2144       sbuf2     = smat_i->sbuf2;
2145       ptr       = smat_i->ptr;
2146       tmp       = smat_i->tmp;
2147       ctr       = smat_i->ctr;
2148 
2149       pa          = smat_i->pa;
2150       req_size    = smat_i->req_size;
2151       req_source1 = smat_i->req_source1;
2152 
2153       allcolumns[i] = smat_i->allcolumns;
2154       row2proc[i]   = smat_i->row2proc;
2155       rmap[i]       = smat_i->rmap;
2156       cmap[i]       = smat_i->cmap;
2157     }
2158 
2159     if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
2160       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2161       smat_i = (Mat_SubMat*)submats[0]->data;
2162 
2163       nrqs        = smat_i->nrqs;
2164       nrqr        = smat_i->nrqr;
2165       rbuf1       = smat_i->rbuf1;
2166       rbuf2       = smat_i->rbuf2;
2167       rbuf3       = smat_i->rbuf3;
2168       req_source2 = smat_i->req_source2;
2169 
2170       sbuf1       = smat_i->sbuf1;
2171       sbuf2       = smat_i->sbuf2;
2172       ptr         = smat_i->ptr;
2173       tmp         = smat_i->tmp;
2174       ctr         = smat_i->ctr;
2175 
2176       pa          = smat_i->pa;
2177       req_size    = smat_i->req_size;
2178       req_source1 = smat_i->req_source1;
2179 
2180       allcolumns[0] = PETSC_TRUE;
2181     }
2182   } else { /* scall == MAT_INITIAL_MATRIX */
2183     /* Get some new tags to keep the communication clean */
2184     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2185     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2186 
2187     /* evaluate communication - mesg to who, length of mesg, and buffer space
2188      required. Based on this, buffers are allocated, and data copied into them*/
2189     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size, initialize work vectors */
2190 
2191     for (i=0; i<ismax; i++) {
2192       jmax   = nrow[i];
2193       irow_i = irow[i];
2194 
2195       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2196       row2proc[i] = row2proc_i;
2197 
2198       if (issorted[i]) proc = 0;
2199       for (j=0; j<jmax; j++) {
2200         if (!issorted[i]) proc = 0;
2201         row = irow_i[j];
2202         while (row >= C->rmap->range[proc+1]) proc++;
2203         w4[proc]++;
2204         row2proc_i[j] = proc; /* map row index to proc */
2205       }
2206       for (j=0; j<size; j++) {
2207         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2208       }
2209     }
2210 
2211     nrqs     = 0;              /* no of outgoing messages */
2212     msz      = 0;              /* total mesg length (for all procs) */
2213     w1[rank] = 0;              /* no mesg sent to self */
2214     w3[rank] = 0;
2215     for (i=0; i<size; i++) {
2216       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2217     }
2218     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2219     for (i=0,j=0; i<size; i++) {
2220       if (w1[i]) { pa[j] = i; j++; }
2221     }
2222 
2223     /* Each message would have a header = 1 + 2*(no of IS) + data */
2224     for (i=0; i<nrqs; i++) {
2225       j      = pa[i];
2226       w1[j] += w2[j] + 2* w3[j];
2227       msz   += w1[j];
2228     }
2229     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2230 
2231     /* Determine the number of messages to expect, their lengths, from from-ids */
2232     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2233     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2234 
2235     /* Now post the Irecvs corresponding to these messages */
2236     tag0 = ((PetscObject)C)->tag;
2237     ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2238 
2239     ierr = PetscFree(onodes1);CHKERRQ(ierr);
2240     ierr = PetscFree(olengths1);CHKERRQ(ierr);
2241 
2242     /* Allocate Memory for outgoing messages */
2243     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2244     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
2245     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
2246 
2247     {
2248       PetscInt *iptr = tmp;
2249       k    = 0;
2250       for (i=0; i<nrqs; i++) {
2251         j        = pa[i];
2252         iptr    += k;
2253         sbuf1[j] = iptr;
2254         k        = w1[j];
2255       }
2256     }
2257 
2258     /* Form the outgoing messages. Initialize the header space */
2259     for (i=0; i<nrqs; i++) {
2260       j           = pa[i];
2261       sbuf1[j][0] = 0;
2262       ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
2263       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2264     }
2265 
2266     /* Parse the isrow and copy data into outbuf */
2267     for (i=0; i<ismax; i++) {
2268       row2proc_i = row2proc[i];
2269       ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
2270       irow_i = irow[i];
2271       jmax   = nrow[i];
2272       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2273         proc = row2proc_i[j];
2274         if (proc != rank) { /* copy to the outgoing buf*/
2275           ctr[proc]++;
2276           *ptr[proc] = irow_i[j];
2277           ptr[proc]++;
2278         }
2279       }
2280       /* Update the headers for the current IS */
2281       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2282         if ((ctr_j = ctr[j])) {
2283           sbuf1_j        = sbuf1[j];
2284           k              = ++sbuf1_j[0];
2285           sbuf1_j[2*k]   = ctr_j;
2286           sbuf1_j[2*k-1] = i;
2287         }
2288       }
2289     }
2290 
2291     /*  Now  post the sends */
2292     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2293     for (i=0; i<nrqs; ++i) {
2294       j    = pa[i];
2295       ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
2296     }
2297 
2298     /* Post Receives to capture the buffer size */
2299     ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2300     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
2301     rbuf2[0] = tmp + msz;
2302     for (i=1; i<nrqs; ++i) {
2303       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2304     }
2305     for (i=0; i<nrqs; ++i) {
2306       j    = pa[i];
2307       ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr);
2308     }
2309 
2310     /* Send to other procs the buf size they should allocate */
2311     /* Receive messages*/
2312     ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2313     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2314     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2315     {
2316       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2317       PetscInt   *sbuf2_i;
2318 
2319       ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
2320       for (i=0; i<nrqr; ++i) {
2321         req_size[i] = 0;
2322         rbuf1_i        = rbuf1[i];
2323         start          = 2*rbuf1_i[0] + 1;
2324         ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2325         ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
2326         sbuf2_i        = sbuf2[i];
2327         for (j=start; j<end; j++) {
2328           id              = rbuf1_i[j] - rstart;
2329           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2330           sbuf2_i[j]      = ncols;
2331           req_size[i] += ncols;
2332         }
2333         req_source1[i] = r_status1[i].MPI_SOURCE;
2334         /* form the header */
2335         sbuf2_i[0] = req_size[i];
2336         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2337 
2338         ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
2339       }
2340     }
2341     ierr = PetscFree(r_status1);CHKERRQ(ierr);
2342     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2343     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2344 
2345     /* Receive messages*/
2346     ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2347     ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2348 
2349     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
2350     for (i=0; i<nrqs; ++i) {
2351       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
2352       req_source2[i] = r_status2[i].MPI_SOURCE;
2353       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
2354     }
2355     ierr = PetscFree(r_status2);CHKERRQ(ierr);
2356     ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2357 
2358     /* Wait on sends1 and sends2 */
2359     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2360     ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2361 
2362     if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
2363     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
2364     ierr = PetscFree(s_status1);CHKERRQ(ierr);
2365     ierr = PetscFree(s_status2);CHKERRQ(ierr);
2366     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2367     ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2368 
2369     /* Now allocate sending buffers for a->j, and send them off */
2370     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2371     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2372     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2373     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2374 
2375     ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2376     {
2377       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2378       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2379       PetscInt cend = C->cmap->rend;
2380       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2381 
2382       for (i=0; i<nrqr; i++) {
2383         rbuf1_i   = rbuf1[i];
2384         sbuf_aj_i = sbuf_aj[i];
2385         ct1       = 2*rbuf1_i[0] + 1;
2386         ct2       = 0;
2387         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2388           kmax = rbuf1[i][2*j];
2389           for (k=0; k<kmax; k++,ct1++) {
2390             row    = rbuf1_i[ct1] - rstart;
2391             nzA    = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
2392             ncols  = nzA + nzB;
2393             cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2394 
2395             /* load the column indices for this row into cols */
2396             cols = sbuf_aj_i + ct2;
2397 
2398             lwrite = 0;
2399             for (l=0; l<nzB; l++) {
2400               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2401             }
2402             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2403             for (l=0; l<nzB; l++) {
2404               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2405             }
2406 
2407             ct2 += ncols;
2408           }
2409         }
2410         ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
2411       }
2412     }
2413     ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
2414 
2415     /* create col map: global col of C -> local col of submatrices */
2416     {
2417       const PetscInt *icol_i;
2418 #if defined(PETSC_USE_CTABLE)
2419       for (i=0; i<ismax; i++) {
2420         if (!allcolumns[i]) {
2421           ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2422 
2423           jmax   = ncol[i];
2424           icol_i = icol[i];
2425           cmap_i = cmap[i];
2426           for (j=0; j<jmax; j++) {
2427             ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2428           }
2429         } else cmap[i] = NULL;
2430       }
2431 #else
2432       for (i=0; i<ismax; i++) {
2433         if (!allcolumns[i]) {
2434           ierr   = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2435           jmax   = ncol[i];
2436           icol_i = icol[i];
2437           cmap_i = cmap[i];
2438           for (j=0; j<jmax; j++) {
2439             cmap_i[icol_i[j]] = j+1;
2440           }
2441         } else cmap[i] = NULL;
2442       }
2443 #endif
2444     }
2445 
2446     /* Create lens which is required for MatCreate... */
2447     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2448     ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2449 
2450     if (ismax) {
2451       ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
2452     }
2453     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2454 
2455     /* Update lens from local data */
2456     for (i=0; i<ismax; i++) {
2457       row2proc_i = row2proc[i];
2458       jmax = nrow[i];
2459       if (!allcolumns[i]) cmap_i = cmap[i];
2460       irow_i = irow[i];
2461       lens_i = lens[i];
2462       for (j=0; j<jmax; j++) {
2463         row = irow_i[j];
2464         proc = row2proc_i[j];
2465         if (proc == rank) {
2466           ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2467           if (!allcolumns[i]) {
2468             for (k=0; k<ncols; k++) {
2469 #if defined(PETSC_USE_CTABLE)
2470               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2471 #else
2472               tcol = cmap_i[cols[k]];
2473 #endif
2474               if (tcol) lens_i[j]++;
2475             }
2476           } else { /* allcolumns */
2477             lens_i[j] = ncols;
2478           }
2479           ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2480         }
2481       }
2482     }
2483 
2484     /* Create row map: global row of C -> local row of submatrices */
2485 #if defined(PETSC_USE_CTABLE)
2486     for (i=0; i<ismax; i++) {
2487       ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2488       irow_i = irow[i];
2489       jmax   = nrow[i];
2490       for (j=0; j<jmax; j++) {
2491       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2492       }
2493     }
2494 #else
2495     for (i=0; i<ismax; i++) {
2496       ierr   = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2497       rmap_i = rmap[i];
2498       irow_i = irow[i];
2499       jmax   = nrow[i];
2500       for (j=0; j<jmax; j++) {
2501         rmap_i[irow_i[j]] = j;
2502       }
2503     }
2504 #endif
2505 
2506     /* Update lens from offproc data */
2507     {
2508       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2509 
2510       ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
2511       for (tmp2=0; tmp2<nrqs; tmp2++) {
2512         sbuf1_i = sbuf1[pa[tmp2]];
2513         jmax    = sbuf1_i[0];
2514         ct1     = 2*jmax+1;
2515         ct2     = 0;
2516         rbuf2_i = rbuf2[tmp2];
2517         rbuf3_i = rbuf3[tmp2];
2518         for (j=1; j<=jmax; j++) {
2519           is_no  = sbuf1_i[2*j-1];
2520           max1   = sbuf1_i[2*j];
2521           lens_i = lens[is_no];
2522           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2523           rmap_i = rmap[is_no];
2524           for (k=0; k<max1; k++,ct1++) {
2525 #if defined(PETSC_USE_CTABLE)
2526             ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2527             row--;
2528             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2529 #else
2530             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2531 #endif
2532             max2 = rbuf2_i[ct1];
2533             for (l=0; l<max2; l++,ct2++) {
2534               if (!allcolumns[is_no]) {
2535 #if defined(PETSC_USE_CTABLE)
2536                 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2537 #else
2538                 tcol = cmap_i[rbuf3_i[ct2]];
2539 #endif
2540                 if (tcol) lens_i[row]++;
2541               } else { /* allcolumns */
2542                 lens_i[row]++; /* lens_i[row] += max2 ? */
2543               }
2544             }
2545           }
2546         }
2547       }
2548     }
2549     ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2550     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
2551     ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr);
2552     ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2553 
2554     /* Create the submatrices */
2555     for (i=0; i<ismax; i++) {
2556       PetscInt    rbs,cbs;
2557 
2558       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2559       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2560 
2561       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2562       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2563 
2564       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2565       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2566       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2567 
2568       /* create struct Mat_SubMat and attached it to submat */
2569       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2570       subc = (Mat_SeqAIJ*)submats[i]->data;
2571       subc->submatis1 = smat_i;
2572 
2573       smat_i->destroy          = submats[i]->ops->destroy;
2574       submats[i]->ops->destroy = MatDestroy_SeqAIJ_Submatrices;
2575       submats[i]->factortype   = C->factortype;
2576 
2577       smat_i->id          = i;
2578       smat_i->nrqs        = nrqs;
2579       smat_i->nrqr        = nrqr;
2580       smat_i->rbuf1       = rbuf1;
2581       smat_i->rbuf2       = rbuf2;
2582       smat_i->rbuf3       = rbuf3;
2583       smat_i->sbuf2       = sbuf2;
2584       smat_i->req_source2 = req_source2;
2585 
2586       smat_i->sbuf1       = sbuf1;
2587       smat_i->ptr         = ptr;
2588       smat_i->tmp         = tmp;
2589       smat_i->ctr         = ctr;
2590 
2591       smat_i->pa           = pa;
2592       smat_i->req_size     = req_size;
2593       smat_i->req_source1  = req_source1;
2594 
2595       smat_i->allcolumns  = allcolumns[i];
2596       smat_i->singleis    = PETSC_FALSE;
2597       smat_i->row2proc    = row2proc[i];
2598       smat_i->rmap        = rmap[i];
2599       smat_i->cmap        = cmap[i];
2600     }
2601 
2602     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2603       ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
2604       ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2605       ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr);
2606 
2607       /* create struct Mat_SubMat and attached it to submat */
2608       ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr);
2609       submats[0]->data = (void*)smat_i;
2610 
2611       smat_i->destroy          = submats[0]->ops->destroy;
2612       submats[0]->ops->destroy = MatDestroy_Dummy_Submatrices;
2613       submats[0]->factortype   = C->factortype;
2614 
2615       smat_i->id          = i;
2616       smat_i->nrqs        = nrqs;
2617       smat_i->nrqr        = nrqr;
2618       smat_i->rbuf1       = rbuf1;
2619       smat_i->rbuf2       = rbuf2;
2620       smat_i->rbuf3       = rbuf3;
2621       smat_i->sbuf2       = sbuf2;
2622       smat_i->req_source2 = req_source2;
2623 
2624       smat_i->sbuf1       = sbuf1;
2625       smat_i->ptr         = ptr;
2626       smat_i->tmp         = tmp;
2627       smat_i->ctr         = ctr;
2628 
2629       smat_i->pa           = pa;
2630       smat_i->req_size     = req_size;
2631       smat_i->req_source1  = req_source1;
2632 
2633       smat_i->allcolumns  = PETSC_TRUE;
2634       smat_i->singleis    = PETSC_FALSE;
2635       smat_i->row2proc    = NULL;
2636       smat_i->rmap        = NULL;
2637       smat_i->cmap        = NULL;
2638     }
2639 
2640     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2641     ierr = PetscFree(lens);CHKERRQ(ierr);
2642     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2643     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2644 
2645   } /* endof scall == MAT_INITIAL_MATRIX */
2646 
2647   /* Post recv matrix values */
2648   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
2649   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2650   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2651   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2652   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2653   for (i=0; i<nrqs; ++i) {
2654     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
2655     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
2656   }
2657 
2658   /* Allocate sending buffers for a->a, and send them off */
2659   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2660   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2661   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2662   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2663 
2664   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2665   {
2666     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2667     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2668     PetscInt    cend   = C->cmap->rend;
2669     PetscInt    *b_j   = b->j;
2670     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2671 
2672     for (i=0; i<nrqr; i++) {
2673       rbuf1_i   = rbuf1[i];
2674       sbuf_aa_i = sbuf_aa[i];
2675       ct1       = 2*rbuf1_i[0]+1;
2676       ct2       = 0;
2677       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2678         kmax = rbuf1_i[2*j];
2679         for (k=0; k<kmax; k++,ct1++) {
2680           row    = rbuf1_i[ct1] - rstart;
2681           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2682           ncols  = nzA + nzB;
2683           cworkB = b_j + b_i[row];
2684           vworkA = a_a + a_i[row];
2685           vworkB = b_a + b_i[row];
2686 
2687           /* load the column values for this row into vals*/
2688           vals = sbuf_aa_i+ct2;
2689 
2690           lwrite = 0;
2691           for (l=0; l<nzB; l++) {
2692             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2693           }
2694           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2695           for (l=0; l<nzB; l++) {
2696             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2697           }
2698 
2699           ct2 += ncols;
2700         }
2701       }
2702       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
2703     }
2704   }
2705 
2706   /* Assemble the matrices */
2707   /* First assemble the local rows */
2708   for (i=0; i<ismax; i++) {
2709     row2proc_i = row2proc[i];
2710     subc      = (Mat_SeqAIJ*)submats[i]->data;
2711     imat_ilen = subc->ilen;
2712     imat_j    = subc->j;
2713     imat_i    = subc->i;
2714     imat_a    = subc->a;
2715 
2716     if (!allcolumns[i]) cmap_i = cmap[i];
2717     rmap_i = rmap[i];
2718     irow_i = irow[i];
2719     jmax   = nrow[i];
2720     for (j=0; j<jmax; j++) {
2721       row  = irow_i[j];
2722       proc = row2proc_i[j];
2723       if (proc == rank) {
2724         old_row = row;
2725 #if defined(PETSC_USE_CTABLE)
2726         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2727         row--;
2728 #else
2729         row = rmap_i[row];
2730 #endif
2731         ilen_row = imat_ilen[row];
2732         ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2733         mat_i    = imat_i[row];
2734         mat_a    = imat_a + mat_i;
2735         mat_j    = imat_j + mat_i;
2736         if (!allcolumns[i]) {
2737           for (k=0; k<ncols; k++) {
2738 #if defined(PETSC_USE_CTABLE)
2739             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2740 #else
2741             tcol = cmap_i[cols[k]];
2742 #endif
2743             if (tcol) {
2744               *mat_j++ = tcol - 1;
2745               *mat_a++ = vals[k];
2746               ilen_row++;
2747             }
2748           }
2749         } else { /* allcolumns */
2750           for (k=0; k<ncols; k++) {
2751             *mat_j++ = cols[k];  /* global col index! */
2752             *mat_a++ = vals[k];
2753             ilen_row++;
2754           }
2755         }
2756         ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2757 
2758         imat_ilen[row] = ilen_row;
2759       }
2760     }
2761   }
2762 
2763   /* Now assemble the off proc rows */
2764   ierr    = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr);
2765   for (tmp2=0; tmp2<nrqs; tmp2++) {
2766     sbuf1_i = sbuf1[pa[tmp2]];
2767     jmax    = sbuf1_i[0];
2768     ct1     = 2*jmax + 1;
2769     ct2     = 0;
2770     rbuf2_i = rbuf2[tmp2];
2771     rbuf3_i = rbuf3[tmp2];
2772     rbuf4_i = rbuf4[tmp2];
2773     for (j=1; j<=jmax; j++) {
2774       is_no     = sbuf1_i[2*j-1];
2775       rmap_i    = rmap[is_no];
2776       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2777       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2778       imat_ilen = subc->ilen;
2779       imat_j    = subc->j;
2780       imat_i    = subc->i;
2781       imat_a    = subc->a;
2782       max1      = sbuf1_i[2*j];
2783       for (k=0; k<max1; k++,ct1++) {
2784         row = sbuf1_i[ct1];
2785 #if defined(PETSC_USE_CTABLE)
2786         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2787         row--;
2788 #else
2789         row = rmap_i[row];
2790 #endif
2791         ilen  = imat_ilen[row];
2792         mat_i = imat_i[row];
2793         mat_a = imat_a + mat_i;
2794         mat_j = imat_j + mat_i;
2795         max2  = rbuf2_i[ct1];
2796         if (!allcolumns[is_no]) {
2797           for (l=0; l<max2; l++,ct2++) {
2798 #if defined(PETSC_USE_CTABLE)
2799             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2800 #else
2801             tcol = cmap_i[rbuf3_i[ct2]];
2802 #endif
2803             if (tcol) {
2804               *mat_j++ = tcol - 1;
2805               *mat_a++ = rbuf4_i[ct2];
2806               ilen++;
2807             }
2808           }
2809         } else { /* allcolumns */
2810           for (l=0; l<max2; l++,ct2++) {
2811             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2812             *mat_a++ = rbuf4_i[ct2];
2813             ilen++;
2814           }
2815         }
2816         imat_ilen[row] = ilen;
2817       }
2818     }
2819   }
2820 
2821   if (!iscsorted) { /* sort column indices of the rows */
2822     for (i=0; i<ismax; i++) {
2823       subc      = (Mat_SeqAIJ*)submats[i]->data;
2824       imat_j    = subc->j;
2825       imat_i    = subc->i;
2826       imat_a    = subc->a;
2827       imat_ilen = subc->ilen;
2828 
2829       if (allcolumns[i]) continue;
2830       jmax = nrow[i];
2831       for (j=0; j<jmax; j++) {
2832         mat_i = imat_i[j];
2833         mat_a = imat_a + mat_i;
2834         mat_j = imat_j + mat_i;
2835         ierr  = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr);
2836       }
2837     }
2838   }
2839 
2840   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2841   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2842   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
2843   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2844   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2845 
2846   /* Restore the indices */
2847   for (i=0; i<ismax; i++) {
2848     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2849     if (!allcolumns[i]) {
2850       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2851     }
2852   }
2853 
2854   for (i=0; i<ismax; i++) {
2855     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2856     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2857   }
2858 
2859   /* Destroy allocated memory */
2860   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2861   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2862   ierr = PetscFree5(irow,icol,nrow,ncol,issorted);CHKERRQ(ierr);
2863 
2864   for (i=0; i<nrqs; ++i) {
2865     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2866   }
2867   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2868 
2869   ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr);
2870   PetscFunctionReturn(0);
2871 }
2872 
2873 /*
2874  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2875  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2876  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2877  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2878  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2879  state, and needs to be "assembled" later by compressing B's column space.
2880 
2881  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2882  Following this call, C->A & C->B have been created, even if empty.
2883  */
2884 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2885 {
2886   /* If making this function public, change the error returned in this function away from _PLIB. */
2887   PetscErrorCode ierr;
2888   Mat_MPIAIJ     *aij;
2889   Mat_SeqAIJ     *Baij;
2890   PetscBool      seqaij,Bdisassembled;
2891   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2892   PetscScalar    v;
2893   const PetscInt *rowindices,*colindices;
2894 
2895   PetscFunctionBegin;
2896   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2897   if (A) {
2898     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2899     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2900     if (rowemb) {
2901       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2902       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2903     } else {
2904       if (C->rmap->n != A->rmap->n) {
2905 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2906       }
2907     }
2908     if (dcolemb) {
2909       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2910       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2911     } else {
2912       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2913     }
2914   }
2915   if (B) {
2916     ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2917     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2918     if (rowemb) {
2919       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2920       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2921     } else {
2922       if (C->rmap->n != B->rmap->n) {
2923 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2924       }
2925     }
2926     if (ocolemb) {
2927       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2928       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2929     } else {
2930       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2931     }
2932   }
2933 
2934   aij = (Mat_MPIAIJ*)C->data;
2935   if (!aij->A) {
2936     /* Mimic parts of MatMPIAIJSetPreallocation() */
2937     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2938     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2939     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2940     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2941     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2942   }
2943   if (A) {
2944     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2945   } else {
2946     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2947   }
2948   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2949     /*
2950       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2951       need to "disassemble" B -- convert it to using C's global indices.
2952       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2953 
2954       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2955       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2956 
2957       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2958       At least avoid calling MatSetValues() and the implied searches?
2959     */
2960 
2961     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2962 #if defined(PETSC_USE_CTABLE)
2963       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2964 #else
2965       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2966       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2967       if (aij->B) {
2968         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2969       }
2970 #endif
2971       ngcol = 0;
2972       if (aij->lvec) {
2973 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2974       }
2975       if (aij->garray) {
2976 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2977 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2978       }
2979       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2980       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2981     }
2982     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2983       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2984     }
2985     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2986       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2987     }
2988   }
2989   Bdisassembled = PETSC_FALSE;
2990   if (!aij->B) {
2991     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2992     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2993     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2994     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
2995     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
2996     Bdisassembled = PETSC_TRUE;
2997   }
2998   if (B) {
2999     Baij = (Mat_SeqAIJ*)B->data;
3000     if (pattern == DIFFERENT_NONZERO_PATTERN) {
3001       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
3002       for (i=0; i<B->rmap->n; i++) {
3003 	nz[i] = Baij->i[i+1] - Baij->i[i];
3004       }
3005       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3006       ierr = PetscFree(nz);CHKERRQ(ierr);
3007     }
3008 
3009     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3010     shift = rend-rstart;
3011     count = 0;
3012     rowindices = NULL;
3013     colindices = NULL;
3014     if (rowemb) {
3015       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3016     }
3017     if (ocolemb) {
3018       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3019     }
3020     for (i=0; i<B->rmap->n; i++) {
3021       PetscInt row;
3022       row = i;
3023       if (rowindices) row = rowindices[i];
3024       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3025 	col  = Baij->j[count];
3026 	if (colindices) col = colindices[col];
3027 	if (Bdisassembled && col>=rstart) col += shift;
3028 	v    = Baij->a[count];
3029 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3030 	++count;
3031       }
3032     }
3033     /* No assembly for aij->B is necessary. */
3034     /* FIXME: set aij->B's nonzerostate correctly. */
3035   } else {
3036     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3037   }
3038   C->preallocated  = PETSC_TRUE;
3039   C->was_assembled = PETSC_FALSE;
3040   C->assembled     = PETSC_FALSE;
3041    /*
3042       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3043       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3044    */
3045   PetscFunctionReturn(0);
3046 }
3047 
3048 /*
3049   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3050  */
3051 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3052 {
3053   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3054 
3055   PetscFunctionBegin;
3056   PetscValidPointer(A,2);
3057   PetscValidPointer(B,3);
3058   /* FIXME: make sure C is assembled */
3059   *A = aij->A;
3060   *B = aij->B;
3061   /* Note that we don't incref *A and *B, so be careful! */
3062   PetscFunctionReturn(0);
3063 }
3064 
3065 /*
3066   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3067   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3068 */
3069 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3070                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3071 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3072 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3073 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3074 {
3075   PetscErrorCode ierr;
3076   PetscMPIInt    isize,flag;
3077   PetscInt       i,ii,cismax,ispar;
3078   Mat            *A,*B;
3079   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3080 
3081   PetscFunctionBegin;
3082   if (!ismax) PetscFunctionReturn(0);
3083 
3084   for (i = 0, cismax = 0; i < ismax; ++i) {
3085     PetscMPIInt isize;
3086     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
3087     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3088     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
3089     if (isize > 1) ++cismax;
3090   }
3091 
3092   /*
3093      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3094      ispar counts the number of parallel ISs across C's comm.
3095   */
3096   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3097   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3098     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3099     PetscFunctionReturn(0);
3100   }
3101 
3102   /* if (ispar) */
3103   /*
3104     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3105     These are used to extract the off-diag portion of the resulting parallel matrix.
3106     The row IS for the off-diag portion is the same as for the diag portion,
3107     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3108   */
3109   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3110   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3111   for (i = 0, ii = 0; i < ismax; ++i) {
3112     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3113     if (isize > 1) {
3114       /*
3115 	 TODO: This is the part that's ***NOT SCALABLE***.
3116 	 To fix this we need to extract just the indices of C's nonzero columns
3117 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3118 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
3119 	 to be done without serializing on the IS list, so, most likely, it is best
3120 	 done by rewriting MatGetSubMatrices_MPIAIJ() directly.
3121       */
3122       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3123       /* Now we have to
3124 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3125 	     were sorted on each rank, concatenated they might no longer be sorted;
3126 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3127 	     indices in the nondecreasing order to the original index positions.
3128 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3129       */
3130       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3131       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3132       ++ii;
3133     }
3134   }
3135   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3136   for (i = 0, ii = 0; i < ismax; ++i) {
3137     PetscInt       j,issize;
3138     const PetscInt *indices;
3139 
3140     /*
3141        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3142      */
3143     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3144     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3145     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3146     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3147     for (j = 1; j < issize; ++j) {
3148       if (indices[j] == indices[j-1]) {
3149 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3150       }
3151     }
3152     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3153 
3154 
3155     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3156     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3157     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3158     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3159     for (j = 1; j < issize; ++j) {
3160       if (indices[j-1] == indices[j]) {
3161 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3162       }
3163     }
3164     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3165     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3166     if (isize > 1) {
3167       cisrow[ii] = isrow[i];
3168       ++ii;
3169     }
3170   }
3171   /*
3172     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3173     array of sequential matrices underlying the resulting parallel matrices.
3174     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3175     contain duplicates.
3176 
3177     There are as many diag matrices as there are original index sets. There are only as many parallel
3178     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3179 
3180     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3181     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3182       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3183       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3184       with A[i] and B[ii] extracted from the corresponding MPI submat.
3185     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3186       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3187       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3188       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3189       values into A[i] and B[ii] sitting inside the corresponding submat.
3190     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3191       A[i], B[ii], AA[i] or BB[ii] matrices.
3192   */
3193   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3194   if (scall == MAT_INITIAL_MATRIX) {
3195     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3196   }
3197 
3198   /* Now obtain the sequential A and B submatrices separately. */
3199   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3200   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3201   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3202 
3203   /*
3204     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3205     matrices A & B have been extracted directly into the parallel matrices containing them, or
3206     simply into the sequential matrix identical with the corresponding A (if isize == 1).
3207     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3208     to have the same sparsity pattern.
3209     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3210     must be constructed for C. This is done by setseqmat(s).
3211   */
3212   for (i = 0, ii = 0; i < ismax; ++i) {
3213     /*
3214        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3215        That way we can avoid sorting and computing permutations when reusing.
3216        To this end:
3217         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3218 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
3219 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3220     */
3221     MatStructure pattern;
3222     pattern = DIFFERENT_NONZERO_PATTERN;
3223 
3224     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3225     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3226     if (isize > 1) {
3227       if (scall == MAT_INITIAL_MATRIX) {
3228 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3229 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3230 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3231 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3232 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3233       }
3234       /*
3235 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3236       */
3237       {
3238 	Mat AA,BB;
3239         AA = A[i];
3240         BB = B[ii];
3241 	if (AA || BB) {
3242 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3243 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3244 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3245 	}
3246 
3247         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3248       }
3249       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3250       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3251       ++ii;
3252     } else { /* if (isize == 1) */
3253       if (scall == MAT_REUSE_MATRIX) {
3254         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3255       }
3256       if (isrow_p[i] || iscol_p[i]) {
3257         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3258         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3259 	/* Otherwise A is extracted straight into (*submats)[i]. */
3260 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3261 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
3262       } else (*submat)[i] = A[i];
3263     }
3264     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3265     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3266   }
3267   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3268   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3269   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3270   ierr = PetscFree(A);CHKERRQ(ierr);
3271   ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr);
3272   PetscFunctionReturn(0);
3273 }
3274 
3275 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3276 {
3277   PetscErrorCode ierr;
3278 
3279   PetscFunctionBegin;
3280   ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3281   PetscFunctionReturn(0);
3282 }
3283