xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 6b5c38ab690a1cd2002726c2fdb90dd77cd82dff)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
23 {
24   PetscErrorCode ierr;
25   PetscInt       i;
26 
27   PetscFunctionBegin;
28   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29   for (i=0; i<ov; ++i) {
30     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
31   }
32   PetscFunctionReturn(0);
33 }
34 
35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
36 {
37   PetscErrorCode ierr;
38   PetscInt       i;
39 
40   PetscFunctionBegin;
41   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
42   for (i=0; i<ov; ++i) {
43     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
44   }
45   PetscFunctionReturn(0);
46 }
47 
48 
49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
50 {
51   PetscErrorCode   ierr;
52   MPI_Comm         comm;
53   PetscInt        *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
54   PetscInt         *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
55   PetscInt         nrecvrows,*sbsizes = 0,*sbdata = 0;
56   const PetscInt  *indices_i,**indices;
57   PetscLayout      rmap;
58   PetscMPIInt      rank,size,*toranks,*fromranks,nto,nfrom;
59   PetscSF          sf;
60   PetscSFNode     *remote;
61 
62   PetscFunctionBegin;
63   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
64   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
65   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
66   /* get row map to determine where rows should be going */
67   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
68   /* retrieve IS data and put all together so that we
69    * can optimize communication
70    *  */
71   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
72   for (i=0,tlength=0; i<nidx; i++){
73     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
74     tlength += length[i];
75     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
76   }
77   /* find these rows on remote processors */
78   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
79   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
80   nrrows = 0;
81   for (i=0; i<nidx; i++){
82     length_i     = length[i];
83     indices_i    = indices[i];
84     for (j=0; j<length_i; j++){
85       owner = -1;
86       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
87       /* remote processors */
88       if (owner != rank){
89         tosizes_temp[owner]++; /* number of rows to owner */
90         rrow_ranks[nrrows]  = owner; /* processor */
91         rrow_isids[nrrows]   = i; /* is id */
92         remoterows[nrrows++] = indices_i[j]; /* row */
93       }
94     }
95     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
96   }
97   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
98   /* test if we need to exchange messages
99    * generally speaking, we do not need to exchange
100    * data when overlap is 1
101    * */
102   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
103   /* we do not have any messages
104    * It usually corresponds to overlap 1
105    * */
106   if (!reducednrrows){
107     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
108     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
109     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
110     PetscFunctionReturn(0);
111   }
112   nto = 0;
113   /* send sizes and ranks for building a two-sided communcation */
114   for (i=0; i<size; i++){
115    if (tosizes_temp[i]){
116      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
117      tosizes_temp[i] = nto; /* a map from processor to index */
118      toranks[nto++]  = i; /* processor */
119    }
120   }
121   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
122   for (i=0; i<nto; i++){
123     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
124     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
125   }
126   /* send information to other processors */
127   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
128   nrecvrows = 0;
129   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
130   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
131   nrecvrows = 0;
132   for (i=0; i<nfrom; i++){
133     for (j=0; j<fromsizes[2*i]; j++){
134       remote[nrecvrows].rank    = fromranks[i];
135       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
136     }
137   }
138   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
139   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
140   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
141   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
142   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
143   /* message pair <no of is, row>  */
144   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
145   for (i=0; i<nrrows; i++){
146     owner = rrow_ranks[i]; /* processor */
147     j     = tosizes_temp[owner]; /* index */
148     todata[toffsets[j]++] = rrow_isids[i];
149     todata[toffsets[j]++] = remoterows[i];
150   }
151   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
152   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
153   ierr = PetscFree(toffsets);CHKERRQ(ierr);
154   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
155   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
156   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
157   /* send rows belonging to the remote so that then we could get the overlapping data back */
158   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
159   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
160   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
161   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
162   ierr = PetscFree(fromranks);CHKERRQ(ierr);
163   nrecvrows = 0;
164   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
165   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
166   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
167   nrecvrows = 0;
168   for (i=0; i<nto; i++){
169     for (j=0; j<tosizes[2*i]; j++){
170       remote[nrecvrows].rank    = toranks[i];
171       remote[nrecvrows++].index = tosizes[2*i+1]+j;
172     }
173   }
174   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
175   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
176   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
177   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
178   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
179   /* overlap communication and computation */
180   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
181   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
182   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
183   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
184   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
185   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
186   ierr = PetscFree(toranks);CHKERRQ(ierr);
187   ierr = PetscFree(tosizes);CHKERRQ(ierr);
188   ierr = PetscFree(todata);CHKERRQ(ierr);
189   PetscFunctionReturn(0);
190 }
191 
192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
193 {
194   PetscInt         *isz,isz_i,i,j,is_id, data_size;
195   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
196   const PetscInt   *indices_i_temp;
197   PetscErrorCode    ierr;
198 
199   PetscFunctionBegin;
200   max_lsize = 0;
201   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
202   for (i=0; i<nidx; i++){
203     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
204     max_lsize = lsize>max_lsize ? lsize:max_lsize;
205     isz[i]    = lsize;
206   }
207   ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr);
208   for (i=0; i<nidx; i++){
209     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
210     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
211     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
212     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
213   }
214   /* retrieve information to get row id and its overlap */
215   for (i=0; i<nrecvs; ){
216     is_id      = recvdata[i++];
217     data_size  = recvdata[i++];
218     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
219     isz_i      = isz[is_id];
220     for (j=0; j< data_size; j++){
221       col = recvdata[i++];
222       indices_i[isz_i++] = col;
223     }
224     isz[is_id] = isz_i;
225   }
226   /* remove duplicate entities */
227   for (i=0; i<nidx; i++){
228     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
229     isz_i      = isz[i];
230     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
231     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
232   }
233   ierr = PetscFree(isz);CHKERRQ(ierr);
234   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
235   PetscFunctionReturn(0);
236 }
237 
238 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
239 {
240   PetscLayout       rmap,cmap;
241   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
242   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
243   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
244   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
245   Mat               amat,bmat;
246   PetscMPIInt       rank;
247   PetscBool         done;
248   MPI_Comm          comm;
249   PetscErrorCode    ierr;
250 
251   PetscFunctionBegin;
252   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
253   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
254   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
255   /* Even if the mat is symmetric, we still assume it is not symmetric */
256   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
257   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
258   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
259   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
260   /* total number of nonzero values is used to estimate the memory usage in the next step */
261   tnz  = ai[an]+bi[bn];
262   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
263   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
264   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
265   /* to find the longest message */
266   max_fszs = 0;
267   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
268   /* better way to estimate number of nonzero in the mat??? */
269   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
270   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
271   rows_pos  = 0;
272   totalrows = 0;
273   for (i=0; i<nfrom; i++){
274     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
275     /* group data together */
276     for (j=0; j<fromsizes[2*i]; j+=2){
277       is_id                       = fromrows[rows_pos++];/* no of is */
278       rows_i                      = rows_data[is_id];
279       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
280     }
281     /* estimate a space to avoid multiple allocations  */
282     for (j=0; j<nidx; j++){
283       indvc_ij = 0;
284       rows_i   = rows_data[j];
285       for (l=0; l<rows_pos_i[j]; l++){
286         row    = rows_i[l]-rstart;
287         start  = ai[row];
288         end    = ai[row+1];
289         for (k=start; k<end; k++){ /* Amat */
290           col = aj[k] + cstart;
291           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
292         }
293         start = bi[row];
294         end   = bi[row+1];
295         for (k=start; k<end; k++) { /* Bmat */
296           col = gcols[bj[k]];
297           indices_tmp[indvc_ij++] = col;
298         }
299       }
300       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
301       indv_counts[i*nidx+j] = indvc_ij;
302       totalrows            += indvc_ij;
303     }
304   }
305   /* message triple <no of is, number of rows, rows> */
306   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
307   totalrows = 0;
308   rows_pos  = 0;
309   /* use this code again */
310   for (i=0;i<nfrom;i++){
311     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
312     for (j=0; j<fromsizes[2*i]; j+=2){
313       is_id                       = fromrows[rows_pos++];
314       rows_i                      = rows_data[is_id];
315       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
316     }
317     /* add data  */
318     for (j=0; j<nidx; j++){
319       if (!indv_counts[i*nidx+j]) continue;
320       indvc_ij = 0;
321       sbdata[totalrows++] = j;
322       sbdata[totalrows++] = indv_counts[i*nidx+j];
323       sbsizes[2*i]       += 2;
324       rows_i              = rows_data[j];
325       for (l=0; l<rows_pos_i[j]; l++){
326         row   = rows_i[l]-rstart;
327         start = ai[row];
328         end   = ai[row+1];
329         for (k=start; k<end; k++){ /* Amat */
330           col = aj[k] + cstart;
331           indices_tmp[indvc_ij++] = col;
332         }
333         start = bi[row];
334         end   = bi[row+1];
335         for (k=start; k<end; k++) { /* Bmat */
336           col = gcols[bj[k]];
337           indices_tmp[indvc_ij++] = col;
338         }
339       }
340       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
341       sbsizes[2*i]  += indvc_ij;
342       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
343       totalrows += indvc_ij;
344     }
345   }
346   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
347   for (i=0; i<nfrom; i++){
348     offsets[i+1]   = offsets[i] + sbsizes[2*i];
349     sbsizes[2*i+1] = offsets[i];
350   }
351   ierr = PetscFree(offsets);CHKERRQ(ierr);
352   if (sbrowsizes) *sbrowsizes = sbsizes;
353   if (sbrows) *sbrows = sbdata;
354   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
355   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
356   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
357   PetscFunctionReturn(0);
358 }
359 
360 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
361 {
362   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
363   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
364   PetscInt          lsize,lsize_tmp,owner;
365   PetscMPIInt       rank;
366   Mat               amat,bmat;
367   PetscBool         done;
368   PetscLayout       cmap,rmap;
369   MPI_Comm          comm;
370   PetscErrorCode    ierr;
371 
372   PetscFunctionBegin;
373   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
374   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
375   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
376   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
377   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
378   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
379   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
380   /* is it a safe way to compute number of nonzero values ? */
381   tnz  = ai[an]+bi[bn];
382   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
383   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
384   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
385   /* it is a better way to estimate memory than the old implementation
386    * where global size of matrix is used
387    * */
388   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
389   for (i=0; i<nidx; i++) {
390     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
391     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
392     lsize_tmp = 0;
393     for (j=0; j<lsize; j++) {
394       owner = -1;
395       row   = indices[j];
396       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
397       if (owner != rank) continue;
398       /* local number */
399       row  -= rstart;
400       start = ai[row];
401       end   = ai[row+1];
402       for (k=start; k<end; k++) { /* Amat */
403         col = aj[k] + cstart;
404         indices_temp[lsize_tmp++] = col;
405       }
406       start = bi[row];
407       end   = bi[row+1];
408       for (k=start; k<end; k++) { /* Bmat */
409         col = gcols[bj[k]];
410         indices_temp[lsize_tmp++] = col;
411       }
412     }
413    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
414    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
415    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
416    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
417   }
418   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
419   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
420   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
421   PetscFunctionReturn(0);
422 }
423 
424 
425 /*
426   Sample message format:
427   If a processor A wants processor B to process some elements corresponding
428   to index sets is[1],is[5]
429   mesg [0] = 2   (no of index sets in the mesg)
430   -----------
431   mesg [1] = 1 => is[1]
432   mesg [2] = sizeof(is[1]);
433   -----------
434   mesg [3] = 5  => is[5]
435   mesg [4] = sizeof(is[5]);
436   -----------
437   mesg [5]
438   mesg [n]  datas[1]
439   -----------
440   mesg[n+1]
441   mesg[m]  data(is[5])
442   -----------
443 
444   Notes:
445   nrqs - no of requests sent (or to be sent out)
446   nrqr - no of requests recieved (which have to be or which have been processed
447 */
448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
449 {
450   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
451   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
452   const PetscInt **idx,*idx_i;
453   PetscInt       *n,**data,len;
454 #if defined(PETSC_USE_CTABLE)
455   PetscTable     *table_data,table_data_i;
456   PetscInt       *tdata,tcount,tcount_max;
457 #else
458   PetscInt       *data_i,*d_p;
459 #endif
460   PetscErrorCode ierr;
461   PetscMPIInt    size,rank,tag1,tag2;
462   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
463   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
464   PetscBT        *table;
465   MPI_Comm       comm;
466   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
467   MPI_Status     *s_status,*recv_status;
468   char           *t_p;
469 
470   PetscFunctionBegin;
471   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
472   size = c->size;
473   rank = c->rank;
474   M    = C->rmap->N;
475 
476   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
477   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
478 
479   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
480 
481   for (i=0; i<imax; i++) {
482     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
483     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
484   }
485 
486   /* evaluate communication - mesg to who,length of mesg, and buffer space
487      required. Based on this, buffers are allocated, and data copied into them  */
488   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
489   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
490   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
491   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
492   for (i=0; i<imax; i++) {
493     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
494     idx_i = idx[i];
495     len   = n[i];
496     for (j=0; j<len; j++) {
497       row = idx_i[j];
498       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
499       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
500       w4[proc]++;
501     }
502     for (j=0; j<size; j++) {
503       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
504     }
505   }
506 
507   nrqs     = 0;              /* no of outgoing messages */
508   msz      = 0;              /* total mesg length (for all proc */
509   w1[rank] = 0;              /* no mesg sent to intself */
510   w3[rank] = 0;
511   for (i=0; i<size; i++) {
512     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
513   }
514   /* pa - is list of processors to communicate with */
515   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
516   for (i=0,j=0; i<size; i++) {
517     if (w1[i]) {pa[j] = i; j++;}
518   }
519 
520   /* Each message would have a header = 1 + 2*(no of IS) + data */
521   for (i=0; i<nrqs; i++) {
522     j      = pa[i];
523     w1[j] += w2[j] + 2*w3[j];
524     msz   += w1[j];
525   }
526 
527   /* Determine the number of messages to expect, their lengths, from from-ids */
528   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
529   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
530 
531   /* Now post the Irecvs corresponding to these messages */
532   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
533 
534   /* Allocate Memory for outgoing messages */
535   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
536   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
537   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
538 
539   {
540     PetscInt *iptr = tmp,ict  = 0;
541     for (i=0; i<nrqs; i++) {
542       j         = pa[i];
543       iptr     +=  ict;
544       outdat[j] = iptr;
545       ict       = w1[j];
546     }
547   }
548 
549   /* Form the outgoing messages */
550   /* plug in the headers */
551   for (i=0; i<nrqs; i++) {
552     j            = pa[i];
553     outdat[j][0] = 0;
554     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
555     ptr[j]       = outdat[j] + 2*w3[j] + 1;
556   }
557 
558   /* Memory for doing local proc's work */
559   {
560     PetscInt M_BPB_imax = 0;
561 #if defined(PETSC_USE_CTABLE)
562     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
563     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
564     for (i=0; i<imax; i++) {
565       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
566     }
567     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
568     for (i=0; i<imax; i++) {
569       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
570     }
571 #else
572     PetscInt Mimax = 0;
573     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
574     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
575     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
576     for (i=0; i<imax; i++) {
577       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
578       data[i]  = d_p + M*i;
579     }
580 #endif
581   }
582 
583   /* Parse the IS and update local tables and the outgoing buf with the data */
584   {
585     PetscInt n_i,isz_i,*outdat_j,ctr_j;
586     PetscBT  table_i;
587 
588     for (i=0; i<imax; i++) {
589       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
590       n_i     = n[i];
591       table_i = table[i];
592       idx_i   = idx[i];
593 #if defined(PETSC_USE_CTABLE)
594       table_data_i = table_data[i];
595 #else
596       data_i  = data[i];
597 #endif
598       isz_i   = isz[i];
599       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
600         row  = idx_i[j];
601         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
602         if (proc != rank) { /* copy to the outgoing buffer */
603           ctr[proc]++;
604           *ptr[proc] = row;
605           ptr[proc]++;
606         } else if (!PetscBTLookupSet(table_i,row)) {
607 #if defined(PETSC_USE_CTABLE)
608           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
609 #else
610           data_i[isz_i] = row; /* Update the local table */
611 #endif
612           isz_i++;
613         }
614       }
615       /* Update the headers for the current IS */
616       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
617         if ((ctr_j = ctr[j])) {
618           outdat_j        = outdat[j];
619           k               = ++outdat_j[0];
620           outdat_j[2*k]   = ctr_j;
621           outdat_j[2*k-1] = i;
622         }
623       }
624       isz[i] = isz_i;
625     }
626   }
627 
628   /*  Now  post the sends */
629   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
630   for (i=0; i<nrqs; ++i) {
631     j    = pa[i];
632     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
633   }
634 
635   /* No longer need the original indices */
636   for (i=0; i<imax; ++i) {
637     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
638   }
639   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
640 
641   for (i=0; i<imax; ++i) {
642     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
643   }
644 
645   /* Do Local work */
646 #if defined(PETSC_USE_CTABLE)
647   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
648 #else
649   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
650 #endif
651 
652   /* Receive messages */
653   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
654   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
655 
656   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
657   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
658 
659   /* Phase 1 sends are complete - deallocate buffers */
660   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
661   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
662 
663   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
664   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
665   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
666   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
667   ierr = PetscFree(rbuf);CHKERRQ(ierr);
668 
669 
670   /* Send the data back */
671   /* Do a global reduction to know the buffer space req for incoming messages */
672   {
673     PetscMPIInt *rw1;
674 
675     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
676 
677     for (i=0; i<nrqr; ++i) {
678       proc = recv_status[i].MPI_SOURCE;
679 
680       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
681       rw1[proc] = isz1[i];
682     }
683     ierr = PetscFree(onodes1);CHKERRQ(ierr);
684     ierr = PetscFree(olengths1);CHKERRQ(ierr);
685 
686     /* Determine the number of messages to expect, their lengths, from from-ids */
687     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
688     ierr = PetscFree(rw1);CHKERRQ(ierr);
689   }
690   /* Now post the Irecvs corresponding to these messages */
691   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
692 
693   /* Now  post the sends */
694   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
695   for (i=0; i<nrqr; ++i) {
696     j    = recv_status[i].MPI_SOURCE;
697     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
698   }
699 
700   /* receive work done on other processors */
701   {
702     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
703     PetscMPIInt idex;
704     PetscBT     table_i;
705     MPI_Status  *status2;
706 
707     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
708     for (i=0; i<nrqs; ++i) {
709       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
710       /* Process the message */
711       rbuf2_i = rbuf2[idex];
712       ct1     = 2*rbuf2_i[0]+1;
713       jmax    = rbuf2[idex][0];
714       for (j=1; j<=jmax; j++) {
715         max     = rbuf2_i[2*j];
716         is_no   = rbuf2_i[2*j-1];
717         isz_i   = isz[is_no];
718         table_i = table[is_no];
719 #if defined(PETSC_USE_CTABLE)
720         table_data_i = table_data[is_no];
721 #else
722         data_i  = data[is_no];
723 #endif
724         for (k=0; k<max; k++,ct1++) {
725           row = rbuf2_i[ct1];
726           if (!PetscBTLookupSet(table_i,row)) {
727 #if defined(PETSC_USE_CTABLE)
728             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
729 #else
730             data_i[isz_i] = row;
731 #endif
732             isz_i++;
733           }
734         }
735         isz[is_no] = isz_i;
736       }
737     }
738 
739     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
740     ierr = PetscFree(status2);CHKERRQ(ierr);
741   }
742 
743 #if defined(PETSC_USE_CTABLE)
744   tcount_max = 0;
745   for (i=0; i<imax; ++i) {
746     table_data_i = table_data[i];
747     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
748     if (tcount_max < tcount) tcount_max = tcount;
749   }
750   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
751 #endif
752 
753   for (i=0; i<imax; ++i) {
754 #if defined(PETSC_USE_CTABLE)
755     PetscTablePosition tpos;
756     table_data_i = table_data[i];
757 
758     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
759     while (tpos) {
760       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
761       tdata[--j] = --k;
762     }
763     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
764 #else
765     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
766 #endif
767   }
768 
769   ierr = PetscFree(onodes2);CHKERRQ(ierr);
770   ierr = PetscFree(olengths2);CHKERRQ(ierr);
771 
772   ierr = PetscFree(pa);CHKERRQ(ierr);
773   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
774   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
775   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
776   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
777   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
778   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
779   ierr = PetscFree(s_status);CHKERRQ(ierr);
780   ierr = PetscFree(recv_status);CHKERRQ(ierr);
781   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
782   ierr = PetscFree(xdata);CHKERRQ(ierr);
783   ierr = PetscFree(isz1);CHKERRQ(ierr);
784 #if defined(PETSC_USE_CTABLE)
785   for (i=0; i<imax; i++) {
786     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
787   }
788   ierr = PetscFree(table_data);CHKERRQ(ierr);
789   ierr = PetscFree(tdata);CHKERRQ(ierr);
790   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
791 #else
792   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
793 #endif
794   PetscFunctionReturn(0);
795 }
796 
797 /*
798    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
799        the work on the local processor.
800 
801      Inputs:
802       C      - MAT_MPIAIJ;
803       imax - total no of index sets processed at a time;
804       table  - an array of char - size = m bits.
805 
806      Output:
807       isz    - array containing the count of the solution elements corresponding
808                to each index set;
809       data or table_data  - pointer to the solutions
810 */
811 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
812 {
813   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
814   Mat        A  = c->A,B = c->B;
815   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
816   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
817   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
818   PetscBT    table_i;
819 #if defined(PETSC_USE_CTABLE)
820   PetscTable         table_data_i;
821   PetscErrorCode     ierr;
822   PetscTablePosition tpos;
823   PetscInt           tcount,*tdata;
824 #else
825   PetscInt           *data_i;
826 #endif
827 
828   PetscFunctionBegin;
829   rstart = C->rmap->rstart;
830   cstart = C->cmap->rstart;
831   ai     = a->i;
832   aj     = a->j;
833   bi     = b->i;
834   bj     = b->j;
835   garray = c->garray;
836 
837   for (i=0; i<imax; i++) {
838 #if defined(PETSC_USE_CTABLE)
839     /* copy existing entries of table_data_i into tdata[] */
840     table_data_i = table_data[i];
841     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
842     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
843 
844     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
845     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
846     while (tpos) {
847       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
848       tdata[--j] = --row;
849       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
850     }
851 #else
852     data_i  = data[i];
853 #endif
854     table_i = table[i];
855     isz_i   = isz[i];
856     max     = isz[i];
857 
858     for (j=0; j<max; j++) {
859 #if defined(PETSC_USE_CTABLE)
860       row   = tdata[j] - rstart;
861 #else
862       row   = data_i[j] - rstart;
863 #endif
864       start = ai[row];
865       end   = ai[row+1];
866       for (k=start; k<end; k++) { /* Amat */
867         val = aj[k] + cstart;
868         if (!PetscBTLookupSet(table_i,val)) {
869 #if defined(PETSC_USE_CTABLE)
870           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
871 #else
872           data_i[isz_i] = val;
873 #endif
874           isz_i++;
875         }
876       }
877       start = bi[row];
878       end   = bi[row+1];
879       for (k=start; k<end; k++) { /* Bmat */
880         val = garray[bj[k]];
881         if (!PetscBTLookupSet(table_i,val)) {
882 #if defined(PETSC_USE_CTABLE)
883           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
884 #else
885           data_i[isz_i] = val;
886 #endif
887           isz_i++;
888         }
889       }
890     }
891     isz[i] = isz_i;
892 
893 #if defined(PETSC_USE_CTABLE)
894     ierr = PetscFree(tdata);CHKERRQ(ierr);
895 #endif
896   }
897   PetscFunctionReturn(0);
898 }
899 
900 /*
901       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
902          and return the output
903 
904          Input:
905            C    - the matrix
906            nrqr - no of messages being processed.
907            rbuf - an array of pointers to the recieved requests
908 
909          Output:
910            xdata - array of messages to be sent back
911            isz1  - size of each message
912 
913   For better efficiency perhaps we should malloc separately each xdata[i],
914 then if a remalloc is required we need only copy the data for that one row
915 rather then all previous rows as it is now where a single large chunck of
916 memory is used.
917 
918 */
919 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
920 {
921   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
922   Mat            A  = c->A,B = c->B;
923   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
924   PetscErrorCode ierr;
925   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
926   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
927   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
928   PetscInt       *rbuf_i,kmax,rbuf_0;
929   PetscBT        xtable;
930 
931   PetscFunctionBegin;
932   m      = C->rmap->N;
933   rstart = C->rmap->rstart;
934   cstart = C->cmap->rstart;
935   ai     = a->i;
936   aj     = a->j;
937   bi     = b->i;
938   bj     = b->j;
939   garray = c->garray;
940 
941 
942   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
943     rbuf_i =  rbuf[i];
944     rbuf_0 =  rbuf_i[0];
945     ct    += rbuf_0;
946     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
947   }
948 
949   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
950   else max1 = 1;
951   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
952   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
953   ++no_malloc;
954   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
955   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
956 
957   ct3 = 0;
958   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
959     rbuf_i =  rbuf[i];
960     rbuf_0 =  rbuf_i[0];
961     ct1    =  2*rbuf_0+1;
962     ct2    =  ct1;
963     ct3   += ct1;
964     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
965       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
966       oct2 = ct2;
967       kmax = rbuf_i[2*j];
968       for (k=0; k<kmax; k++,ct1++) {
969         row = rbuf_i[ct1];
970         if (!PetscBTLookupSet(xtable,row)) {
971           if (!(ct3 < mem_estimate)) {
972             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
973             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
974             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
975             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
976             xdata[0]     = tmp;
977             mem_estimate = new_estimate; ++no_malloc;
978             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
979           }
980           xdata[i][ct2++] = row;
981           ct3++;
982         }
983       }
984       for (k=oct2,max2=ct2; k<max2; k++) {
985         row   = xdata[i][k] - rstart;
986         start = ai[row];
987         end   = ai[row+1];
988         for (l=start; l<end; l++) {
989           val = aj[l] + cstart;
990           if (!PetscBTLookupSet(xtable,val)) {
991             if (!(ct3 < mem_estimate)) {
992               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
993               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
994               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
995               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
996               xdata[0]     = tmp;
997               mem_estimate = new_estimate; ++no_malloc;
998               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
999             }
1000             xdata[i][ct2++] = val;
1001             ct3++;
1002           }
1003         }
1004         start = bi[row];
1005         end   = bi[row+1];
1006         for (l=start; l<end; l++) {
1007           val = garray[bj[l]];
1008           if (!PetscBTLookupSet(xtable,val)) {
1009             if (!(ct3 < mem_estimate)) {
1010               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1011               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1012               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1013               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1014               xdata[0]     = tmp;
1015               mem_estimate = new_estimate; ++no_malloc;
1016               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1017             }
1018             xdata[i][ct2++] = val;
1019             ct3++;
1020           }
1021         }
1022       }
1023       /* Update the header*/
1024       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1025       xdata[i][2*j-1] = rbuf_i[2*j-1];
1026     }
1027     xdata[i][0] = rbuf_0;
1028     xdata[i+1]  = xdata[i] + ct2;
1029     isz1[i]     = ct2; /* size of each message */
1030   }
1031   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1032   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1033   PetscFunctionReturn(0);
1034 }
1035 /* -------------------------------------------------------------------------*/
1036 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
1037 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
1038 /*
1039     Every processor gets the entire matrix
1040 */
1041 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1042 {
1043   Mat            B;
1044   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1045   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1046   PetscErrorCode ierr;
1047   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
1048   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1049   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1050   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1051 
1052   PetscFunctionBegin;
1053   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1054   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
1055 
1056   if (scall == MAT_INITIAL_MATRIX) {
1057     /* ----------------------------------------------------------------
1058          Tell every processor the number of nonzeros per row
1059     */
1060     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1061     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1062       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1063     }
1064     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1065     for (i=0; i<size; i++) {
1066       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1067       displs[i]     = A->rmap->range[i];
1068     }
1069 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1070     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1071 #else
1072     sendcount = A->rmap->rend - A->rmap->rstart;
1073     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1074 #endif
1075     /* ---------------------------------------------------------------
1076          Create the sequential matrix of the same type as the local block diagonal
1077     */
1078     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1079     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1080     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1081     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1082     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1083     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
1084     **Bin = B;
1085     b     = (Mat_SeqAIJ*)B->data;
1086 
1087     /*--------------------------------------------------------------------
1088        Copy my part of matrix column indices over
1089     */
1090     sendcount  = ad->nz + bd->nz;
1091     jsendbuf   = b->j + b->i[rstarts[rank]];
1092     a_jsendbuf = ad->j;
1093     b_jsendbuf = bd->j;
1094     n          = A->rmap->rend - A->rmap->rstart;
1095     cnt        = 0;
1096     for (i=0; i<n; i++) {
1097 
1098       /* put in lower diagonal portion */
1099       m = bd->i[i+1] - bd->i[i];
1100       while (m > 0) {
1101         /* is it above diagonal (in bd (compressed) numbering) */
1102         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1103         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1104         m--;
1105       }
1106 
1107       /* put in diagonal portion */
1108       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1109         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1110       }
1111 
1112       /* put in upper diagonal portion */
1113       while (m-- > 0) {
1114         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1115       }
1116     }
1117     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1118 
1119     /*--------------------------------------------------------------------
1120        Gather all column indices to all processors
1121     */
1122     for (i=0; i<size; i++) {
1123       recvcounts[i] = 0;
1124       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1125         recvcounts[i] += lens[j];
1126       }
1127     }
1128     displs[0] = 0;
1129     for (i=1; i<size; i++) {
1130       displs[i] = displs[i-1] + recvcounts[i-1];
1131     }
1132 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1133     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1134 #else
1135     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1136 #endif
1137     /*--------------------------------------------------------------------
1138         Assemble the matrix into useable form (note numerical values not yet set)
1139     */
1140     /* set the b->ilen (length of each row) values */
1141     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1142     /* set the b->i indices */
1143     b->i[0] = 0;
1144     for (i=1; i<=A->rmap->N; i++) {
1145       b->i[i] = b->i[i-1] + lens[i-1];
1146     }
1147     ierr = PetscFree(lens);CHKERRQ(ierr);
1148     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1149     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1150 
1151   } else {
1152     B = **Bin;
1153     b = (Mat_SeqAIJ*)B->data;
1154   }
1155 
1156   /*--------------------------------------------------------------------
1157        Copy my part of matrix numerical values into the values location
1158   */
1159   if (flag == MAT_GET_VALUES) {
1160     sendcount = ad->nz + bd->nz;
1161     sendbuf   = b->a + b->i[rstarts[rank]];
1162     a_sendbuf = ad->a;
1163     b_sendbuf = bd->a;
1164     b_sendj   = bd->j;
1165     n         = A->rmap->rend - A->rmap->rstart;
1166     cnt       = 0;
1167     for (i=0; i<n; i++) {
1168 
1169       /* put in lower diagonal portion */
1170       m = bd->i[i+1] - bd->i[i];
1171       while (m > 0) {
1172         /* is it above diagonal (in bd (compressed) numbering) */
1173         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1174         sendbuf[cnt++] = *b_sendbuf++;
1175         m--;
1176         b_sendj++;
1177       }
1178 
1179       /* put in diagonal portion */
1180       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1181         sendbuf[cnt++] = *a_sendbuf++;
1182       }
1183 
1184       /* put in upper diagonal portion */
1185       while (m-- > 0) {
1186         sendbuf[cnt++] = *b_sendbuf++;
1187         b_sendj++;
1188       }
1189     }
1190     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1191 
1192     /* -----------------------------------------------------------------
1193        Gather all numerical values to all processors
1194     */
1195     if (!recvcounts) {
1196       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1197     }
1198     for (i=0; i<size; i++) {
1199       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1200     }
1201     displs[0] = 0;
1202     for (i=1; i<size; i++) {
1203       displs[i] = displs[i-1] + recvcounts[i-1];
1204     }
1205     recvbuf = b->a;
1206 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1207     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1208 #else
1209     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1210 #endif
1211   }  /* endof (flag == MAT_GET_VALUES) */
1212   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1213 
1214   if (A->symmetric) {
1215     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1216   } else if (A->hermitian) {
1217     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1218   } else if (A->structurally_symmetric) {
1219     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1220   }
1221   PetscFunctionReturn(0);
1222 }
1223 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C)
1224 {
1225   PetscErrorCode ierr;
1226   Mat_SeqAIJ     *c = (Mat_SeqAIJ*)C->data;
1227   Mat_SubMat     *submatj = c->submatis1;
1228   PetscInt       i;
1229 
1230   PetscFunctionBegin;
1231   ierr = submatj->destroy(C);CHKERRQ(ierr);
1232 
1233   ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr);
1234 
1235   for (i=0; i<submatj->nrqr; ++i) {
1236     ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr);
1237   }
1238   ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr);
1239 
1240   if (submatj->rbuf1) {
1241     ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr);
1242     ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr);
1243   }
1244 
1245   for (i=0; i<submatj->nrqs; ++i) {
1246     ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr);
1247   }
1248   ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr);
1249   ierr = PetscFree(submatj->pa);CHKERRQ(ierr);
1250 
1251   ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr);
1252 
1253 #if defined(PETSC_USE_CTABLE)
1254   ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr);
1255   if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);}
1256   ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr);
1257 #else
1258   ierr = PetscFree(submatj->rmap);CHKERRQ(ierr);
1259 #endif
1260 
1261   if (!submatj->allcolumns) {
1262 #if defined(PETSC_USE_CTABLE)
1263     ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr);
1264 #else
1265     ierr = PetscFree(submatj->cmap);CHKERRQ(ierr);
1266 #endif
1267   }
1268 
1269   ierr = PetscFree(submatj);CHKERRQ(ierr);
1270   PetscFunctionReturn(0);
1271 }
1272 
1273 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1274 {
1275   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1276   Mat            submat,A = c->A,B = c->B;
1277   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1278   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1279   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1280   const PetscInt *icol,*irow;
1281   PetscInt       nrow,ncol,start;
1282   PetscErrorCode ierr;
1283   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1284   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1285   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1286   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1287   PetscInt       *lens,rmax,ncols,*cols,Crow;
1288 #if defined(PETSC_USE_CTABLE)
1289   PetscTable     cmap,rmap;
1290   PetscInt       *cmap_loc,*rmap_loc;
1291 #else
1292   PetscInt       *cmap,*rmap;
1293 #endif
1294   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1295   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1296   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1297   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1298   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1299   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1300   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1301   MPI_Comm       comm;
1302   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1303   PetscMPIInt    *onodes1,*olengths1,idex,end;
1304   Mat_SubMat     *smatis1;
1305   PetscBool      isrowsorted;
1306 
1307   PetscFunctionBegin;
1308   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1309 
1310   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1311   size = c->size;
1312   rank = c->rank;
1313 
1314   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1315   if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted");
1316 
1317   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1318   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1319   if (allcolumns) {
1320     icol = NULL;
1321     ncol = C->cmap->N;
1322   } else {
1323     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1324     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1325   }
1326 
1327   if (scall == MAT_INITIAL_MATRIX) {
1328     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1329 
1330     /* Get some new tags to keep the communication clean */
1331     tag1 = ((PetscObject)C)->tag;
1332     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1333     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1334 
1335     /* evaluate communication - mesg to who, length of mesg, and buffer space
1336      required. Based on this, buffers are allocated, and data copied into them */
1337     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1338     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1339 
1340     /* w1[proc] = num of rows owned by proc -- to be requested */
1341     proc = 0;
1342     nrqs = 0; /* num of outgoing messages */
1343     for (j=0; j<nrow; j++) {
1344       row  = irow[j]; /* sorted! */
1345       while (row >= C->rmap->range[proc+1]) proc++;
1346       w1[proc]++;
1347       row2proc[j] = proc; /* map row index to proc */
1348 
1349       if (proc != rank && !w2[proc]) {
1350         w2[proc] = 1; nrqs++;
1351       }
1352     }
1353     w1[rank] = 0;  /* rows owned by self will not be requested */
1354 
1355     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1356     for (proc=0,j=0; proc<size; proc++) {
1357       if (w1[proc]) { pa[j++] = proc;}
1358     }
1359 
1360     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1361     msz = 0;              /* total mesg length (for all procs) */
1362     for (i=0; i<nrqs; i++) {
1363       proc      = pa[i];
1364       w1[proc] += 3;
1365       msz      += w1[proc];
1366     }
1367     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1368 
1369     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1370     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1371     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1372 
1373     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1374        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1375     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1376 
1377     /* Now post the Irecvs corresponding to these messages */
1378     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1379 
1380     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1381     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1382 
1383     /* Allocate Memory for outgoing messages */
1384     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1385     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1386     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1387 
1388     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1389     iptr = tmp;
1390     for (i=0; i<nrqs; i++) {
1391       proc        = pa[i];
1392       sbuf1[proc] = iptr;
1393       iptr       += w1[proc];
1394     }
1395 
1396     /* Form the outgoing messages */
1397     /* Initialize the header space */
1398     for (i=0; i<nrqs; i++) {
1399       proc      = pa[i];
1400       ierr      = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr);
1401       ptr[proc] = sbuf1[proc] + 3;
1402     }
1403 
1404     /* Parse the isrow and copy data into outbuf */
1405     ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1406     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1407       proc = row2proc[j];
1408       if (proc != rank) { /* copy to the outgoing buf*/
1409         *ptr[proc] = irow[j];
1410         ctr[proc]++; ptr[proc]++;
1411       }
1412     }
1413 
1414     /* Update the headers for the current IS */
1415     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1416       if ((ctr_j = ctr[j])) {
1417         sbuf1_j        = sbuf1[j];
1418         k              = ++sbuf1_j[0];
1419         sbuf1_j[2*k]   = ctr_j;
1420         sbuf1_j[2*k-1] = 0;
1421       }
1422     }
1423 
1424     /* Now post the sends */
1425     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1426     for (i=0; i<nrqs; ++i) {
1427       proc = pa[i];
1428       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr);
1429     }
1430 
1431     /* Post Receives to capture the buffer size */
1432     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1433     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1434 
1435     rbuf2[0] = tmp + msz;
1436     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1437 
1438     for (i=0; i<nrqs; ++i) {
1439       proc = pa[i];
1440       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr);
1441     }
1442 
1443     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1444 
1445     /* Send to other procs the buf size they should allocate */
1446     /* Receive messages*/
1447     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1448     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1449 
1450     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
1451     for (i=0; i<nrqr; ++i) {
1452       req_size[i] = 0;
1453       rbuf1_i        = rbuf1[i];
1454       start          = 2*rbuf1_i[0] + 1;
1455       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1456       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1457       sbuf2_i        = sbuf2[i];
1458       for (j=start; j<end; j++) {
1459         k            = rbuf1_i[j] - rstart;
1460         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1461         sbuf2_i[j]   = ncols;
1462         req_size[i] += ncols;
1463       }
1464       req_source1[i] = r_status1[i].MPI_SOURCE;
1465 
1466       /* form the header */
1467       sbuf2_i[0] = req_size[i];
1468       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1469 
1470       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
1471     }
1472 
1473     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1474     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1475 
1476     /* rbuf2 is received, Post recv column indices a->j */
1477     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
1478 
1479     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1480     for (i=0; i<nrqs; ++i) {
1481       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1482       req_source2[i] = r_status2[i].MPI_SOURCE;
1483       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
1484     }
1485 
1486     /* Wait on sends1 and sends2 */
1487     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1488     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1489     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1490     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1491 
1492     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1493     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1494 
1495     /* Now allocate sending buffers for a->j, and send them off */
1496     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1497     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1498     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1499     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1500 
1501     for (i=0; i<nrqr; i++) { /* for each requested message */
1502       rbuf1_i   = rbuf1[i];
1503       sbuf_aj_i = sbuf_aj[i];
1504       ct1       = 2*rbuf1_i[0] + 1;
1505       ct2       = 0;
1506       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1507 
1508       kmax = rbuf1[i][2];
1509       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1510         row    = rbuf1_i[ct1] - rstart;
1511         nzA    = ai[row+1] - ai[row];
1512         nzB    = bi[row+1] - bi[row];
1513         ncols  = nzA + nzB;
1514         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1515 
1516         /* load the column indices for this row into cols*/
1517         cols = sbuf_aj_i + ct2;
1518 
1519         lwrite = 0;
1520         for (l=0; l<nzB; l++) {
1521           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1522         }
1523         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1524         for (l=0; l<nzB; l++) {
1525           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1526         }
1527 
1528         ct2 += ncols;
1529       }
1530       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
1531     }
1532 
1533     /* create column map (cmap): global col of C -> local col of submat */
1534 #if defined(PETSC_USE_CTABLE)
1535     if (!allcolumns) {
1536       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1537       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1538       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1539         if (icol[j] >= cstart && icol[j] <cend) {
1540           cmap_loc[icol[j] - cstart] = j+1;
1541         } else { /* use PetscTable for non-local col indices */
1542           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1543         }
1544       }
1545     } else {
1546       cmap     = NULL;
1547       cmap_loc = NULL;
1548     }
1549     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1550 #else
1551     if (!allcolumns) {
1552       ierr   = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1553       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1554     } else {
1555       cmap = NULL;
1556     }
1557 #endif
1558 
1559     /* Create lens for MatSeqAIJSetPreallocation() */
1560     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1561 
1562     /* Compute lens from local part of C */
1563     for (j=0; j<nrow; j++) {
1564       row  = irow[j];
1565       proc = row2proc[j];
1566       if (proc == rank) {
1567         /* diagonal part A = c->A */
1568         ncols = ai[row-rstart+1] - ai[row-rstart];
1569         cols  = aj + ai[row-rstart];
1570         if (!allcolumns) {
1571           for (k=0; k<ncols; k++) {
1572 #if defined(PETSC_USE_CTABLE)
1573             tcol = cmap_loc[cols[k]];
1574 #else
1575             tcol = cmap[cols[k]+cstart];
1576 #endif
1577             if (tcol) lens[j]++;
1578           }
1579         } else { /* allcolumns */
1580           lens[j] = ncols;
1581         }
1582 
1583         /* off-diagonal part B = c->B */
1584         ncols = bi[row-rstart+1] - bi[row-rstart];
1585         cols  = bj + bi[row-rstart];
1586         if (!allcolumns) {
1587           for (k=0; k<ncols; k++) {
1588 #if defined(PETSC_USE_CTABLE)
1589             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1590 #else
1591             tcol = cmap[bmap[cols[k]]];
1592 #endif
1593             if (tcol) lens[j]++;
1594           }
1595         } else { /* allcolumns */
1596           lens[j] += ncols;
1597         }
1598       }
1599     }
1600 
1601     /* Create row map (rmap): global row of C -> local row of submat */
1602 #if defined(PETSC_USE_CTABLE)
1603     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1604     for (j=0; j<nrow; j++) {
1605       row  = irow[j];
1606       proc = row2proc[j];
1607       if (proc == rank) { /* a local row */
1608         rmap_loc[row - rstart] = j;
1609       } else {
1610         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1611       }
1612     }
1613 #else
1614     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1615     for (j=0; j<nrow; j++) {
1616       rmap[irow[j]] = j;
1617     }
1618 #endif
1619 
1620     /* Update lens from offproc data */
1621     /* recv a->j is done */
1622     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1623     for (i=0; i<nrqs; i++) {
1624       proc    = pa[i];
1625       sbuf1_i = sbuf1[proc];
1626       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1627       ct1     = 2 + 1;
1628       ct2     = 0;
1629       rbuf2_i = rbuf2[i]; /* received length of C->j */
1630       rbuf3_i = rbuf3[i]; /* received C->j */
1631 
1632       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1633       max1   = sbuf1_i[2];
1634       for (k=0; k<max1; k++,ct1++) {
1635 #if defined(PETSC_USE_CTABLE)
1636         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1637         row--;
1638         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1639 #else
1640         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1641 #endif
1642         /* Now, store row index of submat in sbuf1_i[ct1] */
1643         sbuf1_i[ct1] = row;
1644 
1645         nnz = rbuf2_i[ct1];
1646         if (!allcolumns) {
1647           for (l=0; l<nnz; l++,ct2++) {
1648 #if defined(PETSC_USE_CTABLE)
1649             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1650               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1651             } else {
1652               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1653             }
1654 #else
1655             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1656 #endif
1657             if (tcol) lens[row]++;
1658           }
1659         } else { /* allcolumns */
1660           lens[row] += nnz;
1661         }
1662       }
1663     }
1664     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1665     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1666 
1667     /* Create the submatrices */
1668     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1669     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1670 
1671     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1672     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1673     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1674     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1675     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1676 
1677     /* create struct Mat_SubMat and attached it to submat */
1678     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1679     subc = (Mat_SeqAIJ*)submat->data;
1680     subc->submatis1 = smatis1;
1681 
1682     smatis1->nrqs        = nrqs;
1683     smatis1->nrqr        = nrqr;
1684     smatis1->rbuf1       = rbuf1;
1685     smatis1->rbuf2       = rbuf2;
1686     smatis1->rbuf3       = rbuf3;
1687     smatis1->sbuf2       = sbuf2;
1688     smatis1->req_source2 = req_source2;
1689 
1690     smatis1->sbuf1       = sbuf1;
1691     smatis1->ptr         = ptr;
1692     smatis1->tmp         = tmp;
1693     smatis1->ctr         = ctr;
1694 
1695     smatis1->pa           = pa;
1696     smatis1->req_size     = req_size;
1697     smatis1->req_source1  = req_source1;
1698 
1699     smatis1->allcolumns  = allcolumns;
1700     smatis1->row2proc    = row2proc;
1701     smatis1->rmap        = rmap;
1702     smatis1->cmap        = cmap;
1703 #if defined(PETSC_USE_CTABLE)
1704     smatis1->rmap_loc    = rmap_loc;
1705     smatis1->cmap_loc    = cmap_loc;
1706 #endif
1707 
1708     smatis1->destroy     = submat->ops->destroy;
1709     submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices;
1710     submat->factortype   = C->factortype;
1711 
1712     /* compute rmax */
1713     rmax = 0;
1714     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1715 
1716   } else { /* scall == MAT_REUSE_MATRIX */
1717     submat = submats[0];
1718     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1719 
1720     subc    = (Mat_SeqAIJ*)submat->data;
1721     rmax    = subc->rmax;
1722     smatis1 = subc->submatis1;
1723     nrqs        = smatis1->nrqs;
1724     nrqr        = smatis1->nrqr;
1725     rbuf1       = smatis1->rbuf1;
1726     rbuf2       = smatis1->rbuf2;
1727     rbuf3       = smatis1->rbuf3;
1728     req_source2 = smatis1->req_source2;
1729 
1730     sbuf1     = smatis1->sbuf1;
1731     sbuf2     = smatis1->sbuf2;
1732     ptr       = smatis1->ptr;
1733     tmp       = smatis1->tmp;
1734     ctr       = smatis1->ctr;
1735 
1736     pa         = smatis1->pa;
1737     req_size   = smatis1->req_size;
1738     req_source1 = smatis1->req_source1;
1739 
1740     allcolumns = smatis1->allcolumns;
1741     row2proc   = smatis1->row2proc;
1742     rmap       = smatis1->rmap;
1743     cmap       = smatis1->cmap;
1744 #if defined(PETSC_USE_CTABLE)
1745     rmap_loc   = smatis1->rmap_loc;
1746     cmap_loc   = smatis1->cmap_loc;
1747 #endif
1748   }
1749 
1750   /* Post recv matrix values */
1751   ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1752   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1753   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1754   for (i=0; i<nrqs; ++i) {
1755     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1756     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1757   }
1758 
1759   /* Allocate sending buffers for a->a, and send them off */
1760   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1761   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1762   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1763   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1764 
1765   for (i=0; i<nrqr; i++) {
1766     rbuf1_i   = rbuf1[i];
1767     sbuf_aa_i = sbuf_aa[i];
1768     ct1       = 2*rbuf1_i[0]+1;
1769     ct2       = 0;
1770     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1771 
1772     kmax = rbuf1_i[2];
1773     for (k=0; k<kmax; k++,ct1++) {
1774       row = rbuf1_i[ct1] - rstart;
1775       nzA = ai[row+1] - ai[row];
1776       nzB = bi[row+1] - bi[row];
1777       ncols  = nzA + nzB;
1778       cworkB = bj + bi[row];
1779       vworkA = a_a + ai[row];
1780       vworkB = b_a + bi[row];
1781 
1782       /* load the column values for this row into vals*/
1783       vals = sbuf_aa_i + ct2;
1784 
1785       lwrite = 0;
1786       for (l=0; l<nzB; l++) {
1787         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1788       }
1789       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1790       for (l=0; l<nzB; l++) {
1791         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1792       }
1793 
1794       ct2 += ncols;
1795     }
1796     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1797   }
1798 
1799   /* Assemble submat */
1800   /* First assemble the local rows */
1801   for (j=0; j<nrow; j++) {
1802     row  = irow[j];
1803     proc = row2proc[j];
1804     if (proc == rank) {
1805       Crow = row - rstart;  /* local row index of C */
1806 #if defined(PETSC_USE_CTABLE)
1807       row = rmap_loc[Crow]; /* row index of submat */
1808 #else
1809       row = rmap[row];
1810 #endif
1811 
1812       if (allcolumns) {
1813         /* diagonal part A = c->A */
1814         ncols = ai[Crow+1] - ai[Crow];
1815         cols  = aj + ai[Crow];
1816         vals  = a->a + ai[Crow];
1817         i     = 0;
1818         for (k=0; k<ncols; k++) {
1819           subcols[i]   = cols[k] + cstart;
1820           subvals[i++] = vals[k];
1821         }
1822 
1823         /* off-diagonal part B = c->B */
1824         ncols = bi[Crow+1] - bi[Crow];
1825         cols  = bj + bi[Crow];
1826         vals  = b->a + bi[Crow];
1827         for (k=0; k<ncols; k++) {
1828           subcols[i]   = bmap[cols[k]];
1829           subvals[i++] = vals[k];
1830         }
1831 
1832         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1833 
1834       } else { /* !allcolumns */
1835 #if defined(PETSC_USE_CTABLE)
1836         /* diagonal part A = c->A */
1837         ncols = ai[Crow+1] - ai[Crow];
1838         cols  = aj + ai[Crow];
1839         vals  = a->a + ai[Crow];
1840         i     = 0;
1841         for (k=0; k<ncols; k++) {
1842           tcol = cmap_loc[cols[k]];
1843           if (tcol) {
1844             subcols[i]   = --tcol;
1845             subvals[i++] = vals[k];
1846           }
1847         }
1848 
1849         /* off-diagonal part B = c->B */
1850         ncols = bi[Crow+1] - bi[Crow];
1851         cols  = bj + bi[Crow];
1852         vals  = b->a + bi[Crow];
1853         for (k=0; k<ncols; k++) {
1854           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1855           if (tcol) {
1856             subcols[i]   = --tcol;
1857             subvals[i++] = vals[k];
1858           }
1859         }
1860 #else
1861         /* diagonal part A = c->A */
1862         ncols = ai[Crow+1] - ai[Crow];
1863         cols  = aj + ai[Crow];
1864         vals  = a->a + ai[Crow];
1865         i     = 0;
1866         for (k=0; k<ncols; k++) {
1867           tcol = cmap[cols[k]+cstart];
1868           if (tcol) {
1869             subcols[i]   = --tcol;
1870             subvals[i++] = vals[k];
1871           }
1872         }
1873 
1874         /* off-diagonal part B = c->B */
1875         ncols = bi[Crow+1] - bi[Crow];
1876         cols  = bj + bi[Crow];
1877         vals  = b->a + bi[Crow];
1878         for (k=0; k<ncols; k++) {
1879           tcol = cmap[bmap[cols[k]]];
1880           if (tcol) {
1881             subcols[i]   = --tcol;
1882             subvals[i++] = vals[k];
1883           }
1884         }
1885 #endif
1886         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1887       }
1888     }
1889   }
1890 
1891   /* Now assemble the off-proc rows */
1892   for (i=0; i<nrqs; i++) { /* for each requested message */
1893     /* recv values from other processes */
1894     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1895     proc    = pa[idex];
1896     sbuf1_i = sbuf1[proc];
1897     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1898     ct1     = 2 + 1;
1899     ct2     = 0; /* count of received C->j */
1900     ct3     = 0; /* count of received C->j that will be inserted into submat */
1901     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1902     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1903     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1904 
1905     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1906     max1 = sbuf1_i[2];             /* num of rows */
1907     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1908       row = sbuf1_i[ct1]; /* row index of submat */
1909       if (!allcolumns) {
1910         idex = 0;
1911         if (scall == MAT_INITIAL_MATRIX) {
1912           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1913           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1914 #if defined(PETSC_USE_CTABLE)
1915             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1916               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1917             } else {
1918               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1919             }
1920 #else
1921             tcol = cmap[rbuf3_i[ct2]];
1922 #endif
1923             if (tcol) {
1924               subcols[idex]   = --tcol;
1925               subvals[idex++] = rbuf4_i[ct2];
1926 
1927               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1928                For reuse, we replace received C->j with index that should be inserted to submat */
1929               rbuf3_i[ct3++] = ct2;
1930             }
1931           }
1932           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1933 
1934         } else { /* scall == MAT_REUSE_MATRIX */
1935           submat = submats[0];
1936           subc   = (Mat_SeqAIJ*)submat->data;
1937 
1938           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1939           for (l=0; l<nnz; l++) {
1940             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1941             subvals[idex++] = rbuf4_i[ct2];
1942           }
1943 
1944           bj = subc->j + subc->i[row];
1945           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1946         }
1947       } else { /* allcolumns */
1948         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1949         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1950         ct2 += nnz;
1951       }
1952     }
1953   }
1954 
1955   /* sending a->a are done */
1956   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1957   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1958 
1959   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1960   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1961   submats[0] = submat;
1962 
1963   /* Restore the indices */
1964   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1965   if (!allcolumns) {
1966     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1967   }
1968 
1969   /* Destroy allocated memory */
1970   for (i=0; i<nrqs; ++i) {
1971     ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr);
1972   }
1973   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1974   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1975   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1976 
1977   if (scall == MAT_INITIAL_MATRIX) {
1978     ierr = PetscFree(lens);CHKERRQ(ierr);
1979     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1980     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1981   }
1982   PetscFunctionReturn(0);
1983 }
1984 
1985 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1986 {
1987   PetscErrorCode ierr;
1988   PetscInt       ncol;
1989   PetscBool      colflag,allcolumns=PETSC_FALSE;
1990 
1991   PetscFunctionBegin;
1992   /* Allocate memory to hold all the submatrices */
1993   if (scall != MAT_REUSE_MATRIX) {
1994     ierr = PetscMalloc1(1,submat);CHKERRQ(ierr);
1995   }
1996 
1997   /* Check for special case: each processor gets entire matrix columns */
1998   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
1999   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
2000   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
2001 
2002   ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
2003   PetscFunctionReturn(0);
2004 }
2005 
2006 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2007 {
2008   PetscErrorCode ierr;
2009   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
2010   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
2011 
2012   PetscFunctionBegin;
2013 #if 0
2014   /* Check for special case: each processor gets entire matrix */
2015   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip several MPIU_Allreduce() */
2016     ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2017     PetscFunctionReturn(0);
2018   }
2019 #endif
2020 
2021   if (ismax == 1 && C->rmap->N == C->cmap->N) {
2022     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
2023     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
2024     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
2025     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
2026     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2027       wantallmatrix = PETSC_TRUE;
2028 
2029       ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
2030     }
2031   }
2032   ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2033   if (twantallmatrix) {
2034     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2035     PetscFunctionReturn(0);
2036   }
2037 
2038   /* Allocate memory to hold all the submatrices */
2039   if (scall != MAT_REUSE_MATRIX) {
2040     ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr);
2041   }
2042 
2043   /* Check for special case: each processor gets entire matrix columns */
2044   ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr);
2045   for (i=0; i<ismax; i++) {
2046     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2047     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
2048     if (colflag && ncol == C->cmap->N) {
2049       allcolumns[i] = PETSC_TRUE;
2050     } else {
2051       allcolumns[i] = PETSC_FALSE;
2052     }
2053   }
2054 
2055   /* Determine the number of stages through which submatrices are done */
2056   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
2057 
2058   /*
2059      Each stage will extract nmax submatrices.
2060      nmax is determined by the matrix column dimension.
2061      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2062   */
2063   if (!nmax) nmax = 1;
2064   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
2065 
2066   /* Make sure every processor loops through the nstages */
2067   ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2068 
2069   for (i=0,pos=0; i<nstages; i++) {
2070     if (pos+nmax <= ismax) max_no = nmax;
2071     else if (pos == ismax) max_no = 0;
2072     else                   max_no = ismax-pos;
2073     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
2074     pos += max_no;
2075   }
2076 
2077   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
2078   PetscFunctionReturn(0);
2079 }
2080 
2081 /* -------------------------------------------------------------------------*/
2082 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
2083 {
2084   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2085   Mat            A  = c->A;
2086   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
2087   const PetscInt **icol,**irow;
2088   PetscInt       *nrow,*ncol,start;
2089   PetscErrorCode ierr;
2090   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
2091   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
2092   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
2093   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
2094   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2095 #if defined(PETSC_USE_CTABLE)
2096   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
2097 #else
2098   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2099 #endif
2100   const PetscInt *irow_i;
2101   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2102   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2103   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2104   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2105   MPI_Status     *r_status3,*r_status4,*s_status4;
2106   MPI_Comm       comm;
2107   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
2108   PetscMPIInt    *onodes1,*olengths1;
2109   PetscMPIInt    idex,idex2,end;
2110   PetscInt       **row2proc,*row2proc_i;
2111   Mat_SubMat     **smats,*smat_i;
2112   PetscBool      *issorted;
2113 
2114   PetscFunctionBegin;
2115   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2116   tag0 = ((PetscObject)C)->tag;
2117   size = c->size;
2118   rank = c->rank;
2119   //printf("[%d] MatGetSubMatrices_MPIAIJ_Local, scall %d\n",rank,scall);
2120 
2121   /* Get some new tags to keep the communication clean */
2122   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
2123   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2124   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2125 
2126   ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr);
2127   ierr = PetscMalloc1(ismax,&issorted);CHKERRQ(ierr);
2128 
2129   for (i=0; i<ismax; i++) {
2130     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2131 
2132     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2133     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2134     if (allcolumns[i]) {
2135       icol[i] = NULL;
2136       ncol[i] = C->cmap->N;
2137     } else {
2138       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2139       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2140     }
2141   }
2142 
2143 
2144   ierr = PetscMalloc1(ismax,&smats);CHKERRQ(ierr);
2145   ierr = PetscMalloc1(ismax,&row2proc);CHKERRQ(ierr);
2146 
2147   if (scall == MAT_REUSE_MATRIX) {
2148     //PetscInt  *req_source1,*req_source2;
2149 
2150     /* Assumes new rows are same length as the old rows,hence bug! */
2151     for (i=0; i<ismax; i++) {
2152       mat = (Mat_SeqAIJ*)(submats[i]->data);
2153       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2154 
2155       /* Initial matrix as if empty -- rm later!!! */
2156       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2157 
2158       /* Initial matrix as if empty */
2159       submats[i]->factortype = C->factortype;
2160 
2161       smat_i   = mat->submatis1;
2162       smats[i] = smat_i;
2163 
2164       nrqs        = smat_i->nrqs;
2165       nrqr        = smat_i->nrqr;
2166       rbuf1       = smat_i->rbuf1;
2167       rbuf2       = smat_i->rbuf2;
2168       rbuf3       = smat_i->rbuf3;
2169       //req_source2 = smat_i->req_source2;
2170 
2171       sbuf1     = smat_i->sbuf1;
2172       sbuf2     = smat_i->sbuf2;
2173       ptr       = smat_i->ptr;
2174       tmp       = smat_i->tmp;
2175       ctr       = smat_i->ctr;
2176 
2177       pa          = smat_i->pa;
2178       req_size    = smat_i->req_size;
2179       //req_source1 = smat_i->req_source1;
2180 
2181       allcolumns[i] = smat_i->allcolumns;
2182       row2proc[i]   = smat_i->row2proc;
2183       //rmap       = smat_i->rmap;
2184       //cmap       = smat_i->cmap;
2185 #if defined(PETSC_USE_CTABLE)
2186       //rmap_loc   = smat_i->rmap_loc;
2187       //cmap_loc   = smat_i->cmap_loc;
2188 #endif
2189     }
2190   }
2191 
2192   /* evaluate communication - mesg to who, length of mesg, and buffer space
2193      required. Based on this, buffers are allocated, and data copied into them*/
2194   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size */
2195   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2196   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2197   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2198 
2199   for (i=0; i<ismax; i++) {
2200     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
2201     jmax   = nrow[i];
2202     irow_i = irow[i];
2203 
2204     if (scall == MAT_INITIAL_MATRIX) {
2205       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2206       row2proc[i] = row2proc_i;
2207     } else {
2208       row2proc_i = row2proc[i];
2209     }
2210 
2211     if (issorted[i]) proc = 0;
2212     for (j=0; j<jmax; j++) {
2213       if (!issorted[i]) proc = 0;
2214       row = irow_i[j];
2215       while (row >= C->rmap->range[proc+1]) proc++;
2216       w4[proc]++;
2217       row2proc_i[j] = proc; /* map row index to proc */
2218     }
2219     for (j=0; j<size; j++) {
2220       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
2221     }
2222   }
2223 
2224   nrqs     = 0;              /* no of outgoing messages */
2225   msz      = 0;              /* total mesg length (for all procs) */
2226   w1[rank] = 0;              /* no mesg sent to self */
2227   w3[rank] = 0;
2228   for (i=0; i<size; i++) {
2229     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2230   }
2231   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2232   for (i=0,j=0; i<size; i++) {
2233     if (w1[i]) { pa[j] = i; j++; }
2234   }
2235 
2236   /* Each message would have a header = 1 + 2*(no of IS) + data */
2237   for (i=0; i<nrqs; i++) {
2238     j      = pa[i];
2239     w1[j] += w2[j] + 2* w3[j];
2240     msz   += w1[j];
2241   }
2242   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2243 
2244   /* Determine the number of messages to expect, their lengths, from from-ids */
2245   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2246   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2247 
2248   /* Now post the Irecvs corresponding to these messages */
2249   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2250 
2251   ierr = PetscFree(onodes1);CHKERRQ(ierr);
2252   ierr = PetscFree(olengths1);CHKERRQ(ierr);
2253 
2254   /* Allocate Memory for outgoing messages */
2255   ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2256   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
2257   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
2258 
2259   {
2260     PetscInt *iptr = tmp;
2261     k    = 0;
2262     for (i=0; i<nrqs; i++) {
2263       j        = pa[i];
2264       iptr    += k;
2265       sbuf1[j] = iptr;
2266       k        = w1[j];
2267     }
2268   }
2269 
2270   /* Form the outgoing messages */
2271   /* Initialize the header space */
2272   for (i=0; i<nrqs; i++) {
2273     j           = pa[i];
2274     sbuf1[j][0] = 0;
2275     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
2276     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2277   }
2278 
2279   /* Parse the isrow and copy data into outbuf */
2280   for (i=0; i<ismax; i++) {
2281     row2proc_i = row2proc[i];
2282     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
2283     irow_i = irow[i];
2284     jmax   = nrow[i];
2285     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2286       proc = row2proc_i[j];
2287       if (proc != rank) { /* copy to the outgoing buf*/
2288         ctr[proc]++;
2289         *ptr[proc] = irow_i[j];
2290         ptr[proc]++;
2291       }
2292     }
2293     /* Update the headers for the current IS */
2294     for (j=0; j<size; j++) { /* Can Optimise this loop too */
2295       if ((ctr_j = ctr[j])) {
2296         sbuf1_j        = sbuf1[j];
2297         k              = ++sbuf1_j[0];
2298         sbuf1_j[2*k]   = ctr_j;
2299         sbuf1_j[2*k-1] = i;
2300       }
2301     }
2302   }
2303 
2304   /*  Now  post the sends */
2305   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2306   for (i=0; i<nrqs; ++i) {
2307     j    = pa[i];
2308     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
2309   }
2310 
2311   /* Post Receives to capture the buffer size */
2312   ierr     = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2313   ierr     = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr);
2314   rbuf2[0] = tmp + msz;
2315   for (i=1; i<nrqs; ++i) {
2316     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2317   }
2318   for (i=0; i<nrqs; ++i) {
2319     j    = pa[i];
2320     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
2321   }
2322 
2323   /* Send to other procs the buf size they should allocate */
2324 
2325 
2326   /* Receive messages*/
2327   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2328   ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2329   ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr);
2330   {
2331     PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2332     PetscInt   *sbuf2_i;
2333 
2334     for (i=0; i<nrqr; ++i) {
2335       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
2336 
2337       req_size[idex] = 0;
2338       rbuf1_i        = rbuf1[idex];
2339       start          = 2*rbuf1_i[0] + 1;
2340       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2341       ierr           = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr);
2342       sbuf2_i        = sbuf2[idex];
2343       for (j=start; j<end; j++) {
2344         id              = rbuf1_i[j] - rstart;
2345         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2346         sbuf2_i[j]      = ncols;
2347         req_size[idex] += ncols;
2348       }
2349       req_source[idex] = r_status1[i].MPI_SOURCE;
2350       /* form the header */
2351       sbuf2_i[0] = req_size[idex];
2352       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2353 
2354       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
2355     }
2356   }
2357   ierr = PetscFree(r_status1);CHKERRQ(ierr);
2358   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2359 
2360   /*  recv buffer sizes */
2361   /* Receive messages*/
2362 
2363   ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr);
2364   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2365   ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2366   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2367   ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2368 
2369   for (i=0; i<nrqs; ++i) {
2370     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
2371     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr);
2372     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr);
2373     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
2374     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
2375   }
2376   ierr = PetscFree(r_status2);CHKERRQ(ierr);
2377   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2378 
2379   /* Wait on sends1 and sends2 */
2380   ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2381   ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2382 
2383   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
2384   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
2385   ierr = PetscFree(s_status1);CHKERRQ(ierr);
2386   ierr = PetscFree(s_status2);CHKERRQ(ierr);
2387   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2388   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2389 
2390   /* Now allocate buffers for a->j, and send them off */
2391   ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2392   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2393   ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2394   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2395 
2396   ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2397   {
2398     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2399     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2400     PetscInt cend = C->cmap->rend;
2401     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2402 
2403     for (i=0; i<nrqr; i++) {
2404       rbuf1_i   = rbuf1[i];
2405       sbuf_aj_i = sbuf_aj[i];
2406       ct1       = 2*rbuf1_i[0] + 1;
2407       ct2       = 0;
2408       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2409         kmax = rbuf1[i][2*j];
2410         for (k=0; k<kmax; k++,ct1++) {
2411           row    = rbuf1_i[ct1] - rstart;
2412           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2413           ncols  = nzA + nzB;
2414           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2415 
2416           /* load the column indices for this row into cols*/
2417           cols = sbuf_aj_i + ct2;
2418 
2419           lwrite = 0;
2420           for (l=0; l<nzB; l++) {
2421             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2422           }
2423           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2424           for (l=0; l<nzB; l++) {
2425             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2426           }
2427 
2428           ct2 += ncols;
2429         }
2430       }
2431       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
2432     }
2433   }
2434   ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr);
2435   ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr);
2436 
2437   /* Allocate buffers for a->a, and send them off */
2438   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2439   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2440   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2441   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2442 
2443   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2444   {
2445     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2446     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2447     PetscInt    cend   = C->cmap->rend;
2448     PetscInt    *b_j   = b->j;
2449     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2450 
2451     for (i=0; i<nrqr; i++) {
2452       rbuf1_i   = rbuf1[i];
2453       sbuf_aa_i = sbuf_aa[i];
2454       ct1       = 2*rbuf1_i[0]+1;
2455       ct2       = 0;
2456       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2457         kmax = rbuf1_i[2*j];
2458         for (k=0; k<kmax; k++,ct1++) {
2459           row    = rbuf1_i[ct1] - rstart;
2460           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2461           ncols  = nzA + nzB;
2462           cworkB = b_j + b_i[row];
2463           vworkA = a_a + a_i[row];
2464           vworkB = b_a + b_i[row];
2465 
2466           /* load the column values for this row into vals*/
2467           vals = sbuf_aa_i+ct2;
2468 
2469           lwrite = 0;
2470           for (l=0; l<nzB; l++) {
2471             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2472           }
2473           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2474           for (l=0; l<nzB; l++) {
2475             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2476           }
2477 
2478           ct2 += ncols;
2479         }
2480       }
2481       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
2482     }
2483   }
2484   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
2485   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
2486   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2487   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2488 
2489   /* Form the matrix */
2490   /* create col map: global col of C -> local col of submatrices */
2491   {
2492     const PetscInt *icol_i;
2493 #if defined(PETSC_USE_CTABLE)
2494     ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr);
2495     for (i=0; i<ismax; i++) {
2496       if (!allcolumns[i]) {
2497         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2498 
2499         jmax   = ncol[i];
2500         icol_i = icol[i];
2501         cmap_i = cmap[i];
2502         for (j=0; j<jmax; j++) {
2503           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2504         }
2505       } else {
2506         cmap[i] = NULL;
2507       }
2508     }
2509 #else
2510     ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr);
2511     for (i=0; i<ismax; i++) {
2512       if (!allcolumns[i]) {
2513         ierr   = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2514         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
2515         jmax   = ncol[i];
2516         icol_i = icol[i];
2517         cmap_i = cmap[i];
2518         for (j=0; j<jmax; j++) {
2519           cmap_i[icol_i[j]] = j+1;
2520         }
2521       } else {
2522         cmap[i] = NULL;
2523       }
2524     }
2525 #endif
2526   }
2527 
2528   /* Create lens which is required for MatCreate... */
2529   for (i=0,j=0; i<ismax; i++) j += nrow[i];
2530   ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2531   if (ismax) {
2532     ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr);
2533     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
2534   }
2535   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2536 
2537   /* Update lens from local data */
2538   for (i=0; i<ismax; i++) {
2539     row2proc_i = row2proc[i];
2540     jmax = nrow[i];
2541     if (!allcolumns[i]) cmap_i = cmap[i];
2542     irow_i = irow[i];
2543     lens_i = lens[i];
2544     for (j=0; j<jmax; j++) {
2545       row = irow_i[j];
2546       proc = row2proc_i[j];
2547       if (proc == rank) {
2548         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2549         if (!allcolumns[i]) {
2550           for (k=0; k<ncols; k++) {
2551 #if defined(PETSC_USE_CTABLE)
2552             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2553 #else
2554             tcol = cmap_i[cols[k]];
2555 #endif
2556             if (tcol) lens_i[j]++;
2557           }
2558         } else { /* allcolumns */
2559           lens_i[j] = ncols;
2560         }
2561         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2562       }
2563     }
2564   }
2565 
2566   /* Create row map: global row of C -> local row of submatrices */
2567 #if defined(PETSC_USE_CTABLE)
2568   ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr);
2569   for (i=0; i<ismax; i++) {
2570     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2571     irow_i = irow[i];
2572     jmax   = nrow[i];
2573     for (j=0; j<jmax; j++) {
2574       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2575     }
2576   }
2577 #else
2578   ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr);
2579   if (ismax) {
2580     ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr);
2581     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
2582   }
2583   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
2584   for (i=0; i<ismax; i++) {
2585     rmap_i = rmap[i];
2586     irow_i = irow[i];
2587     jmax   = nrow[i];
2588     for (j=0; j<jmax; j++) {
2589       rmap_i[irow_i[j]] = j;
2590     }
2591   }
2592 #endif
2593 
2594   /* Update lens from offproc data */
2595   {
2596     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2597 
2598     for (tmp2=0; tmp2<nrqs; tmp2++) {
2599       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
2600       idex    = pa[idex2];
2601       sbuf1_i = sbuf1[idex];
2602       jmax    = sbuf1_i[0];
2603       ct1     = 2*jmax+1;
2604       ct2     = 0;
2605       rbuf2_i = rbuf2[idex2];
2606       rbuf3_i = rbuf3[idex2];
2607       for (j=1; j<=jmax; j++) {
2608         is_no  = sbuf1_i[2*j-1];
2609         max1   = sbuf1_i[2*j];
2610         lens_i = lens[is_no];
2611         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2612         rmap_i = rmap[is_no];
2613         for (k=0; k<max1; k++,ct1++) {
2614 #if defined(PETSC_USE_CTABLE)
2615           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2616           row--;
2617           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2618 #else
2619           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2620 #endif
2621           max2 = rbuf2_i[ct1];
2622           for (l=0; l<max2; l++,ct2++) {
2623             if (!allcolumns[is_no]) {
2624 #if defined(PETSC_USE_CTABLE)
2625               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2626 #else
2627               tcol = cmap_i[rbuf3_i[ct2]];
2628 #endif
2629               if (tcol) lens_i[row]++;
2630             } else { /* allcolumns */
2631               lens_i[row]++; /* lens_i[row] += max2 ? */
2632             }
2633           }
2634         }
2635       }
2636     }
2637   }
2638   ierr = PetscFree(r_status3);CHKERRQ(ierr);
2639   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2640   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
2641   ierr = PetscFree(s_status3);CHKERRQ(ierr);
2642   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2643 
2644   /* Create the submatrices */
2645   if (scall == MAT_INITIAL_MATRIX) {
2646     for (i=0; i<ismax; i++) {
2647       PetscInt    rbs,cbs;
2648       Mat_SeqAIJ* subc;
2649 
2650       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2651       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2652 
2653       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2654       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2655 
2656       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2657       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2658       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2659 
2660       /* create struct Mat_SubMat and attached it to submat */
2661       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2662       subc = (Mat_SeqAIJ*)submats[i]->data;
2663       subc->submatis1 = smat_i;
2664       smats[i]        = smat_i;
2665 
2666       smat_i->destroy          = submats[i]->ops->destroy;
2667       submats[i]->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices;
2668       submats[i]->factortype   = C->factortype;
2669 
2670       smat_i->nrqs        = 0;//nrqs;
2671       smat_i->nrqr        = 0;//nrqr;
2672       smat_i->rbuf1       = NULL;//rbuf1;
2673       smat_i->rbuf2       = NULL;//rbuf2;
2674       smat_i->rbuf3       = NULL;//rbuf3;
2675       smat_i->sbuf2       = NULL;//sbuf2;
2676       smat_i->req_source2 = NULL;//req_source2;
2677 
2678       smat_i->sbuf1       = NULL;//sbuf1;
2679       smat_i->ptr         = NULL;//ptr;
2680       smat_i->tmp         = NULL;//tmp;
2681       smat_i->ctr         = NULL;//ctr;
2682 
2683       smat_i->pa           = NULL;//pa;
2684       smat_i->req_size     = NULL;//req_size;
2685       smat_i->req_source1  = NULL;//req_source1;
2686 
2687       smat_i->allcolumns  = allcolumns[i];
2688       smat_i->row2proc    = row2proc[i];
2689       smat_i->rmap        = NULL;//rmap;
2690       smat_i->cmap        = NULL;//cmap;
2691 #if defined(PETSC_USE_CTABLE)
2692       smat_i->rmap_loc    = NULL;//rmap_loc;
2693       smat_i->cmap_loc    = NULL;//cmap_loc;
2694 #endif
2695 
2696     }
2697   }
2698 
2699   /* Assemble the matrices */
2700   /* First assemble the local rows */
2701   {
2702     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2703     PetscScalar *imat_a;
2704 
2705     for (i=0; i<ismax; i++) {
2706       row2proc_i = row2proc[i];
2707       mat       = (Mat_SeqAIJ*)submats[i]->data;
2708       imat_ilen = mat->ilen;
2709       imat_j    = mat->j;
2710       imat_i    = mat->i;
2711       imat_a    = mat->a;
2712 
2713       if (!allcolumns[i]) cmap_i = cmap[i];
2714       rmap_i = rmap[i];
2715       irow_i = irow[i];
2716       jmax   = nrow[i];
2717       for (j=0; j<jmax; j++) {
2718         row  = irow_i[j];
2719         proc = row2proc_i[j];
2720         if (proc == rank) {
2721           old_row = row;
2722 #if defined(PETSC_USE_CTABLE)
2723           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2724           row--;
2725 #else
2726           row = rmap_i[row];
2727 #endif
2728           ilen_row = imat_ilen[row];
2729           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2730           mat_i    = imat_i[row];
2731           mat_a    = imat_a + mat_i;
2732           mat_j    = imat_j + mat_i;
2733           if (!allcolumns[i]) {
2734             for (k=0; k<ncols; k++) {
2735 #if defined(PETSC_USE_CTABLE)
2736               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2737 #else
2738               tcol = cmap_i[cols[k]];
2739 #endif
2740               if (tcol) {
2741                 *mat_j++ = tcol - 1;
2742                 *mat_a++ = vals[k];
2743                 ilen_row++;
2744               }
2745             }
2746           } else { /* allcolumns */
2747             for (k=0; k<ncols; k++) {
2748               *mat_j++ = cols[k];  /* global col index! */
2749               *mat_a++ = vals[k];
2750               ilen_row++;
2751             }
2752           }
2753           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2754 
2755           imat_ilen[row] = ilen_row;
2756         }
2757       }
2758     }
2759   }
2760 
2761   /*   Now assemble the off proc rows*/
2762   {
2763     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
2764     PetscInt    *imat_j,*imat_i;
2765     PetscScalar *imat_a,*rbuf4_i;
2766 
2767     for (tmp2=0; tmp2<nrqs; tmp2++) {
2768       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
2769       idex    = pa[idex2];
2770       sbuf1_i = sbuf1[idex];
2771       jmax    = sbuf1_i[0];
2772       ct1     = 2*jmax + 1;
2773       ct2     = 0;
2774       rbuf2_i = rbuf2[idex2];
2775       rbuf3_i = rbuf3[idex2];
2776       rbuf4_i = rbuf4[idex2];
2777       for (j=1; j<=jmax; j++) {
2778         is_no     = sbuf1_i[2*j-1];
2779         rmap_i    = rmap[is_no];
2780         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2781         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
2782         imat_ilen = mat->ilen;
2783         imat_j    = mat->j;
2784         imat_i    = mat->i;
2785         imat_a    = mat->a;
2786         max1      = sbuf1_i[2*j];
2787         for (k=0; k<max1; k++,ct1++) {
2788           row = sbuf1_i[ct1];
2789 #if defined(PETSC_USE_CTABLE)
2790           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2791           row--;
2792 #else
2793           row = rmap_i[row];
2794 #endif
2795           ilen  = imat_ilen[row];
2796           mat_i = imat_i[row];
2797           mat_a = imat_a + mat_i;
2798           mat_j = imat_j + mat_i;
2799           max2  = rbuf2_i[ct1];
2800           if (!allcolumns[is_no]) {
2801             for (l=0; l<max2; l++,ct2++) {
2802 
2803 #if defined(PETSC_USE_CTABLE)
2804               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2805 #else
2806               tcol = cmap_i[rbuf3_i[ct2]];
2807 #endif
2808               if (tcol) {
2809                 *mat_j++ = tcol - 1;
2810                 *mat_a++ = rbuf4_i[ct2];
2811                 ilen++;
2812               }
2813             }
2814           } else { /* allcolumns */
2815             for (l=0; l<max2; l++,ct2++) {
2816               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2817               *mat_a++ = rbuf4_i[ct2];
2818               ilen++;
2819             }
2820           }
2821           imat_ilen[row] = ilen;
2822         }
2823       }
2824     }
2825   }
2826 
2827   /* sort the rows */
2828   {
2829     PetscInt    *imat_ilen,*imat_j,*imat_i;
2830     PetscScalar *imat_a;
2831 
2832     for (i=0; i<ismax; i++) {
2833       mat       = (Mat_SeqAIJ*)submats[i]->data;
2834       imat_j    = mat->j;
2835       imat_i    = mat->i;
2836       imat_a    = mat->a;
2837       imat_ilen = mat->ilen;
2838 
2839       if (allcolumns[i]) continue;
2840       jmax = nrow[i];
2841       for (j=0; j<jmax; j++) {
2842         PetscInt ilen;
2843 
2844         mat_i = imat_i[j];
2845         mat_a = imat_a + mat_i;
2846         mat_j = imat_j + mat_i;
2847         ilen  = imat_ilen[j];
2848         ierr  = PetscSortIntWithScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr);
2849       }
2850     }
2851   }
2852 
2853   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2854   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2855   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
2856   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2857   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2858 
2859   /* Restore the indices */
2860   for (i=0; i<ismax; i++) {
2861     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2862     if (!allcolumns[i]) {
2863       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2864     }
2865   }
2866 
2867   /* Destroy allocated memory */
2868   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
2869   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2870   ierr = PetscFree(pa);CHKERRQ(ierr);
2871 
2872   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
2873   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
2874   for (i=0; i<nrqr; ++i) {
2875     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
2876   }
2877   for (i=0; i<nrqs; ++i) {
2878     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
2879     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2880   }
2881 
2882   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
2883   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
2884   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2885   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2886   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2887   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2888   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2889 
2890 #if defined(PETSC_USE_CTABLE)
2891   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
2892 #else
2893   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
2894 #endif
2895   ierr = PetscFree(rmap);CHKERRQ(ierr);
2896 
2897   for (i=0; i<ismax; i++) {
2898     if (!allcolumns[i]) {
2899 #if defined(PETSC_USE_CTABLE)
2900       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
2901 #else
2902       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
2903 #endif
2904     }
2905   }
2906   ierr = PetscFree(cmap);CHKERRQ(ierr);
2907   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2908   ierr = PetscFree(lens);CHKERRQ(ierr);
2909 
2910   for (i=0; i<ismax; i++) {
2911     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2912     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2913   }
2914 
2915   ierr = PetscFree(issorted);CHKERRQ(ierr);
2916   ierr = PetscFree(row2proc);CHKERRQ(ierr);
2917   ierr = PetscFree(smats);CHKERRQ(ierr);
2918   PetscFunctionReturn(0);
2919 }
2920 
2921 /*
2922  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2923  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2924  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2925  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2926  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2927  state, and needs to be "assembled" later by compressing B's column space.
2928 
2929  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2930  Following this call, C->A & C->B have been created, even if empty.
2931  */
2932 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2933 {
2934   /* If making this function public, change the error returned in this function away from _PLIB. */
2935   PetscErrorCode ierr;
2936   Mat_MPIAIJ     *aij;
2937   Mat_SeqAIJ     *Baij;
2938   PetscBool      seqaij,Bdisassembled;
2939   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2940   PetscScalar    v;
2941   const PetscInt *rowindices,*colindices;
2942 
2943   PetscFunctionBegin;
2944   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2945   if (A) {
2946     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2947     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2948     if (rowemb) {
2949       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2950       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2951     } else {
2952       if (C->rmap->n != A->rmap->n) {
2953 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2954       }
2955     }
2956     if (dcolemb) {
2957       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2958       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2959     } else {
2960       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2961     }
2962   }
2963   if (B) {
2964     ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2965     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2966     if (rowemb) {
2967       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2968       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2969     } else {
2970       if (C->rmap->n != B->rmap->n) {
2971 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2972       }
2973     }
2974     if (ocolemb) {
2975       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2976       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2977     } else {
2978       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2979     }
2980   }
2981 
2982   aij    = (Mat_MPIAIJ*)(C->data);
2983   if (!aij->A) {
2984     /* Mimic parts of MatMPIAIJSetPreallocation() */
2985     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2986     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2987     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2988     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2989     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2990   }
2991   if (A) {
2992     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2993   } else {
2994     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2995   }
2996   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2997     /*
2998       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2999       need to "disassemble" B -- convert it to using C's global indices.
3000       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
3001 
3002       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
3003       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
3004 
3005       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
3006       At least avoid calling MatSetValues() and the implied searches?
3007     */
3008 
3009     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
3010 #if defined(PETSC_USE_CTABLE)
3011       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
3012 #else
3013       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
3014       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
3015       if (aij->B) {
3016         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
3017       }
3018 #endif
3019       ngcol = 0;
3020       if (aij->lvec) {
3021 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
3022       }
3023       if (aij->garray) {
3024 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
3025 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
3026       }
3027       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
3028       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
3029     }
3030     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
3031       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
3032     }
3033     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
3034       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
3035     }
3036   }
3037   Bdisassembled = PETSC_FALSE;
3038   if (!aij->B) {
3039     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
3040     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
3041     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
3042     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
3043     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
3044     Bdisassembled = PETSC_TRUE;
3045   }
3046   if (B) {
3047     Baij = (Mat_SeqAIJ*)(B->data);
3048     if (pattern == DIFFERENT_NONZERO_PATTERN) {
3049       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
3050       for (i=0; i<B->rmap->n; i++) {
3051 	nz[i] = Baij->i[i+1] - Baij->i[i];
3052       }
3053       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3054       ierr = PetscFree(nz);CHKERRQ(ierr);
3055     }
3056 
3057     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3058     shift = rend-rstart;
3059     count = 0;
3060     rowindices = NULL;
3061     colindices = NULL;
3062     if (rowemb) {
3063       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3064     }
3065     if (ocolemb) {
3066       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3067     }
3068     for (i=0; i<B->rmap->n; i++) {
3069       PetscInt row;
3070       row = i;
3071       if (rowindices) row = rowindices[i];
3072       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3073 	col  = Baij->j[count];
3074 	if (colindices) col = colindices[col];
3075 	if (Bdisassembled && col>=rstart) col += shift;
3076 	v    = Baij->a[count];
3077 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3078 	++count;
3079       }
3080     }
3081     /* No assembly for aij->B is necessary. */
3082     /* FIXME: set aij->B's nonzerostate correctly. */
3083   } else {
3084     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3085   }
3086   C->preallocated  = PETSC_TRUE;
3087   C->was_assembled = PETSC_FALSE;
3088   C->assembled     = PETSC_FALSE;
3089    /*
3090       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3091       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3092    */
3093   PetscFunctionReturn(0);
3094 }
3095 
3096 /*
3097   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3098  */
3099 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3100 {
3101   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
3102 
3103   PetscFunctionBegin;
3104   PetscValidPointer(A,2);
3105   PetscValidPointer(B,3);
3106   /* FIXME: make sure C is assembled */
3107   *A = aij->A;
3108   *B = aij->B;
3109   /* Note that we don't incref *A and *B, so be careful! */
3110   PetscFunctionReturn(0);
3111 }
3112 
3113 /*
3114   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3115   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3116 */
3117 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3118                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3119 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3120 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3121 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3122 {
3123   PetscErrorCode ierr;
3124   PetscMPIInt    isize,flag;
3125   PetscInt       i,ii,cismax,ispar,ciscol_localsize;
3126   Mat            *A,*B;
3127   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3128 
3129   PetscFunctionBegin;
3130   if (!ismax) PetscFunctionReturn(0);
3131 
3132   for (i = 0, cismax = 0; i < ismax; ++i) {
3133     PetscMPIInt isize;
3134     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
3135     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3136     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
3137     if (isize > 1) ++cismax;
3138   }
3139   /*
3140      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3141      ispar counts the number of parallel ISs across C's comm.
3142   */
3143   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3144   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3145     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3146     PetscFunctionReturn(0);
3147   }
3148 
3149   /* if (ispar) */
3150   /*
3151     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3152     These are used to extract the off-diag portion of the resulting parallel matrix.
3153     The row IS for the off-diag portion is the same as for the diag portion,
3154     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3155   */
3156   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3157   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3158   for (i = 0, ii = 0; i < ismax; ++i) {
3159     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3160     if (isize > 1) {
3161       /*
3162 	 TODO: This is the part that's ***NOT SCALABLE***.
3163 	 To fix this we need to extract just the indices of C's nonzero columns
3164 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3165 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
3166 	 to be done without serializing on the IS list, so, most likely, it is best
3167 	 done by rewriting MatGetSubMatrices_MPIAIJ() directly.
3168       */
3169       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3170       /* Now we have to
3171 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3172 	     were sorted on each rank, concatenated they might no longer be sorted;
3173 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3174 	     indices in the nondecreasing order to the original index positions.
3175 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3176       */
3177       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3178       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3179       ++ii;
3180     }
3181   }
3182   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3183   for (i = 0, ii = 0; i < ismax; ++i) {
3184     PetscInt       j,issize;
3185     const PetscInt *indices;
3186 
3187     /*
3188        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3189      */
3190     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3191     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3192     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3193     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3194     for (j = 1; j < issize; ++j) {
3195       if (indices[j] == indices[j-1]) {
3196 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3197       }
3198     }
3199     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3200 
3201 
3202     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3203     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3204     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3205     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3206     for (j = 1; j < issize; ++j) {
3207       if (indices[j-1] == indices[j]) {
3208 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3209       }
3210     }
3211     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3212     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3213     if (isize > 1) {
3214       cisrow[ii] = isrow[i];
3215       ++ii;
3216     }
3217   }
3218   /*
3219     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3220     array of sequential matrices underlying the resulting parallel matrices.
3221     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3222     contain duplicates.
3223 
3224     There are as many diag matrices as there are original index sets. There are only as many parallel
3225     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3226 
3227     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3228     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3229       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3230       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3231       with A[i] and B[ii] extracted from the corresponding MPI submat.
3232     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3233       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3234       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3235       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3236       values into A[i] and B[ii] sitting inside the corresponding submat.
3237     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3238       A[i], B[ii], AA[i] or BB[ii] matrices.
3239   */
3240   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3241   if (scall == MAT_INITIAL_MATRIX) {
3242     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3243     /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */
3244   } else {
3245     ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr);
3246     ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr);
3247     /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */
3248     for (i = 0, ii = 0; i < ismax; ++i) {
3249       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3250       if (isize > 1) {
3251 	Mat AA,BB;
3252 	ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr);
3253 	if (!isrow_p[i] && !iscol_p[i]) {
3254 	  A[i] = AA;
3255 	} else {
3256 	  /* TODO: extract A[i] composed on AA. */
3257 	  ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
3258 	}
3259 	if (!isrow_p[i] && !ciscol_p[ii]) {
3260 	  B[ii] = BB;
3261 	} else {
3262 	  /* TODO: extract B[ii] composed on BB. */
3263 	  ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr);
3264 	}
3265 	++ii;
3266       } else {
3267 	if (!isrow_p[i] && !iscol_p[i]) {
3268 	  A[i] = (*submat)[i];
3269 	} else {
3270 	  /* TODO: extract A[i] composed on (*submat)[i]. */
3271 	  ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
3272 	}
3273       }
3274     }
3275   }
3276   /* Now obtain the sequential A and B submatrices separately. */
3277   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr);
3278   /* I did not figure out a good way to fix it right now.
3279    * Local column size of B[i] is different from the size of ciscol[i].
3280    * B[i]'s size is finally determined by MatAssembly, while
3281    * ciscol[i] is computed as the complement of iscol[i].
3282    * It is better to keep only nonzero indices when computing
3283    * the complement ciscol[i].
3284    * */
3285   if(scall==MAT_REUSE_MATRIX){
3286 	for(i=0; i<cismax; i++){
3287 	  ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr);
3288 	  B[i]->cmap->n = ciscol_localsize;
3289 	}
3290   }
3291   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr);
3292   /*
3293     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3294     matrices A & B have been extracted directly into the parallel matrices containing them, or
3295     simply into the sequential matrix identical with the corresponding A (if isize == 1).
3296     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3297     to have the same sparsity pattern.
3298     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3299     must be constructed for C. This is done by setseqmat(s).
3300   */
3301   for (i = 0, ii = 0; i < ismax; ++i) {
3302     /*
3303        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3304        That way we can avoid sorting and computing permutations when reusing.
3305        To this end:
3306         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3307 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
3308 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3309     */
3310     MatStructure pattern;
3311     if (scall == MAT_INITIAL_MATRIX) {
3312       pattern = DIFFERENT_NONZERO_PATTERN;
3313     } else {
3314       pattern = SAME_NONZERO_PATTERN;
3315     }
3316     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3317     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3318     if (isize > 1) {
3319       if (scall == MAT_INITIAL_MATRIX) {
3320 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3321 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3322 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3323 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3324 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3325       }
3326       /*
3327 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3328       */
3329       {
3330 	Mat AA,BB;
3331 	AA = NULL;
3332 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i];
3333 	BB = NULL;
3334 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii];
3335 	if (AA || BB) {
3336 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3337 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3338 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3339 	}
3340 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) {
3341 	  /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */
3342 	  ierr = MatDestroy(&AA);CHKERRQ(ierr);
3343 	}
3344 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) {
3345 	  /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */
3346 	  ierr = MatDestroy(&BB);CHKERRQ(ierr);
3347 	}
3348       }
3349       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3350       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3351       ++ii;
3352     } else { /* if (isize == 1) */
3353       if (scall == MAT_INITIAL_MATRIX) {
3354 	if (isrow_p[i] || iscol_p[i]) {
3355 	  ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3356 	} else (*submat)[i] = A[i];
3357       }
3358       if (isrow_p[i] || iscol_p[i]) {
3359 	ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3360 	/* Otherwise A is extracted straight into (*submats)[i]. */
3361 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3362 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
3363       }
3364     }
3365     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3366     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3367   }
3368   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3369   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3370   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3371   ierr = PetscFree(A);CHKERRQ(ierr);
3372   ierr = PetscFree(B);CHKERRQ(ierr);
3373   PetscFunctionReturn(0);
3374 }
3375 
3376 
3377 
3378 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3379 {
3380   PetscErrorCode ierr;
3381 
3382   PetscFunctionBegin;
3383   ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3384   PetscFunctionReturn(0);
3385 }
3386