xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 580bdb303e1ee3b1222b2042810b4c26340259c6)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
23 {
24   PetscErrorCode ierr;
25   PetscInt       i;
26 
27   PetscFunctionBegin;
28   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29   for (i=0; i<ov; ++i) {
30     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
31   }
32   PetscFunctionReturn(0);
33 }
34 
35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
36 {
37   PetscErrorCode ierr;
38   PetscInt       i;
39 
40   PetscFunctionBegin;
41   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
42   for (i=0; i<ov; ++i) {
43     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
44   }
45   PetscFunctionReturn(0);
46 }
47 
48 
49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
50 {
51   PetscErrorCode ierr;
52   MPI_Comm       comm;
53   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
54   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
55   PetscInt       nrecvrows,*sbsizes = 0,*sbdata = 0;
56   const PetscInt *indices_i,**indices;
57   PetscLayout    rmap;
58   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom;
59   PetscSF        sf;
60   PetscSFNode    *remote;
61 
62   PetscFunctionBegin;
63   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
64   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
65   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
66   /* get row map to determine where rows should be going */
67   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
68   /* retrieve IS data and put all together so that we
69    * can optimize communication
70    *  */
71   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
72   for (i=0,tlength=0; i<nidx; i++){
73     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
74     tlength += length[i];
75     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
76   }
77   /* find these rows on remote processors */
78   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
79   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
80   nrrows = 0;
81   for (i=0; i<nidx; i++) {
82     length_i  = length[i];
83     indices_i = indices[i];
84     for (j=0; j<length_i; j++) {
85       owner = -1;
86       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
87       /* remote processors */
88       if (owner != rank) {
89         tosizes_temp[owner]++; /* number of rows to owner */
90         rrow_ranks[nrrows]  = owner; /* processor */
91         rrow_isids[nrrows]   = i; /* is id */
92         remoterows[nrrows++] = indices_i[j]; /* row */
93       }
94     }
95     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
96   }
97   ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr);
98   /* test if we need to exchange messages
99    * generally speaking, we do not need to exchange
100    * data when overlap is 1
101    * */
102   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
103   /* we do not have any messages
104    * It usually corresponds to overlap 1
105    * */
106   if (!reducednrrows) {
107     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
108     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
109     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
110     PetscFunctionReturn(0);
111   }
112   nto = 0;
113   /* send sizes and ranks for building a two-sided communcation */
114   for (i=0; i<size; i++) {
115     if (tosizes_temp[i]) {
116       tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
117       tosizes_temp[i] = nto; /* a map from processor to index */
118       toranks[nto++]  = i; /* processor */
119     }
120   }
121   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
122   for (i=0; i<nto; i++) {
123     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
124     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
125   }
126   /* send information to other processors */
127   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
128   nrecvrows = 0;
129   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
130   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
131   nrecvrows = 0;
132   for (i=0; i<nfrom; i++) {
133     for (j=0; j<fromsizes[2*i]; j++) {
134       remote[nrecvrows].rank    = fromranks[i];
135       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
136     }
137   }
138   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
139   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
140   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
141   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
142   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
143   /* message pair <no of is, row>  */
144   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
145   for (i=0; i<nrrows; i++) {
146     owner = rrow_ranks[i]; /* processor */
147     j     = tosizes_temp[owner]; /* index */
148     todata[toffsets[j]++] = rrow_isids[i];
149     todata[toffsets[j]++] = remoterows[i];
150   }
151   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
152   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
153   ierr = PetscFree(toffsets);CHKERRQ(ierr);
154   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
155   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
156   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
157   /* send rows belonging to the remote so that then we could get the overlapping data back */
158   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
159   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
160   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
161   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
162   ierr = PetscFree(fromranks);CHKERRQ(ierr);
163   nrecvrows = 0;
164   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
165   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
166   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
167   nrecvrows = 0;
168   for (i=0; i<nto; i++) {
169     for (j=0; j<tosizes[2*i]; j++) {
170       remote[nrecvrows].rank    = toranks[i];
171       remote[nrecvrows++].index = tosizes[2*i+1]+j;
172     }
173   }
174   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
175   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
176   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
177   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
178   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
179   /* overlap communication and computation */
180   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
181   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
182   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
183   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
184   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
185   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
186   ierr = PetscFree(toranks);CHKERRQ(ierr);
187   ierr = PetscFree(tosizes);CHKERRQ(ierr);
188   ierr = PetscFree(todata);CHKERRQ(ierr);
189   PetscFunctionReturn(0);
190 }
191 
192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
193 {
194   PetscInt         *isz,isz_i,i,j,is_id, data_size;
195   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
196   const PetscInt   *indices_i_temp;
197   MPI_Comm         *iscomms;
198   PetscErrorCode    ierr;
199 
200   PetscFunctionBegin;
201   max_lsize = 0;
202   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
203   for (i=0; i<nidx; i++){
204     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
205     max_lsize = lsize>max_lsize ? lsize:max_lsize;
206     isz[i]    = lsize;
207   }
208   ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr);
209   for (i=0; i<nidx; i++){
210     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
211     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
212     ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr);
213     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
214     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
215   }
216   /* retrieve information to get row id and its overlap */
217   for (i=0; i<nrecvs; ){
218     is_id     = recvdata[i++];
219     data_size = recvdata[i++];
220     indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
221     isz_i     = isz[is_id];
222     for (j=0; j< data_size; j++){
223       col = recvdata[i++];
224       indices_i[isz_i++] = col;
225     }
226     isz[is_id] = isz_i;
227   }
228   /* remove duplicate entities */
229   for (i=0; i<nidx; i++){
230     indices_i = indices_temp+(max_lsize+nrecvs)*i;
231     isz_i     = isz[i];
232     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
233     ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
234     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
235   }
236   ierr = PetscFree(isz);CHKERRQ(ierr);
237   ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr);
238   PetscFunctionReturn(0);
239 }
240 
241 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
242 {
243   PetscLayout       rmap,cmap;
244   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
245   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
246   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
247   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
248   Mat               amat,bmat;
249   PetscMPIInt       rank;
250   PetscBool         done;
251   MPI_Comm          comm;
252   PetscErrorCode    ierr;
253 
254   PetscFunctionBegin;
255   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
256   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
257   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
258   /* Even if the mat is symmetric, we still assume it is not symmetric */
259   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
260   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
261   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
262   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
263   /* total number of nonzero values is used to estimate the memory usage in the next step */
264   tnz  = ai[an]+bi[bn];
265   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
266   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
267   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
268   /* to find the longest message */
269   max_fszs = 0;
270   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
271   /* better way to estimate number of nonzero in the mat??? */
272   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
273   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
274   rows_pos  = 0;
275   totalrows = 0;
276   for (i=0; i<nfrom; i++){
277     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
278     /* group data together */
279     for (j=0; j<fromsizes[2*i]; j+=2){
280       is_id                       = fromrows[rows_pos++];/* no of is */
281       rows_i                      = rows_data[is_id];
282       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
283     }
284     /* estimate a space to avoid multiple allocations  */
285     for (j=0; j<nidx; j++){
286       indvc_ij = 0;
287       rows_i   = rows_data[j];
288       for (l=0; l<rows_pos_i[j]; l++){
289         row    = rows_i[l]-rstart;
290         start  = ai[row];
291         end    = ai[row+1];
292         for (k=start; k<end; k++){ /* Amat */
293           col = aj[k] + cstart;
294           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
295         }
296         start = bi[row];
297         end   = bi[row+1];
298         for (k=start; k<end; k++) { /* Bmat */
299           col = gcols[bj[k]];
300           indices_tmp[indvc_ij++] = col;
301         }
302       }
303       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
304       indv_counts[i*nidx+j] = indvc_ij;
305       totalrows            += indvc_ij;
306     }
307   }
308   /* message triple <no of is, number of rows, rows> */
309   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
310   totalrows = 0;
311   rows_pos  = 0;
312   /* use this code again */
313   for (i=0;i<nfrom;i++){
314     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
315     for (j=0; j<fromsizes[2*i]; j+=2){
316       is_id                       = fromrows[rows_pos++];
317       rows_i                      = rows_data[is_id];
318       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
319     }
320     /* add data  */
321     for (j=0; j<nidx; j++){
322       if (!indv_counts[i*nidx+j]) continue;
323       indvc_ij = 0;
324       sbdata[totalrows++] = j;
325       sbdata[totalrows++] = indv_counts[i*nidx+j];
326       sbsizes[2*i]       += 2;
327       rows_i              = rows_data[j];
328       for (l=0; l<rows_pos_i[j]; l++){
329         row   = rows_i[l]-rstart;
330         start = ai[row];
331         end   = ai[row+1];
332         for (k=start; k<end; k++){ /* Amat */
333           col = aj[k] + cstart;
334           indices_tmp[indvc_ij++] = col;
335         }
336         start = bi[row];
337         end   = bi[row+1];
338         for (k=start; k<end; k++) { /* Bmat */
339           col = gcols[bj[k]];
340           indices_tmp[indvc_ij++] = col;
341         }
342       }
343       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
344       sbsizes[2*i]  += indvc_ij;
345       ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr);
346       totalrows += indvc_ij;
347     }
348   }
349   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
350   for (i=0; i<nfrom; i++){
351     offsets[i+1]   = offsets[i] + sbsizes[2*i];
352     sbsizes[2*i+1] = offsets[i];
353   }
354   ierr = PetscFree(offsets);CHKERRQ(ierr);
355   if (sbrowsizes) *sbrowsizes = sbsizes;
356   if (sbrows) *sbrows = sbdata;
357   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
358   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
359   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
360   PetscFunctionReturn(0);
361 }
362 
363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
364 {
365   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
366   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
367   PetscInt          lsize,lsize_tmp,owner;
368   PetscMPIInt       rank;
369   Mat               amat,bmat;
370   PetscBool         done;
371   PetscLayout       cmap,rmap;
372   MPI_Comm          comm;
373   PetscErrorCode    ierr;
374 
375   PetscFunctionBegin;
376   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
377   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
378   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
379   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
380   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
381   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
382   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
383   /* is it a safe way to compute number of nonzero values ? */
384   tnz  = ai[an]+bi[bn];
385   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
386   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
387   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
388   /* it is a better way to estimate memory than the old implementation
389    * where global size of matrix is used
390    * */
391   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
392   for (i=0; i<nidx; i++) {
393     MPI_Comm iscomm;
394 
395     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
396     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
397     lsize_tmp = 0;
398     for (j=0; j<lsize; j++) {
399       owner = -1;
400       row   = indices[j];
401       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
402       if (owner != rank) continue;
403       /* local number */
404       row  -= rstart;
405       start = ai[row];
406       end   = ai[row+1];
407       for (k=start; k<end; k++) { /* Amat */
408         col = aj[k] + cstart;
409         indices_temp[lsize_tmp++] = col;
410       }
411       start = bi[row];
412       end   = bi[row+1];
413       for (k=start; k<end; k++) { /* Bmat */
414         col = gcols[bj[k]];
415         indices_temp[lsize_tmp++] = col;
416       }
417     }
418    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
419    ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr);
420    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
421    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
422    ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
423    ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr);
424   }
425   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
426   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
427   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
428   PetscFunctionReturn(0);
429 }
430 
431 
432 /*
433   Sample message format:
434   If a processor A wants processor B to process some elements corresponding
435   to index sets is[1],is[5]
436   mesg [0] = 2   (no of index sets in the mesg)
437   -----------
438   mesg [1] = 1 => is[1]
439   mesg [2] = sizeof(is[1]);
440   -----------
441   mesg [3] = 5  => is[5]
442   mesg [4] = sizeof(is[5]);
443   -----------
444   mesg [5]
445   mesg [n]  datas[1]
446   -----------
447   mesg[n+1]
448   mesg[m]  data(is[5])
449   -----------
450 
451   Notes:
452   nrqs - no of requests sent (or to be sent out)
453   nrqr - no of requests recieved (which have to be or which have been processed
454 */
455 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
456 {
457   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
458   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
459   const PetscInt **idx,*idx_i;
460   PetscInt       *n,**data,len;
461 #if defined(PETSC_USE_CTABLE)
462   PetscTable     *table_data,table_data_i;
463   PetscInt       *tdata,tcount,tcount_max;
464 #else
465   PetscInt       *data_i,*d_p;
466 #endif
467   PetscErrorCode ierr;
468   PetscMPIInt    size,rank,tag1,tag2;
469   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
470   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
471   PetscBT        *table;
472   MPI_Comm       comm;
473   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
474   MPI_Status     *s_status,*recv_status;
475   MPI_Comm       *iscomms;
476   char           *t_p;
477 
478   PetscFunctionBegin;
479   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
480   size = c->size;
481   rank = c->rank;
482   M    = C->rmap->N;
483 
484   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
485   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
486 
487   ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr);
488 
489   for (i=0; i<imax; i++) {
490     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
491     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
492   }
493 
494   /* evaluate communication - mesg to who,length of mesg, and buffer space
495      required. Based on this, buffers are allocated, and data copied into them  */
496   ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
497   for (i=0; i<imax; i++) {
498     ierr  = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/
499     idx_i = idx[i];
500     len   = n[i];
501     for (j=0; j<len; j++) {
502       row = idx_i[j];
503       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
504       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
505       w4[proc]++;
506     }
507     for (j=0; j<size; j++) {
508       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
509     }
510   }
511 
512   nrqs     = 0;              /* no of outgoing messages */
513   msz      = 0;              /* total mesg length (for all proc */
514   w1[rank] = 0;              /* no mesg sent to intself */
515   w3[rank] = 0;
516   for (i=0; i<size; i++) {
517     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
518   }
519   /* pa - is list of processors to communicate with */
520   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
521   for (i=0,j=0; i<size; i++) {
522     if (w1[i]) {pa[j] = i; j++;}
523   }
524 
525   /* Each message would have a header = 1 + 2*(no of IS) + data */
526   for (i=0; i<nrqs; i++) {
527     j      = pa[i];
528     w1[j] += w2[j] + 2*w3[j];
529     msz   += w1[j];
530   }
531 
532   /* Determine the number of messages to expect, their lengths, from from-ids */
533   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
534   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
535 
536   /* Now post the Irecvs corresponding to these messages */
537   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
538 
539   /* Allocate Memory for outgoing messages */
540   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
541   /* TODO: use PetscCalloc2() */
542   ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr);
543   ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
544 
545   {
546     PetscInt *iptr = tmp,ict  = 0;
547     for (i=0; i<nrqs; i++) {
548       j         = pa[i];
549       iptr     +=  ict;
550       outdat[j] = iptr;
551       ict       = w1[j];
552     }
553   }
554 
555   /* Form the outgoing messages */
556   /* plug in the headers */
557   for (i=0; i<nrqs; i++) {
558     j            = pa[i];
559     outdat[j][0] = 0;
560     ierr         = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr);
561     ptr[j]       = outdat[j] + 2*w3[j] + 1;
562   }
563 
564   /* Memory for doing local proc's work */
565   {
566     PetscInt M_BPB_imax = 0;
567 #if defined(PETSC_USE_CTABLE)
568     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
569     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
570     for (i=0; i<imax; i++) {
571       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
572     }
573     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
574     for (i=0; i<imax; i++) {
575       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
576     }
577 #else
578     PetscInt Mimax = 0;
579     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
580     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
581     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
582     for (i=0; i<imax; i++) {
583       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
584       data[i]  = d_p + M*i;
585     }
586 #endif
587   }
588 
589   /* Parse the IS and update local tables and the outgoing buf with the data */
590   {
591     PetscInt n_i,isz_i,*outdat_j,ctr_j;
592     PetscBT  table_i;
593 
594     for (i=0; i<imax; i++) {
595       ierr    = PetscArrayzero(ctr,size);CHKERRQ(ierr);
596       n_i     = n[i];
597       table_i = table[i];
598       idx_i   = idx[i];
599 #if defined(PETSC_USE_CTABLE)
600       table_data_i = table_data[i];
601 #else
602       data_i  = data[i];
603 #endif
604       isz_i   = isz[i];
605       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
606         row  = idx_i[j];
607         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
608         if (proc != rank) { /* copy to the outgoing buffer */
609           ctr[proc]++;
610           *ptr[proc] = row;
611           ptr[proc]++;
612         } else if (!PetscBTLookupSet(table_i,row)) {
613 #if defined(PETSC_USE_CTABLE)
614           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
615 #else
616           data_i[isz_i] = row; /* Update the local table */
617 #endif
618           isz_i++;
619         }
620       }
621       /* Update the headers for the current IS */
622       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
623         if ((ctr_j = ctr[j])) {
624           outdat_j        = outdat[j];
625           k               = ++outdat_j[0];
626           outdat_j[2*k]   = ctr_j;
627           outdat_j[2*k-1] = i;
628         }
629       }
630       isz[i] = isz_i;
631     }
632   }
633 
634   /*  Now  post the sends */
635   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
636   for (i=0; i<nrqs; ++i) {
637     j    = pa[i];
638     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
639   }
640 
641   /* No longer need the original indices */
642   ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr);
643   for (i=0; i<imax; ++i) {
644     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
645     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
646   }
647   ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr);
648 
649   for (i=0; i<imax; ++i) {
650     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
651   }
652 
653   /* Do Local work */
654 #if defined(PETSC_USE_CTABLE)
655   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
656 #else
657   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
658 #endif
659 
660   /* Receive messages */
661   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
662   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
663 
664   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
665   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
666 
667   /* Phase 1 sends are complete - deallocate buffers */
668   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
669   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
670 
671   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
672   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
673   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
674   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
675   ierr = PetscFree(rbuf);CHKERRQ(ierr);
676 
677 
678   /* Send the data back */
679   /* Do a global reduction to know the buffer space req for incoming messages */
680   {
681     PetscMPIInt *rw1;
682 
683     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
684 
685     for (i=0; i<nrqr; ++i) {
686       proc = recv_status[i].MPI_SOURCE;
687 
688       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
689       rw1[proc] = isz1[i];
690     }
691     ierr = PetscFree(onodes1);CHKERRQ(ierr);
692     ierr = PetscFree(olengths1);CHKERRQ(ierr);
693 
694     /* Determine the number of messages to expect, their lengths, from from-ids */
695     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
696     ierr = PetscFree(rw1);CHKERRQ(ierr);
697   }
698   /* Now post the Irecvs corresponding to these messages */
699   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
700 
701   /* Now  post the sends */
702   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
703   for (i=0; i<nrqr; ++i) {
704     j    = recv_status[i].MPI_SOURCE;
705     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
706   }
707 
708   /* receive work done on other processors */
709   {
710     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
711     PetscMPIInt idex;
712     PetscBT     table_i;
713     MPI_Status  *status2;
714 
715     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
716     for (i=0; i<nrqs; ++i) {
717       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
718       /* Process the message */
719       rbuf2_i = rbuf2[idex];
720       ct1     = 2*rbuf2_i[0]+1;
721       jmax    = rbuf2[idex][0];
722       for (j=1; j<=jmax; j++) {
723         max     = rbuf2_i[2*j];
724         is_no   = rbuf2_i[2*j-1];
725         isz_i   = isz[is_no];
726         table_i = table[is_no];
727 #if defined(PETSC_USE_CTABLE)
728         table_data_i = table_data[is_no];
729 #else
730         data_i  = data[is_no];
731 #endif
732         for (k=0; k<max; k++,ct1++) {
733           row = rbuf2_i[ct1];
734           if (!PetscBTLookupSet(table_i,row)) {
735 #if defined(PETSC_USE_CTABLE)
736             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
737 #else
738             data_i[isz_i] = row;
739 #endif
740             isz_i++;
741           }
742         }
743         isz[is_no] = isz_i;
744       }
745     }
746 
747     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
748     ierr = PetscFree(status2);CHKERRQ(ierr);
749   }
750 
751 #if defined(PETSC_USE_CTABLE)
752   tcount_max = 0;
753   for (i=0; i<imax; ++i) {
754     table_data_i = table_data[i];
755     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
756     if (tcount_max < tcount) tcount_max = tcount;
757   }
758   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
759 #endif
760 
761   for (i=0; i<imax; ++i) {
762 #if defined(PETSC_USE_CTABLE)
763     PetscTablePosition tpos;
764     table_data_i = table_data[i];
765 
766     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
767     while (tpos) {
768       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
769       tdata[--j] = --k;
770     }
771     ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
772 #else
773     ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
774 #endif
775     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
776   }
777 
778   ierr = PetscFree(iscomms);CHKERRQ(ierr);
779   ierr = PetscFree(onodes2);CHKERRQ(ierr);
780   ierr = PetscFree(olengths2);CHKERRQ(ierr);
781 
782   ierr = PetscFree(pa);CHKERRQ(ierr);
783   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
784   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
785   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
786   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
787   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
788   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
789   ierr = PetscFree(s_status);CHKERRQ(ierr);
790   ierr = PetscFree(recv_status);CHKERRQ(ierr);
791   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
792   ierr = PetscFree(xdata);CHKERRQ(ierr);
793   ierr = PetscFree(isz1);CHKERRQ(ierr);
794 #if defined(PETSC_USE_CTABLE)
795   for (i=0; i<imax; i++) {
796     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
797   }
798   ierr = PetscFree(table_data);CHKERRQ(ierr);
799   ierr = PetscFree(tdata);CHKERRQ(ierr);
800   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
801 #else
802   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
803 #endif
804   PetscFunctionReturn(0);
805 }
806 
807 /*
808    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
809        the work on the local processor.
810 
811      Inputs:
812       C      - MAT_MPIAIJ;
813       imax - total no of index sets processed at a time;
814       table  - an array of char - size = m bits.
815 
816      Output:
817       isz    - array containing the count of the solution elements corresponding
818                to each index set;
819       data or table_data  - pointer to the solutions
820 */
821 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
822 {
823   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
824   Mat        A  = c->A,B = c->B;
825   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
826   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
827   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
828   PetscBT    table_i;
829 #if defined(PETSC_USE_CTABLE)
830   PetscTable         table_data_i;
831   PetscErrorCode     ierr;
832   PetscTablePosition tpos;
833   PetscInt           tcount,*tdata;
834 #else
835   PetscInt           *data_i;
836 #endif
837 
838   PetscFunctionBegin;
839   rstart = C->rmap->rstart;
840   cstart = C->cmap->rstart;
841   ai     = a->i;
842   aj     = a->j;
843   bi     = b->i;
844   bj     = b->j;
845   garray = c->garray;
846 
847   for (i=0; i<imax; i++) {
848 #if defined(PETSC_USE_CTABLE)
849     /* copy existing entries of table_data_i into tdata[] */
850     table_data_i = table_data[i];
851     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
852     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
853 
854     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
855     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
856     while (tpos) {
857       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
858       tdata[--j] = --row;
859       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
860     }
861 #else
862     data_i  = data[i];
863 #endif
864     table_i = table[i];
865     isz_i   = isz[i];
866     max     = isz[i];
867 
868     for (j=0; j<max; j++) {
869 #if defined(PETSC_USE_CTABLE)
870       row   = tdata[j] - rstart;
871 #else
872       row   = data_i[j] - rstart;
873 #endif
874       start = ai[row];
875       end   = ai[row+1];
876       for (k=start; k<end; k++) { /* Amat */
877         val = aj[k] + cstart;
878         if (!PetscBTLookupSet(table_i,val)) {
879 #if defined(PETSC_USE_CTABLE)
880           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
881 #else
882           data_i[isz_i] = val;
883 #endif
884           isz_i++;
885         }
886       }
887       start = bi[row];
888       end   = bi[row+1];
889       for (k=start; k<end; k++) { /* Bmat */
890         val = garray[bj[k]];
891         if (!PetscBTLookupSet(table_i,val)) {
892 #if defined(PETSC_USE_CTABLE)
893           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
894 #else
895           data_i[isz_i] = val;
896 #endif
897           isz_i++;
898         }
899       }
900     }
901     isz[i] = isz_i;
902 
903 #if defined(PETSC_USE_CTABLE)
904     ierr = PetscFree(tdata);CHKERRQ(ierr);
905 #endif
906   }
907   PetscFunctionReturn(0);
908 }
909 
910 /*
911       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
912          and return the output
913 
914          Input:
915            C    - the matrix
916            nrqr - no of messages being processed.
917            rbuf - an array of pointers to the recieved requests
918 
919          Output:
920            xdata - array of messages to be sent back
921            isz1  - size of each message
922 
923   For better efficiency perhaps we should malloc separately each xdata[i],
924 then if a remalloc is required we need only copy the data for that one row
925 rather then all previous rows as it is now where a single large chunck of
926 memory is used.
927 
928 */
929 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
930 {
931   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
932   Mat            A  = c->A,B = c->B;
933   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
934   PetscErrorCode ierr;
935   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
936   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
937   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
938   PetscInt       *rbuf_i,kmax,rbuf_0;
939   PetscBT        xtable;
940 
941   PetscFunctionBegin;
942   m      = C->rmap->N;
943   rstart = C->rmap->rstart;
944   cstart = C->cmap->rstart;
945   ai     = a->i;
946   aj     = a->j;
947   bi     = b->i;
948   bj     = b->j;
949   garray = c->garray;
950 
951 
952   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
953     rbuf_i =  rbuf[i];
954     rbuf_0 =  rbuf_i[0];
955     ct    += rbuf_0;
956     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
957   }
958 
959   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
960   else max1 = 1;
961   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
962   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
963   ++no_malloc;
964   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
965   ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr);
966 
967   ct3 = 0;
968   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
969     rbuf_i =  rbuf[i];
970     rbuf_0 =  rbuf_i[0];
971     ct1    =  2*rbuf_0+1;
972     ct2    =  ct1;
973     ct3   += ct1;
974     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
975       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
976       oct2 = ct2;
977       kmax = rbuf_i[2*j];
978       for (k=0; k<kmax; k++,ct1++) {
979         row = rbuf_i[ct1];
980         if (!PetscBTLookupSet(xtable,row)) {
981           if (!(ct3 < mem_estimate)) {
982             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
983             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
984             ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
985             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
986             xdata[0]     = tmp;
987             mem_estimate = new_estimate; ++no_malloc;
988             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
989           }
990           xdata[i][ct2++] = row;
991           ct3++;
992         }
993       }
994       for (k=oct2,max2=ct2; k<max2; k++) {
995         row   = xdata[i][k] - rstart;
996         start = ai[row];
997         end   = ai[row+1];
998         for (l=start; l<end; l++) {
999           val = aj[l] + cstart;
1000           if (!PetscBTLookupSet(xtable,val)) {
1001             if (!(ct3 < mem_estimate)) {
1002               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1003               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1004               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
1005               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1006               xdata[0]     = tmp;
1007               mem_estimate = new_estimate; ++no_malloc;
1008               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1009             }
1010             xdata[i][ct2++] = val;
1011             ct3++;
1012           }
1013         }
1014         start = bi[row];
1015         end   = bi[row+1];
1016         for (l=start; l<end; l++) {
1017           val = garray[bj[l]];
1018           if (!PetscBTLookupSet(xtable,val)) {
1019             if (!(ct3 < mem_estimate)) {
1020               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1021               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1022               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
1023               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1024               xdata[0]     = tmp;
1025               mem_estimate = new_estimate; ++no_malloc;
1026               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1027             }
1028             xdata[i][ct2++] = val;
1029             ct3++;
1030           }
1031         }
1032       }
1033       /* Update the header*/
1034       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1035       xdata[i][2*j-1] = rbuf_i[2*j-1];
1036     }
1037     xdata[i][0] = rbuf_0;
1038     xdata[i+1]  = xdata[i] + ct2;
1039     isz1[i]     = ct2; /* size of each message */
1040   }
1041   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1042   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1043   PetscFunctionReturn(0);
1044 }
1045 /* -------------------------------------------------------------------------*/
1046 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1047 /*
1048     Every processor gets the entire matrix
1049 */
1050 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1051 {
1052   Mat            B;
1053   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1054   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1055   PetscErrorCode ierr;
1056   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
1057   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1058   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1059   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1060 
1061   PetscFunctionBegin;
1062   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1063   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
1064 
1065   if (scall == MAT_INITIAL_MATRIX) {
1066     /* ----------------------------------------------------------------
1067          Tell every processor the number of nonzeros per row
1068     */
1069     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1070     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1071       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1072     }
1073     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1074     for (i=0; i<size; i++) {
1075       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1076       displs[i]     = A->rmap->range[i];
1077     }
1078 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1079     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1080 #else
1081     sendcount = A->rmap->rend - A->rmap->rstart;
1082     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1083 #endif
1084     /* ---------------------------------------------------------------
1085          Create the sequential matrix of the same type as the local block diagonal
1086     */
1087     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1088     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1089     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1090     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1091     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1092     ierr  = PetscCalloc1(2,Bin);CHKERRQ(ierr);
1093     **Bin = B;
1094     b     = (Mat_SeqAIJ*)B->data;
1095 
1096     /*--------------------------------------------------------------------
1097        Copy my part of matrix column indices over
1098     */
1099     sendcount  = ad->nz + bd->nz;
1100     jsendbuf   = b->j + b->i[rstarts[rank]];
1101     a_jsendbuf = ad->j;
1102     b_jsendbuf = bd->j;
1103     n          = A->rmap->rend - A->rmap->rstart;
1104     cnt        = 0;
1105     for (i=0; i<n; i++) {
1106 
1107       /* put in lower diagonal portion */
1108       m = bd->i[i+1] - bd->i[i];
1109       while (m > 0) {
1110         /* is it above diagonal (in bd (compressed) numbering) */
1111         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1112         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1113         m--;
1114       }
1115 
1116       /* put in diagonal portion */
1117       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1118         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1119       }
1120 
1121       /* put in upper diagonal portion */
1122       while (m-- > 0) {
1123         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1124       }
1125     }
1126     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1127 
1128     /*--------------------------------------------------------------------
1129        Gather all column indices to all processors
1130     */
1131     for (i=0; i<size; i++) {
1132       recvcounts[i] = 0;
1133       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1134         recvcounts[i] += lens[j];
1135       }
1136     }
1137     displs[0] = 0;
1138     for (i=1; i<size; i++) {
1139       displs[i] = displs[i-1] + recvcounts[i-1];
1140     }
1141 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1142     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1143 #else
1144     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1145 #endif
1146     /*--------------------------------------------------------------------
1147         Assemble the matrix into useable form (note numerical values not yet set)
1148     */
1149     /* set the b->ilen (length of each row) values */
1150     ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr);
1151     /* set the b->i indices */
1152     b->i[0] = 0;
1153     for (i=1; i<=A->rmap->N; i++) {
1154       b->i[i] = b->i[i-1] + lens[i-1];
1155     }
1156     ierr = PetscFree(lens);CHKERRQ(ierr);
1157     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1158     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1159 
1160   } else {
1161     B = **Bin;
1162     b = (Mat_SeqAIJ*)B->data;
1163   }
1164 
1165   /*--------------------------------------------------------------------
1166        Copy my part of matrix numerical values into the values location
1167   */
1168   if (flag == MAT_GET_VALUES) {
1169     sendcount = ad->nz + bd->nz;
1170     sendbuf   = b->a + b->i[rstarts[rank]];
1171     a_sendbuf = ad->a;
1172     b_sendbuf = bd->a;
1173     b_sendj   = bd->j;
1174     n         = A->rmap->rend - A->rmap->rstart;
1175     cnt       = 0;
1176     for (i=0; i<n; i++) {
1177 
1178       /* put in lower diagonal portion */
1179       m = bd->i[i+1] - bd->i[i];
1180       while (m > 0) {
1181         /* is it above diagonal (in bd (compressed) numbering) */
1182         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1183         sendbuf[cnt++] = *b_sendbuf++;
1184         m--;
1185         b_sendj++;
1186       }
1187 
1188       /* put in diagonal portion */
1189       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1190         sendbuf[cnt++] = *a_sendbuf++;
1191       }
1192 
1193       /* put in upper diagonal portion */
1194       while (m-- > 0) {
1195         sendbuf[cnt++] = *b_sendbuf++;
1196         b_sendj++;
1197       }
1198     }
1199     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1200 
1201     /* -----------------------------------------------------------------
1202        Gather all numerical values to all processors
1203     */
1204     if (!recvcounts) {
1205       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1206     }
1207     for (i=0; i<size; i++) {
1208       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1209     }
1210     displs[0] = 0;
1211     for (i=1; i<size; i++) {
1212       displs[i] = displs[i-1] + recvcounts[i-1];
1213     }
1214     recvbuf = b->a;
1215 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1216     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1217 #else
1218     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1219 #endif
1220   }  /* endof (flag == MAT_GET_VALUES) */
1221   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1222 
1223   if (A->symmetric) {
1224     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1225   } else if (A->hermitian) {
1226     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1227   } else if (A->structurally_symmetric) {
1228     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1229   }
1230   PetscFunctionReturn(0);
1231 }
1232 
1233 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1234 {
1235   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1236   Mat            submat,A = c->A,B = c->B;
1237   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1238   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1239   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1240   const PetscInt *icol,*irow;
1241   PetscInt       nrow,ncol,start;
1242   PetscErrorCode ierr;
1243   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1244   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1245   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1246   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1247   PetscInt       *lens,rmax,ncols,*cols,Crow;
1248 #if defined(PETSC_USE_CTABLE)
1249   PetscTable     cmap,rmap;
1250   PetscInt       *cmap_loc,*rmap_loc;
1251 #else
1252   PetscInt       *cmap,*rmap;
1253 #endif
1254   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1255   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1256   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1257   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1258   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1259   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1260   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1261   MPI_Comm       comm;
1262   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1263   PetscMPIInt    *onodes1,*olengths1,idex,end;
1264   Mat_SubSppt    *smatis1;
1265   PetscBool      isrowsorted,iscolsorted;
1266 
1267   PetscFunctionBegin;
1268   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1269 
1270   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1271   size = c->size;
1272   rank = c->rank;
1273 
1274   ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr);
1275   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1276   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1277   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1278   if (allcolumns) {
1279     icol = NULL;
1280     ncol = C->cmap->N;
1281   } else {
1282     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1283     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1284   }
1285 
1286   if (scall == MAT_INITIAL_MATRIX) {
1287     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1288 
1289     /* Get some new tags to keep the communication clean */
1290     tag1 = ((PetscObject)C)->tag;
1291     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1292     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1293 
1294     /* evaluate communication - mesg to who, length of mesg, and buffer space
1295      required. Based on this, buffers are allocated, and data copied into them */
1296     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1297     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1298 
1299     /* w1[proc] = num of rows owned by proc -- to be requested */
1300     proc = 0;
1301     nrqs = 0; /* num of outgoing messages */
1302     for (j=0; j<nrow; j++) {
1303       row  = irow[j];
1304       if (!isrowsorted) proc = 0;
1305       while (row >= C->rmap->range[proc+1]) proc++;
1306       w1[proc]++;
1307       row2proc[j] = proc; /* map row index to proc */
1308 
1309       if (proc != rank && !w2[proc]) {
1310         w2[proc] = 1; nrqs++;
1311       }
1312     }
1313     w1[rank] = 0;  /* rows owned by self will not be requested */
1314 
1315     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1316     for (proc=0,j=0; proc<size; proc++) {
1317       if (w1[proc]) { pa[j++] = proc;}
1318     }
1319 
1320     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1321     msz = 0;              /* total mesg length (for all procs) */
1322     for (i=0; i<nrqs; i++) {
1323       proc      = pa[i];
1324       w1[proc] += 3;
1325       msz      += w1[proc];
1326     }
1327     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1328 
1329     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1330     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1331     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1332 
1333     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1334        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1335     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1336 
1337     /* Now post the Irecvs corresponding to these messages */
1338     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1339 
1340     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1341     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1342 
1343     /* Allocate Memory for outgoing messages */
1344     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1345     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
1346     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
1347 
1348     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1349     iptr = tmp;
1350     for (i=0; i<nrqs; i++) {
1351       proc        = pa[i];
1352       sbuf1[proc] = iptr;
1353       iptr       += w1[proc];
1354     }
1355 
1356     /* Form the outgoing messages */
1357     /* Initialize the header space */
1358     for (i=0; i<nrqs; i++) {
1359       proc      = pa[i];
1360       ierr      = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr);
1361       ptr[proc] = sbuf1[proc] + 3;
1362     }
1363 
1364     /* Parse the isrow and copy data into outbuf */
1365     ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr);
1366     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1367       proc = row2proc[j];
1368       if (proc != rank) { /* copy to the outgoing buf*/
1369         *ptr[proc] = irow[j];
1370         ctr[proc]++; ptr[proc]++;
1371       }
1372     }
1373 
1374     /* Update the headers for the current IS */
1375     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1376       if ((ctr_j = ctr[j])) {
1377         sbuf1_j        = sbuf1[j];
1378         k              = ++sbuf1_j[0];
1379         sbuf1_j[2*k]   = ctr_j;
1380         sbuf1_j[2*k-1] = 0;
1381       }
1382     }
1383 
1384     /* Now post the sends */
1385     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1386     for (i=0; i<nrqs; ++i) {
1387       proc = pa[i];
1388       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr);
1389     }
1390 
1391     /* Post Receives to capture the buffer size */
1392     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1393     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1394 
1395     rbuf2[0] = tmp + msz;
1396     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1397 
1398     for (i=0; i<nrqs; ++i) {
1399       proc = pa[i];
1400       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr);
1401     }
1402 
1403     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1404 
1405     /* Send to other procs the buf size they should allocate */
1406     /* Receive messages*/
1407     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1408     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1409 
1410     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
1411     for (i=0; i<nrqr; ++i) {
1412       req_size[i] = 0;
1413       rbuf1_i        = rbuf1[i];
1414       start          = 2*rbuf1_i[0] + 1;
1415       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1416       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1417       sbuf2_i        = sbuf2[i];
1418       for (j=start; j<end; j++) {
1419         k            = rbuf1_i[j] - rstart;
1420         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1421         sbuf2_i[j]   = ncols;
1422         req_size[i] += ncols;
1423       }
1424       req_source1[i] = r_status1[i].MPI_SOURCE;
1425 
1426       /* form the header */
1427       sbuf2_i[0] = req_size[i];
1428       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1429 
1430       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
1431     }
1432 
1433     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1434     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1435 
1436     /* rbuf2 is received, Post recv column indices a->j */
1437     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
1438 
1439     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1440     for (i=0; i<nrqs; ++i) {
1441       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1442       req_source2[i] = r_status2[i].MPI_SOURCE;
1443       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
1444     }
1445 
1446     /* Wait on sends1 and sends2 */
1447     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1448     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1449     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1450     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1451 
1452     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1453     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1454 
1455     /* Now allocate sending buffers for a->j, and send them off */
1456     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1457     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1458     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1459     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1460 
1461     for (i=0; i<nrqr; i++) { /* for each requested message */
1462       rbuf1_i   = rbuf1[i];
1463       sbuf_aj_i = sbuf_aj[i];
1464       ct1       = 2*rbuf1_i[0] + 1;
1465       ct2       = 0;
1466       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1467 
1468       kmax = rbuf1[i][2];
1469       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1470         row    = rbuf1_i[ct1] - rstart;
1471         nzA    = ai[row+1] - ai[row];
1472         nzB    = bi[row+1] - bi[row];
1473         ncols  = nzA + nzB;
1474         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1475 
1476         /* load the column indices for this row into cols*/
1477         cols = sbuf_aj_i + ct2;
1478 
1479         lwrite = 0;
1480         for (l=0; l<nzB; l++) {
1481           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1482         }
1483         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1484         for (l=0; l<nzB; l++) {
1485           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1486         }
1487 
1488         ct2 += ncols;
1489       }
1490       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
1491     }
1492 
1493     /* create column map (cmap): global col of C -> local col of submat */
1494 #if defined(PETSC_USE_CTABLE)
1495     if (!allcolumns) {
1496       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1497       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1498       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1499         if (icol[j] >= cstart && icol[j] <cend) {
1500           cmap_loc[icol[j] - cstart] = j+1;
1501         } else { /* use PetscTable for non-local col indices */
1502           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1503         }
1504       }
1505     } else {
1506       cmap     = NULL;
1507       cmap_loc = NULL;
1508     }
1509     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1510 #else
1511     if (!allcolumns) {
1512       ierr   = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1513       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1514     } else {
1515       cmap = NULL;
1516     }
1517 #endif
1518 
1519     /* Create lens for MatSeqAIJSetPreallocation() */
1520     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1521 
1522     /* Compute lens from local part of C */
1523     for (j=0; j<nrow; j++) {
1524       row  = irow[j];
1525       proc = row2proc[j];
1526       if (proc == rank) {
1527         /* diagonal part A = c->A */
1528         ncols = ai[row-rstart+1] - ai[row-rstart];
1529         cols  = aj + ai[row-rstart];
1530         if (!allcolumns) {
1531           for (k=0; k<ncols; k++) {
1532 #if defined(PETSC_USE_CTABLE)
1533             tcol = cmap_loc[cols[k]];
1534 #else
1535             tcol = cmap[cols[k]+cstart];
1536 #endif
1537             if (tcol) lens[j]++;
1538           }
1539         } else { /* allcolumns */
1540           lens[j] = ncols;
1541         }
1542 
1543         /* off-diagonal part B = c->B */
1544         ncols = bi[row-rstart+1] - bi[row-rstart];
1545         cols  = bj + bi[row-rstart];
1546         if (!allcolumns) {
1547           for (k=0; k<ncols; k++) {
1548 #if defined(PETSC_USE_CTABLE)
1549             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1550 #else
1551             tcol = cmap[bmap[cols[k]]];
1552 #endif
1553             if (tcol) lens[j]++;
1554           }
1555         } else { /* allcolumns */
1556           lens[j] += ncols;
1557         }
1558       }
1559     }
1560 
1561     /* Create row map (rmap): global row of C -> local row of submat */
1562 #if defined(PETSC_USE_CTABLE)
1563     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1564     for (j=0; j<nrow; j++) {
1565       row  = irow[j];
1566       proc = row2proc[j];
1567       if (proc == rank) { /* a local row */
1568         rmap_loc[row - rstart] = j;
1569       } else {
1570         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1571       }
1572     }
1573 #else
1574     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1575     for (j=0; j<nrow; j++) {
1576       rmap[irow[j]] = j;
1577     }
1578 #endif
1579 
1580     /* Update lens from offproc data */
1581     /* recv a->j is done */
1582     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1583     for (i=0; i<nrqs; i++) {
1584       proc    = pa[i];
1585       sbuf1_i = sbuf1[proc];
1586       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1587       ct1     = 2 + 1;
1588       ct2     = 0;
1589       rbuf2_i = rbuf2[i]; /* received length of C->j */
1590       rbuf3_i = rbuf3[i]; /* received C->j */
1591 
1592       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1593       max1   = sbuf1_i[2];
1594       for (k=0; k<max1; k++,ct1++) {
1595 #if defined(PETSC_USE_CTABLE)
1596         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1597         row--;
1598         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1599 #else
1600         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1601 #endif
1602         /* Now, store row index of submat in sbuf1_i[ct1] */
1603         sbuf1_i[ct1] = row;
1604 
1605         nnz = rbuf2_i[ct1];
1606         if (!allcolumns) {
1607           for (l=0; l<nnz; l++,ct2++) {
1608 #if defined(PETSC_USE_CTABLE)
1609             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1610               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1611             } else {
1612               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1613             }
1614 #else
1615             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1616 #endif
1617             if (tcol) lens[row]++;
1618           }
1619         } else { /* allcolumns */
1620           lens[row] += nnz;
1621         }
1622       }
1623     }
1624     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1625     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1626 
1627     /* Create the submatrices */
1628     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1629     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1630 
1631     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1632     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1633     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1634     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1635     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1636 
1637     /* create struct Mat_SubSppt and attached it to submat */
1638     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1639     subc = (Mat_SeqAIJ*)submat->data;
1640     subc->submatis1 = smatis1;
1641 
1642     smatis1->id          = 0;
1643     smatis1->nrqs        = nrqs;
1644     smatis1->nrqr        = nrqr;
1645     smatis1->rbuf1       = rbuf1;
1646     smatis1->rbuf2       = rbuf2;
1647     smatis1->rbuf3       = rbuf3;
1648     smatis1->sbuf2       = sbuf2;
1649     smatis1->req_source2 = req_source2;
1650 
1651     smatis1->sbuf1       = sbuf1;
1652     smatis1->ptr         = ptr;
1653     smatis1->tmp         = tmp;
1654     smatis1->ctr         = ctr;
1655 
1656     smatis1->pa           = pa;
1657     smatis1->req_size     = req_size;
1658     smatis1->req_source1  = req_source1;
1659 
1660     smatis1->allcolumns  = allcolumns;
1661     smatis1->singleis    = PETSC_TRUE;
1662     smatis1->row2proc    = row2proc;
1663     smatis1->rmap        = rmap;
1664     smatis1->cmap        = cmap;
1665 #if defined(PETSC_USE_CTABLE)
1666     smatis1->rmap_loc    = rmap_loc;
1667     smatis1->cmap_loc    = cmap_loc;
1668 #endif
1669 
1670     smatis1->destroy     = submat->ops->destroy;
1671     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1672     submat->factortype   = C->factortype;
1673 
1674     /* compute rmax */
1675     rmax = 0;
1676     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1677 
1678   } else { /* scall == MAT_REUSE_MATRIX */
1679     submat = submats[0];
1680     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1681 
1682     subc    = (Mat_SeqAIJ*)submat->data;
1683     rmax    = subc->rmax;
1684     smatis1 = subc->submatis1;
1685     nrqs        = smatis1->nrqs;
1686     nrqr        = smatis1->nrqr;
1687     rbuf1       = smatis1->rbuf1;
1688     rbuf2       = smatis1->rbuf2;
1689     rbuf3       = smatis1->rbuf3;
1690     req_source2 = smatis1->req_source2;
1691 
1692     sbuf1     = smatis1->sbuf1;
1693     sbuf2     = smatis1->sbuf2;
1694     ptr       = smatis1->ptr;
1695     tmp       = smatis1->tmp;
1696     ctr       = smatis1->ctr;
1697 
1698     pa         = smatis1->pa;
1699     req_size   = smatis1->req_size;
1700     req_source1 = smatis1->req_source1;
1701 
1702     allcolumns = smatis1->allcolumns;
1703     row2proc   = smatis1->row2proc;
1704     rmap       = smatis1->rmap;
1705     cmap       = smatis1->cmap;
1706 #if defined(PETSC_USE_CTABLE)
1707     rmap_loc   = smatis1->rmap_loc;
1708     cmap_loc   = smatis1->cmap_loc;
1709 #endif
1710   }
1711 
1712   /* Post recv matrix values */
1713   ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1714   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1715   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1716   for (i=0; i<nrqs; ++i) {
1717     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1718     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1719   }
1720 
1721   /* Allocate sending buffers for a->a, and send them off */
1722   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1723   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1724   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1725   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1726 
1727   for (i=0; i<nrqr; i++) {
1728     rbuf1_i   = rbuf1[i];
1729     sbuf_aa_i = sbuf_aa[i];
1730     ct1       = 2*rbuf1_i[0]+1;
1731     ct2       = 0;
1732     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1733 
1734     kmax = rbuf1_i[2];
1735     for (k=0; k<kmax; k++,ct1++) {
1736       row = rbuf1_i[ct1] - rstart;
1737       nzA = ai[row+1] - ai[row];
1738       nzB = bi[row+1] - bi[row];
1739       ncols  = nzA + nzB;
1740       cworkB = bj + bi[row];
1741       vworkA = a_a + ai[row];
1742       vworkB = b_a + bi[row];
1743 
1744       /* load the column values for this row into vals*/
1745       vals = sbuf_aa_i + ct2;
1746 
1747       lwrite = 0;
1748       for (l=0; l<nzB; l++) {
1749         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1750       }
1751       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1752       for (l=0; l<nzB; l++) {
1753         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1754       }
1755 
1756       ct2 += ncols;
1757     }
1758     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1759   }
1760 
1761   /* Assemble submat */
1762   /* First assemble the local rows */
1763   for (j=0; j<nrow; j++) {
1764     row  = irow[j];
1765     proc = row2proc[j];
1766     if (proc == rank) {
1767       Crow = row - rstart;  /* local row index of C */
1768 #if defined(PETSC_USE_CTABLE)
1769       row = rmap_loc[Crow]; /* row index of submat */
1770 #else
1771       row = rmap[row];
1772 #endif
1773 
1774       if (allcolumns) {
1775         /* diagonal part A = c->A */
1776         ncols = ai[Crow+1] - ai[Crow];
1777         cols  = aj + ai[Crow];
1778         vals  = a->a + ai[Crow];
1779         i     = 0;
1780         for (k=0; k<ncols; k++) {
1781           subcols[i]   = cols[k] + cstart;
1782           subvals[i++] = vals[k];
1783         }
1784 
1785         /* off-diagonal part B = c->B */
1786         ncols = bi[Crow+1] - bi[Crow];
1787         cols  = bj + bi[Crow];
1788         vals  = b->a + bi[Crow];
1789         for (k=0; k<ncols; k++) {
1790           subcols[i]   = bmap[cols[k]];
1791           subvals[i++] = vals[k];
1792         }
1793 
1794         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1795 
1796       } else { /* !allcolumns */
1797 #if defined(PETSC_USE_CTABLE)
1798         /* diagonal part A = c->A */
1799         ncols = ai[Crow+1] - ai[Crow];
1800         cols  = aj + ai[Crow];
1801         vals  = a->a + ai[Crow];
1802         i     = 0;
1803         for (k=0; k<ncols; k++) {
1804           tcol = cmap_loc[cols[k]];
1805           if (tcol) {
1806             subcols[i]   = --tcol;
1807             subvals[i++] = vals[k];
1808           }
1809         }
1810 
1811         /* off-diagonal part B = c->B */
1812         ncols = bi[Crow+1] - bi[Crow];
1813         cols  = bj + bi[Crow];
1814         vals  = b->a + bi[Crow];
1815         for (k=0; k<ncols; k++) {
1816           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1817           if (tcol) {
1818             subcols[i]   = --tcol;
1819             subvals[i++] = vals[k];
1820           }
1821         }
1822 #else
1823         /* diagonal part A = c->A */
1824         ncols = ai[Crow+1] - ai[Crow];
1825         cols  = aj + ai[Crow];
1826         vals  = a->a + ai[Crow];
1827         i     = 0;
1828         for (k=0; k<ncols; k++) {
1829           tcol = cmap[cols[k]+cstart];
1830           if (tcol) {
1831             subcols[i]   = --tcol;
1832             subvals[i++] = vals[k];
1833           }
1834         }
1835 
1836         /* off-diagonal part B = c->B */
1837         ncols = bi[Crow+1] - bi[Crow];
1838         cols  = bj + bi[Crow];
1839         vals  = b->a + bi[Crow];
1840         for (k=0; k<ncols; k++) {
1841           tcol = cmap[bmap[cols[k]]];
1842           if (tcol) {
1843             subcols[i]   = --tcol;
1844             subvals[i++] = vals[k];
1845           }
1846         }
1847 #endif
1848         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1849       }
1850     }
1851   }
1852 
1853   /* Now assemble the off-proc rows */
1854   for (i=0; i<nrqs; i++) { /* for each requested message */
1855     /* recv values from other processes */
1856     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1857     proc    = pa[idex];
1858     sbuf1_i = sbuf1[proc];
1859     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1860     ct1     = 2 + 1;
1861     ct2     = 0; /* count of received C->j */
1862     ct3     = 0; /* count of received C->j that will be inserted into submat */
1863     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1864     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1865     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1866 
1867     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1868     max1 = sbuf1_i[2];             /* num of rows */
1869     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1870       row = sbuf1_i[ct1]; /* row index of submat */
1871       if (!allcolumns) {
1872         idex = 0;
1873         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1874           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1875           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1876 #if defined(PETSC_USE_CTABLE)
1877             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1878               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1879             } else {
1880               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1881             }
1882 #else
1883             tcol = cmap[rbuf3_i[ct2]];
1884 #endif
1885             if (tcol) {
1886               subcols[idex]   = --tcol; /* may not be sorted */
1887               subvals[idex++] = rbuf4_i[ct2];
1888 
1889               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1890                For reuse, we replace received C->j with index that should be inserted to submat */
1891               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1892             }
1893           }
1894           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1895         } else { /* scall == MAT_REUSE_MATRIX */
1896           submat = submats[0];
1897           subc   = (Mat_SeqAIJ*)submat->data;
1898 
1899           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1900           for (l=0; l<nnz; l++) {
1901             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1902             subvals[idex++] = rbuf4_i[ct2];
1903           }
1904 
1905           bj = subc->j + subc->i[row]; /* sorted column indices */
1906           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1907         }
1908       } else { /* allcolumns */
1909         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1910         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1911         ct2 += nnz;
1912       }
1913     }
1914   }
1915 
1916   /* sending a->a are done */
1917   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1918   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1919 
1920   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1921   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1922   submats[0] = submat;
1923 
1924   /* Restore the indices */
1925   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1926   if (!allcolumns) {
1927     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1928   }
1929 
1930   /* Destroy allocated memory */
1931   for (i=0; i<nrqs; ++i) {
1932     ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr);
1933   }
1934   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1935   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1936   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1937 
1938   if (scall == MAT_INITIAL_MATRIX) {
1939     ierr = PetscFree(lens);CHKERRQ(ierr);
1940     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1941     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1942   }
1943   PetscFunctionReturn(0);
1944 }
1945 
1946 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1947 {
1948   PetscErrorCode ierr;
1949   PetscInt       ncol;
1950   PetscBool      colflag,allcolumns=PETSC_FALSE;
1951 
1952   PetscFunctionBegin;
1953   /* Allocate memory to hold all the submatrices */
1954   if (scall == MAT_INITIAL_MATRIX) {
1955     ierr = PetscCalloc1(2,submat);CHKERRQ(ierr);
1956   }
1957 
1958   /* Check for special case: each processor gets entire matrix columns */
1959   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
1960   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1961   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1962 
1963   ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
1964   PetscFunctionReturn(0);
1965 }
1966 
1967 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1968 {
1969   PetscErrorCode ierr;
1970   PetscInt       nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1971   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1972   Mat_SeqAIJ     *subc;
1973   Mat_SubSppt    *smat;
1974 
1975   PetscFunctionBegin;
1976   /* Check for special case: each processor has a single IS */
1977   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */
1978     ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1979     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */
1980     PetscFunctionReturn(0);
1981   }
1982 
1983   /* Collect global wantallmatrix and nstages */
1984   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1985   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1986   if (!nmax) nmax = 1;
1987 
1988   if (scall == MAT_INITIAL_MATRIX) {
1989     /* Collect global wantallmatrix and nstages */
1990     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1991       ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1992       ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1993       ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1994       ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1995       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1996         wantallmatrix = PETSC_TRUE;
1997 
1998         ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
1999       }
2000     }
2001 
2002     /* Determine the number of stages through which submatrices are done
2003        Each stage will extract nmax submatrices.
2004        nmax is determined by the matrix column dimension.
2005        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2006     */
2007     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2008 
2009     in[0] = -1*(PetscInt)wantallmatrix;
2010     in[1] = nstages;
2011     ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2012     wantallmatrix = (PetscBool)(-out[0]);
2013     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2014 
2015   } else { /* MAT_REUSE_MATRIX */
2016     if (ismax) {
2017       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2018       smat = subc->submatis1;
2019     } else { /* (*submat)[0] is a dummy matrix */
2020       smat = (Mat_SubSppt*)(*submat)[0]->data;
2021     }
2022     if (!smat) {
2023       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2024       wantallmatrix = PETSC_TRUE;
2025     } else if (smat->singleis) {
2026       ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2027       PetscFunctionReturn(0);
2028     } else {
2029       nstages = smat->nstages;
2030     }
2031   }
2032 
2033   if (wantallmatrix) {
2034     ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2035     PetscFunctionReturn(0);
2036   }
2037 
2038   /* Allocate memory to hold all the submatrices and dummy submatrices */
2039   if (scall == MAT_INITIAL_MATRIX) {
2040     ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr);
2041   }
2042 
2043   for (i=0,pos=0; i<nstages; i++) {
2044     if (pos+nmax <= ismax) max_no = nmax;
2045     else if (pos >= ismax) max_no = 0;
2046     else                   max_no = ismax-pos;
2047 
2048     ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
2049     if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2050       smat = (Mat_SubSppt*)(*submat)[pos]->data; pos++;
2051       smat->nstages = nstages;
2052     }
2053     pos += max_no;
2054   }
2055 
2056   if (ismax && scall == MAT_INITIAL_MATRIX) {
2057     /* save nstages for reuse */
2058     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2059     smat = subc->submatis1;
2060     smat->nstages = nstages;
2061   }
2062   PetscFunctionReturn(0);
2063 }
2064 
2065 /* -------------------------------------------------------------------------*/
2066 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2067 {
2068   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2069   Mat            A  = c->A;
2070   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2071   const PetscInt **icol,**irow;
2072   PetscInt       *nrow,*ncol,start;
2073   PetscErrorCode ierr;
2074   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2075   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2076   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2077   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2078   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2079 #if defined(PETSC_USE_CTABLE)
2080   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2081 #else
2082   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2083 #endif
2084   const PetscInt *irow_i;
2085   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2086   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2087   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2088   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2089   MPI_Status     *r_status3,*r_status4,*s_status4;
2090   MPI_Comm       comm;
2091   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2092   PetscMPIInt    *onodes1,*olengths1,end;
2093   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2094   Mat_SubSppt    *smat_i;
2095   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2096   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2097 
2098   PetscFunctionBegin;
2099   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2100   size = c->size;
2101   rank = c->rank;
2102 
2103   ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr);
2104   ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
2105 
2106   for (i=0; i<ismax; i++) {
2107     ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
2108     if (!issorted[i]) iscsorted = issorted[i];
2109 
2110     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2111 
2112     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2113     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2114 
2115     /* Check for special case: allcolumn */
2116     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2117     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2118     if (colflag && ncol[i] == C->cmap->N) {
2119       allcolumns[i] = PETSC_TRUE;
2120       icol[i] = NULL;
2121     } else {
2122       allcolumns[i] = PETSC_FALSE;
2123       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2124     }
2125   }
2126 
2127   if (scall == MAT_REUSE_MATRIX) {
2128     /* Assumes new rows are same length as the old rows */
2129     for (i=0; i<ismax; i++) {
2130       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2131       subc = (Mat_SeqAIJ*)submats[i]->data;
2132       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2133 
2134       /* Initial matrix as if empty */
2135       ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr);
2136 
2137       smat_i   = subc->submatis1;
2138 
2139       nrqs        = smat_i->nrqs;
2140       nrqr        = smat_i->nrqr;
2141       rbuf1       = smat_i->rbuf1;
2142       rbuf2       = smat_i->rbuf2;
2143       rbuf3       = smat_i->rbuf3;
2144       req_source2 = smat_i->req_source2;
2145 
2146       sbuf1     = smat_i->sbuf1;
2147       sbuf2     = smat_i->sbuf2;
2148       ptr       = smat_i->ptr;
2149       tmp       = smat_i->tmp;
2150       ctr       = smat_i->ctr;
2151 
2152       pa          = smat_i->pa;
2153       req_size    = smat_i->req_size;
2154       req_source1 = smat_i->req_source1;
2155 
2156       allcolumns[i] = smat_i->allcolumns;
2157       row2proc[i]   = smat_i->row2proc;
2158       rmap[i]       = smat_i->rmap;
2159       cmap[i]       = smat_i->cmap;
2160     }
2161 
2162     if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
2163       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2164       smat_i = (Mat_SubSppt*)submats[0]->data;
2165 
2166       nrqs        = smat_i->nrqs;
2167       nrqr        = smat_i->nrqr;
2168       rbuf1       = smat_i->rbuf1;
2169       rbuf2       = smat_i->rbuf2;
2170       rbuf3       = smat_i->rbuf3;
2171       req_source2 = smat_i->req_source2;
2172 
2173       sbuf1       = smat_i->sbuf1;
2174       sbuf2       = smat_i->sbuf2;
2175       ptr         = smat_i->ptr;
2176       tmp         = smat_i->tmp;
2177       ctr         = smat_i->ctr;
2178 
2179       pa          = smat_i->pa;
2180       req_size    = smat_i->req_size;
2181       req_source1 = smat_i->req_source1;
2182 
2183       allcolumns[0] = PETSC_FALSE;
2184     }
2185   } else { /* scall == MAT_INITIAL_MATRIX */
2186     /* Get some new tags to keep the communication clean */
2187     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2188     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2189 
2190     /* evaluate communication - mesg to who, length of mesg, and buffer space
2191      required. Based on this, buffers are allocated, and data copied into them*/
2192     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size, initialize work vectors */
2193 
2194     for (i=0; i<ismax; i++) {
2195       jmax   = nrow[i];
2196       irow_i = irow[i];
2197 
2198       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2199       row2proc[i] = row2proc_i;
2200 
2201       if (issorted[i]) proc = 0;
2202       for (j=0; j<jmax; j++) {
2203         if (!issorted[i]) proc = 0;
2204         row = irow_i[j];
2205         while (row >= C->rmap->range[proc+1]) proc++;
2206         w4[proc]++;
2207         row2proc_i[j] = proc; /* map row index to proc */
2208       }
2209       for (j=0; j<size; j++) {
2210         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2211       }
2212     }
2213 
2214     nrqs     = 0;              /* no of outgoing messages */
2215     msz      = 0;              /* total mesg length (for all procs) */
2216     w1[rank] = 0;              /* no mesg sent to self */
2217     w3[rank] = 0;
2218     for (i=0; i<size; i++) {
2219       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2220     }
2221     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2222     for (i=0,j=0; i<size; i++) {
2223       if (w1[i]) { pa[j] = i; j++; }
2224     }
2225 
2226     /* Each message would have a header = 1 + 2*(no of IS) + data */
2227     for (i=0; i<nrqs; i++) {
2228       j      = pa[i];
2229       w1[j] += w2[j] + 2* w3[j];
2230       msz   += w1[j];
2231     }
2232     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2233 
2234     /* Determine the number of messages to expect, their lengths, from from-ids */
2235     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2236     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2237 
2238     /* Now post the Irecvs corresponding to these messages */
2239     tag0 = ((PetscObject)C)->tag;
2240     ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2241 
2242     ierr = PetscFree(onodes1);CHKERRQ(ierr);
2243     ierr = PetscFree(olengths1);CHKERRQ(ierr);
2244 
2245     /* Allocate Memory for outgoing messages */
2246     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2247     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
2248     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
2249 
2250     {
2251       PetscInt *iptr = tmp;
2252       k    = 0;
2253       for (i=0; i<nrqs; i++) {
2254         j        = pa[i];
2255         iptr    += k;
2256         sbuf1[j] = iptr;
2257         k        = w1[j];
2258       }
2259     }
2260 
2261     /* Form the outgoing messages. Initialize the header space */
2262     for (i=0; i<nrqs; i++) {
2263       j           = pa[i];
2264       sbuf1[j][0] = 0;
2265       ierr        = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr);
2266       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2267     }
2268 
2269     /* Parse the isrow and copy data into outbuf */
2270     for (i=0; i<ismax; i++) {
2271       row2proc_i = row2proc[i];
2272       ierr   = PetscArrayzero(ctr,size);CHKERRQ(ierr);
2273       irow_i = irow[i];
2274       jmax   = nrow[i];
2275       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2276         proc = row2proc_i[j];
2277         if (proc != rank) { /* copy to the outgoing buf*/
2278           ctr[proc]++;
2279           *ptr[proc] = irow_i[j];
2280           ptr[proc]++;
2281         }
2282       }
2283       /* Update the headers for the current IS */
2284       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2285         if ((ctr_j = ctr[j])) {
2286           sbuf1_j        = sbuf1[j];
2287           k              = ++sbuf1_j[0];
2288           sbuf1_j[2*k]   = ctr_j;
2289           sbuf1_j[2*k-1] = i;
2290         }
2291       }
2292     }
2293 
2294     /*  Now  post the sends */
2295     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2296     for (i=0; i<nrqs; ++i) {
2297       j    = pa[i];
2298       ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
2299     }
2300 
2301     /* Post Receives to capture the buffer size */
2302     ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2303     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
2304     rbuf2[0] = tmp + msz;
2305     for (i=1; i<nrqs; ++i) {
2306       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2307     }
2308     for (i=0; i<nrqs; ++i) {
2309       j    = pa[i];
2310       ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr);
2311     }
2312 
2313     /* Send to other procs the buf size they should allocate */
2314     /* Receive messages*/
2315     ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2316     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2317     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2318     {
2319       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2320       PetscInt   *sbuf2_i;
2321 
2322       ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
2323       for (i=0; i<nrqr; ++i) {
2324         req_size[i] = 0;
2325         rbuf1_i        = rbuf1[i];
2326         start          = 2*rbuf1_i[0] + 1;
2327         ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2328         ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
2329         sbuf2_i        = sbuf2[i];
2330         for (j=start; j<end; j++) {
2331           id              = rbuf1_i[j] - rstart;
2332           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2333           sbuf2_i[j]      = ncols;
2334           req_size[i] += ncols;
2335         }
2336         req_source1[i] = r_status1[i].MPI_SOURCE;
2337         /* form the header */
2338         sbuf2_i[0] = req_size[i];
2339         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2340 
2341         ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
2342       }
2343     }
2344     ierr = PetscFree(r_status1);CHKERRQ(ierr);
2345     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2346     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2347 
2348     /* Receive messages*/
2349     ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2350     ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2351 
2352     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
2353     for (i=0; i<nrqs; ++i) {
2354       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
2355       req_source2[i] = r_status2[i].MPI_SOURCE;
2356       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
2357     }
2358     ierr = PetscFree(r_status2);CHKERRQ(ierr);
2359     ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2360 
2361     /* Wait on sends1 and sends2 */
2362     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2363     ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2364 
2365     if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
2366     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
2367     ierr = PetscFree(s_status1);CHKERRQ(ierr);
2368     ierr = PetscFree(s_status2);CHKERRQ(ierr);
2369     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2370     ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2371 
2372     /* Now allocate sending buffers for a->j, and send them off */
2373     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2374     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2375     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2376     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2377 
2378     ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2379     {
2380       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2381       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2382       PetscInt cend = C->cmap->rend;
2383       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2384 
2385       for (i=0; i<nrqr; i++) {
2386         rbuf1_i   = rbuf1[i];
2387         sbuf_aj_i = sbuf_aj[i];
2388         ct1       = 2*rbuf1_i[0] + 1;
2389         ct2       = 0;
2390         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2391           kmax = rbuf1[i][2*j];
2392           for (k=0; k<kmax; k++,ct1++) {
2393             row    = rbuf1_i[ct1] - rstart;
2394             nzA    = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
2395             ncols  = nzA + nzB;
2396             cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2397 
2398             /* load the column indices for this row into cols */
2399             cols = sbuf_aj_i + ct2;
2400 
2401             lwrite = 0;
2402             for (l=0; l<nzB; l++) {
2403               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2404             }
2405             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2406             for (l=0; l<nzB; l++) {
2407               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2408             }
2409 
2410             ct2 += ncols;
2411           }
2412         }
2413         ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
2414       }
2415     }
2416     ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
2417 
2418     /* create col map: global col of C -> local col of submatrices */
2419     {
2420       const PetscInt *icol_i;
2421 #if defined(PETSC_USE_CTABLE)
2422       for (i=0; i<ismax; i++) {
2423         if (!allcolumns[i]) {
2424           ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2425 
2426           jmax   = ncol[i];
2427           icol_i = icol[i];
2428           cmap_i = cmap[i];
2429           for (j=0; j<jmax; j++) {
2430             ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2431           }
2432         } else cmap[i] = NULL;
2433       }
2434 #else
2435       for (i=0; i<ismax; i++) {
2436         if (!allcolumns[i]) {
2437           ierr   = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2438           jmax   = ncol[i];
2439           icol_i = icol[i];
2440           cmap_i = cmap[i];
2441           for (j=0; j<jmax; j++) {
2442             cmap_i[icol_i[j]] = j+1;
2443           }
2444         } else cmap[i] = NULL;
2445       }
2446 #endif
2447     }
2448 
2449     /* Create lens which is required for MatCreate... */
2450     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2451     ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2452 
2453     if (ismax) {
2454       ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
2455     }
2456     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2457 
2458     /* Update lens from local data */
2459     for (i=0; i<ismax; i++) {
2460       row2proc_i = row2proc[i];
2461       jmax = nrow[i];
2462       if (!allcolumns[i]) cmap_i = cmap[i];
2463       irow_i = irow[i];
2464       lens_i = lens[i];
2465       for (j=0; j<jmax; j++) {
2466         row = irow_i[j];
2467         proc = row2proc_i[j];
2468         if (proc == rank) {
2469           ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2470           if (!allcolumns[i]) {
2471             for (k=0; k<ncols; k++) {
2472 #if defined(PETSC_USE_CTABLE)
2473               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2474 #else
2475               tcol = cmap_i[cols[k]];
2476 #endif
2477               if (tcol) lens_i[j]++;
2478             }
2479           } else { /* allcolumns */
2480             lens_i[j] = ncols;
2481           }
2482           ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2483         }
2484       }
2485     }
2486 
2487     /* Create row map: global row of C -> local row of submatrices */
2488 #if defined(PETSC_USE_CTABLE)
2489     for (i=0; i<ismax; i++) {
2490       ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2491       irow_i = irow[i];
2492       jmax   = nrow[i];
2493       for (j=0; j<jmax; j++) {
2494       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2495       }
2496     }
2497 #else
2498     for (i=0; i<ismax; i++) {
2499       ierr   = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2500       rmap_i = rmap[i];
2501       irow_i = irow[i];
2502       jmax   = nrow[i];
2503       for (j=0; j<jmax; j++) {
2504         rmap_i[irow_i[j]] = j;
2505       }
2506     }
2507 #endif
2508 
2509     /* Update lens from offproc data */
2510     {
2511       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2512 
2513       ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
2514       for (tmp2=0; tmp2<nrqs; tmp2++) {
2515         sbuf1_i = sbuf1[pa[tmp2]];
2516         jmax    = sbuf1_i[0];
2517         ct1     = 2*jmax+1;
2518         ct2     = 0;
2519         rbuf2_i = rbuf2[tmp2];
2520         rbuf3_i = rbuf3[tmp2];
2521         for (j=1; j<=jmax; j++) {
2522           is_no  = sbuf1_i[2*j-1];
2523           max1   = sbuf1_i[2*j];
2524           lens_i = lens[is_no];
2525           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2526           rmap_i = rmap[is_no];
2527           for (k=0; k<max1; k++,ct1++) {
2528 #if defined(PETSC_USE_CTABLE)
2529             ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2530             row--;
2531             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2532 #else
2533             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2534 #endif
2535             max2 = rbuf2_i[ct1];
2536             for (l=0; l<max2; l++,ct2++) {
2537               if (!allcolumns[is_no]) {
2538 #if defined(PETSC_USE_CTABLE)
2539                 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2540 #else
2541                 tcol = cmap_i[rbuf3_i[ct2]];
2542 #endif
2543                 if (tcol) lens_i[row]++;
2544               } else { /* allcolumns */
2545                 lens_i[row]++; /* lens_i[row] += max2 ? */
2546               }
2547             }
2548           }
2549         }
2550       }
2551     }
2552     ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2553     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
2554     ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr);
2555     ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2556 
2557     /* Create the submatrices */
2558     for (i=0; i<ismax; i++) {
2559       PetscInt    rbs,cbs;
2560 
2561       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2562       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2563 
2564       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2565       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2566 
2567       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2568       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2569       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2570 
2571       /* create struct Mat_SubSppt and attached it to submat */
2572       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2573       subc = (Mat_SeqAIJ*)submats[i]->data;
2574       subc->submatis1 = smat_i;
2575 
2576       smat_i->destroy          = submats[i]->ops->destroy;
2577       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2578       submats[i]->factortype   = C->factortype;
2579 
2580       smat_i->id          = i;
2581       smat_i->nrqs        = nrqs;
2582       smat_i->nrqr        = nrqr;
2583       smat_i->rbuf1       = rbuf1;
2584       smat_i->rbuf2       = rbuf2;
2585       smat_i->rbuf3       = rbuf3;
2586       smat_i->sbuf2       = sbuf2;
2587       smat_i->req_source2 = req_source2;
2588 
2589       smat_i->sbuf1       = sbuf1;
2590       smat_i->ptr         = ptr;
2591       smat_i->tmp         = tmp;
2592       smat_i->ctr         = ctr;
2593 
2594       smat_i->pa           = pa;
2595       smat_i->req_size     = req_size;
2596       smat_i->req_source1  = req_source1;
2597 
2598       smat_i->allcolumns  = allcolumns[i];
2599       smat_i->singleis    = PETSC_FALSE;
2600       smat_i->row2proc    = row2proc[i];
2601       smat_i->rmap        = rmap[i];
2602       smat_i->cmap        = cmap[i];
2603     }
2604 
2605     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2606       ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
2607       ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2608       ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr);
2609 
2610       /* create struct Mat_SubSppt and attached it to submat */
2611       ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr);
2612       submats[0]->data = (void*)smat_i;
2613 
2614       smat_i->destroy          = submats[0]->ops->destroy;
2615       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2616       submats[0]->factortype   = C->factortype;
2617 
2618       smat_i->id          = 0;
2619       smat_i->nrqs        = nrqs;
2620       smat_i->nrqr        = nrqr;
2621       smat_i->rbuf1       = rbuf1;
2622       smat_i->rbuf2       = rbuf2;
2623       smat_i->rbuf3       = rbuf3;
2624       smat_i->sbuf2       = sbuf2;
2625       smat_i->req_source2 = req_source2;
2626 
2627       smat_i->sbuf1       = sbuf1;
2628       smat_i->ptr         = ptr;
2629       smat_i->tmp         = tmp;
2630       smat_i->ctr         = ctr;
2631 
2632       smat_i->pa           = pa;
2633       smat_i->req_size     = req_size;
2634       smat_i->req_source1  = req_source1;
2635 
2636       smat_i->allcolumns  = PETSC_FALSE;
2637       smat_i->singleis    = PETSC_FALSE;
2638       smat_i->row2proc    = NULL;
2639       smat_i->rmap        = NULL;
2640       smat_i->cmap        = NULL;
2641     }
2642 
2643     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2644     ierr = PetscFree(lens);CHKERRQ(ierr);
2645     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2646     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2647 
2648   } /* endof scall == MAT_INITIAL_MATRIX */
2649 
2650   /* Post recv matrix values */
2651   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
2652   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2653   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2654   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2655   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2656   for (i=0; i<nrqs; ++i) {
2657     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
2658     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
2659   }
2660 
2661   /* Allocate sending buffers for a->a, and send them off */
2662   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2663   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2664   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2665   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2666 
2667   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2668   {
2669     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2670     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2671     PetscInt    cend   = C->cmap->rend;
2672     PetscInt    *b_j   = b->j;
2673     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2674 
2675     for (i=0; i<nrqr; i++) {
2676       rbuf1_i   = rbuf1[i];
2677       sbuf_aa_i = sbuf_aa[i];
2678       ct1       = 2*rbuf1_i[0]+1;
2679       ct2       = 0;
2680       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2681         kmax = rbuf1_i[2*j];
2682         for (k=0; k<kmax; k++,ct1++) {
2683           row    = rbuf1_i[ct1] - rstart;
2684           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2685           ncols  = nzA + nzB;
2686           cworkB = b_j + b_i[row];
2687           vworkA = a_a + a_i[row];
2688           vworkB = b_a + b_i[row];
2689 
2690           /* load the column values for this row into vals*/
2691           vals = sbuf_aa_i+ct2;
2692 
2693           lwrite = 0;
2694           for (l=0; l<nzB; l++) {
2695             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2696           }
2697           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2698           for (l=0; l<nzB; l++) {
2699             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2700           }
2701 
2702           ct2 += ncols;
2703         }
2704       }
2705       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
2706     }
2707   }
2708 
2709   /* Assemble the matrices */
2710   /* First assemble the local rows */
2711   for (i=0; i<ismax; i++) {
2712     row2proc_i = row2proc[i];
2713     subc      = (Mat_SeqAIJ*)submats[i]->data;
2714     imat_ilen = subc->ilen;
2715     imat_j    = subc->j;
2716     imat_i    = subc->i;
2717     imat_a    = subc->a;
2718 
2719     if (!allcolumns[i]) cmap_i = cmap[i];
2720     rmap_i = rmap[i];
2721     irow_i = irow[i];
2722     jmax   = nrow[i];
2723     for (j=0; j<jmax; j++) {
2724       row  = irow_i[j];
2725       proc = row2proc_i[j];
2726       if (proc == rank) {
2727         old_row = row;
2728 #if defined(PETSC_USE_CTABLE)
2729         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2730         row--;
2731 #else
2732         row = rmap_i[row];
2733 #endif
2734         ilen_row = imat_ilen[row];
2735         ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2736         mat_i    = imat_i[row];
2737         mat_a    = imat_a + mat_i;
2738         mat_j    = imat_j + mat_i;
2739         if (!allcolumns[i]) {
2740           for (k=0; k<ncols; k++) {
2741 #if defined(PETSC_USE_CTABLE)
2742             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2743 #else
2744             tcol = cmap_i[cols[k]];
2745 #endif
2746             if (tcol) {
2747               *mat_j++ = tcol - 1;
2748               *mat_a++ = vals[k];
2749               ilen_row++;
2750             }
2751           }
2752         } else { /* allcolumns */
2753           for (k=0; k<ncols; k++) {
2754             *mat_j++ = cols[k];  /* global col index! */
2755             *mat_a++ = vals[k];
2756             ilen_row++;
2757           }
2758         }
2759         ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2760 
2761         imat_ilen[row] = ilen_row;
2762       }
2763     }
2764   }
2765 
2766   /* Now assemble the off proc rows */
2767   ierr    = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr);
2768   for (tmp2=0; tmp2<nrqs; tmp2++) {
2769     sbuf1_i = sbuf1[pa[tmp2]];
2770     jmax    = sbuf1_i[0];
2771     ct1     = 2*jmax + 1;
2772     ct2     = 0;
2773     rbuf2_i = rbuf2[tmp2];
2774     rbuf3_i = rbuf3[tmp2];
2775     rbuf4_i = rbuf4[tmp2];
2776     for (j=1; j<=jmax; j++) {
2777       is_no     = sbuf1_i[2*j-1];
2778       rmap_i    = rmap[is_no];
2779       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2780       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2781       imat_ilen = subc->ilen;
2782       imat_j    = subc->j;
2783       imat_i    = subc->i;
2784       imat_a    = subc->a;
2785       max1      = sbuf1_i[2*j];
2786       for (k=0; k<max1; k++,ct1++) {
2787         row = sbuf1_i[ct1];
2788 #if defined(PETSC_USE_CTABLE)
2789         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2790         row--;
2791 #else
2792         row = rmap_i[row];
2793 #endif
2794         ilen  = imat_ilen[row];
2795         mat_i = imat_i[row];
2796         mat_a = imat_a + mat_i;
2797         mat_j = imat_j + mat_i;
2798         max2  = rbuf2_i[ct1];
2799         if (!allcolumns[is_no]) {
2800           for (l=0; l<max2; l++,ct2++) {
2801 #if defined(PETSC_USE_CTABLE)
2802             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2803 #else
2804             tcol = cmap_i[rbuf3_i[ct2]];
2805 #endif
2806             if (tcol) {
2807               *mat_j++ = tcol - 1;
2808               *mat_a++ = rbuf4_i[ct2];
2809               ilen++;
2810             }
2811           }
2812         } else { /* allcolumns */
2813           for (l=0; l<max2; l++,ct2++) {
2814             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2815             *mat_a++ = rbuf4_i[ct2];
2816             ilen++;
2817           }
2818         }
2819         imat_ilen[row] = ilen;
2820       }
2821     }
2822   }
2823 
2824   if (!iscsorted) { /* sort column indices of the rows */
2825     for (i=0; i<ismax; i++) {
2826       subc      = (Mat_SeqAIJ*)submats[i]->data;
2827       imat_j    = subc->j;
2828       imat_i    = subc->i;
2829       imat_a    = subc->a;
2830       imat_ilen = subc->ilen;
2831 
2832       if (allcolumns[i]) continue;
2833       jmax = nrow[i];
2834       for (j=0; j<jmax; j++) {
2835         mat_i = imat_i[j];
2836         mat_a = imat_a + mat_i;
2837         mat_j = imat_j + mat_i;
2838         ierr  = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr);
2839       }
2840     }
2841   }
2842 
2843   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2844   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2845   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
2846   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2847   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2848 
2849   /* Restore the indices */
2850   for (i=0; i<ismax; i++) {
2851     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2852     if (!allcolumns[i]) {
2853       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2854     }
2855   }
2856 
2857   for (i=0; i<ismax; i++) {
2858     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2859     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2860   }
2861 
2862   /* Destroy allocated memory */
2863   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2864   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2865   ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr);
2866 
2867   for (i=0; i<nrqs; ++i) {
2868     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2869   }
2870   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2871 
2872   ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr);
2873   PetscFunctionReturn(0);
2874 }
2875 
2876 /*
2877  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2878  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2879  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2880  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2881  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2882  state, and needs to be "assembled" later by compressing B's column space.
2883 
2884  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2885  Following this call, C->A & C->B have been created, even if empty.
2886  */
2887 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2888 {
2889   /* If making this function public, change the error returned in this function away from _PLIB. */
2890   PetscErrorCode ierr;
2891   Mat_MPIAIJ     *aij;
2892   Mat_SeqAIJ     *Baij;
2893   PetscBool      seqaij,Bdisassembled;
2894   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2895   PetscScalar    v;
2896   const PetscInt *rowindices,*colindices;
2897 
2898   PetscFunctionBegin;
2899   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2900   if (A) {
2901     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2902     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2903     if (rowemb) {
2904       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2905       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2906     } else {
2907       if (C->rmap->n != A->rmap->n) {
2908 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2909       }
2910     }
2911     if (dcolemb) {
2912       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2913       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2914     } else {
2915       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2916     }
2917   }
2918   if (B) {
2919     ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2920     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2921     if (rowemb) {
2922       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2923       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2924     } else {
2925       if (C->rmap->n != B->rmap->n) {
2926 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2927       }
2928     }
2929     if (ocolemb) {
2930       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2931       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2932     } else {
2933       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2934     }
2935   }
2936 
2937   aij = (Mat_MPIAIJ*)C->data;
2938   if (!aij->A) {
2939     /* Mimic parts of MatMPIAIJSetPreallocation() */
2940     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2941     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2942     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2943     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2944     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2945   }
2946   if (A) {
2947     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2948   } else {
2949     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2950   }
2951   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2952     /*
2953       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2954       need to "disassemble" B -- convert it to using C's global indices.
2955       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2956 
2957       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2958       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2959 
2960       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2961       At least avoid calling MatSetValues() and the implied searches?
2962     */
2963 
2964     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2965 #if defined(PETSC_USE_CTABLE)
2966       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2967 #else
2968       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2969       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2970       if (aij->B) {
2971         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2972       }
2973 #endif
2974       ngcol = 0;
2975       if (aij->lvec) {
2976 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2977       }
2978       if (aij->garray) {
2979 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2980 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2981       }
2982       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2983       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2984     }
2985     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2986       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2987     }
2988     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2989       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2990     }
2991   }
2992   Bdisassembled = PETSC_FALSE;
2993   if (!aij->B) {
2994     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2995     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2996     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2997     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
2998     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
2999     Bdisassembled = PETSC_TRUE;
3000   }
3001   if (B) {
3002     Baij = (Mat_SeqAIJ*)B->data;
3003     if (pattern == DIFFERENT_NONZERO_PATTERN) {
3004       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
3005       for (i=0; i<B->rmap->n; i++) {
3006 	nz[i] = Baij->i[i+1] - Baij->i[i];
3007       }
3008       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3009       ierr = PetscFree(nz);CHKERRQ(ierr);
3010     }
3011 
3012     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3013     shift = rend-rstart;
3014     count = 0;
3015     rowindices = NULL;
3016     colindices = NULL;
3017     if (rowemb) {
3018       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3019     }
3020     if (ocolemb) {
3021       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3022     }
3023     for (i=0; i<B->rmap->n; i++) {
3024       PetscInt row;
3025       row = i;
3026       if (rowindices) row = rowindices[i];
3027       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3028 	col  = Baij->j[count];
3029 	if (colindices) col = colindices[col];
3030 	if (Bdisassembled && col>=rstart) col += shift;
3031 	v    = Baij->a[count];
3032 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3033 	++count;
3034       }
3035     }
3036     /* No assembly for aij->B is necessary. */
3037     /* FIXME: set aij->B's nonzerostate correctly. */
3038   } else {
3039     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3040   }
3041   C->preallocated  = PETSC_TRUE;
3042   C->was_assembled = PETSC_FALSE;
3043   C->assembled     = PETSC_FALSE;
3044    /*
3045       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3046       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3047    */
3048   PetscFunctionReturn(0);
3049 }
3050 
3051 /*
3052   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3053  */
3054 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3055 {
3056   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3057 
3058   PetscFunctionBegin;
3059   PetscValidPointer(A,2);
3060   PetscValidPointer(B,3);
3061   /* FIXME: make sure C is assembled */
3062   *A = aij->A;
3063   *B = aij->B;
3064   /* Note that we don't incref *A and *B, so be careful! */
3065   PetscFunctionReturn(0);
3066 }
3067 
3068 /*
3069   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3070   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3071 */
3072 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3073                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3074 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3075 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3076 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3077 {
3078   PetscErrorCode ierr;
3079   PetscMPIInt    isize,flag;
3080   PetscInt       i,ii,cismax,ispar;
3081   Mat            *A,*B;
3082   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3083 
3084   PetscFunctionBegin;
3085   if (!ismax) PetscFunctionReturn(0);
3086 
3087   for (i = 0, cismax = 0; i < ismax; ++i) {
3088     PetscMPIInt isize;
3089     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
3090     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3091     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
3092     if (isize > 1) ++cismax;
3093   }
3094 
3095   /*
3096      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3097      ispar counts the number of parallel ISs across C's comm.
3098   */
3099   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3100   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3101     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3102     PetscFunctionReturn(0);
3103   }
3104 
3105   /* if (ispar) */
3106   /*
3107     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3108     These are used to extract the off-diag portion of the resulting parallel matrix.
3109     The row IS for the off-diag portion is the same as for the diag portion,
3110     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3111   */
3112   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3113   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3114   for (i = 0, ii = 0; i < ismax; ++i) {
3115     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3116     if (isize > 1) {
3117       /*
3118 	 TODO: This is the part that's ***NOT SCALABLE***.
3119 	 To fix this we need to extract just the indices of C's nonzero columns
3120 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3121 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
3122 	 to be done without serializing on the IS list, so, most likely, it is best
3123 	 done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3124       */
3125       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3126       /* Now we have to
3127 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3128 	     were sorted on each rank, concatenated they might no longer be sorted;
3129 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3130 	     indices in the nondecreasing order to the original index positions.
3131 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3132       */
3133       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3134       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3135       ++ii;
3136     }
3137   }
3138   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3139   for (i = 0, ii = 0; i < ismax; ++i) {
3140     PetscInt       j,issize;
3141     const PetscInt *indices;
3142 
3143     /*
3144        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3145      */
3146     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3147     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3148     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3149     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3150     for (j = 1; j < issize; ++j) {
3151       if (indices[j] == indices[j-1]) {
3152 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3153       }
3154     }
3155     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3156 
3157 
3158     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3159     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3160     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3161     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3162     for (j = 1; j < issize; ++j) {
3163       if (indices[j-1] == indices[j]) {
3164 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3165       }
3166     }
3167     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3168     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3169     if (isize > 1) {
3170       cisrow[ii] = isrow[i];
3171       ++ii;
3172     }
3173   }
3174   /*
3175     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3176     array of sequential matrices underlying the resulting parallel matrices.
3177     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3178     contain duplicates.
3179 
3180     There are as many diag matrices as there are original index sets. There are only as many parallel
3181     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3182 
3183     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3184     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3185       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3186       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3187       with A[i] and B[ii] extracted from the corresponding MPI submat.
3188     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3189       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3190       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3191       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3192       values into A[i] and B[ii] sitting inside the corresponding submat.
3193     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3194       A[i], B[ii], AA[i] or BB[ii] matrices.
3195   */
3196   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3197   if (scall == MAT_INITIAL_MATRIX) {
3198     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3199   }
3200 
3201   /* Now obtain the sequential A and B submatrices separately. */
3202   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3203   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3204   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3205 
3206   /*
3207     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3208     matrices A & B have been extracted directly into the parallel matrices containing them, or
3209     simply into the sequential matrix identical with the corresponding A (if isize == 1).
3210     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3211     to have the same sparsity pattern.
3212     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3213     must be constructed for C. This is done by setseqmat(s).
3214   */
3215   for (i = 0, ii = 0; i < ismax; ++i) {
3216     /*
3217        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3218        That way we can avoid sorting and computing permutations when reusing.
3219        To this end:
3220         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3221 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
3222 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3223     */
3224     MatStructure pattern;
3225     pattern = DIFFERENT_NONZERO_PATTERN;
3226 
3227     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3228     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3229     if (isize > 1) {
3230       if (scall == MAT_INITIAL_MATRIX) {
3231 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3232 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3233 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3234 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3235 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3236       }
3237       /*
3238 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3239       */
3240       {
3241 	Mat AA,BB;
3242         AA = A[i];
3243         BB = B[ii];
3244 	if (AA || BB) {
3245 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3246 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3247 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3248 	}
3249 
3250         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3251       }
3252       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3253       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3254       ++ii;
3255     } else { /* if (isize == 1) */
3256       if (scall == MAT_REUSE_MATRIX) {
3257         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3258       }
3259       if (isrow_p[i] || iscol_p[i]) {
3260         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3261         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3262 	/* Otherwise A is extracted straight into (*submats)[i]. */
3263 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3264 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
3265       } else (*submat)[i] = A[i];
3266     }
3267     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3268     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3269   }
3270   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3271   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3272   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3273   ierr = PetscFree(A);CHKERRQ(ierr);
3274   ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr);
3275   PetscFunctionReturn(0);
3276 }
3277 
3278 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3279 {
3280   PetscErrorCode ierr;
3281 
3282   PetscFunctionBegin;
3283   ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3284   PetscFunctionReturn(0);
3285 }
3286