xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 54113e6c510133c72022bf1fcc97d9553d75412e)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 #undef __FUNCT__
23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
25 {
26   PetscErrorCode ierr;
27   PetscInt       i;
28 
29   PetscFunctionBegin;
30   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
31   for (i=0; i<ov; ++i) {
32     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
33   }
34   PetscFunctionReturn(0);
35 }
36 
37 #undef __FUNCT__
38 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable"
39 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
40 {
41   PetscErrorCode ierr;
42   PetscInt       i;
43 
44   PetscFunctionBegin;
45   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
46   for (i=0; i<ov; ++i) {
47     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
48   }
49   PetscFunctionReturn(0);
50 }
51 
52 
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
56 {
57   PetscErrorCode   ierr;
58   MPI_Comm         comm;
59   PetscInt        *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
60   PetscInt         *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
61   PetscInt         nrecvrows,*sbsizes = 0,*sbdata = 0;
62   const PetscInt  *indices_i,**indices;
63   PetscLayout      rmap;
64   PetscMPIInt      rank,size,*toranks,*fromranks,nto,nfrom;
65   PetscSF          sf;
66   PetscSFNode     *remote;
67 
68   PetscFunctionBegin;
69   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
70   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
71   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
72   /* get row map to determine where rows should be going */
73   ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr);
74   /* retrieve IS data and put all together so that we
75    * can optimize communication
76    *  */
77   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
78   for (i=0,tlength=0; i<nidx; i++){
79     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
80     tlength += length[i];
81     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
82   }
83   /* find these rows on remote processors */
84   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
85   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
86   nrrows = 0;
87   for (i=0; i<nidx; i++){
88     length_i     = length[i];
89     indices_i    = indices[i];
90     for (j=0; j<length_i; j++){
91       owner = -1;
92       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
93       /* remote processors */
94       if (owner != rank){
95         tosizes_temp[owner]++; /* number of rows to owner */
96         rrow_ranks[nrrows]  = owner; /* processor */
97         rrow_isids[nrrows]   = i; /* is id */
98         remoterows[nrrows++] = indices_i[j]; /* row */
99       }
100     }
101     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
102   }
103   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
104   /* test if we need to exchange messages
105    * generally speaking, we do not need to exchange
106    * data when overlap is 1
107    * */
108   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
109   /* we do not have any messages
110    * It usually corresponds to overlap 1
111    * */
112   if (!reducednrrows){
113     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
114     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
115     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
116     PetscFunctionReturn(0);
117   }
118   nto = 0;
119   /* send sizes and ranks for building a two-sided communcation */
120   for (i=0; i<size; i++){
121    if (tosizes_temp[i]){
122      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
123      tosizes_temp[i] = nto; /* a map from processor to index */
124      toranks[nto++]  = i; /* processor */
125    }
126   }
127   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
128   for (i=0; i<nto; i++){
129     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
130     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
131   }
132   /* send information to other processors */
133   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
134   nrecvrows = 0;
135   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
136   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
137   nrecvrows = 0;
138   for (i=0; i<nfrom; i++){
139     for (j=0; j<fromsizes[2*i]; j++){
140       remote[nrecvrows].rank    = fromranks[i];
141       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
142     }
143   }
144   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
145   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
146   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
147   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
148   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
149   /* message pair <no of is, row>  */
150   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
151   for (i=0; i<nrrows; i++){
152     owner = rrow_ranks[i]; /* processor */
153     j     = tosizes_temp[owner]; /* index */
154     todata[toffsets[j]++] = rrow_isids[i];
155     todata[toffsets[j]++] = remoterows[i];
156   }
157   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
158   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
159   ierr = PetscFree(toffsets);CHKERRQ(ierr);
160   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
161   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
162   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
163   /* send rows belonging to the remote so that then we could get the overlapping data back */
164   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
165   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
166   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
167   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
168   ierr = PetscFree(fromranks);CHKERRQ(ierr);
169   nrecvrows = 0;
170   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
171   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
172   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
173   nrecvrows = 0;
174   for (i=0; i<nto; i++){
175     for (j=0; j<tosizes[2*i]; j++){
176       remote[nrecvrows].rank    = toranks[i];
177       remote[nrecvrows++].index = tosizes[2*i+1]+j;
178     }
179   }
180   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
181   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
182   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
183   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
184   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
185   /* overlap communication and computation */
186   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
187   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
188   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
189   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
190   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
191   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
192   ierr = PetscFree(toranks);CHKERRQ(ierr);
193   ierr = PetscFree(tosizes);CHKERRQ(ierr);
194   ierr = PetscFree(todata);CHKERRQ(ierr);
195   PetscFunctionReturn(0);
196 }
197 
198 #undef __FUNCT__
199 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable"
200 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
201 {
202   PetscInt         *isz,isz_i,i,j,is_id, data_size;
203   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
204   const PetscInt   *indices_i_temp;
205   PetscErrorCode    ierr;
206 
207   PetscFunctionBegin;
208   max_lsize = 0;
209   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
210   for (i=0; i<nidx; i++){
211     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
212     max_lsize = lsize>max_lsize ? lsize:max_lsize;
213     isz[i]    = lsize;
214   }
215   ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr);
216   for (i=0; i<nidx; i++){
217     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
218     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
219     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
220     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
221   }
222   /* retrieve information to get row id and its overlap */
223   for (i=0; i<nrecvs; ){
224     is_id      = recvdata[i++];
225     data_size  = recvdata[i++];
226     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
227     isz_i      = isz[is_id];
228     for (j=0; j< data_size; j++){
229       col = recvdata[i++];
230       indices_i[isz_i++] = col;
231     }
232     isz[is_id] = isz_i;
233   }
234   /* remove duplicate entities */
235   for (i=0; i<nidx; i++){
236     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
237     isz_i      = isz[i];
238     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
239     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
240   }
241   ierr = PetscFree(isz);CHKERRQ(ierr);
242   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
243   PetscFunctionReturn(0);
244 }
245 
246 #undef __FUNCT__
247 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable"
248 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
249 {
250   PetscLayout       rmap,cmap;
251   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
252   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
253   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
254   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
255   Mat               amat,bmat;
256   PetscMPIInt       rank;
257   PetscBool         done;
258   MPI_Comm          comm;
259   PetscErrorCode    ierr;
260 
261   PetscFunctionBegin;
262   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
263   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
264   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
265   /* Even if the mat is symmetric, we still assume it is not symmetric */
266   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
267   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
268   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
269   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
270   /* total number of nonzero values is used to estimate the memory usage in the next step */
271   tnz  = ai[an]+bi[bn];
272   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
273   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
274   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
275   /* to find the longest message */
276   max_fszs = 0;
277   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
278   /* better way to estimate number of nonzero in the mat??? */
279   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
280   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
281   rows_pos  = 0;
282   totalrows = 0;
283   for (i=0; i<nfrom; i++){
284     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
285     /* group data together */
286     for (j=0; j<fromsizes[2*i]; j+=2){
287       is_id                       = fromrows[rows_pos++];/* no of is */
288       rows_i                      = rows_data[is_id];
289       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
290     }
291     /* estimate a space to avoid multiple allocations  */
292     for (j=0; j<nidx; j++){
293       indvc_ij = 0;
294       rows_i   = rows_data[j];
295       for (l=0; l<rows_pos_i[j]; l++){
296         row    = rows_i[l]-rstart;
297         start  = ai[row];
298         end    = ai[row+1];
299         for (k=start; k<end; k++){ /* Amat */
300           col = aj[k] + cstart;
301           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
302         }
303         start = bi[row];
304         end   = bi[row+1];
305         for (k=start; k<end; k++) { /* Bmat */
306           col = gcols[bj[k]];
307           indices_tmp[indvc_ij++] = col;
308         }
309       }
310       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
311       indv_counts[i*nidx+j] = indvc_ij;
312       totalrows            += indvc_ij;
313     }
314   }
315   /* message triple <no of is, number of rows, rows> */
316   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
317   totalrows = 0;
318   rows_pos  = 0;
319   /* use this code again */
320   for (i=0;i<nfrom;i++){
321     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
322     for (j=0; j<fromsizes[2*i]; j+=2){
323       is_id                       = fromrows[rows_pos++];
324       rows_i                      = rows_data[is_id];
325       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
326     }
327     /* add data  */
328     for (j=0; j<nidx; j++){
329       if (!indv_counts[i*nidx+j]) continue;
330       indvc_ij = 0;
331       sbdata[totalrows++] = j;
332       sbdata[totalrows++] = indv_counts[i*nidx+j];
333       sbsizes[2*i]       += 2;
334       rows_i              = rows_data[j];
335       for (l=0; l<rows_pos_i[j]; l++){
336         row   = rows_i[l]-rstart;
337         start = ai[row];
338         end   = ai[row+1];
339         for (k=start; k<end; k++){ /* Amat */
340           col = aj[k] + cstart;
341           indices_tmp[indvc_ij++] = col;
342         }
343         start = bi[row];
344         end   = bi[row+1];
345         for (k=start; k<end; k++) { /* Bmat */
346           col = gcols[bj[k]];
347           indices_tmp[indvc_ij++] = col;
348         }
349       }
350       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
351       sbsizes[2*i]  += indvc_ij;
352       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
353       totalrows += indvc_ij;
354     }
355   }
356   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
357   for (i=0; i<nfrom; i++){
358     offsets[i+1]   = offsets[i] + sbsizes[2*i];
359     sbsizes[2*i+1] = offsets[i];
360   }
361   ierr = PetscFree(offsets);CHKERRQ(ierr);
362   if (sbrowsizes) *sbrowsizes = sbsizes;
363   if (sbrows) *sbrows = sbdata;
364   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
365   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
366   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
367   PetscFunctionReturn(0);
368 }
369 
370 #undef __FUNCT__
371 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable"
372 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
373 {
374   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
375   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
376   PetscInt          lsize,lsize_tmp,owner;
377   PetscMPIInt       rank;
378   Mat               amat,bmat;
379   PetscBool         done;
380   PetscLayout       cmap,rmap;
381   MPI_Comm          comm;
382   PetscErrorCode    ierr;
383 
384   PetscFunctionBegin;
385   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
386   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
387   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
388   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
389   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
390   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
391   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
392   /* is it a safe way to compute number of nonzero values ? */
393   tnz  = ai[an]+bi[bn];
394   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
395   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
396   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
397   /* it is a better way to estimate memory than the old implementation
398    * where global size of matrix is used
399    * */
400   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
401   for (i=0; i<nidx; i++) {
402     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
403     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
404     lsize_tmp = 0;
405     for (j=0; j<lsize; j++) {
406       owner = -1;
407       row   = indices[j];
408       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
409       if (owner != rank) continue;
410       /* local number */
411       row  -= rstart;
412       start = ai[row];
413       end   = ai[row+1];
414       for (k=start; k<end; k++) { /* Amat */
415         col = aj[k] + cstart;
416         indices_temp[lsize_tmp++] = col;
417       }
418       start = bi[row];
419       end   = bi[row+1];
420       for (k=start; k<end; k++) { /* Bmat */
421         col = gcols[bj[k]];
422         indices_temp[lsize_tmp++] = col;
423       }
424     }
425    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
426    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
427    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
428    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
429   }
430   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
431   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
432   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
433   PetscFunctionReturn(0);
434 }
435 
436 
437 /*
438   Sample message format:
439   If a processor A wants processor B to process some elements corresponding
440   to index sets is[1],is[5]
441   mesg [0] = 2   (no of index sets in the mesg)
442   -----------
443   mesg [1] = 1 => is[1]
444   mesg [2] = sizeof(is[1]);
445   -----------
446   mesg [3] = 5  => is[5]
447   mesg [4] = sizeof(is[5]);
448   -----------
449   mesg [5]
450   mesg [n]  datas[1]
451   -----------
452   mesg[n+1]
453   mesg[m]  data(is[5])
454   -----------
455 
456   Notes:
457   nrqs - no of requests sent (or to be sent out)
458   nrqr - no of requests recieved (which have to be or which have been processed
459 */
460 #undef __FUNCT__
461 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
462 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
463 {
464   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
465   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
466   const PetscInt **idx,*idx_i;
467   PetscInt       *n,**data,len;
468 #if defined(PETSC_USE_CTABLE)
469   PetscTable     *table_data,table_data_i;
470   PetscInt       *tdata,tcount,tcount_max;
471 #else
472   PetscInt       *data_i,*d_p;
473 #endif
474   PetscErrorCode ierr;
475   PetscMPIInt    size,rank,tag1,tag2;
476   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
477   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
478   PetscBT        *table;
479   MPI_Comm       comm;
480   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
481   MPI_Status     *s_status,*recv_status;
482   char           *t_p;
483 
484   PetscFunctionBegin;
485   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
486   size = c->size;
487   rank = c->rank;
488   M    = C->rmap->N;
489 
490   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
491   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
492 
493   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
494 
495   for (i=0; i<imax; i++) {
496     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
497     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
498   }
499 
500   /* evaluate communication - mesg to who,length of mesg, and buffer space
501      required. Based on this, buffers are allocated, and data copied into them  */
502   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
503   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
504   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
505   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
506   for (i=0; i<imax; i++) {
507     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
508     idx_i = idx[i];
509     len   = n[i];
510     for (j=0; j<len; j++) {
511       row = idx_i[j];
512       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
513       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
514       w4[proc]++;
515     }
516     for (j=0; j<size; j++) {
517       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
518     }
519   }
520 
521   nrqs     = 0;              /* no of outgoing messages */
522   msz      = 0;              /* total mesg length (for all proc */
523   w1[rank] = 0;              /* no mesg sent to intself */
524   w3[rank] = 0;
525   for (i=0; i<size; i++) {
526     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
527   }
528   /* pa - is list of processors to communicate with */
529   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
530   for (i=0,j=0; i<size; i++) {
531     if (w1[i]) {pa[j] = i; j++;}
532   }
533 
534   /* Each message would have a header = 1 + 2*(no of IS) + data */
535   for (i=0; i<nrqs; i++) {
536     j      = pa[i];
537     w1[j] += w2[j] + 2*w3[j];
538     msz   += w1[j];
539   }
540 
541   /* Determine the number of messages to expect, their lengths, from from-ids */
542   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
543   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
544 
545   /* Now post the Irecvs corresponding to these messages */
546   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
547 
548   /* Allocate Memory for outgoing messages */
549   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
550   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
551   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
552 
553   {
554     PetscInt *iptr = tmp,ict  = 0;
555     for (i=0; i<nrqs; i++) {
556       j         = pa[i];
557       iptr     +=  ict;
558       outdat[j] = iptr;
559       ict       = w1[j];
560     }
561   }
562 
563   /* Form the outgoing messages */
564   /* plug in the headers */
565   for (i=0; i<nrqs; i++) {
566     j            = pa[i];
567     outdat[j][0] = 0;
568     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
569     ptr[j]       = outdat[j] + 2*w3[j] + 1;
570   }
571 
572   /* Memory for doing local proc's work */
573   {
574     PetscInt M_BPB_imax = 0;
575 #if defined(PETSC_USE_CTABLE)
576     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
577     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
578     for (i=0; i<imax; i++) {
579       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
580     }
581     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
582     for (i=0; i<imax; i++) {
583       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
584     }
585 #else
586     PetscInt Mimax = 0;
587     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
588     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
589     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
590     for (i=0; i<imax; i++) {
591       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
592       data[i]  = d_p + M*i;
593     }
594 #endif
595   }
596 
597   /* Parse the IS and update local tables and the outgoing buf with the data */
598   {
599     PetscInt n_i,isz_i,*outdat_j,ctr_j;
600     PetscBT  table_i;
601 
602     for (i=0; i<imax; i++) {
603       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
604       n_i     = n[i];
605       table_i = table[i];
606       idx_i   = idx[i];
607 #if defined(PETSC_USE_CTABLE)
608       table_data_i = table_data[i];
609 #else
610       data_i  = data[i];
611 #endif
612       isz_i   = isz[i];
613       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
614         row  = idx_i[j];
615         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
616         if (proc != rank) { /* copy to the outgoing buffer */
617           ctr[proc]++;
618           *ptr[proc] = row;
619           ptr[proc]++;
620         } else if (!PetscBTLookupSet(table_i,row)) {
621 #if defined(PETSC_USE_CTABLE)
622           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
623 #else
624           data_i[isz_i] = row; /* Update the local table */
625 #endif
626           isz_i++;
627         }
628       }
629       /* Update the headers for the current IS */
630       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
631         if ((ctr_j = ctr[j])) {
632           outdat_j        = outdat[j];
633           k               = ++outdat_j[0];
634           outdat_j[2*k]   = ctr_j;
635           outdat_j[2*k-1] = i;
636         }
637       }
638       isz[i] = isz_i;
639     }
640   }
641 
642   /*  Now  post the sends */
643   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
644   for (i=0; i<nrqs; ++i) {
645     j    = pa[i];
646     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
647   }
648 
649   /* No longer need the original indices */
650   for (i=0; i<imax; ++i) {
651     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
652   }
653   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
654 
655   for (i=0; i<imax; ++i) {
656     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
657   }
658 
659   /* Do Local work */
660 #if defined(PETSC_USE_CTABLE)
661   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
662 #else
663   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
664 #endif
665 
666   /* Receive messages */
667   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
668   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
669 
670   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
671   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
672 
673   /* Phase 1 sends are complete - deallocate buffers */
674   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
675   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
676 
677   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
678   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
679   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
680   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
681   ierr = PetscFree(rbuf);CHKERRQ(ierr);
682 
683 
684   /* Send the data back */
685   /* Do a global reduction to know the buffer space req for incoming messages */
686   {
687     PetscMPIInt *rw1;
688 
689     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
690 
691     for (i=0; i<nrqr; ++i) {
692       proc = recv_status[i].MPI_SOURCE;
693 
694       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
695       rw1[proc] = isz1[i];
696     }
697     ierr = PetscFree(onodes1);CHKERRQ(ierr);
698     ierr = PetscFree(olengths1);CHKERRQ(ierr);
699 
700     /* Determine the number of messages to expect, their lengths, from from-ids */
701     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
702     ierr = PetscFree(rw1);CHKERRQ(ierr);
703   }
704   /* Now post the Irecvs corresponding to these messages */
705   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
706 
707   /* Now  post the sends */
708   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
709   for (i=0; i<nrqr; ++i) {
710     j    = recv_status[i].MPI_SOURCE;
711     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
712   }
713 
714   /* receive work done on other processors */
715   {
716     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
717     PetscMPIInt idex;
718     PetscBT     table_i;
719     MPI_Status  *status2;
720 
721     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
722     for (i=0; i<nrqs; ++i) {
723       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
724       /* Process the message */
725       rbuf2_i = rbuf2[idex];
726       ct1     = 2*rbuf2_i[0]+1;
727       jmax    = rbuf2[idex][0];
728       for (j=1; j<=jmax; j++) {
729         max     = rbuf2_i[2*j];
730         is_no   = rbuf2_i[2*j-1];
731         isz_i   = isz[is_no];
732         table_i = table[is_no];
733 #if defined(PETSC_USE_CTABLE)
734         table_data_i = table_data[is_no];
735 #else
736         data_i  = data[is_no];
737 #endif
738         for (k=0; k<max; k++,ct1++) {
739           row = rbuf2_i[ct1];
740           if (!PetscBTLookupSet(table_i,row)) {
741 #if defined(PETSC_USE_CTABLE)
742             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
743 #else
744             data_i[isz_i] = row;
745 #endif
746             isz_i++;
747           }
748         }
749         isz[is_no] = isz_i;
750       }
751     }
752 
753     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
754     ierr = PetscFree(status2);CHKERRQ(ierr);
755   }
756 
757 #if defined(PETSC_USE_CTABLE)
758   tcount_max = 0;
759   for (i=0; i<imax; ++i) {
760     table_data_i = table_data[i];
761     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
762     if (tcount_max < tcount) tcount_max = tcount;
763   }
764   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
765 #endif
766 
767   for (i=0; i<imax; ++i) {
768 #if defined(PETSC_USE_CTABLE)
769     PetscTablePosition tpos;
770     table_data_i = table_data[i];
771 
772     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
773     while (tpos) {
774       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
775       tdata[--j] = --k;
776     }
777     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
778 #else
779     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
780 #endif
781   }
782 
783   ierr = PetscFree(onodes2);CHKERRQ(ierr);
784   ierr = PetscFree(olengths2);CHKERRQ(ierr);
785 
786   ierr = PetscFree(pa);CHKERRQ(ierr);
787   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
788   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
789   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
790   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
791   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
792   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
793   ierr = PetscFree(s_status);CHKERRQ(ierr);
794   ierr = PetscFree(recv_status);CHKERRQ(ierr);
795   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
796   ierr = PetscFree(xdata);CHKERRQ(ierr);
797   ierr = PetscFree(isz1);CHKERRQ(ierr);
798 #if defined(PETSC_USE_CTABLE)
799   for (i=0; i<imax; i++) {
800     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
801   }
802   ierr = PetscFree(table_data);CHKERRQ(ierr);
803   ierr = PetscFree(tdata);CHKERRQ(ierr);
804   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
805 #else
806   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
807 #endif
808   PetscFunctionReturn(0);
809 }
810 
811 #undef __FUNCT__
812 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
813 /*
814    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
815        the work on the local processor.
816 
817      Inputs:
818       C      - MAT_MPIAIJ;
819       imax - total no of index sets processed at a time;
820       table  - an array of char - size = m bits.
821 
822      Output:
823       isz    - array containing the count of the solution elements corresponding
824                to each index set;
825       data or table_data  - pointer to the solutions
826 */
827 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
828 {
829   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
830   Mat        A  = c->A,B = c->B;
831   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
832   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
833   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
834   PetscBT    table_i;
835 #if defined(PETSC_USE_CTABLE)
836   PetscTable         table_data_i;
837   PetscErrorCode     ierr;
838   PetscTablePosition tpos;
839   PetscInt           tcount,*tdata;
840 #else
841   PetscInt           *data_i;
842 #endif
843 
844   PetscFunctionBegin;
845   rstart = C->rmap->rstart;
846   cstart = C->cmap->rstart;
847   ai     = a->i;
848   aj     = a->j;
849   bi     = b->i;
850   bj     = b->j;
851   garray = c->garray;
852 
853   for (i=0; i<imax; i++) {
854 #if defined(PETSC_USE_CTABLE)
855     /* copy existing entries of table_data_i into tdata[] */
856     table_data_i = table_data[i];
857     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
858     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
859 
860     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
861     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
862     while (tpos) {
863       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
864       tdata[--j] = --row;
865       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
866     }
867 #else
868     data_i  = data[i];
869 #endif
870     table_i = table[i];
871     isz_i   = isz[i];
872     max     = isz[i];
873 
874     for (j=0; j<max; j++) {
875 #if defined(PETSC_USE_CTABLE)
876       row   = tdata[j] - rstart;
877 #else
878       row   = data_i[j] - rstart;
879 #endif
880       start = ai[row];
881       end   = ai[row+1];
882       for (k=start; k<end; k++) { /* Amat */
883         val = aj[k] + cstart;
884         if (!PetscBTLookupSet(table_i,val)) {
885 #if defined(PETSC_USE_CTABLE)
886           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
887 #else
888           data_i[isz_i] = val;
889 #endif
890           isz_i++;
891         }
892       }
893       start = bi[row];
894       end   = bi[row+1];
895       for (k=start; k<end; k++) { /* Bmat */
896         val = garray[bj[k]];
897         if (!PetscBTLookupSet(table_i,val)) {
898 #if defined(PETSC_USE_CTABLE)
899           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
900 #else
901           data_i[isz_i] = val;
902 #endif
903           isz_i++;
904         }
905       }
906     }
907     isz[i] = isz_i;
908 
909 #if defined(PETSC_USE_CTABLE)
910     ierr = PetscFree(tdata);CHKERRQ(ierr);
911 #endif
912   }
913   PetscFunctionReturn(0);
914 }
915 
916 #undef __FUNCT__
917 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
918 /*
919       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
920          and return the output
921 
922          Input:
923            C    - the matrix
924            nrqr - no of messages being processed.
925            rbuf - an array of pointers to the recieved requests
926 
927          Output:
928            xdata - array of messages to be sent back
929            isz1  - size of each message
930 
931   For better efficiency perhaps we should malloc separately each xdata[i],
932 then if a remalloc is required we need only copy the data for that one row
933 rather then all previous rows as it is now where a single large chunck of
934 memory is used.
935 
936 */
937 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
938 {
939   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
940   Mat            A  = c->A,B = c->B;
941   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
942   PetscErrorCode ierr;
943   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
944   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
945   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
946   PetscInt       *rbuf_i,kmax,rbuf_0;
947   PetscBT        xtable;
948 
949   PetscFunctionBegin;
950   m      = C->rmap->N;
951   rstart = C->rmap->rstart;
952   cstart = C->cmap->rstart;
953   ai     = a->i;
954   aj     = a->j;
955   bi     = b->i;
956   bj     = b->j;
957   garray = c->garray;
958 
959 
960   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
961     rbuf_i =  rbuf[i];
962     rbuf_0 =  rbuf_i[0];
963     ct    += rbuf_0;
964     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
965   }
966 
967   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
968   else max1 = 1;
969   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
970   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
971   ++no_malloc;
972   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
973   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
974 
975   ct3 = 0;
976   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
977     rbuf_i =  rbuf[i];
978     rbuf_0 =  rbuf_i[0];
979     ct1    =  2*rbuf_0+1;
980     ct2    =  ct1;
981     ct3   += ct1;
982     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
983       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
984       oct2 = ct2;
985       kmax = rbuf_i[2*j];
986       for (k=0; k<kmax; k++,ct1++) {
987         row = rbuf_i[ct1];
988         if (!PetscBTLookupSet(xtable,row)) {
989           if (!(ct3 < mem_estimate)) {
990             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
991             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
992             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
993             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
994             xdata[0]     = tmp;
995             mem_estimate = new_estimate; ++no_malloc;
996             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
997           }
998           xdata[i][ct2++] = row;
999           ct3++;
1000         }
1001       }
1002       for (k=oct2,max2=ct2; k<max2; k++) {
1003         row   = xdata[i][k] - rstart;
1004         start = ai[row];
1005         end   = ai[row+1];
1006         for (l=start; l<end; l++) {
1007           val = aj[l] + cstart;
1008           if (!PetscBTLookupSet(xtable,val)) {
1009             if (!(ct3 < mem_estimate)) {
1010               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1011               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1012               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1013               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1014               xdata[0]     = tmp;
1015               mem_estimate = new_estimate; ++no_malloc;
1016               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1017             }
1018             xdata[i][ct2++] = val;
1019             ct3++;
1020           }
1021         }
1022         start = bi[row];
1023         end   = bi[row+1];
1024         for (l=start; l<end; l++) {
1025           val = garray[bj[l]];
1026           if (!PetscBTLookupSet(xtable,val)) {
1027             if (!(ct3 < mem_estimate)) {
1028               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1029               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1030               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1031               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1032               xdata[0]     = tmp;
1033               mem_estimate = new_estimate; ++no_malloc;
1034               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1035             }
1036             xdata[i][ct2++] = val;
1037             ct3++;
1038           }
1039         }
1040       }
1041       /* Update the header*/
1042       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1043       xdata[i][2*j-1] = rbuf_i[2*j-1];
1044     }
1045     xdata[i][0] = rbuf_0;
1046     xdata[i+1]  = xdata[i] + ct2;
1047     isz1[i]     = ct2; /* size of each message */
1048   }
1049   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1050   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1051   PetscFunctionReturn(0);
1052 }
1053 /* -------------------------------------------------------------------------*/
1054 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
1055 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
1056 /*
1057     Every processor gets the entire matrix
1058 */
1059 #undef __FUNCT__
1060 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
1061 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1062 {
1063   Mat            B;
1064   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1065   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1066   PetscErrorCode ierr;
1067   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
1068   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1069   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1070   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1071 
1072   PetscFunctionBegin;
1073   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1074   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
1075 
1076   if (scall == MAT_INITIAL_MATRIX) {
1077     /* ----------------------------------------------------------------
1078          Tell every processor the number of nonzeros per row
1079     */
1080     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1081     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1082       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1083     }
1084     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1085     for (i=0; i<size; i++) {
1086       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1087       displs[i]     = A->rmap->range[i];
1088     }
1089 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1090     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1091 #else
1092     sendcount = A->rmap->rend - A->rmap->rstart;
1093     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1094 #endif
1095     /* ---------------------------------------------------------------
1096          Create the sequential matrix of the same type as the local block diagonal
1097     */
1098     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1099     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1100     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1101     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1102     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1103     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
1104     **Bin = B;
1105     b     = (Mat_SeqAIJ*)B->data;
1106 
1107     /*--------------------------------------------------------------------
1108        Copy my part of matrix column indices over
1109     */
1110     sendcount  = ad->nz + bd->nz;
1111     jsendbuf   = b->j + b->i[rstarts[rank]];
1112     a_jsendbuf = ad->j;
1113     b_jsendbuf = bd->j;
1114     n          = A->rmap->rend - A->rmap->rstart;
1115     cnt        = 0;
1116     for (i=0; i<n; i++) {
1117 
1118       /* put in lower diagonal portion */
1119       m = bd->i[i+1] - bd->i[i];
1120       while (m > 0) {
1121         /* is it above diagonal (in bd (compressed) numbering) */
1122         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1123         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1124         m--;
1125       }
1126 
1127       /* put in diagonal portion */
1128       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1129         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1130       }
1131 
1132       /* put in upper diagonal portion */
1133       while (m-- > 0) {
1134         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1135       }
1136     }
1137     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1138 
1139     /*--------------------------------------------------------------------
1140        Gather all column indices to all processors
1141     */
1142     for (i=0; i<size; i++) {
1143       recvcounts[i] = 0;
1144       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1145         recvcounts[i] += lens[j];
1146       }
1147     }
1148     displs[0] = 0;
1149     for (i=1; i<size; i++) {
1150       displs[i] = displs[i-1] + recvcounts[i-1];
1151     }
1152 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1153     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1154 #else
1155     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1156 #endif
1157     /*--------------------------------------------------------------------
1158         Assemble the matrix into useable form (note numerical values not yet set)
1159     */
1160     /* set the b->ilen (length of each row) values */
1161     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1162     /* set the b->i indices */
1163     b->i[0] = 0;
1164     for (i=1; i<=A->rmap->N; i++) {
1165       b->i[i] = b->i[i-1] + lens[i-1];
1166     }
1167     ierr = PetscFree(lens);CHKERRQ(ierr);
1168     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1169     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1170 
1171   } else {
1172     B = **Bin;
1173     b = (Mat_SeqAIJ*)B->data;
1174   }
1175 
1176   /*--------------------------------------------------------------------
1177        Copy my part of matrix numerical values into the values location
1178   */
1179   if (flag == MAT_GET_VALUES) {
1180     sendcount = ad->nz + bd->nz;
1181     sendbuf   = b->a + b->i[rstarts[rank]];
1182     a_sendbuf = ad->a;
1183     b_sendbuf = bd->a;
1184     b_sendj   = bd->j;
1185     n         = A->rmap->rend - A->rmap->rstart;
1186     cnt       = 0;
1187     for (i=0; i<n; i++) {
1188 
1189       /* put in lower diagonal portion */
1190       m = bd->i[i+1] - bd->i[i];
1191       while (m > 0) {
1192         /* is it above diagonal (in bd (compressed) numbering) */
1193         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1194         sendbuf[cnt++] = *b_sendbuf++;
1195         m--;
1196         b_sendj++;
1197       }
1198 
1199       /* put in diagonal portion */
1200       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1201         sendbuf[cnt++] = *a_sendbuf++;
1202       }
1203 
1204       /* put in upper diagonal portion */
1205       while (m-- > 0) {
1206         sendbuf[cnt++] = *b_sendbuf++;
1207         b_sendj++;
1208       }
1209     }
1210     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1211 
1212     /* -----------------------------------------------------------------
1213        Gather all numerical values to all processors
1214     */
1215     if (!recvcounts) {
1216       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1217     }
1218     for (i=0; i<size; i++) {
1219       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1220     }
1221     displs[0] = 0;
1222     for (i=1; i<size; i++) {
1223       displs[i] = displs[i-1] + recvcounts[i-1];
1224     }
1225     recvbuf = b->a;
1226 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1227     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1228 #else
1229     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1230 #endif
1231   }  /* endof (flag == MAT_GET_VALUES) */
1232   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1233 
1234   if (A->symmetric) {
1235     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1236   } else if (A->hermitian) {
1237     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1238   } else if (A->structurally_symmetric) {
1239     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1240   }
1241   PetscFunctionReturn(0);
1242 }
1243 #undef __FUNCT__
1244 #define __FUNCT__ "MatDestroy_MPIAIJ_MatGetSubmatrices"
1245 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C)
1246 {
1247   PetscErrorCode ierr;
1248   Mat_SeqAIJ     *c = (Mat_SeqAIJ*)C->data;
1249   Mat_SubMat     *submatj = c->submatis1;
1250   PetscInt       i;
1251 
1252   PetscFunctionBegin;
1253   ierr = submatj->destroy(C);CHKERRQ(ierr);
1254 
1255   ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr);
1256 
1257   for (i=0; i<submatj->nrqr; ++i) {
1258     ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr);
1259   }
1260   ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr);
1261 
1262   ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr);
1263   ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr);
1264 
1265   for (i=0; i<submatj->nrqs; ++i) {
1266     ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr);
1267   }
1268   ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr);
1269   ierr = PetscFree(submatj->pa);CHKERRQ(ierr);
1270 
1271   ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr);
1272 
1273 #if defined(PETSC_USE_CTABLE)
1274   ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr);
1275   if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);}
1276   ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr);
1277 #else
1278   ierr = PetscFree(submatj->rmap);CHKERRQ(ierr);
1279 #endif
1280 
1281   if (!submatj->allcolumns) {
1282 #if defined(PETSC_USE_CTABLE)
1283     ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr);
1284 #else
1285     ierr = PetscFree(submatj->cmap);CHKERRQ(ierr);
1286 #endif
1287   }
1288 
1289   ierr = PetscFree(submatj);CHKERRQ(ierr);
1290   PetscFunctionReturn(0);
1291 }
1292 
1293 #undef __FUNCT__
1294 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS_Local"
1295 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1296 {
1297   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1298   Mat            submat,A = c->A,B = c->B;
1299   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1300   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1301   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1302   const PetscInt *icol,*irow;
1303   PetscInt       nrow,ncol,start;
1304   PetscErrorCode ierr;
1305   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1306   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1307   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1308   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1309   PetscInt       *lens,rmax,ncols,*cols,Crow;
1310 #if defined(PETSC_USE_CTABLE)
1311   PetscTable     cmap,rmap;
1312   PetscInt       *cmap_loc,*rmap_loc;
1313 #else
1314   PetscInt       *cmap,*rmap;
1315 #endif
1316   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1317   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1318   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1319   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1320   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1321   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1322   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1323   MPI_Comm       comm;
1324   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1325   PetscMPIInt    *onodes1,*olengths1,idex,end;
1326   Mat_SubMat     *smatis1;
1327   PetscBool      isrowsorted;
1328 
1329   PetscFunctionBegin;
1330   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1331 
1332   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1333   size = c->size;
1334   rank = c->rank;
1335 
1336   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1337   if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted");
1338 
1339   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1340   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1341   if (allcolumns) {
1342     icol = NULL;
1343     ncol = C->cmap->N;
1344   } else {
1345     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1346     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1347   }
1348 
1349   if (scall == MAT_INITIAL_MATRIX) {
1350     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1351 
1352     /* Get some new tags to keep the communication clean */
1353     tag1 = ((PetscObject)C)->tag;
1354     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1355     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1356 
1357     /* evaluate communication - mesg to who, length of mesg, and buffer space
1358      required. Based on this, buffers are allocated, and data copied into them */
1359     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1360     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1361 
1362     /* w1[proc] = num of rows owned by proc */
1363     proc = 0;
1364     for (j=0; j<nrow; j++) {
1365       row  = irow[j]; /* sorted! */
1366       while (row >= C->rmap->range[proc+1]) proc++;
1367       w1[proc]++;
1368       row2proc[j] = proc; /* map row index to proc */
1369     }
1370 
1371     nrqs     = 0;              /* num of outgoing messages */
1372     msz      = 0;              /* total mesg length (for all procs) */
1373     w1[rank] = 0;              /* num mesg sent to self */
1374     for (proc=0; proc<size; proc++) {
1375       if (w1[proc])  { w2[proc] = 1; nrqs++;} /* there exists a message to this proc */
1376     }
1377 
1378     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1379     for (proc=0,j=0; proc<size; proc++) {
1380       if (w1[proc]) { pa[j++] = proc;}
1381     }
1382 
1383     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1384     for (i=0; i<nrqs; i++) {
1385       proc      = pa[i];
1386       w1[proc] += 3;
1387       msz      += w1[proc];
1388     }
1389     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1390 
1391     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1392     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1393     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1394 
1395     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1396        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1397     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1398 
1399     /* Now post the Irecvs corresponding to these messages */
1400     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1401 
1402     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1403     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1404 
1405     /* Allocate Memory for outgoing messages */
1406     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1407     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1408     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1409 
1410     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1411     iptr = tmp;
1412     for (i=0; i<nrqs; i++) {
1413       proc     = pa[i];
1414       sbuf1[proc] = iptr;
1415       iptr += w1[proc];
1416     }
1417 
1418     /* Form the outgoing messages */
1419     /* Initialize the header space */
1420     for (i=0; i<nrqs; i++) {
1421       proc           = pa[i];
1422       ierr           = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr);
1423       ptr[proc]      = sbuf1[proc] + 3;
1424     }
1425 
1426     /* Parse the isrow and copy data into outbuf */
1427     ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1428     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1429       proc = row2proc[j];
1430       if (proc != rank) { /* copy to the outgoing buf*/
1431         *ptr[proc] = irow[j];
1432         ctr[proc]++; ptr[proc]++;
1433       }
1434     }
1435 
1436     /* Update the headers for the current IS */
1437     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1438       if ((ctr_j = ctr[j])) {
1439         sbuf1_j        = sbuf1[j];
1440         k              = ++sbuf1_j[0];
1441         sbuf1_j[2*k]   = ctr_j;
1442         sbuf1_j[2*k-1] = 0;
1443       }
1444     }
1445 
1446     /* Now post the sends */
1447     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1448     for (i=0; i<nrqs; ++i) {
1449       proc = pa[i];
1450       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr);
1451     }
1452 
1453     /* Post Receives to capture the buffer size */
1454     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1455     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1456 
1457     rbuf2[0] = tmp + msz;
1458     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1459 
1460     for (i=0; i<nrqs; ++i) {
1461       proc = pa[i];
1462       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr);
1463     }
1464 
1465     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1466 
1467     /* Send to other procs the buf size they should allocate */
1468     /* Receive messages*/
1469     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1470     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1471 
1472     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
1473     for (i=0; i<nrqr; ++i) {
1474       req_size[i] = 0;
1475       rbuf1_i        = rbuf1[i];
1476       start          = 2*rbuf1_i[0] + 1;
1477       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1478       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1479       sbuf2_i        = sbuf2[i];
1480       for (j=start; j<end; j++) {
1481         k            = rbuf1_i[j] - rstart;
1482         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1483         sbuf2_i[j]   = ncols;
1484         req_size[i] += ncols;
1485       }
1486       req_source1[i] = r_status1[i].MPI_SOURCE;
1487 
1488       /* form the header */
1489       sbuf2_i[0] = req_size[i];
1490       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1491 
1492       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
1493     }
1494 
1495     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1496     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1497 
1498     /* rbuf2 is received, Post recv column indices a->j */
1499     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
1500 
1501     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1502     for (i=0; i<nrqs; ++i) {
1503       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1504       req_source2[i] = r_status2[i].MPI_SOURCE;
1505       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
1506     }
1507 
1508     /* Wait on sends1 and sends2 */
1509     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1510     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1511     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1512     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1513 
1514     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1515     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1516 
1517     /* Now allocate sending buffers for a->j, and send them off */
1518     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1519     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1520     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1521     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1522 
1523     for (i=0; i<nrqr; i++) { /* for each requested message */
1524       rbuf1_i   = rbuf1[i];
1525       sbuf_aj_i = sbuf_aj[i];
1526       ct1       = 2*rbuf1_i[0] + 1;
1527       ct2       = 0;
1528       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1529 
1530       kmax = rbuf1[i][2];
1531       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1532         row    = rbuf1_i[ct1] - rstart;
1533         nzA    = ai[row+1] - ai[row];     nzB = bi[row+1] - bi[row];
1534         ncols  = nzA + nzB;
1535         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1536 
1537         /* load the column indices for this row into cols*/
1538         cols = sbuf_aj_i + ct2;
1539 
1540         lwrite = 0;
1541         for (l=0; l<nzB; l++) {
1542           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1543         }
1544         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1545         for (l=0; l<nzB; l++) {
1546           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1547         }
1548 
1549         ct2 += ncols;
1550       }
1551       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
1552     }
1553 
1554     /* create column map (cmap): global col of C -> local col of submat */
1555 #if defined(PETSC_USE_CTABLE)
1556     if (!allcolumns) {
1557       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1558       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1559       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1560         if (icol[j] >= cstart && icol[j] <cend) {
1561           cmap_loc[icol[j] - cstart] = j+1;
1562         } else { /* use PetscTable for non-local col indices */
1563           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1564         }
1565       }
1566     } else {
1567       cmap     = NULL;
1568       cmap_loc = NULL;
1569     }
1570     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1571 #else
1572     if (!allcolumns) {
1573       ierr   = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1574       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1575     } else {
1576       cmap = NULL;
1577     }
1578 #endif
1579 
1580     /* Create lens for MatSeqAIJSetPreallocation() */
1581     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1582 
1583     /* Compute lens from local part of C */
1584     for (j=0; j<nrow; j++) {
1585       row  = irow[j];
1586       proc = row2proc[j];
1587       if (proc == rank) {
1588         /* diagonal part A = c->A */
1589         ncols = ai[row-rstart+1] - ai[row-rstart];
1590         cols  = aj + ai[row-rstart];
1591         if (!allcolumns) {
1592           for (k=0; k<ncols; k++) {
1593 #if defined(PETSC_USE_CTABLE)
1594             tcol = cmap_loc[cols[k]];
1595 #else
1596             tcol = cmap[cols[k]+cstart];
1597 #endif
1598             if (tcol) lens[j]++;
1599           }
1600         } else { /* allcolumns */
1601           lens[j] = ncols;
1602         }
1603 
1604         /* off-diagonal part B = c->B */
1605         ncols = bi[row-rstart+1] - bi[row-rstart];
1606         cols  = bj + bi[row-rstart];
1607         if (!allcolumns) {
1608           for (k=0; k<ncols; k++) {
1609 #if defined(PETSC_USE_CTABLE)
1610             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1611 #else
1612             tcol = cmap[bmap[cols[k]]];
1613 #endif
1614             if (tcol) lens[j]++;
1615           }
1616         } else { /* allcolumns */
1617           lens[j] += ncols;
1618         }
1619       }
1620     }
1621 
1622     /* Create row map (rmap): global row of C -> local row of submat */
1623 #if defined(PETSC_USE_CTABLE)
1624     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1625     for (j=0; j<nrow; j++) {
1626       row  = irow[j];
1627       proc = row2proc[j];
1628       if (proc == rank) { /* a local row */
1629         rmap_loc[row - rstart] = j;
1630       } else {
1631         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1632       }
1633     }
1634 #else
1635     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1636     for (j=0; j<nrow; j++) {
1637       rmap[irow[j]] = j;
1638     }
1639 #endif
1640 
1641     /* Update lens from offproc data */
1642     /* recv a->j is done */
1643     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1644     for (i=0; i<nrqs; i++) {
1645       proc    = pa[i];
1646       sbuf1_i = sbuf1[proc];
1647       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1648       ct1     = 2 + 1;
1649       ct2     = 0;
1650       rbuf2_i = rbuf2[i]; /* received length of C->j */
1651       rbuf3_i = rbuf3[i]; /* received C->j */
1652 
1653       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1654       max1   = sbuf1_i[2];
1655       for (k=0; k<max1; k++,ct1++) {
1656 #if defined(PETSC_USE_CTABLE)
1657         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1658         row--;
1659         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1660 #else
1661         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1662 #endif
1663         /* Now, store row index of submat in sbuf1_i[ct1] */
1664         sbuf1_i[ct1] = row;
1665 
1666         nnz = rbuf2_i[ct1];
1667         if (!allcolumns) {
1668           for (l=0; l<nnz; l++,ct2++) {
1669 #if defined(PETSC_USE_CTABLE)
1670             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1671               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1672             } else {
1673               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1674             }
1675 #else
1676             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1677 #endif
1678             if (tcol) {
1679               lens[row]++;
1680 
1681               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1682                For reuse, we replace received C->j with index that should be inserted to submat */
1683               /* rbuf3_i[ct3++] = ct2; ??? */
1684             }
1685           }
1686         } else { /* allcolumns */
1687           lens[row] += nnz;
1688         }
1689       }
1690     }
1691     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1692     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1693 
1694     /* Create the submatrices */
1695     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1696     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1697 
1698     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1699     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1700     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1701     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1702     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1703 
1704     /* create struct Mat_SubMat and attached it to submat */
1705     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1706     subc = (Mat_SeqAIJ*)submat->data;
1707     subc->submatis1 = smatis1;
1708 
1709     smatis1->nrqs        = nrqs;
1710     smatis1->nrqr        = nrqr;
1711     smatis1->rbuf1       = rbuf1;
1712     smatis1->rbuf2       = rbuf2;
1713     smatis1->rbuf3       = rbuf3;
1714     smatis1->sbuf2       = sbuf2;
1715     smatis1->req_source2 = req_source2;
1716 
1717     smatis1->sbuf1       = sbuf1;
1718     smatis1->ptr         = ptr;
1719     smatis1->tmp         = tmp;
1720     smatis1->ctr         = ctr;
1721 
1722     smatis1->pa           = pa;
1723     smatis1->req_size     = req_size;
1724     smatis1->req_source1  = req_source1;
1725 
1726     smatis1->allcolumns  = allcolumns;
1727     smatis1->row2proc    = row2proc;
1728     smatis1->rmap        = rmap;
1729     smatis1->cmap        = cmap;
1730 #if defined(PETSC_USE_CTABLE)
1731     smatis1->rmap_loc    = rmap_loc;
1732     smatis1->cmap_loc    = cmap_loc;
1733 #endif
1734 
1735     smatis1->destroy     = submat->ops->destroy;
1736     submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices;
1737     submat->factortype   = C->factortype;
1738 
1739     /* compute rmax */
1740     rmax = 0;
1741     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1742 
1743   } else { /* scall == MAT_REUSE_MATRIX */
1744     submat = submats[0];
1745     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1746 
1747     subc    = (Mat_SeqAIJ*)submat->data;
1748     rmax    = subc->rmax;
1749     smatis1 = subc->submatis1;
1750     nrqs        = smatis1->nrqs;
1751     nrqr        = smatis1->nrqr;
1752     rbuf1       = smatis1->rbuf1;
1753     rbuf2       = smatis1->rbuf2;
1754     rbuf3       = smatis1->rbuf3;
1755     req_source2 = smatis1->req_source2;
1756 
1757     sbuf1     = smatis1->sbuf1;
1758     sbuf2     = smatis1->sbuf2;
1759     ptr       = smatis1->ptr;
1760     tmp       = smatis1->tmp;
1761     ctr       = smatis1->ctr;
1762 
1763     pa         = smatis1->pa;
1764     req_size   = smatis1->req_size;
1765     req_source1 = smatis1->req_source1;
1766 
1767     allcolumns = smatis1->allcolumns;
1768     row2proc   = smatis1->row2proc;
1769     rmap       = smatis1->rmap;
1770     cmap       = smatis1->cmap;
1771 #if defined(PETSC_USE_CTABLE)
1772     rmap_loc   = smatis1->rmap_loc;
1773     cmap_loc   = smatis1->cmap_loc;
1774 #endif
1775   }
1776 
1777   /* Post recv matrix values */
1778   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
1779   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1780   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1781   for (i=0; i<nrqs; ++i) {
1782     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1783     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1784   }
1785 
1786   /* Allocate sending buffers for a->a, and send them off */
1787   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1788   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1789   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1790   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1791 
1792   for (i=0; i<nrqr; i++) {
1793     rbuf1_i   = rbuf1[i];
1794     sbuf_aa_i = sbuf_aa[i];
1795     ct1       = 2*rbuf1_i[0]+1;
1796     ct2       = 0;
1797     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1798 
1799     kmax = rbuf1_i[2];
1800     for (k=0; k<kmax; k++,ct1++) {
1801       row = rbuf1_i[ct1] - rstart;
1802       nzA = ai[row+1] - ai[row];
1803       nzB = bi[row+1] - bi[row];
1804       ncols  = nzA + nzB;
1805       cworkB = bj + bi[row];
1806       vworkA = a_a + ai[row];
1807       vworkB = b_a + bi[row];
1808 
1809       /* load the column values for this row into vals*/
1810       vals = sbuf_aa_i + ct2;
1811 
1812       lwrite = 0;
1813       for (l=0; l<nzB; l++) {
1814         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1815       }
1816       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1817       for (l=0; l<nzB; l++) {
1818         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1819       }
1820 
1821       ct2 += ncols;
1822     }
1823     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1824   }
1825 
1826   /* Assemble submat */
1827   ierr = PetscCalloc2(rmax,&subcols,rmax,&subvals);CHKERRQ(ierr);
1828 
1829   /* First assemble the local rows */
1830   for (j=0; j<nrow; j++) {
1831     row  = irow[j];
1832     proc = row2proc[j];
1833     if (proc == rank) {
1834       Crow = row - rstart;  /* local row index of C */
1835 #if defined(PETSC_USE_CTABLE)
1836       row = rmap_loc[Crow]; /* row index of submat */
1837 #else
1838       row = rmap[row];
1839 #endif
1840 
1841       if (allcolumns) {
1842         /* diagonal part A = c->A */
1843         ncols = ai[Crow+1] - ai[Crow];
1844         cols  = aj + ai[Crow];
1845         vals  = a->a + ai[Crow];
1846         i     = 0;
1847         for (k=0; k<ncols; k++) {
1848           subcols[i]   = cols[k] + cstart;
1849           subvals[i++] = vals[k];
1850         }
1851 
1852         /* off-diagonal part B = c->B */
1853         ncols = bi[Crow+1] - bi[Crow];
1854         cols  = bj + bi[Crow];
1855         vals  = b->a + bi[Crow];
1856         for (k=0; k<ncols; k++) {
1857           subcols[i]   = bmap[cols[k]];
1858           subvals[i++] = vals[k];
1859         }
1860 
1861         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1862 
1863       } else { /* !allcolumns */
1864 #if defined(PETSC_USE_CTABLE)
1865         /* diagonal part A = c->A */
1866         ncols = ai[Crow+1] - ai[Crow];
1867         cols  = aj + ai[Crow];
1868         vals  = a->a + ai[Crow];
1869         i     = 0;
1870         for (k=0; k<ncols; k++) {
1871           tcol = cmap_loc[cols[k]];
1872           if (tcol) {
1873             subcols[i]   = --tcol;
1874             subvals[i++] = vals[k];
1875           }
1876         }
1877 
1878         /* off-diagonal part B = c->B */
1879         ncols = bi[Crow+1] - bi[Crow];
1880         cols  = bj + bi[Crow];
1881         vals  = b->a + bi[Crow];
1882         for (k=0; k<ncols; k++) {
1883           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1884           if (tcol) {
1885             subcols[i]   = --tcol;
1886             subvals[i++] = vals[k];
1887           }
1888         }
1889 #else
1890         /* diagonal part A = c->A */
1891         ncols = ai[Crow+1] - ai[Crow];
1892         cols  = aj + ai[Crow];
1893         vals  = a->a + ai[Crow];
1894         i     = 0;
1895         for (k=0; k<ncols; k++) {
1896           tcol = cmap[cols[k]+cstart];
1897           if (tcol) {
1898             subcols[i]   = --tcol;
1899             subvals[i++] = vals[k];
1900           }
1901         }
1902 
1903         /* off-diagonal part B = c->B */
1904         ncols = bi[Crow+1] - bi[Crow];
1905         cols  = bj + bi[Crow];
1906         vals  = b->a + bi[Crow];
1907         for (k=0; k<ncols; k++) {
1908           tcol = cmap[bmap[cols[k]]];
1909           if (tcol) {
1910             subcols[i]   = --tcol;
1911             subvals[i++] = vals[k];
1912           }
1913         }
1914 #endif
1915         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1916       }
1917     }
1918   }
1919 
1920   /* Now assemble the off-proc rows */
1921   for (i=0; i<nrqs; i++) { /* for each requested message */
1922     /* recv values from other processes */
1923     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1924     proc    = pa[idex];
1925     sbuf1_i = sbuf1[proc];
1926     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1927     ct1     = 2 + 1;
1928     ct2     = 0; /* count of received C->j */
1929     ct3     = 0; /* count of received C->j that will be inserted into submat */
1930     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1931     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1932     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1933 
1934     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1935     max1 = sbuf1_i[2];             /* num of rows */
1936     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1937       row = sbuf1_i[ct1]; /* row index of submat */
1938       if (!allcolumns) {
1939         idex = 0;
1940         if (scall == MAT_INITIAL_MATRIX) {
1941           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1942           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1943 #if defined(PETSC_USE_CTABLE)
1944             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1945               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1946             } else {
1947               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1948             }
1949 #else
1950             tcol = cmap[rbuf3_i[ct2]];
1951 #endif
1952             if (tcol) {
1953               subcols[idex]   = --tcol;
1954               subvals[idex++] = rbuf4_i[ct2];
1955 
1956               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1957                For reuse, we replace received C->j with index that should be inserted to submat */
1958               rbuf3_i[ct3++] = ct2;
1959             }
1960           }
1961           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1962 
1963         } else { /* scall == MAT_REUSE_MATRIX */
1964           submat = submats[0];
1965           subc   = (Mat_SeqAIJ*)submat->data;
1966 
1967           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1968           for (l=0; l<nnz; l++) {
1969             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1970             subvals[idex++] = rbuf4_i[ct2];
1971           }
1972 
1973           bj = subc->j + subc->i[row];
1974           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1975         }
1976       } else { /* allcolumns */
1977         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1978         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1979         ct2 += nnz;
1980       }
1981     }
1982   }
1983 
1984   /* sending a->a are done */
1985   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1986   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1987 
1988   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1989   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1990   submats[0] = submat;
1991 
1992   /* Restore the indices */
1993   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1994   if (!allcolumns) {
1995     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1996   }
1997 
1998   /* Destroy allocated memory */
1999   for (i=0; i<nrqs; ++i) {
2000     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2001   }
2002   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2003   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2004   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2005   ierr = PetscFree2(subcols,subvals);CHKERRQ(ierr);
2006 
2007   if (scall == MAT_INITIAL_MATRIX) {
2008     ierr = PetscFree(lens);CHKERRQ(ierr);
2009     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2010     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2011   }
2012   PetscFunctionReturn(0);
2013 }
2014 
2015 #undef __FUNCT__
2016 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS"
2017 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2018 {
2019   PetscErrorCode ierr;
2020   PetscInt       ncol;
2021   PetscBool      colflag,allcolumns=PETSC_FALSE;
2022 
2023   PetscFunctionBegin;
2024   /* Allocate memory to hold all the submatrices */
2025   if (scall != MAT_REUSE_MATRIX) {
2026     ierr = PetscMalloc1(1,submat);CHKERRQ(ierr);
2027   }
2028 
2029   /* Check for special case: each processor gets entire matrix columns */
2030   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
2031   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
2032   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
2033 
2034   ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
2035   PetscFunctionReturn(0);
2036 }
2037 
2038 #undef __FUNCT__
2039 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
2040 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2041 {
2042   PetscErrorCode ierr;
2043   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
2044   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
2045 
2046   PetscFunctionBegin;
2047   /* Check for special case: each processor gets entire matrix */
2048   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip several MPIU_Allreduce() */
2049     ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2050     PetscFunctionReturn(0);
2051   }
2052 
2053   if (ismax == 1 && C->rmap->N == C->cmap->N) {
2054     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
2055     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
2056     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
2057     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
2058     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2059       wantallmatrix = PETSC_TRUE;
2060 
2061       ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
2062     }
2063   }
2064   ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2065   if (twantallmatrix) {
2066     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2067     PetscFunctionReturn(0);
2068   }
2069 
2070   /* Allocate memory to hold all the submatrices */
2071   if (scall != MAT_REUSE_MATRIX) {
2072     ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr);
2073   }
2074 
2075   /* Check for special case: each processor gets entire matrix columns */
2076   ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr);
2077   for (i=0; i<ismax; i++) {
2078     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2079     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
2080     if (colflag && ncol == C->cmap->N) {
2081       allcolumns[i] = PETSC_TRUE;
2082     } else {
2083       allcolumns[i] = PETSC_FALSE;
2084     }
2085   }
2086 
2087   /* Determine the number of stages through which submatrices are done */
2088   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
2089 
2090   /*
2091      Each stage will extract nmax submatrices.
2092      nmax is determined by the matrix column dimension.
2093      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2094   */
2095   if (!nmax) nmax = 1;
2096   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
2097 
2098   /* Make sure every processor loops through the nstages */
2099   ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2100 
2101   for (i=0,pos=0; i<nstages; i++) {
2102     if (pos+nmax <= ismax) max_no = nmax;
2103     else if (pos == ismax) max_no = 0;
2104     else                   max_no = ismax-pos;
2105     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
2106     pos += max_no;
2107   }
2108 
2109   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
2110   PetscFunctionReturn(0);
2111 }
2112 
2113 /* -------------------------------------------------------------------------*/
2114 #undef __FUNCT__
2115 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
2116 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
2117 {
2118   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2119   Mat            A  = c->A;
2120   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
2121   const PetscInt **icol,**irow;
2122   PetscInt       *nrow,*ncol,start;
2123   PetscErrorCode ierr;
2124   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
2125   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
2126   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
2127   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
2128   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2129 #if defined(PETSC_USE_CTABLE)
2130   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
2131 #else
2132   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2133 #endif
2134   const PetscInt *irow_i;
2135   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2136   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2137   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2138   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2139   MPI_Status     *r_status3,*r_status4,*s_status4;
2140   MPI_Comm       comm;
2141   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
2142   PetscMPIInt    *onodes1,*olengths1;
2143   PetscMPIInt    idex,idex2,end;
2144 
2145   PetscFunctionBegin;
2146   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2147   tag0 = ((PetscObject)C)->tag;
2148   size = c->size;
2149   rank = c->rank;
2150 
2151   /* Get some new tags to keep the communication clean */
2152   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
2153   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2154   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2155 
2156   ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr);
2157 
2158   for (i=0; i<ismax; i++) {
2159     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2160     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2161     if (allcolumns[i]) {
2162       icol[i] = NULL;
2163       ncol[i] = C->cmap->N;
2164     } else {
2165       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2166       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2167     }
2168   }
2169 
2170   /* evaluate communication - mesg to who, length of mesg, and buffer space
2171      required. Based on this, buffers are allocated, and data copied into them*/
2172   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size */
2173   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2174   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2175   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2176   for (i=0; i<ismax; i++) {
2177     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
2178     jmax   = nrow[i];
2179     irow_i = irow[i];
2180     for (j=0; j<jmax; j++) {
2181       l   = 0;
2182       row = irow_i[j];
2183       while (row >= C->rmap->range[l+1]) l++;
2184       proc = l;
2185       w4[proc]++;
2186     }
2187     for (j=0; j<size; j++) {
2188       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
2189     }
2190   }
2191 
2192   nrqs     = 0;              /* no of outgoing messages */
2193   msz      = 0;              /* total mesg length (for all procs) */
2194   w1[rank] = 0;              /* no mesg sent to self */
2195   w3[rank] = 0;
2196   for (i=0; i<size; i++) {
2197     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2198   }
2199   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2200   for (i=0,j=0; i<size; i++) {
2201     if (w1[i]) { pa[j] = i; j++; }
2202   }
2203 
2204   /* Each message would have a header = 1 + 2*(no of IS) + data */
2205   for (i=0; i<nrqs; i++) {
2206     j      = pa[i];
2207     w1[j] += w2[j] + 2* w3[j];
2208     msz   += w1[j];
2209   }
2210   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2211 
2212   /* Determine the number of messages to expect, their lengths, from from-ids */
2213   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2214   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2215 
2216   /* Now post the Irecvs corresponding to these messages */
2217   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2218 
2219   ierr = PetscFree(onodes1);CHKERRQ(ierr);
2220   ierr = PetscFree(olengths1);CHKERRQ(ierr);
2221 
2222   /* Allocate Memory for outgoing messages */
2223   ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2224   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
2225   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
2226 
2227   {
2228     PetscInt *iptr = tmp;
2229     k    = 0;
2230     for (i=0; i<nrqs; i++) {
2231       j        = pa[i];
2232       iptr    += k;
2233       sbuf1[j] = iptr;
2234       k        = w1[j];
2235     }
2236   }
2237 
2238   /* Form the outgoing messages */
2239   /* Initialize the header space */
2240   for (i=0; i<nrqs; i++) {
2241     j           = pa[i];
2242     sbuf1[j][0] = 0;
2243     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
2244     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2245   }
2246 
2247   /* Parse the isrow and copy data into outbuf */
2248   for (i=0; i<ismax; i++) {
2249     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
2250     irow_i = irow[i];
2251     jmax   = nrow[i];
2252     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2253       l   = 0;
2254       row = irow_i[j];
2255       while (row >= C->rmap->range[l+1]) l++;
2256       proc = l;
2257       if (proc != rank) { /* copy to the outgoing buf*/
2258         ctr[proc]++;
2259         *ptr[proc] = row;
2260         ptr[proc]++;
2261       }
2262     }
2263     /* Update the headers for the current IS */
2264     for (j=0; j<size; j++) { /* Can Optimise this loop too */
2265       if ((ctr_j = ctr[j])) {
2266         sbuf1_j        = sbuf1[j];
2267         k              = ++sbuf1_j[0];
2268         sbuf1_j[2*k]   = ctr_j;
2269         sbuf1_j[2*k-1] = i;
2270       }
2271     }
2272   }
2273 
2274   /*  Now  post the sends */
2275   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2276   for (i=0; i<nrqs; ++i) {
2277     j    = pa[i];
2278     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
2279   }
2280 
2281   /* Post Receives to capture the buffer size */
2282   ierr     = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2283   ierr     = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr);
2284   rbuf2[0] = tmp + msz;
2285   for (i=1; i<nrqs; ++i) {
2286     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2287   }
2288   for (i=0; i<nrqs; ++i) {
2289     j    = pa[i];
2290     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
2291   }
2292 
2293   /* Send to other procs the buf size they should allocate */
2294 
2295 
2296   /* Receive messages*/
2297   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2298   ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2299   ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr);
2300   {
2301     PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2302     PetscInt   *sbuf2_i;
2303 
2304     for (i=0; i<nrqr; ++i) {
2305       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
2306 
2307       req_size[idex] = 0;
2308       rbuf1_i        = rbuf1[idex];
2309       start          = 2*rbuf1_i[0] + 1;
2310       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2311       ierr           = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr);
2312       sbuf2_i        = sbuf2[idex];
2313       for (j=start; j<end; j++) {
2314         id              = rbuf1_i[j] - rstart;
2315         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2316         sbuf2_i[j]      = ncols;
2317         req_size[idex] += ncols;
2318       }
2319       req_source[idex] = r_status1[i].MPI_SOURCE;
2320       /* form the header */
2321       sbuf2_i[0] = req_size[idex];
2322       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2323 
2324       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
2325     }
2326   }
2327   ierr = PetscFree(r_status1);CHKERRQ(ierr);
2328   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2329 
2330   /*  recv buffer sizes */
2331   /* Receive messages*/
2332 
2333   ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr);
2334   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2335   ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2336   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2337   ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2338 
2339   for (i=0; i<nrqs; ++i) {
2340     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
2341     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr);
2342     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr);
2343     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
2344     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
2345   }
2346   ierr = PetscFree(r_status2);CHKERRQ(ierr);
2347   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2348 
2349   /* Wait on sends1 and sends2 */
2350   ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2351   ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2352 
2353   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
2354   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
2355   ierr = PetscFree(s_status1);CHKERRQ(ierr);
2356   ierr = PetscFree(s_status2);CHKERRQ(ierr);
2357   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2358   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2359 
2360   /* Now allocate buffers for a->j, and send them off */
2361   ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2362   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2363   ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2364   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2365 
2366   ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2367   {
2368     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2369     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2370     PetscInt cend = C->cmap->rend;
2371     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2372 
2373     for (i=0; i<nrqr; i++) {
2374       rbuf1_i   = rbuf1[i];
2375       sbuf_aj_i = sbuf_aj[i];
2376       ct1       = 2*rbuf1_i[0] + 1;
2377       ct2       = 0;
2378       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2379         kmax = rbuf1[i][2*j];
2380         for (k=0; k<kmax; k++,ct1++) {
2381           row    = rbuf1_i[ct1] - rstart;
2382           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2383           ncols  = nzA + nzB;
2384           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2385 
2386           /* load the column indices for this row into cols*/
2387           cols = sbuf_aj_i + ct2;
2388 
2389           lwrite = 0;
2390           for (l=0; l<nzB; l++) {
2391             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2392           }
2393           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2394           for (l=0; l<nzB; l++) {
2395             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2396           }
2397 
2398           ct2 += ncols;
2399         }
2400       }
2401       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
2402     }
2403   }
2404   ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr);
2405   ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr);
2406 
2407   /* Allocate buffers for a->a, and send them off */
2408   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2409   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2410   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2411   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2412 
2413   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2414   {
2415     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2416     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2417     PetscInt    cend   = C->cmap->rend;
2418     PetscInt    *b_j   = b->j;
2419     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2420 
2421     for (i=0; i<nrqr; i++) {
2422       rbuf1_i   = rbuf1[i];
2423       sbuf_aa_i = sbuf_aa[i];
2424       ct1       = 2*rbuf1_i[0]+1;
2425       ct2       = 0;
2426       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2427         kmax = rbuf1_i[2*j];
2428         for (k=0; k<kmax; k++,ct1++) {
2429           row    = rbuf1_i[ct1] - rstart;
2430           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2431           ncols  = nzA + nzB;
2432           cworkB = b_j + b_i[row];
2433           vworkA = a_a + a_i[row];
2434           vworkB = b_a + b_i[row];
2435 
2436           /* load the column values for this row into vals*/
2437           vals = sbuf_aa_i+ct2;
2438 
2439           lwrite = 0;
2440           for (l=0; l<nzB; l++) {
2441             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2442           }
2443           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2444           for (l=0; l<nzB; l++) {
2445             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2446           }
2447 
2448           ct2 += ncols;
2449         }
2450       }
2451       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
2452     }
2453   }
2454   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
2455   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
2456   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2457   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2458 
2459   /* Form the matrix */
2460   /* create col map: global col of C -> local col of submatrices */
2461   {
2462     const PetscInt *icol_i;
2463 #if defined(PETSC_USE_CTABLE)
2464     ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr);
2465     for (i=0; i<ismax; i++) {
2466       if (!allcolumns[i]) {
2467         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2468 
2469         jmax   = ncol[i];
2470         icol_i = icol[i];
2471         cmap_i = cmap[i];
2472         for (j=0; j<jmax; j++) {
2473           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2474         }
2475       } else {
2476         cmap[i] = NULL;
2477       }
2478     }
2479 #else
2480     ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr);
2481     for (i=0; i<ismax; i++) {
2482       if (!allcolumns[i]) {
2483         ierr   = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2484         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
2485         jmax   = ncol[i];
2486         icol_i = icol[i];
2487         cmap_i = cmap[i];
2488         for (j=0; j<jmax; j++) {
2489           cmap_i[icol_i[j]] = j+1;
2490         }
2491       } else {
2492         cmap[i] = NULL;
2493       }
2494     }
2495 #endif
2496   }
2497 
2498   /* Create lens which is required for MatCreate... */
2499   for (i=0,j=0; i<ismax; i++) j += nrow[i];
2500   ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2501   if (ismax) {
2502     ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr);
2503     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
2504   }
2505   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2506 
2507   /* Update lens from local data */
2508   for (i=0; i<ismax; i++) {
2509     jmax = nrow[i];
2510     if (!allcolumns[i]) cmap_i = cmap[i];
2511     irow_i = irow[i];
2512     lens_i = lens[i];
2513     for (j=0; j<jmax; j++) {
2514       l   = 0;
2515       row = irow_i[j];
2516       while (row >= C->rmap->range[l+1]) l++;
2517       proc = l;
2518       if (proc == rank) {
2519         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2520         if (!allcolumns[i]) {
2521           for (k=0; k<ncols; k++) {
2522 #if defined(PETSC_USE_CTABLE)
2523             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2524 #else
2525             tcol = cmap_i[cols[k]];
2526 #endif
2527             if (tcol) lens_i[j]++;
2528           }
2529         } else { /* allcolumns */
2530           lens_i[j] = ncols;
2531         }
2532         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2533       }
2534     }
2535   }
2536 
2537   /* Create row map: global row of C -> local row of submatrices */
2538 #if defined(PETSC_USE_CTABLE)
2539   ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr);
2540   for (i=0; i<ismax; i++) {
2541     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2542     irow_i = irow[i];
2543     jmax   = nrow[i];
2544     for (j=0; j<jmax; j++) {
2545       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2546     }
2547   }
2548 #else
2549   ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr);
2550   if (ismax) {
2551     ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr);
2552     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
2553   }
2554   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
2555   for (i=0; i<ismax; i++) {
2556     rmap_i = rmap[i];
2557     irow_i = irow[i];
2558     jmax   = nrow[i];
2559     for (j=0; j<jmax; j++) {
2560       rmap_i[irow_i[j]] = j;
2561     }
2562   }
2563 #endif
2564 
2565   /* Update lens from offproc data */
2566   {
2567     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2568 
2569     for (tmp2=0; tmp2<nrqs; tmp2++) {
2570       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
2571       idex    = pa[idex2];
2572       sbuf1_i = sbuf1[idex];
2573       jmax    = sbuf1_i[0];
2574       ct1     = 2*jmax+1;
2575       ct2     = 0;
2576       rbuf2_i = rbuf2[idex2];
2577       rbuf3_i = rbuf3[idex2];
2578       for (j=1; j<=jmax; j++) {
2579         is_no  = sbuf1_i[2*j-1];
2580         max1   = sbuf1_i[2*j];
2581         lens_i = lens[is_no];
2582         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2583         rmap_i = rmap[is_no];
2584         for (k=0; k<max1; k++,ct1++) {
2585 #if defined(PETSC_USE_CTABLE)
2586           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2587           row--;
2588           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2589 #else
2590           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2591 #endif
2592           max2 = rbuf2_i[ct1];
2593           for (l=0; l<max2; l++,ct2++) {
2594             if (!allcolumns[is_no]) {
2595 #if defined(PETSC_USE_CTABLE)
2596               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2597 #else
2598               tcol = cmap_i[rbuf3_i[ct2]];
2599 #endif
2600               if (tcol) lens_i[row]++;
2601             } else { /* allcolumns */
2602               lens_i[row]++; /* lens_i[row] += max2 ? */
2603             }
2604           }
2605         }
2606       }
2607     }
2608   }
2609   ierr = PetscFree(r_status3);CHKERRQ(ierr);
2610   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2611   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
2612   ierr = PetscFree(s_status3);CHKERRQ(ierr);
2613   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2614 
2615   /* Create the submatrices */
2616   if (scall == MAT_REUSE_MATRIX) {
2617     PetscBool flag;
2618 
2619     /*
2620         Assumes new rows are same length as the old rows,hence bug!
2621     */
2622     for (i=0; i<ismax; i++) {
2623       mat = (Mat_SeqAIJ*)(submats[i]->data);
2624       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2625       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2626       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
2627       /* Initial matrix as if empty */
2628       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2629 
2630       submats[i]->factortype = C->factortype;
2631     }
2632   } else {
2633     for (i=0; i<ismax; i++) {
2634       PetscInt rbs,cbs;
2635       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2636       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2637 
2638       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2639       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2640 
2641       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2642       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2643       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2644     }
2645   }
2646 
2647   /* Assemble the matrices */
2648   /* First assemble the local rows */
2649   {
2650     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2651     PetscScalar *imat_a;
2652 
2653     for (i=0; i<ismax; i++) {
2654       mat       = (Mat_SeqAIJ*)submats[i]->data;
2655       imat_ilen = mat->ilen;
2656       imat_j    = mat->j;
2657       imat_i    = mat->i;
2658       imat_a    = mat->a;
2659 
2660       if (!allcolumns[i]) cmap_i = cmap[i];
2661       rmap_i = rmap[i];
2662       irow_i = irow[i];
2663       jmax   = nrow[i];
2664       for (j=0; j<jmax; j++) {
2665         l   = 0;
2666         row = irow_i[j];
2667         while (row >= C->rmap->range[l+1]) l++;
2668         proc = l;
2669         if (proc == rank) {
2670           old_row = row;
2671 #if defined(PETSC_USE_CTABLE)
2672           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2673           row--;
2674 #else
2675           row = rmap_i[row];
2676 #endif
2677           ilen_row = imat_ilen[row];
2678           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2679           mat_i    = imat_i[row];
2680           mat_a    = imat_a + mat_i;
2681           mat_j    = imat_j + mat_i;
2682           if (!allcolumns[i]) {
2683             for (k=0; k<ncols; k++) {
2684 #if defined(PETSC_USE_CTABLE)
2685               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2686 #else
2687               tcol = cmap_i[cols[k]];
2688 #endif
2689               if (tcol) {
2690                 *mat_j++ = tcol - 1;
2691                 *mat_a++ = vals[k];
2692                 ilen_row++;
2693               }
2694             }
2695           } else { /* allcolumns */
2696             for (k=0; k<ncols; k++) {
2697               *mat_j++ = cols[k];  /* global col index! */
2698               *mat_a++ = vals[k];
2699               ilen_row++;
2700             }
2701           }
2702           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2703 
2704           imat_ilen[row] = ilen_row;
2705         }
2706       }
2707     }
2708   }
2709 
2710   /*   Now assemble the off proc rows*/
2711   {
2712     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
2713     PetscInt    *imat_j,*imat_i;
2714     PetscScalar *imat_a,*rbuf4_i;
2715 
2716     for (tmp2=0; tmp2<nrqs; tmp2++) {
2717       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
2718       idex    = pa[idex2];
2719       sbuf1_i = sbuf1[idex];
2720       jmax    = sbuf1_i[0];
2721       ct1     = 2*jmax + 1;
2722       ct2     = 0;
2723       rbuf2_i = rbuf2[idex2];
2724       rbuf3_i = rbuf3[idex2];
2725       rbuf4_i = rbuf4[idex2];
2726       for (j=1; j<=jmax; j++) {
2727         is_no     = sbuf1_i[2*j-1];
2728         rmap_i    = rmap[is_no];
2729         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2730         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
2731         imat_ilen = mat->ilen;
2732         imat_j    = mat->j;
2733         imat_i    = mat->i;
2734         imat_a    = mat->a;
2735         max1      = sbuf1_i[2*j];
2736         for (k=0; k<max1; k++,ct1++) {
2737           row = sbuf1_i[ct1];
2738 #if defined(PETSC_USE_CTABLE)
2739           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2740           row--;
2741 #else
2742           row = rmap_i[row];
2743 #endif
2744           ilen  = imat_ilen[row];
2745           mat_i = imat_i[row];
2746           mat_a = imat_a + mat_i;
2747           mat_j = imat_j + mat_i;
2748           max2  = rbuf2_i[ct1];
2749           if (!allcolumns[is_no]) {
2750             for (l=0; l<max2; l++,ct2++) {
2751 
2752 #if defined(PETSC_USE_CTABLE)
2753               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2754 #else
2755               tcol = cmap_i[rbuf3_i[ct2]];
2756 #endif
2757               if (tcol) {
2758                 *mat_j++ = tcol - 1;
2759                 *mat_a++ = rbuf4_i[ct2];
2760                 ilen++;
2761               }
2762             }
2763           } else { /* allcolumns */
2764             for (l=0; l<max2; l++,ct2++) {
2765               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2766               *mat_a++ = rbuf4_i[ct2];
2767               ilen++;
2768             }
2769           }
2770           imat_ilen[row] = ilen;
2771         }
2772       }
2773     }
2774   }
2775 
2776   /* sort the rows */
2777   {
2778     PetscInt    *imat_ilen,*imat_j,*imat_i;
2779     PetscScalar *imat_a;
2780 
2781     for (i=0; i<ismax; i++) {
2782       mat       = (Mat_SeqAIJ*)submats[i]->data;
2783       imat_j    = mat->j;
2784       imat_i    = mat->i;
2785       imat_a    = mat->a;
2786       imat_ilen = mat->ilen;
2787 
2788       if (allcolumns[i]) continue;
2789       jmax = nrow[i];
2790       for (j=0; j<jmax; j++) {
2791         PetscInt ilen;
2792 
2793         mat_i = imat_i[j];
2794         mat_a = imat_a + mat_i;
2795         mat_j = imat_j + mat_i;
2796         ilen  = imat_ilen[j];
2797         ierr  = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr);
2798       }
2799     }
2800   }
2801 
2802   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2803   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2804   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
2805   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2806   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2807 
2808   /* Restore the indices */
2809   for (i=0; i<ismax; i++) {
2810     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2811     if (!allcolumns[i]) {
2812       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2813     }
2814   }
2815 
2816   /* Destroy allocated memory */
2817   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
2818   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2819   ierr = PetscFree(pa);CHKERRQ(ierr);
2820 
2821   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
2822   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
2823   for (i=0; i<nrqr; ++i) {
2824     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
2825   }
2826   for (i=0; i<nrqs; ++i) {
2827     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
2828     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2829   }
2830 
2831   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
2832   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
2833   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2834   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2835   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2836   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2837   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2838 
2839 #if defined(PETSC_USE_CTABLE)
2840   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
2841 #else
2842   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
2843 #endif
2844   ierr = PetscFree(rmap);CHKERRQ(ierr);
2845 
2846   for (i=0; i<ismax; i++) {
2847     if (!allcolumns[i]) {
2848 #if defined(PETSC_USE_CTABLE)
2849       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
2850 #else
2851       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
2852 #endif
2853     }
2854   }
2855   ierr = PetscFree(cmap);CHKERRQ(ierr);
2856   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2857   ierr = PetscFree(lens);CHKERRQ(ierr);
2858 
2859   for (i=0; i<ismax; i++) {
2860     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2861     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2862   }
2863   PetscFunctionReturn(0);
2864 }
2865 
2866 /*
2867  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2868  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2869  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2870  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2871  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2872  state, and needs to be "assembled" later by compressing B's column space.
2873 
2874  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2875  Following this call, C->A & C->B have been created, even if empty.
2876  */
2877 #undef __FUNCT__
2878 #define __FUNCT__ "MatSetSeqMats_MPIAIJ"
2879 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2880 {
2881   /* If making this function public, change the error returned in this function away from _PLIB. */
2882   PetscErrorCode ierr;
2883   Mat_MPIAIJ     *aij;
2884   Mat_SeqAIJ     *Baij;
2885   PetscBool      seqaij,Bdisassembled;
2886   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2887   PetscScalar    v;
2888   const PetscInt *rowindices,*colindices;
2889 
2890   PetscFunctionBegin;
2891   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2892   if (A) {
2893     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2894     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2895     if (rowemb) {
2896       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2897       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2898     } else {
2899       if (C->rmap->n != A->rmap->n) {
2900 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2901       }
2902     }
2903     if (dcolemb) {
2904       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2905       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2906     } else {
2907       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2908     }
2909   }
2910   if (B) {
2911     ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2912     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2913     if (rowemb) {
2914       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2915       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2916     } else {
2917       if (C->rmap->n != B->rmap->n) {
2918 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2919       }
2920     }
2921     if (ocolemb) {
2922       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2923       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2924     } else {
2925       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2926     }
2927   }
2928 
2929   aij    = (Mat_MPIAIJ*)(C->data);
2930   if (!aij->A) {
2931     /* Mimic parts of MatMPIAIJSetPreallocation() */
2932     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2933     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2934     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2935     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2936     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2937   }
2938   if (A) {
2939     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2940   } else {
2941     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2942   }
2943   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2944     /*
2945       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2946       need to "disassemble" B -- convert it to using C's global indices.
2947       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2948 
2949       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2950       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2951 
2952       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2953       At least avoid calling MatSetValues() and the implied searches?
2954     */
2955 
2956     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2957 #if defined(PETSC_USE_CTABLE)
2958       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2959 #else
2960       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2961       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2962       if (aij->B) {
2963         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2964       }
2965 #endif
2966       ngcol = 0;
2967       if (aij->lvec) {
2968 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2969       }
2970       if (aij->garray) {
2971 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2972 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2973       }
2974       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2975       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2976     }
2977     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2978       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2979     }
2980     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2981       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2982     }
2983   }
2984   Bdisassembled = PETSC_FALSE;
2985   if (!aij->B) {
2986     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2987     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2988     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2989     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
2990     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
2991     Bdisassembled = PETSC_TRUE;
2992   }
2993   if (B) {
2994     Baij = (Mat_SeqAIJ*)(B->data);
2995     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2996       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
2997       for (i=0; i<B->rmap->n; i++) {
2998 	nz[i] = Baij->i[i+1] - Baij->i[i];
2999       }
3000       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3001       ierr = PetscFree(nz);CHKERRQ(ierr);
3002     }
3003 
3004     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3005     shift = rend-rstart;
3006     count = 0;
3007     rowindices = NULL;
3008     colindices = NULL;
3009     if (rowemb) {
3010       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3011     }
3012     if (ocolemb) {
3013       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3014     }
3015     for (i=0; i<B->rmap->n; i++) {
3016       PetscInt row;
3017       row = i;
3018       if (rowindices) row = rowindices[i];
3019       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3020 	col  = Baij->j[count];
3021 	if (colindices) col = colindices[col];
3022 	if (Bdisassembled && col>=rstart) col += shift;
3023 	v    = Baij->a[count];
3024 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3025 	++count;
3026       }
3027     }
3028     /* No assembly for aij->B is necessary. */
3029     /* FIXME: set aij->B's nonzerostate correctly. */
3030   } else {
3031     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3032   }
3033   C->preallocated  = PETSC_TRUE;
3034   C->was_assembled = PETSC_FALSE;
3035   C->assembled     = PETSC_FALSE;
3036    /*
3037       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3038       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3039    */
3040   PetscFunctionReturn(0);
3041 }
3042 
3043 #undef __FUNCT__
3044 #define __FUNCT__ "MatGetSeqMats_MPIAIJ"
3045 /*
3046   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3047  */
3048 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3049 {
3050   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
3051 
3052   PetscFunctionBegin;
3053   PetscValidPointer(A,2);
3054   PetscValidPointer(B,3);
3055   /* FIXME: make sure C is assembled */
3056   *A = aij->A;
3057   *B = aij->B;
3058   /* Note that we don't incref *A and *B, so be careful! */
3059   PetscFunctionReturn(0);
3060 }
3061 
3062 /*
3063   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3064   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3065 */
3066 #undef __FUNCT__
3067 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ"
3068 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3069                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3070 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3071 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3072 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3073 {
3074   PetscErrorCode ierr;
3075   PetscMPIInt    isize,flag;
3076   PetscInt       i,ii,cismax,ispar,ciscol_localsize;
3077   Mat            *A,*B;
3078   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3079 
3080   PetscFunctionBegin;
3081   if (!ismax) PetscFunctionReturn(0);
3082 
3083   for (i = 0, cismax = 0; i < ismax; ++i) {
3084     PetscMPIInt isize;
3085     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
3086     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3087     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
3088     if (isize > 1) ++cismax;
3089   }
3090   /*
3091      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3092      ispar counts the number of parallel ISs across C's comm.
3093   */
3094   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3095   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3096     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3097     PetscFunctionReturn(0);
3098   }
3099 
3100   /* if (ispar) */
3101   /*
3102     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3103     These are used to extract the off-diag portion of the resulting parallel matrix.
3104     The row IS for the off-diag portion is the same as for the diag portion,
3105     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3106   */
3107   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3108   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3109   for (i = 0, ii = 0; i < ismax; ++i) {
3110     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3111     if (isize > 1) {
3112       /*
3113 	 TODO: This is the part that's ***NOT SCALABLE***.
3114 	 To fix this we need to extract just the indices of C's nonzero columns
3115 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3116 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
3117 	 to be done without serializing on the IS list, so, most likely, it is best
3118 	 done by rewriting MatGetSubMatrices_MPIAIJ() directly.
3119       */
3120       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3121       /* Now we have to
3122 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3123 	     were sorted on each rank, concatenated they might no longer be sorted;
3124 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3125 	     indices in the nondecreasing order to the original index positions.
3126 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3127       */
3128       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3129       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3130       ++ii;
3131     }
3132   }
3133   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3134   for (i = 0, ii = 0; i < ismax; ++i) {
3135     PetscInt       j,issize;
3136     const PetscInt *indices;
3137 
3138     /*
3139        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3140      */
3141     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3142     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3143     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3144     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3145     for (j = 1; j < issize; ++j) {
3146       if (indices[j] == indices[j-1]) {
3147 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3148       }
3149     }
3150     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3151 
3152 
3153     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3154     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3155     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3156     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3157     for (j = 1; j < issize; ++j) {
3158       if (indices[j-1] == indices[j]) {
3159 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3160       }
3161     }
3162     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3163     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3164     if (isize > 1) {
3165       cisrow[ii] = isrow[i];
3166       ++ii;
3167     }
3168   }
3169   /*
3170     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3171     array of sequential matrices underlying the resulting parallel matrices.
3172     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3173     contain duplicates.
3174 
3175     There are as many diag matrices as there are original index sets. There are only as many parallel
3176     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3177 
3178     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3179     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3180       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3181       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3182       with A[i] and B[ii] extracted from the corresponding MPI submat.
3183     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3184       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3185       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3186       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3187       values into A[i] and B[ii] sitting inside the corresponding submat.
3188     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3189       A[i], B[ii], AA[i] or BB[ii] matrices.
3190   */
3191   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3192   if (scall == MAT_INITIAL_MATRIX) {
3193     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3194     /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */
3195   } else {
3196     ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr);
3197     ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr);
3198     /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */
3199     for (i = 0, ii = 0; i < ismax; ++i) {
3200       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3201       if (isize > 1) {
3202 	Mat AA,BB;
3203 	ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr);
3204 	if (!isrow_p[i] && !iscol_p[i]) {
3205 	  A[i] = AA;
3206 	} else {
3207 	  /* TODO: extract A[i] composed on AA. */
3208 	  ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
3209 	}
3210 	if (!isrow_p[i] && !ciscol_p[ii]) {
3211 	  B[ii] = BB;
3212 	} else {
3213 	  /* TODO: extract B[ii] composed on BB. */
3214 	  ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr);
3215 	}
3216 	++ii;
3217       } else {
3218 	if (!isrow_p[i] && !iscol_p[i]) {
3219 	  A[i] = (*submat)[i];
3220 	} else {
3221 	  /* TODO: extract A[i] composed on (*submat)[i]. */
3222 	  ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
3223 	}
3224       }
3225     }
3226   }
3227   /* Now obtain the sequential A and B submatrices separately. */
3228   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr);
3229   /* I did not figure out a good way to fix it right now.
3230    * Local column size of B[i] is different from the size of ciscol[i].
3231    * B[i]'s size is finally determined by MatAssembly, while
3232    * ciscol[i] is computed as the complement of iscol[i].
3233    * It is better to keep only nonzero indices when computing
3234    * the complement ciscol[i].
3235    * */
3236   if(scall==MAT_REUSE_MATRIX){
3237 	for(i=0; i<cismax; i++){
3238 	  ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr);
3239 	  B[i]->cmap->n = ciscol_localsize;
3240 	}
3241   }
3242   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr);
3243   /*
3244     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3245     matrices A & B have been extracted directly into the parallel matrices containing them, or
3246     simply into the sequential matrix identical with the corresponding A (if isize == 1).
3247     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3248     to have the same sparsity pattern.
3249     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3250     must be constructed for C. This is done by setseqmat(s).
3251   */
3252   for (i = 0, ii = 0; i < ismax; ++i) {
3253     /*
3254        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3255        That way we can avoid sorting and computing permutations when reusing.
3256        To this end:
3257         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3258 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
3259 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3260     */
3261     MatStructure pattern;
3262     if (scall == MAT_INITIAL_MATRIX) {
3263       pattern = DIFFERENT_NONZERO_PATTERN;
3264     } else {
3265       pattern = SAME_NONZERO_PATTERN;
3266     }
3267     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3268     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3269     if (isize > 1) {
3270       if (scall == MAT_INITIAL_MATRIX) {
3271 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3272 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3273 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3274 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3275 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3276       }
3277       /*
3278 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3279       */
3280       {
3281 	Mat AA,BB;
3282 	AA = NULL;
3283 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i];
3284 	BB = NULL;
3285 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii];
3286 	if (AA || BB) {
3287 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3288 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3289 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3290 	}
3291 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) {
3292 	  /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */
3293 	  ierr = MatDestroy(&AA);CHKERRQ(ierr);
3294 	}
3295 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) {
3296 	  /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */
3297 	  ierr = MatDestroy(&BB);CHKERRQ(ierr);
3298 	}
3299       }
3300       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3301       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3302       ++ii;
3303     } else { /* if (isize == 1) */
3304       if (scall == MAT_INITIAL_MATRIX) {
3305 	if (isrow_p[i] || iscol_p[i]) {
3306 	  ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3307 	} else (*submat)[i] = A[i];
3308       }
3309       if (isrow_p[i] || iscol_p[i]) {
3310 	ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3311 	/* Otherwise A is extracted straight into (*submats)[i]. */
3312 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3313 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
3314       }
3315     }
3316     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3317     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3318   }
3319   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3320   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3321   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3322   ierr = PetscFree(A);CHKERRQ(ierr);
3323   ierr = PetscFree(B);CHKERRQ(ierr);
3324   PetscFunctionReturn(0);
3325 }
3326 
3327 
3328 
3329 #undef __FUNCT__
3330 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ"
3331 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3332 {
3333   PetscErrorCode ierr;
3334 
3335   PetscFunctionBegin;
3336   ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3337   PetscFunctionReturn(0);
3338 }
3339