xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision a51a273958bfb707f5023b6376b38d3c6478ebd9)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
22 {
23   PetscErrorCode ierr;
24   PetscInt       i;
25 
26   PetscFunctionBegin;
27   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28   for (i=0; i<ov; ++i) {
29     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
30   }
31   PetscFunctionReturn(0);
32 }
33 
34 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
35 {
36   PetscErrorCode ierr;
37   PetscInt       i;
38 
39   PetscFunctionBegin;
40   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
41   for (i=0; i<ov; ++i) {
42     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
43   }
44   PetscFunctionReturn(0);
45 }
46 
47 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
48 {
49   PetscErrorCode ierr;
50   MPI_Comm       comm;
51   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j;
52   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
53   PetscInt       nrecvrows,*sbsizes = NULL,*sbdata = NULL;
54   const PetscInt *indices_i,**indices;
55   PetscLayout    rmap;
56   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom,owner;
57   PetscSF        sf;
58   PetscSFNode    *remote;
59 
60   PetscFunctionBegin;
61   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
62   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
63   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
64   /* get row map to determine where rows should be going */
65   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
66   /* retrieve IS data and put all together so that we
67    * can optimize communication
68    *  */
69   ierr = PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
70   for (i=0,tlength=0; i<nidx; i++) {
71     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
72     tlength += length[i];
73     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
74   }
75   /* find these rows on remote processors */
76   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
77   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
78   nrrows = 0;
79   for (i=0; i<nidx; i++) {
80     length_i  = length[i];
81     indices_i = indices[i];
82     for (j=0; j<length_i; j++) {
83       owner = -1;
84       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
85       /* remote processors */
86       if (owner != rank) {
87         tosizes_temp[owner]++; /* number of rows to owner */
88         rrow_ranks[nrrows]  = owner; /* processor */
89         rrow_isids[nrrows]   = i; /* is id */
90         remoterows[nrrows++] = indices_i[j]; /* row */
91       }
92     }
93     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
94   }
95   ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr);
96   /* test if we need to exchange messages
97    * generally speaking, we do not need to exchange
98    * data when overlap is 1
99    * */
100   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRMPI(ierr);
101   /* we do not have any messages
102    * It usually corresponds to overlap 1
103    * */
104   if (!reducednrrows) {
105     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
106     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
107     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
108     PetscFunctionReturn(0);
109   }
110   nto = 0;
111   /* send sizes and ranks for building a two-sided communcation */
112   for (i=0; i<size; i++) {
113     if (tosizes_temp[i]) {
114       tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
115       tosizes_temp[i] = nto; /* a map from processor to index */
116       toranks[nto++]  = i; /* processor */
117     }
118   }
119   ierr = PetscMalloc1(nto+1,&toffsets);CHKERRQ(ierr);
120   toffsets[0] = 0;
121   for (i=0; i<nto; i++) {
122     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
123     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
124   }
125   /* send information to other processors */
126   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
127   nrecvrows = 0;
128   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
129   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
130   nrecvrows = 0;
131   for (i=0; i<nfrom; i++) {
132     for (j=0; j<fromsizes[2*i]; j++) {
133       remote[nrecvrows].rank    = fromranks[i];
134       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
135     }
136   }
137   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
138   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
139   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
140   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
141   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
142   /* message pair <no of is, row>  */
143   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
144   for (i=0; i<nrrows; i++) {
145     owner = rrow_ranks[i]; /* processor */
146     j     = tosizes_temp[owner]; /* index */
147     todata[toffsets[j]++] = rrow_isids[i];
148     todata[toffsets[j]++] = remoterows[i];
149   }
150   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
151   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
152   ierr = PetscFree(toffsets);CHKERRQ(ierr);
153   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr);
154   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr);
155   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
156   /* send rows belonging to the remote so that then we could get the overlapping data back */
157   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
158   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
159   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
160   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
161   ierr = PetscFree(fromranks);CHKERRQ(ierr);
162   nrecvrows = 0;
163   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
164   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
165   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
166   nrecvrows = 0;
167   for (i=0; i<nto; i++) {
168     for (j=0; j<tosizes[2*i]; j++) {
169       remote[nrecvrows].rank    = toranks[i];
170       remote[nrecvrows++].index = tosizes[2*i+1]+j;
171     }
172   }
173   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
174   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
175   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
176   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
177   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
178   /* overlap communication and computation */
179   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr);
180   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
181   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr);
182   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
183   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
184   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
185   ierr = PetscFree(toranks);CHKERRQ(ierr);
186   ierr = PetscFree(tosizes);CHKERRQ(ierr);
187   ierr = PetscFree(todata);CHKERRQ(ierr);
188   PetscFunctionReturn(0);
189 }
190 
191 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
192 {
193   PetscInt         *isz,isz_i,i,j,is_id, data_size;
194   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
195   const PetscInt   *indices_i_temp;
196   MPI_Comm         *iscomms;
197   PetscErrorCode    ierr;
198 
199   PetscFunctionBegin;
200   max_lsize = 0;
201   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
202   for (i=0; i<nidx; i++) {
203     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
204     max_lsize = lsize>max_lsize ? lsize:max_lsize;
205     isz[i]    = lsize;
206   }
207   ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr);
208   for (i=0; i<nidx; i++) {
209     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
210     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
211     ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr);
212     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
213     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
214   }
215   /* retrieve information to get row id and its overlap */
216   for (i=0; i<nrecvs;) {
217     is_id     = recvdata[i++];
218     data_size = recvdata[i++];
219     indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
220     isz_i     = isz[is_id];
221     for (j=0; j< data_size; j++) {
222       col = recvdata[i++];
223       indices_i[isz_i++] = col;
224     }
225     isz[is_id] = isz_i;
226   }
227   /* remove duplicate entities */
228   for (i=0; i<nidx; i++) {
229     indices_i = indices_temp+(max_lsize+nrecvs)*i;
230     isz_i     = isz[i];
231     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
232     ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
233     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
234   }
235   ierr = PetscFree(isz);CHKERRQ(ierr);
236   ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr);
237   PetscFunctionReturn(0);
238 }
239 
240 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
241 {
242   PetscLayout       rmap,cmap;
243   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
244   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
245   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
246   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
247   Mat               amat,bmat;
248   PetscMPIInt       rank;
249   PetscBool         done;
250   MPI_Comm          comm;
251   PetscErrorCode    ierr;
252 
253   PetscFunctionBegin;
254   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
255   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
256   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
257   /* Even if the mat is symmetric, we still assume it is not symmetric */
258   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
259   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
260   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
261   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
262   /* total number of nonzero values is used to estimate the memory usage in the next step */
263   tnz  = ai[an]+bi[bn];
264   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
265   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
266   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
267   /* to find the longest message */
268   max_fszs = 0;
269   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
270   /* better way to estimate number of nonzero in the mat??? */
271   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
272   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
273   rows_pos  = 0;
274   totalrows = 0;
275   for (i=0; i<nfrom; i++) {
276     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
277     /* group data together */
278     for (j=0; j<fromsizes[2*i]; j+=2) {
279       is_id                       = fromrows[rows_pos++];/* no of is */
280       rows_i                      = rows_data[is_id];
281       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
282     }
283     /* estimate a space to avoid multiple allocations  */
284     for (j=0; j<nidx; j++) {
285       indvc_ij = 0;
286       rows_i   = rows_data[j];
287       for (l=0; l<rows_pos_i[j]; l++) {
288         row    = rows_i[l]-rstart;
289         start  = ai[row];
290         end    = ai[row+1];
291         for (k=start; k<end; k++) { /* Amat */
292           col = aj[k] + cstart;
293           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
294         }
295         start = bi[row];
296         end   = bi[row+1];
297         for (k=start; k<end; k++) { /* Bmat */
298           col = gcols[bj[k]];
299           indices_tmp[indvc_ij++] = col;
300         }
301       }
302       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
303       indv_counts[i*nidx+j] = indvc_ij;
304       totalrows            += indvc_ij;
305     }
306   }
307   /* message triple <no of is, number of rows, rows> */
308   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
309   totalrows = 0;
310   rows_pos  = 0;
311   /* use this code again */
312   for (i=0;i<nfrom;i++) {
313     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
314     for (j=0; j<fromsizes[2*i]; j+=2) {
315       is_id                       = fromrows[rows_pos++];
316       rows_i                      = rows_data[is_id];
317       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
318     }
319     /* add data  */
320     for (j=0; j<nidx; j++) {
321       if (!indv_counts[i*nidx+j]) continue;
322       indvc_ij = 0;
323       sbdata[totalrows++] = j;
324       sbdata[totalrows++] = indv_counts[i*nidx+j];
325       sbsizes[2*i]       += 2;
326       rows_i              = rows_data[j];
327       for (l=0; l<rows_pos_i[j]; l++) {
328         row   = rows_i[l]-rstart;
329         start = ai[row];
330         end   = ai[row+1];
331         for (k=start; k<end; k++) { /* Amat */
332           col = aj[k] + cstart;
333           indices_tmp[indvc_ij++] = col;
334         }
335         start = bi[row];
336         end   = bi[row+1];
337         for (k=start; k<end; k++) { /* Bmat */
338           col = gcols[bj[k]];
339           indices_tmp[indvc_ij++] = col;
340         }
341       }
342       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
343       sbsizes[2*i]  += indvc_ij;
344       ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr);
345       totalrows += indvc_ij;
346     }
347   }
348   ierr = PetscMalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
349   offsets[0] = 0;
350   for (i=0; i<nfrom; i++) {
351     offsets[i+1]   = offsets[i] + sbsizes[2*i];
352     sbsizes[2*i+1] = offsets[i];
353   }
354   ierr = PetscFree(offsets);CHKERRQ(ierr);
355   if (sbrowsizes) *sbrowsizes = sbsizes;
356   if (sbrows) *sbrows = sbdata;
357   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
358   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
359   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
360   PetscFunctionReturn(0);
361 }
362 
363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
364 {
365   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
366   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
367   PetscInt          lsize,lsize_tmp;
368   PetscMPIInt       rank,owner;
369   Mat               amat,bmat;
370   PetscBool         done;
371   PetscLayout       cmap,rmap;
372   MPI_Comm          comm;
373   PetscErrorCode    ierr;
374 
375   PetscFunctionBegin;
376   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
377   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
378   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
379   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
380   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
381   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
382   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
383   /* is it a safe way to compute number of nonzero values ? */
384   tnz  = ai[an]+bi[bn];
385   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
386   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
387   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
388   /* it is a better way to estimate memory than the old implementation
389    * where global size of matrix is used
390    * */
391   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
392   for (i=0; i<nidx; i++) {
393     MPI_Comm iscomm;
394 
395     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
396     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
397     lsize_tmp = 0;
398     for (j=0; j<lsize; j++) {
399       owner = -1;
400       row   = indices[j];
401       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
402       if (owner != rank) continue;
403       /* local number */
404       row  -= rstart;
405       start = ai[row];
406       end   = ai[row+1];
407       for (k=start; k<end; k++) { /* Amat */
408         col = aj[k] + cstart;
409         indices_temp[lsize_tmp++] = col;
410       }
411       start = bi[row];
412       end   = bi[row+1];
413       for (k=start; k<end; k++) { /* Bmat */
414         col = gcols[bj[k]];
415         indices_temp[lsize_tmp++] = col;
416       }
417     }
418    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
419    ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr);
420    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
421    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
422    ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
423    ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr);
424   }
425   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
426   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
427   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
428   PetscFunctionReturn(0);
429 }
430 
431 /*
432   Sample message format:
433   If a processor A wants processor B to process some elements corresponding
434   to index sets is[1],is[5]
435   mesg [0] = 2   (no of index sets in the mesg)
436   -----------
437   mesg [1] = 1 => is[1]
438   mesg [2] = sizeof(is[1]);
439   -----------
440   mesg [3] = 5  => is[5]
441   mesg [4] = sizeof(is[5]);
442   -----------
443   mesg [5]
444   mesg [n]  datas[1]
445   -----------
446   mesg[n+1]
447   mesg[m]  data(is[5])
448   -----------
449 
450   Notes:
451   nrqs - no of requests sent (or to be sent out)
452   nrqr - no of requests received (which have to be or which have been processed)
453 */
454 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
455 {
456   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
457   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
458   const PetscInt **idx,*idx_i;
459   PetscInt       *n,**data,len;
460 #if defined(PETSC_USE_CTABLE)
461   PetscTable     *table_data,table_data_i;
462   PetscInt       *tdata,tcount,tcount_max;
463 #else
464   PetscInt       *data_i,*d_p;
465 #endif
466   PetscErrorCode ierr;
467   PetscMPIInt    size,rank,tag1,tag2,proc = 0;
468   PetscInt       M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
469   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
470   PetscBT        *table;
471   MPI_Comm       comm;
472   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
473   MPI_Status     *recv_status;
474   MPI_Comm       *iscomms;
475   char           *t_p;
476 
477   PetscFunctionBegin;
478   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
479   size = c->size;
480   rank = c->rank;
481   M    = C->rmap->N;
482 
483   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
484   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
485 
486   ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr);
487 
488   for (i=0; i<imax; i++) {
489     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
490     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
491   }
492 
493   /* evaluate communication - mesg to who,length of mesg, and buffer space
494      required. Based on this, buffers are allocated, and data copied into them  */
495   ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
496   for (i=0; i<imax; i++) {
497     ierr  = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/
498     idx_i = idx[i];
499     len   = n[i];
500     for (j=0; j<len; j++) {
501       row = idx_i[j];
502       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
503       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
504       w4[proc]++;
505     }
506     for (j=0; j<size; j++) {
507       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
508     }
509   }
510 
511   nrqs     = 0;              /* no of outgoing messages */
512   msz      = 0;              /* total mesg length (for all proc */
513   w1[rank] = 0;              /* no mesg sent to intself */
514   w3[rank] = 0;
515   for (i=0; i<size; i++) {
516     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
517   }
518   /* pa - is list of processors to communicate with */
519   ierr = PetscMalloc1(nrqs,&pa);CHKERRQ(ierr);
520   for (i=0,j=0; i<size; i++) {
521     if (w1[i]) {pa[j] = i; j++;}
522   }
523 
524   /* Each message would have a header = 1 + 2*(no of IS) + data */
525   for (i=0; i<nrqs; i++) {
526     j      = pa[i];
527     w1[j] += w2[j] + 2*w3[j];
528     msz   += w1[j];
529   }
530 
531   /* Determine the number of messages to expect, their lengths, from from-ids */
532   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
533   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
534 
535   /* Now post the Irecvs corresponding to these messages */
536   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
537 
538   /* Allocate Memory for outgoing messages */
539   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
540   ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr);
541   ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
542 
543   {
544     PetscInt *iptr = tmp,ict  = 0;
545     for (i=0; i<nrqs; i++) {
546       j         = pa[i];
547       iptr     +=  ict;
548       outdat[j] = iptr;
549       ict       = w1[j];
550     }
551   }
552 
553   /* Form the outgoing messages */
554   /* plug in the headers */
555   for (i=0; i<nrqs; i++) {
556     j            = pa[i];
557     outdat[j][0] = 0;
558     ierr         = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr);
559     ptr[j]       = outdat[j] + 2*w3[j] + 1;
560   }
561 
562   /* Memory for doing local proc's work */
563   {
564     PetscInt M_BPB_imax = 0;
565 #if defined(PETSC_USE_CTABLE)
566     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
567     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
568     for (i=0; i<imax; i++) {
569       ierr = PetscTableCreate(n[i],M,&table_data[i]);CHKERRQ(ierr);
570     }
571     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
572     for (i=0; i<imax; i++) {
573       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
574     }
575 #else
576     PetscInt Mimax = 0;
577     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
578     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
579     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
580     for (i=0; i<imax; i++) {
581       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
582       data[i]  = d_p + M*i;
583     }
584 #endif
585   }
586 
587   /* Parse the IS and update local tables and the outgoing buf with the data */
588   {
589     PetscInt n_i,isz_i,*outdat_j,ctr_j;
590     PetscBT  table_i;
591 
592     for (i=0; i<imax; i++) {
593       ierr    = PetscArrayzero(ctr,size);CHKERRQ(ierr);
594       n_i     = n[i];
595       table_i = table[i];
596       idx_i   = idx[i];
597 #if defined(PETSC_USE_CTABLE)
598       table_data_i = table_data[i];
599 #else
600       data_i  = data[i];
601 #endif
602       isz_i   = isz[i];
603       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
604         row  = idx_i[j];
605         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
606         if (proc != rank) { /* copy to the outgoing buffer */
607           ctr[proc]++;
608           *ptr[proc] = row;
609           ptr[proc]++;
610         } else if (!PetscBTLookupSet(table_i,row)) {
611 #if defined(PETSC_USE_CTABLE)
612           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
613 #else
614           data_i[isz_i] = row; /* Update the local table */
615 #endif
616           isz_i++;
617         }
618       }
619       /* Update the headers for the current IS */
620       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
621         if ((ctr_j = ctr[j])) {
622           outdat_j        = outdat[j];
623           k               = ++outdat_j[0];
624           outdat_j[2*k]   = ctr_j;
625           outdat_j[2*k-1] = i;
626         }
627       }
628       isz[i] = isz_i;
629     }
630   }
631 
632   /*  Now  post the sends */
633   ierr = PetscMalloc1(nrqs,&s_waits1);CHKERRQ(ierr);
634   for (i=0; i<nrqs; ++i) {
635     j    = pa[i];
636     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRMPI(ierr);
637   }
638 
639   /* No longer need the original indices */
640   ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr);
641   for (i=0; i<imax; ++i) {
642     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
643     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
644   }
645   ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr);
646 
647   for (i=0; i<imax; ++i) {
648     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
649   }
650 
651   /* Do Local work */
652 #if defined(PETSC_USE_CTABLE)
653   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
654 #else
655   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
656 #endif
657 
658   /* Receive messages */
659   ierr = PetscMalloc1(nrqr,&recv_status);CHKERRQ(ierr);
660   ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRMPI(ierr);
661   ierr = MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
662 
663   /* Phase 1 sends are complete - deallocate buffers */
664   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
665   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
666 
667   ierr = PetscMalloc1(nrqr,&xdata);CHKERRQ(ierr);
668   ierr = PetscMalloc1(nrqr,&isz1);CHKERRQ(ierr);
669   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
670   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
671   ierr = PetscFree(rbuf);CHKERRQ(ierr);
672 
673   /* Send the data back */
674   /* Do a global reduction to know the buffer space req for incoming messages */
675   {
676     PetscMPIInt *rw1;
677 
678     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
679 
680     for (i=0; i<nrqr; ++i) {
681       proc = recv_status[i].MPI_SOURCE;
682 
683       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
684       rw1[proc] = isz1[i];
685     }
686     ierr = PetscFree(onodes1);CHKERRQ(ierr);
687     ierr = PetscFree(olengths1);CHKERRQ(ierr);
688 
689     /* Determine the number of messages to expect, their lengths, from from-ids */
690     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
691     ierr = PetscFree(rw1);CHKERRQ(ierr);
692   }
693   /* Now post the Irecvs corresponding to these messages */
694   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
695 
696   /* Now  post the sends */
697   ierr = PetscMalloc1(nrqr,&s_waits2);CHKERRQ(ierr);
698   for (i=0; i<nrqr; ++i) {
699     j    = recv_status[i].MPI_SOURCE;
700     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRMPI(ierr);
701   }
702 
703   /* receive work done on other processors */
704   {
705     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
706     PetscMPIInt idex;
707     PetscBT     table_i;
708 
709     for (i=0; i<nrqs; ++i) {
710       ierr = MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE);CHKERRMPI(ierr);
711       /* Process the message */
712       rbuf2_i = rbuf2[idex];
713       ct1     = 2*rbuf2_i[0]+1;
714       jmax    = rbuf2[idex][0];
715       for (j=1; j<=jmax; j++) {
716         max     = rbuf2_i[2*j];
717         is_no   = rbuf2_i[2*j-1];
718         isz_i   = isz[is_no];
719         table_i = table[is_no];
720 #if defined(PETSC_USE_CTABLE)
721         table_data_i = table_data[is_no];
722 #else
723         data_i  = data[is_no];
724 #endif
725         for (k=0; k<max; k++,ct1++) {
726           row = rbuf2_i[ct1];
727           if (!PetscBTLookupSet(table_i,row)) {
728 #if defined(PETSC_USE_CTABLE)
729             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
730 #else
731             data_i[isz_i] = row;
732 #endif
733             isz_i++;
734           }
735         }
736         isz[is_no] = isz_i;
737       }
738     }
739 
740     ierr = MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
741   }
742 
743 #if defined(PETSC_USE_CTABLE)
744   tcount_max = 0;
745   for (i=0; i<imax; ++i) {
746     table_data_i = table_data[i];
747     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
748     if (tcount_max < tcount) tcount_max = tcount;
749   }
750   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
751 #endif
752 
753   for (i=0; i<imax; ++i) {
754 #if defined(PETSC_USE_CTABLE)
755     PetscTablePosition tpos;
756     table_data_i = table_data[i];
757 
758     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
759     while (tpos) {
760       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
761       tdata[--j] = --k;
762     }
763     ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
764 #else
765     ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
766 #endif
767     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
768   }
769 
770   ierr = PetscFree(iscomms);CHKERRQ(ierr);
771   ierr = PetscFree(onodes2);CHKERRQ(ierr);
772   ierr = PetscFree(olengths2);CHKERRQ(ierr);
773 
774   ierr = PetscFree(pa);CHKERRQ(ierr);
775   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
776   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
777   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
778   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
779   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
780   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
781   ierr = PetscFree(recv_status);CHKERRQ(ierr);
782   if (xdata) {
783     ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
784     ierr = PetscFree(xdata);CHKERRQ(ierr);
785   }
786   ierr = PetscFree(isz1);CHKERRQ(ierr);
787 #if defined(PETSC_USE_CTABLE)
788   for (i=0; i<imax; i++) {
789     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
790   }
791   ierr = PetscFree(table_data);CHKERRQ(ierr);
792   ierr = PetscFree(tdata);CHKERRQ(ierr);
793   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
794 #else
795   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
796 #endif
797   PetscFunctionReturn(0);
798 }
799 
800 /*
801    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
802        the work on the local processor.
803 
804      Inputs:
805       C      - MAT_MPIAIJ;
806       imax - total no of index sets processed at a time;
807       table  - an array of char - size = m bits.
808 
809      Output:
810       isz    - array containing the count of the solution elements corresponding
811                to each index set;
812       data or table_data  - pointer to the solutions
813 */
814 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
815 {
816   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
817   Mat        A  = c->A,B = c->B;
818   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
819   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
820   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
821   PetscBT    table_i;
822 #if defined(PETSC_USE_CTABLE)
823   PetscTable         table_data_i;
824   PetscErrorCode     ierr;
825   PetscTablePosition tpos;
826   PetscInt           tcount,*tdata;
827 #else
828   PetscInt           *data_i;
829 #endif
830 
831   PetscFunctionBegin;
832   rstart = C->rmap->rstart;
833   cstart = C->cmap->rstart;
834   ai     = a->i;
835   aj     = a->j;
836   bi     = b->i;
837   bj     = b->j;
838   garray = c->garray;
839 
840   for (i=0; i<imax; i++) {
841 #if defined(PETSC_USE_CTABLE)
842     /* copy existing entries of table_data_i into tdata[] */
843     table_data_i = table_data[i];
844     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
845     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %d != isz[%d] %d",tcount,i,isz[i]);
846 
847     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
848     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
849     while (tpos) {
850       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
851       tdata[--j] = --row;
852       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB," j %d >= tcount %d",j,tcount);
853     }
854 #else
855     data_i  = data[i];
856 #endif
857     table_i = table[i];
858     isz_i   = isz[i];
859     max     = isz[i];
860 
861     for (j=0; j<max; j++) {
862 #if defined(PETSC_USE_CTABLE)
863       row   = tdata[j] - rstart;
864 #else
865       row   = data_i[j] - rstart;
866 #endif
867       start = ai[row];
868       end   = ai[row+1];
869       for (k=start; k<end; k++) { /* Amat */
870         val = aj[k] + cstart;
871         if (!PetscBTLookupSet(table_i,val)) {
872 #if defined(PETSC_USE_CTABLE)
873           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
874 #else
875           data_i[isz_i] = val;
876 #endif
877           isz_i++;
878         }
879       }
880       start = bi[row];
881       end   = bi[row+1];
882       for (k=start; k<end; k++) { /* Bmat */
883         val = garray[bj[k]];
884         if (!PetscBTLookupSet(table_i,val)) {
885 #if defined(PETSC_USE_CTABLE)
886           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
887 #else
888           data_i[isz_i] = val;
889 #endif
890           isz_i++;
891         }
892       }
893     }
894     isz[i] = isz_i;
895 
896 #if defined(PETSC_USE_CTABLE)
897     ierr = PetscFree(tdata);CHKERRQ(ierr);
898 #endif
899   }
900   PetscFunctionReturn(0);
901 }
902 
903 /*
904       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
905          and return the output
906 
907          Input:
908            C    - the matrix
909            nrqr - no of messages being processed.
910            rbuf - an array of pointers to the received requests
911 
912          Output:
913            xdata - array of messages to be sent back
914            isz1  - size of each message
915 
916   For better efficiency perhaps we should malloc separately each xdata[i],
917 then if a remalloc is required we need only copy the data for that one row
918 rather then all previous rows as it is now where a single large chunck of
919 memory is used.
920 
921 */
922 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
923 {
924   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
925   Mat            A  = c->A,B = c->B;
926   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
927   PetscErrorCode ierr;
928   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
929   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
930   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
931   PetscInt       *rbuf_i,kmax,rbuf_0;
932   PetscBT        xtable;
933 
934   PetscFunctionBegin;
935   m      = C->rmap->N;
936   rstart = C->rmap->rstart;
937   cstart = C->cmap->rstart;
938   ai     = a->i;
939   aj     = a->j;
940   bi     = b->i;
941   bj     = b->j;
942   garray = c->garray;
943 
944   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
945     rbuf_i =  rbuf[i];
946     rbuf_0 =  rbuf_i[0];
947     ct    += rbuf_0;
948     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
949   }
950 
951   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
952   else max1 = 1;
953   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
954   if (nrqr) {
955     ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
956     ++no_malloc;
957   }
958   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
959   ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr);
960 
961   ct3 = 0;
962   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
963     rbuf_i =  rbuf[i];
964     rbuf_0 =  rbuf_i[0];
965     ct1    =  2*rbuf_0+1;
966     ct2    =  ct1;
967     ct3   += ct1;
968     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
969       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
970       oct2 = ct2;
971       kmax = rbuf_i[2*j];
972       for (k=0; k<kmax; k++,ct1++) {
973         row = rbuf_i[ct1];
974         if (!PetscBTLookupSet(xtable,row)) {
975           if (!(ct3 < mem_estimate)) {
976             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
977             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
978             ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
979             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
980             xdata[0]     = tmp;
981             mem_estimate = new_estimate; ++no_malloc;
982             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
983           }
984           xdata[i][ct2++] = row;
985           ct3++;
986         }
987       }
988       for (k=oct2,max2=ct2; k<max2; k++) {
989         row   = xdata[i][k] - rstart;
990         start = ai[row];
991         end   = ai[row+1];
992         for (l=start; l<end; l++) {
993           val = aj[l] + cstart;
994           if (!PetscBTLookupSet(xtable,val)) {
995             if (!(ct3 < mem_estimate)) {
996               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
997               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
998               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
999               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1000               xdata[0]     = tmp;
1001               mem_estimate = new_estimate; ++no_malloc;
1002               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1003             }
1004             xdata[i][ct2++] = val;
1005             ct3++;
1006           }
1007         }
1008         start = bi[row];
1009         end   = bi[row+1];
1010         for (l=start; l<end; l++) {
1011           val = garray[bj[l]];
1012           if (!PetscBTLookupSet(xtable,val)) {
1013             if (!(ct3 < mem_estimate)) {
1014               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1015               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1016               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
1017               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1018               xdata[0]     = tmp;
1019               mem_estimate = new_estimate; ++no_malloc;
1020               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1021             }
1022             xdata[i][ct2++] = val;
1023             ct3++;
1024           }
1025         }
1026       }
1027       /* Update the header*/
1028       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1029       xdata[i][2*j-1] = rbuf_i[2*j-1];
1030     }
1031     xdata[i][0] = rbuf_0;
1032     if (i+1<nrqr) xdata[i+1]  = xdata[i] + ct2;
1033     isz1[i]     = ct2; /* size of each message */
1034   }
1035   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1036   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1037   PetscFunctionReturn(0);
1038 }
1039 /* -------------------------------------------------------------------------*/
1040 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1041 /*
1042     Every processor gets the entire matrix
1043 */
1044 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1045 {
1046   Mat            B;
1047   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1048   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1049   PetscErrorCode ierr;
1050   PetscMPIInt    size,rank,*recvcounts = NULL,*displs = NULL;
1051   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1052   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1053 
1054   PetscFunctionBegin;
1055   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
1056   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRMPI(ierr);
1057   if (scall == MAT_INITIAL_MATRIX) {
1058     /* ----------------------------------------------------------------
1059          Tell every processor the number of nonzeros per row
1060     */
1061     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1062     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1063       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1064     }
1065     ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1066     for (i=0; i<size; i++) {
1067       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1068       displs[i]     = A->rmap->range[i];
1069     }
1070 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1071     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1072 #else
1073     sendcount = A->rmap->rend - A->rmap->rstart;
1074     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1075 #endif
1076     /* ---------------------------------------------------------------
1077          Create the sequential matrix of the same type as the local block diagonal
1078     */
1079     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1080     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1081     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1082     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1083     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1084     ierr  = PetscCalloc1(2,Bin);CHKERRQ(ierr);
1085     **Bin = B;
1086     b     = (Mat_SeqAIJ*)B->data;
1087 
1088     /*--------------------------------------------------------------------
1089        Copy my part of matrix column indices over
1090     */
1091     sendcount  = ad->nz + bd->nz;
1092     jsendbuf   = b->j + b->i[rstarts[rank]];
1093     a_jsendbuf = ad->j;
1094     b_jsendbuf = bd->j;
1095     n          = A->rmap->rend - A->rmap->rstart;
1096     cnt        = 0;
1097     for (i=0; i<n; i++) {
1098       /* put in lower diagonal portion */
1099       m = bd->i[i+1] - bd->i[i];
1100       while (m > 0) {
1101         /* is it above diagonal (in bd (compressed) numbering) */
1102         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1103         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1104         m--;
1105       }
1106 
1107       /* put in diagonal portion */
1108       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1109         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1110       }
1111 
1112       /* put in upper diagonal portion */
1113       while (m-- > 0) {
1114         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1115       }
1116     }
1117     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1118 
1119     /*--------------------------------------------------------------------
1120        Gather all column indices to all processors
1121     */
1122     for (i=0; i<size; i++) {
1123       recvcounts[i] = 0;
1124       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1125         recvcounts[i] += lens[j];
1126       }
1127     }
1128     displs[0] = 0;
1129     for (i=1; i<size; i++) {
1130       displs[i] = displs[i-1] + recvcounts[i-1];
1131     }
1132 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1133     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1134 #else
1135     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1136 #endif
1137     /*--------------------------------------------------------------------
1138         Assemble the matrix into useable form (note numerical values not yet set)
1139     */
1140     /* set the b->ilen (length of each row) values */
1141     ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr);
1142     /* set the b->i indices */
1143     b->i[0] = 0;
1144     for (i=1; i<=A->rmap->N; i++) {
1145       b->i[i] = b->i[i-1] + lens[i-1];
1146     }
1147     ierr = PetscFree(lens);CHKERRQ(ierr);
1148     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1149     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1150 
1151   } else {
1152     B = **Bin;
1153     b = (Mat_SeqAIJ*)B->data;
1154   }
1155 
1156   /*--------------------------------------------------------------------
1157        Copy my part of matrix numerical values into the values location
1158   */
1159   if (flag == MAT_GET_VALUES) {
1160     const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf;
1161     MatScalar         *sendbuf,*recvbuf;
1162 
1163     ierr = MatSeqAIJGetArrayRead(a->A,&ada);CHKERRQ(ierr);
1164     ierr = MatSeqAIJGetArrayRead(a->B,&bda);CHKERRQ(ierr);
1165     sendcount = ad->nz + bd->nz;
1166     sendbuf   = b->a + b->i[rstarts[rank]];
1167     a_sendbuf = ada;
1168     b_sendbuf = bda;
1169     b_sendj   = bd->j;
1170     n         = A->rmap->rend - A->rmap->rstart;
1171     cnt       = 0;
1172     for (i=0; i<n; i++) {
1173       /* put in lower diagonal portion */
1174       m = bd->i[i+1] - bd->i[i];
1175       while (m > 0) {
1176         /* is it above diagonal (in bd (compressed) numbering) */
1177         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1178         sendbuf[cnt++] = *b_sendbuf++;
1179         m--;
1180         b_sendj++;
1181       }
1182 
1183       /* put in diagonal portion */
1184       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1185         sendbuf[cnt++] = *a_sendbuf++;
1186       }
1187 
1188       /* put in upper diagonal portion */
1189       while (m-- > 0) {
1190         sendbuf[cnt++] = *b_sendbuf++;
1191         b_sendj++;
1192       }
1193     }
1194     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1195 
1196     /* -----------------------------------------------------------------
1197        Gather all numerical values to all processors
1198     */
1199     if (!recvcounts) {
1200       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1201     }
1202     for (i=0; i<size; i++) {
1203       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1204     }
1205     displs[0] = 0;
1206     for (i=1; i<size; i++) {
1207       displs[i] = displs[i-1] + recvcounts[i-1];
1208     }
1209     recvbuf = b->a;
1210 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1211     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1212 #else
1213     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1214 #endif
1215     ierr = MatSeqAIJRestoreArrayRead(a->A,&ada);CHKERRQ(ierr);
1216     ierr = MatSeqAIJRestoreArrayRead(a->B,&bda);CHKERRQ(ierr);
1217   }  /* endof (flag == MAT_GET_VALUES) */
1218   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1219 
1220   if (A->symmetric) {
1221     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1222   } else if (A->hermitian) {
1223     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1224   } else if (A->structurally_symmetric) {
1225     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1226   }
1227   PetscFunctionReturn(0);
1228 }
1229 
1230 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1231 {
1232   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1233   Mat            submat,A = c->A,B = c->B;
1234   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1235   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1236   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1237   const PetscInt *icol,*irow;
1238   PetscInt       nrow,ncol,start;
1239   PetscErrorCode ierr;
1240   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1241   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1242   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1243   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1244   PetscInt       *lens,rmax,ncols,*cols,Crow;
1245 #if defined(PETSC_USE_CTABLE)
1246   PetscTable     cmap,rmap;
1247   PetscInt       *cmap_loc,*rmap_loc;
1248 #else
1249   PetscInt       *cmap,*rmap;
1250 #endif
1251   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1252   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1253   PetscScalar    *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL;
1254   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1255   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1256   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1257   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1258   MPI_Comm       comm;
1259   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1260   PetscMPIInt    *onodes1,*olengths1,idex,end;
1261   Mat_SubSppt    *smatis1;
1262   PetscBool      isrowsorted,iscolsorted;
1263 
1264   PetscFunctionBegin;
1265   PetscValidLogicalCollectiveInt(C,ismax,2);
1266   PetscValidLogicalCollectiveEnum(C,scall,5);
1267   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1268   ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
1269   ierr = MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
1270   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1271   size = c->size;
1272   rank = c->rank;
1273 
1274   ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr);
1275   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1276   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1277   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1278   if (allcolumns) {
1279     icol = NULL;
1280     ncol = C->cmap->N;
1281   } else {
1282     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1283     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1284   }
1285 
1286   if (scall == MAT_INITIAL_MATRIX) {
1287     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1288 
1289     /* Get some new tags to keep the communication clean */
1290     tag1 = ((PetscObject)C)->tag;
1291     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1292     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1293 
1294     /* evaluate communication - mesg to who, length of mesg, and buffer space
1295      required. Based on this, buffers are allocated, and data copied into them */
1296     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1297     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1298 
1299     /* w1[proc] = num of rows owned by proc -- to be requested */
1300     proc = 0;
1301     nrqs = 0; /* num of outgoing messages */
1302     for (j=0; j<nrow; j++) {
1303       row  = irow[j];
1304       if (!isrowsorted) proc = 0;
1305       while (row >= C->rmap->range[proc+1]) proc++;
1306       w1[proc]++;
1307       row2proc[j] = proc; /* map row index to proc */
1308 
1309       if (proc != rank && !w2[proc]) {
1310         w2[proc] = 1; nrqs++;
1311       }
1312     }
1313     w1[rank] = 0;  /* rows owned by self will not be requested */
1314 
1315     ierr = PetscMalloc1(nrqs,&pa);CHKERRQ(ierr); /*(proc -array)*/
1316     for (proc=0,j=0; proc<size; proc++) {
1317       if (w1[proc]) { pa[j++] = proc;}
1318     }
1319 
1320     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1321     msz = 0;              /* total mesg length (for all procs) */
1322     for (i=0; i<nrqs; i++) {
1323       proc      = pa[i];
1324       w1[proc] += 3;
1325       msz      += w1[proc];
1326     }
1327     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1328 
1329     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1330     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1331     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1332 
1333     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1334        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1335     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1336 
1337     /* Now post the Irecvs corresponding to these messages */
1338     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1339 
1340     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1341     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1342 
1343     /* Allocate Memory for outgoing messages */
1344     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1345     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
1346     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
1347 
1348     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1349     iptr = tmp;
1350     for (i=0; i<nrqs; i++) {
1351       proc        = pa[i];
1352       sbuf1[proc] = iptr;
1353       iptr       += w1[proc];
1354     }
1355 
1356     /* Form the outgoing messages */
1357     /* Initialize the header space */
1358     for (i=0; i<nrqs; i++) {
1359       proc      = pa[i];
1360       ierr      = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr);
1361       ptr[proc] = sbuf1[proc] + 3;
1362     }
1363 
1364     /* Parse the isrow and copy data into outbuf */
1365     ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr);
1366     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1367       proc = row2proc[j];
1368       if (proc != rank) { /* copy to the outgoing buf*/
1369         *ptr[proc] = irow[j];
1370         ctr[proc]++; ptr[proc]++;
1371       }
1372     }
1373 
1374     /* Update the headers for the current IS */
1375     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1376       if ((ctr_j = ctr[j])) {
1377         sbuf1_j        = sbuf1[j];
1378         k              = ++sbuf1_j[0];
1379         sbuf1_j[2*k]   = ctr_j;
1380         sbuf1_j[2*k-1] = 0;
1381       }
1382     }
1383 
1384     /* Now post the sends */
1385     ierr = PetscMalloc1(nrqs,&s_waits1);CHKERRQ(ierr);
1386     for (i=0; i<nrqs; ++i) {
1387       proc = pa[i];
1388       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRMPI(ierr);
1389     }
1390 
1391     /* Post Receives to capture the buffer size */
1392     ierr = PetscMalloc4(nrqs,&r_status2,nrqr,&s_waits2,nrqs,&r_waits2,nrqr,&s_status2);CHKERRQ(ierr);
1393     ierr = PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);CHKERRQ(ierr);
1394 
1395     if (nrqs) rbuf2[0] = tmp + msz;
1396     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1397 
1398     for (i=0; i<nrqs; ++i) {
1399       proc = pa[i];
1400       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRMPI(ierr);
1401     }
1402 
1403     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1404 
1405     /* Send to other procs the buf size they should allocate */
1406     /* Receive messages*/
1407     ierr = PetscMalloc1(nrqr,&r_status1);CHKERRQ(ierr);
1408     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1409 
1410     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr);
1411     for (i=0; i<nrqr; ++i) {
1412       req_size[i] = 0;
1413       rbuf1_i        = rbuf1[i];
1414       start          = 2*rbuf1_i[0] + 1;
1415       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRMPI(ierr);
1416       ierr           = PetscMalloc1(end,&sbuf2[i]);CHKERRQ(ierr);
1417       sbuf2_i        = sbuf2[i];
1418       for (j=start; j<end; j++) {
1419         k            = rbuf1_i[j] - rstart;
1420         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1421         sbuf2_i[j]   = ncols;
1422         req_size[i] += ncols;
1423       }
1424       req_source1[i] = r_status1[i].MPI_SOURCE;
1425 
1426       /* form the header */
1427       sbuf2_i[0] = req_size[i];
1428       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1429 
1430       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr);
1431     }
1432 
1433     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1434     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1435 
1436     /* rbuf2 is received, Post recv column indices a->j */
1437     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr);
1438 
1439     ierr = PetscMalloc4(nrqs,&r_waits3,nrqr,&s_waits3,nrqs,&r_status3,nrqr,&s_status3);CHKERRQ(ierr);
1440     for (i=0; i<nrqs; ++i) {
1441       ierr = PetscMalloc1(rbuf2[i][0],&rbuf3[i]);CHKERRQ(ierr);
1442       req_source2[i] = r_status2[i].MPI_SOURCE;
1443       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr);
1444     }
1445 
1446     /* Wait on sends1 and sends2 */
1447     ierr = PetscMalloc1(nrqs,&s_status1);CHKERRQ(ierr);
1448     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr);
1449     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1450     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1451 
1452     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr);
1453     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1454 
1455     /* Now allocate sending buffers for a->j, and send them off */
1456     ierr = PetscMalloc1(nrqr,&sbuf_aj);CHKERRQ(ierr);
1457     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1458     if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aj[0]);CHKERRQ(ierr);}
1459     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1460 
1461     for (i=0; i<nrqr; i++) { /* for each requested message */
1462       rbuf1_i   = rbuf1[i];
1463       sbuf_aj_i = sbuf_aj[i];
1464       ct1       = 2*rbuf1_i[0] + 1;
1465       ct2       = 0;
1466       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1467 
1468       kmax = rbuf1[i][2];
1469       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1470         row    = rbuf1_i[ct1] - rstart;
1471         nzA    = ai[row+1] - ai[row];
1472         nzB    = bi[row+1] - bi[row];
1473         ncols  = nzA + nzB;
1474         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1475 
1476         /* load the column indices for this row into cols*/
1477         cols = sbuf_aj_i + ct2;
1478 
1479         lwrite = 0;
1480         for (l=0; l<nzB; l++) {
1481           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1482         }
1483         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1484         for (l=0; l<nzB; l++) {
1485           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1486         }
1487 
1488         ct2 += ncols;
1489       }
1490       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr);
1491     }
1492 
1493     /* create column map (cmap): global col of C -> local col of submat */
1494 #if defined(PETSC_USE_CTABLE)
1495     if (!allcolumns) {
1496       ierr = PetscTableCreate(ncol,C->cmap->N,&cmap);CHKERRQ(ierr);
1497       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1498       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1499         if (icol[j] >= cstart && icol[j] <cend) {
1500           cmap_loc[icol[j] - cstart] = j+1;
1501         } else { /* use PetscTable for non-local col indices */
1502           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1503         }
1504       }
1505     } else {
1506       cmap     = NULL;
1507       cmap_loc = NULL;
1508     }
1509     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1510 #else
1511     if (!allcolumns) {
1512       ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1513       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1514     } else {
1515       cmap = NULL;
1516     }
1517 #endif
1518 
1519     /* Create lens for MatSeqAIJSetPreallocation() */
1520     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1521 
1522     /* Compute lens from local part of C */
1523     for (j=0; j<nrow; j++) {
1524       row  = irow[j];
1525       proc = row2proc[j];
1526       if (proc == rank) {
1527         /* diagonal part A = c->A */
1528         ncols = ai[row-rstart+1] - ai[row-rstart];
1529         cols  = aj + ai[row-rstart];
1530         if (!allcolumns) {
1531           for (k=0; k<ncols; k++) {
1532 #if defined(PETSC_USE_CTABLE)
1533             tcol = cmap_loc[cols[k]];
1534 #else
1535             tcol = cmap[cols[k]+cstart];
1536 #endif
1537             if (tcol) lens[j]++;
1538           }
1539         } else { /* allcolumns */
1540           lens[j] = ncols;
1541         }
1542 
1543         /* off-diagonal part B = c->B */
1544         ncols = bi[row-rstart+1] - bi[row-rstart];
1545         cols  = bj + bi[row-rstart];
1546         if (!allcolumns) {
1547           for (k=0; k<ncols; k++) {
1548 #if defined(PETSC_USE_CTABLE)
1549             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1550 #else
1551             tcol = cmap[bmap[cols[k]]];
1552 #endif
1553             if (tcol) lens[j]++;
1554           }
1555         } else { /* allcolumns */
1556           lens[j] += ncols;
1557         }
1558       }
1559     }
1560 
1561     /* Create row map (rmap): global row of C -> local row of submat */
1562 #if defined(PETSC_USE_CTABLE)
1563     ierr = PetscTableCreate(nrow,C->rmap->N,&rmap);CHKERRQ(ierr);
1564     for (j=0; j<nrow; j++) {
1565       row  = irow[j];
1566       proc = row2proc[j];
1567       if (proc == rank) { /* a local row */
1568         rmap_loc[row - rstart] = j;
1569       } else {
1570         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1571       }
1572     }
1573 #else
1574     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1575     for (j=0; j<nrow; j++) {
1576       rmap[irow[j]] = j;
1577     }
1578 #endif
1579 
1580     /* Update lens from offproc data */
1581     /* recv a->j is done */
1582     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRMPI(ierr);
1583     for (i=0; i<nrqs; i++) {
1584       proc    = pa[i];
1585       sbuf1_i = sbuf1[proc];
1586       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1587       ct1     = 2 + 1;
1588       ct2     = 0;
1589       rbuf2_i = rbuf2[i]; /* received length of C->j */
1590       rbuf3_i = rbuf3[i]; /* received C->j */
1591 
1592       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1593       max1   = sbuf1_i[2];
1594       for (k=0; k<max1; k++,ct1++) {
1595 #if defined(PETSC_USE_CTABLE)
1596         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1597         row--;
1598         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1599 #else
1600         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1601 #endif
1602         /* Now, store row index of submat in sbuf1_i[ct1] */
1603         sbuf1_i[ct1] = row;
1604 
1605         nnz = rbuf2_i[ct1];
1606         if (!allcolumns) {
1607           for (l=0; l<nnz; l++,ct2++) {
1608 #if defined(PETSC_USE_CTABLE)
1609             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1610               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1611             } else {
1612               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1613             }
1614 #else
1615             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1616 #endif
1617             if (tcol) lens[row]++;
1618           }
1619         } else { /* allcolumns */
1620           lens[row] += nnz;
1621         }
1622       }
1623     }
1624     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr);
1625     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1626 
1627     /* Create the submatrices */
1628     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1629     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1630 
1631     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1632     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1633     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1634     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1635     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1636 
1637     /* create struct Mat_SubSppt and attached it to submat */
1638     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1639     subc = (Mat_SeqAIJ*)submat->data;
1640     subc->submatis1 = smatis1;
1641 
1642     smatis1->id          = 0;
1643     smatis1->nrqs        = nrqs;
1644     smatis1->nrqr        = nrqr;
1645     smatis1->rbuf1       = rbuf1;
1646     smatis1->rbuf2       = rbuf2;
1647     smatis1->rbuf3       = rbuf3;
1648     smatis1->sbuf2       = sbuf2;
1649     smatis1->req_source2 = req_source2;
1650 
1651     smatis1->sbuf1       = sbuf1;
1652     smatis1->ptr         = ptr;
1653     smatis1->tmp         = tmp;
1654     smatis1->ctr         = ctr;
1655 
1656     smatis1->pa           = pa;
1657     smatis1->req_size     = req_size;
1658     smatis1->req_source1  = req_source1;
1659 
1660     smatis1->allcolumns  = allcolumns;
1661     smatis1->singleis    = PETSC_TRUE;
1662     smatis1->row2proc    = row2proc;
1663     smatis1->rmap        = rmap;
1664     smatis1->cmap        = cmap;
1665 #if defined(PETSC_USE_CTABLE)
1666     smatis1->rmap_loc    = rmap_loc;
1667     smatis1->cmap_loc    = cmap_loc;
1668 #endif
1669 
1670     smatis1->destroy     = submat->ops->destroy;
1671     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1672     submat->factortype   = C->factortype;
1673 
1674     /* compute rmax */
1675     rmax = 0;
1676     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1677 
1678   } else { /* scall == MAT_REUSE_MATRIX */
1679     submat = submats[0];
1680     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1681 
1682     subc    = (Mat_SeqAIJ*)submat->data;
1683     rmax    = subc->rmax;
1684     smatis1 = subc->submatis1;
1685     nrqs        = smatis1->nrqs;
1686     nrqr        = smatis1->nrqr;
1687     rbuf1       = smatis1->rbuf1;
1688     rbuf2       = smatis1->rbuf2;
1689     rbuf3       = smatis1->rbuf3;
1690     req_source2 = smatis1->req_source2;
1691 
1692     sbuf1     = smatis1->sbuf1;
1693     sbuf2     = smatis1->sbuf2;
1694     ptr       = smatis1->ptr;
1695     tmp       = smatis1->tmp;
1696     ctr       = smatis1->ctr;
1697 
1698     pa         = smatis1->pa;
1699     req_size   = smatis1->req_size;
1700     req_source1 = smatis1->req_source1;
1701 
1702     allcolumns = smatis1->allcolumns;
1703     row2proc   = smatis1->row2proc;
1704     rmap       = smatis1->rmap;
1705     cmap       = smatis1->cmap;
1706 #if defined(PETSC_USE_CTABLE)
1707     rmap_loc   = smatis1->rmap_loc;
1708     cmap_loc   = smatis1->cmap_loc;
1709 #endif
1710   }
1711 
1712   /* Post recv matrix values */
1713   ierr = PetscMalloc3(nrqs,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1714   ierr = PetscMalloc4(nrqs,&r_waits4,nrqr,&s_waits4,nrqs,&r_status4,nrqr,&s_status4);CHKERRQ(ierr);
1715   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1716   for (i=0; i<nrqs; ++i) {
1717     ierr = PetscMalloc1(rbuf2[i][0],&rbuf4[i]);CHKERRQ(ierr);
1718     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr);
1719   }
1720 
1721   /* Allocate sending buffers for a->a, and send them off */
1722   ierr = PetscMalloc1(nrqr,&sbuf_aa);CHKERRQ(ierr);
1723   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1724   if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aa[0]);CHKERRQ(ierr);}
1725   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1726 
1727   for (i=0; i<nrqr; i++) {
1728     rbuf1_i   = rbuf1[i];
1729     sbuf_aa_i = sbuf_aa[i];
1730     ct1       = 2*rbuf1_i[0]+1;
1731     ct2       = 0;
1732     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1733 
1734     kmax = rbuf1_i[2];
1735     for (k=0; k<kmax; k++,ct1++) {
1736       row = rbuf1_i[ct1] - rstart;
1737       nzA = ai[row+1] - ai[row];
1738       nzB = bi[row+1] - bi[row];
1739       ncols  = nzA + nzB;
1740       cworkB = bj + bi[row];
1741       vworkA = a_a + ai[row];
1742       vworkB = b_a + bi[row];
1743 
1744       /* load the column values for this row into vals*/
1745       vals = sbuf_aa_i + ct2;
1746 
1747       lwrite = 0;
1748       for (l=0; l<nzB; l++) {
1749         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1750       }
1751       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1752       for (l=0; l<nzB; l++) {
1753         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1754       }
1755 
1756       ct2 += ncols;
1757     }
1758     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr);
1759   }
1760 
1761   /* Assemble submat */
1762   /* First assemble the local rows */
1763   for (j=0; j<nrow; j++) {
1764     row  = irow[j];
1765     proc = row2proc[j];
1766     if (proc == rank) {
1767       Crow = row - rstart;  /* local row index of C */
1768 #if defined(PETSC_USE_CTABLE)
1769       row = rmap_loc[Crow]; /* row index of submat */
1770 #else
1771       row = rmap[row];
1772 #endif
1773 
1774       if (allcolumns) {
1775         /* diagonal part A = c->A */
1776         ncols = ai[Crow+1] - ai[Crow];
1777         cols  = aj + ai[Crow];
1778         vals  = a_a + ai[Crow];
1779         i     = 0;
1780         for (k=0; k<ncols; k++) {
1781           subcols[i]   = cols[k] + cstart;
1782           subvals[i++] = vals[k];
1783         }
1784 
1785         /* off-diagonal part B = c->B */
1786         ncols = bi[Crow+1] - bi[Crow];
1787         cols  = bj + bi[Crow];
1788         vals  = b_a + bi[Crow];
1789         for (k=0; k<ncols; k++) {
1790           subcols[i]   = bmap[cols[k]];
1791           subvals[i++] = vals[k];
1792         }
1793 
1794         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1795 
1796       } else { /* !allcolumns */
1797 #if defined(PETSC_USE_CTABLE)
1798         /* diagonal part A = c->A */
1799         ncols = ai[Crow+1] - ai[Crow];
1800         cols  = aj + ai[Crow];
1801         vals  = a_a + ai[Crow];
1802         i     = 0;
1803         for (k=0; k<ncols; k++) {
1804           tcol = cmap_loc[cols[k]];
1805           if (tcol) {
1806             subcols[i]   = --tcol;
1807             subvals[i++] = vals[k];
1808           }
1809         }
1810 
1811         /* off-diagonal part B = c->B */
1812         ncols = bi[Crow+1] - bi[Crow];
1813         cols  = bj + bi[Crow];
1814         vals  = b_a + bi[Crow];
1815         for (k=0; k<ncols; k++) {
1816           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1817           if (tcol) {
1818             subcols[i]   = --tcol;
1819             subvals[i++] = vals[k];
1820           }
1821         }
1822 #else
1823         /* diagonal part A = c->A */
1824         ncols = ai[Crow+1] - ai[Crow];
1825         cols  = aj + ai[Crow];
1826         vals  = a_a + ai[Crow];
1827         i     = 0;
1828         for (k=0; k<ncols; k++) {
1829           tcol = cmap[cols[k]+cstart];
1830           if (tcol) {
1831             subcols[i]   = --tcol;
1832             subvals[i++] = vals[k];
1833           }
1834         }
1835 
1836         /* off-diagonal part B = c->B */
1837         ncols = bi[Crow+1] - bi[Crow];
1838         cols  = bj + bi[Crow];
1839         vals  = b_a + bi[Crow];
1840         for (k=0; k<ncols; k++) {
1841           tcol = cmap[bmap[cols[k]]];
1842           if (tcol) {
1843             subcols[i]   = --tcol;
1844             subvals[i++] = vals[k];
1845           }
1846         }
1847 #endif
1848         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1849       }
1850     }
1851   }
1852 
1853   /* Now assemble the off-proc rows */
1854   for (i=0; i<nrqs; i++) { /* for each requested message */
1855     /* recv values from other processes */
1856     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRMPI(ierr);
1857     proc    = pa[idex];
1858     sbuf1_i = sbuf1[proc];
1859     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1860     ct1     = 2 + 1;
1861     ct2     = 0; /* count of received C->j */
1862     ct3     = 0; /* count of received C->j that will be inserted into submat */
1863     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1864     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1865     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1866 
1867     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1868     max1 = sbuf1_i[2];             /* num of rows */
1869     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1870       row = sbuf1_i[ct1]; /* row index of submat */
1871       if (!allcolumns) {
1872         idex = 0;
1873         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1874           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1875           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1876 #if defined(PETSC_USE_CTABLE)
1877             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1878               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1879             } else {
1880               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1881             }
1882 #else
1883             tcol = cmap[rbuf3_i[ct2]];
1884 #endif
1885             if (tcol) {
1886               subcols[idex]   = --tcol; /* may not be sorted */
1887               subvals[idex++] = rbuf4_i[ct2];
1888 
1889               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1890                For reuse, we replace received C->j with index that should be inserted to submat */
1891               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1892             }
1893           }
1894           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1895         } else { /* scall == MAT_REUSE_MATRIX */
1896           submat = submats[0];
1897           subc   = (Mat_SeqAIJ*)submat->data;
1898 
1899           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1900           for (l=0; l<nnz; l++) {
1901             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1902             subvals[idex++] = rbuf4_i[ct2];
1903           }
1904 
1905           bj = subc->j + subc->i[row]; /* sorted column indices */
1906           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1907         }
1908       } else { /* allcolumns */
1909         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1910         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1911         ct2 += nnz;
1912       }
1913     }
1914   }
1915 
1916   /* sending a->a are done */
1917   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr);
1918   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1919 
1920   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1921   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1922   submats[0] = submat;
1923 
1924   /* Restore the indices */
1925   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1926   if (!allcolumns) {
1927     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1928   }
1929 
1930   /* Destroy allocated memory */
1931   for (i=0; i<nrqs; ++i) {
1932     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1933   }
1934   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1935   if (sbuf_aa) {
1936     ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1937     ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1938   }
1939 
1940   if (scall == MAT_INITIAL_MATRIX) {
1941     ierr = PetscFree(lens);CHKERRQ(ierr);
1942     if (sbuf_aj) {
1943       ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1944       ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1945     }
1946   }
1947   ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
1948   ierr = MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
1949   PetscFunctionReturn(0);
1950 }
1951 
1952 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1953 {
1954   PetscErrorCode ierr;
1955   PetscInt       ncol;
1956   PetscBool      colflag,allcolumns=PETSC_FALSE;
1957 
1958   PetscFunctionBegin;
1959   /* Allocate memory to hold all the submatrices */
1960   if (scall == MAT_INITIAL_MATRIX) {
1961     ierr = PetscCalloc1(2,submat);CHKERRQ(ierr);
1962   }
1963 
1964   /* Check for special case: each processor gets entire matrix columns */
1965   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
1966   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1967   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1968 
1969   ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
1970   PetscFunctionReturn(0);
1971 }
1972 
1973 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1974 {
1975   PetscErrorCode ierr;
1976   PetscInt       nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1977   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1978   Mat_SeqAIJ     *subc;
1979   Mat_SubSppt    *smat;
1980 
1981   PetscFunctionBegin;
1982   /* Check for special case: each processor has a single IS */
1983   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1984     ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1985     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1986     PetscFunctionReturn(0);
1987   }
1988 
1989   /* Collect global wantallmatrix and nstages */
1990   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1991   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1992   if (!nmax) nmax = 1;
1993 
1994   if (scall == MAT_INITIAL_MATRIX) {
1995     /* Collect global wantallmatrix and nstages */
1996     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1997       ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1998       ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1999       ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
2000       ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
2001       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2002         wantallmatrix = PETSC_TRUE;
2003 
2004         ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
2005       }
2006     }
2007 
2008     /* Determine the number of stages through which submatrices are done
2009        Each stage will extract nmax submatrices.
2010        nmax is determined by the matrix column dimension.
2011        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2012     */
2013     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2014 
2015     in[0] = -1*(PetscInt)wantallmatrix;
2016     in[1] = nstages;
2017     ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr);
2018     wantallmatrix = (PetscBool)(-out[0]);
2019     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2020 
2021   } else { /* MAT_REUSE_MATRIX */
2022     if (ismax) {
2023       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2024       smat = subc->submatis1;
2025     } else { /* (*submat)[0] is a dummy matrix */
2026       smat = (Mat_SubSppt*)(*submat)[0]->data;
2027     }
2028     if (!smat) {
2029       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2030       wantallmatrix = PETSC_TRUE;
2031     } else if (smat->singleis) {
2032       ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2033       PetscFunctionReturn(0);
2034     } else {
2035       nstages = smat->nstages;
2036     }
2037   }
2038 
2039   if (wantallmatrix) {
2040     ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2041     PetscFunctionReturn(0);
2042   }
2043 
2044   /* Allocate memory to hold all the submatrices and dummy submatrices */
2045   if (scall == MAT_INITIAL_MATRIX) {
2046     ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr);
2047   }
2048 
2049   for (i=0,pos=0; i<nstages; i++) {
2050     if (pos+nmax <= ismax) max_no = nmax;
2051     else if (pos >= ismax) max_no = 0;
2052     else                   max_no = ismax-pos;
2053 
2054     ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
2055     if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2056       smat = (Mat_SubSppt*)(*submat)[pos]->data; pos++;
2057       smat->nstages = nstages;
2058     }
2059     pos += max_no;
2060   }
2061 
2062   if (ismax && scall == MAT_INITIAL_MATRIX) {
2063     /* save nstages for reuse */
2064     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2065     smat = subc->submatis1;
2066     smat->nstages = nstages;
2067   }
2068   PetscFunctionReturn(0);
2069 }
2070 
2071 /* -------------------------------------------------------------------------*/
2072 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2073 {
2074   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2075   Mat            A  = c->A;
2076   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2077   const PetscInt **icol,**irow;
2078   PetscInt       *nrow,*ncol,start;
2079   PetscErrorCode ierr;
2080   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2081   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2082   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2083   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2084   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2085 #if defined(PETSC_USE_CTABLE)
2086   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2087 #else
2088   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2089 #endif
2090   const PetscInt *irow_i;
2091   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2092   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2093   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2094   MPI_Comm       comm;
2095   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2096   PetscMPIInt    *onodes1,*olengths1,end;
2097   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2098   Mat_SubSppt    *smat_i;
2099   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2100   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2101 
2102   PetscFunctionBegin;
2103   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2104   size = c->size;
2105   rank = c->rank;
2106 
2107   ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr);
2108   ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
2109 
2110   for (i=0; i<ismax; i++) {
2111     ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
2112     if (!issorted[i]) iscsorted = issorted[i];
2113 
2114     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2115 
2116     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2117     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2118 
2119     /* Check for special case: allcolumn */
2120     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2121     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2122     if (colflag && ncol[i] == C->cmap->N) {
2123       allcolumns[i] = PETSC_TRUE;
2124       icol[i] = NULL;
2125     } else {
2126       allcolumns[i] = PETSC_FALSE;
2127       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2128     }
2129   }
2130 
2131   if (scall == MAT_REUSE_MATRIX) {
2132     /* Assumes new rows are same length as the old rows */
2133     for (i=0; i<ismax; i++) {
2134       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2135       subc = (Mat_SeqAIJ*)submats[i]->data;
2136       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2137 
2138       /* Initial matrix as if empty */
2139       ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr);
2140 
2141       smat_i   = subc->submatis1;
2142 
2143       nrqs        = smat_i->nrqs;
2144       nrqr        = smat_i->nrqr;
2145       rbuf1       = smat_i->rbuf1;
2146       rbuf2       = smat_i->rbuf2;
2147       rbuf3       = smat_i->rbuf3;
2148       req_source2 = smat_i->req_source2;
2149 
2150       sbuf1     = smat_i->sbuf1;
2151       sbuf2     = smat_i->sbuf2;
2152       ptr       = smat_i->ptr;
2153       tmp       = smat_i->tmp;
2154       ctr       = smat_i->ctr;
2155 
2156       pa          = smat_i->pa;
2157       req_size    = smat_i->req_size;
2158       req_source1 = smat_i->req_source1;
2159 
2160       allcolumns[i] = smat_i->allcolumns;
2161       row2proc[i]   = smat_i->row2proc;
2162       rmap[i]       = smat_i->rmap;
2163       cmap[i]       = smat_i->cmap;
2164     }
2165 
2166     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2167       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2168       smat_i = (Mat_SubSppt*)submats[0]->data;
2169 
2170       nrqs        = smat_i->nrqs;
2171       nrqr        = smat_i->nrqr;
2172       rbuf1       = smat_i->rbuf1;
2173       rbuf2       = smat_i->rbuf2;
2174       rbuf3       = smat_i->rbuf3;
2175       req_source2 = smat_i->req_source2;
2176 
2177       sbuf1       = smat_i->sbuf1;
2178       sbuf2       = smat_i->sbuf2;
2179       ptr         = smat_i->ptr;
2180       tmp         = smat_i->tmp;
2181       ctr         = smat_i->ctr;
2182 
2183       pa          = smat_i->pa;
2184       req_size    = smat_i->req_size;
2185       req_source1 = smat_i->req_source1;
2186 
2187       allcolumns[0] = PETSC_FALSE;
2188     }
2189   } else { /* scall == MAT_INITIAL_MATRIX */
2190     /* Get some new tags to keep the communication clean */
2191     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2192     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2193 
2194     /* evaluate communication - mesg to who, length of mesg, and buffer space
2195      required. Based on this, buffers are allocated, and data copied into them*/
2196     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size, initialize work vectors */
2197 
2198     for (i=0; i<ismax; i++) {
2199       jmax   = nrow[i];
2200       irow_i = irow[i];
2201 
2202       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2203       row2proc[i] = row2proc_i;
2204 
2205       if (issorted[i]) proc = 0;
2206       for (j=0; j<jmax; j++) {
2207         if (!issorted[i]) proc = 0;
2208         row = irow_i[j];
2209         while (row >= C->rmap->range[proc+1]) proc++;
2210         w4[proc]++;
2211         row2proc_i[j] = proc; /* map row index to proc */
2212       }
2213       for (j=0; j<size; j++) {
2214         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2215       }
2216     }
2217 
2218     nrqs     = 0;              /* no of outgoing messages */
2219     msz      = 0;              /* total mesg length (for all procs) */
2220     w1[rank] = 0;              /* no mesg sent to self */
2221     w3[rank] = 0;
2222     for (i=0; i<size; i++) {
2223       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2224     }
2225     ierr = PetscMalloc1(nrqs,&pa);CHKERRQ(ierr); /*(proc -array)*/
2226     for (i=0,j=0; i<size; i++) {
2227       if (w1[i]) { pa[j] = i; j++; }
2228     }
2229 
2230     /* Each message would have a header = 1 + 2*(no of IS) + data */
2231     for (i=0; i<nrqs; i++) {
2232       j      = pa[i];
2233       w1[j] += w2[j] + 2* w3[j];
2234       msz   += w1[j];
2235     }
2236     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2237 
2238     /* Determine the number of messages to expect, their lengths, from from-ids */
2239     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2240     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2241 
2242     /* Now post the Irecvs corresponding to these messages */
2243     ierr = PetscObjectGetNewTag((PetscObject)C,&tag0);CHKERRQ(ierr);
2244     ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2245 
2246     /* Allocate Memory for outgoing messages */
2247     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2248     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
2249     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
2250 
2251     {
2252       PetscInt *iptr = tmp;
2253       k    = 0;
2254       for (i=0; i<nrqs; i++) {
2255         j        = pa[i];
2256         iptr    += k;
2257         sbuf1[j] = iptr;
2258         k        = w1[j];
2259       }
2260     }
2261 
2262     /* Form the outgoing messages. Initialize the header space */
2263     for (i=0; i<nrqs; i++) {
2264       j           = pa[i];
2265       sbuf1[j][0] = 0;
2266       ierr        = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr);
2267       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2268     }
2269 
2270     /* Parse the isrow and copy data into outbuf */
2271     for (i=0; i<ismax; i++) {
2272       row2proc_i = row2proc[i];
2273       ierr   = PetscArrayzero(ctr,size);CHKERRQ(ierr);
2274       irow_i = irow[i];
2275       jmax   = nrow[i];
2276       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2277         proc = row2proc_i[j];
2278         if (proc != rank) { /* copy to the outgoing buf*/
2279           ctr[proc]++;
2280           *ptr[proc] = irow_i[j];
2281           ptr[proc]++;
2282         }
2283       }
2284       /* Update the headers for the current IS */
2285       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2286         if ((ctr_j = ctr[j])) {
2287           sbuf1_j        = sbuf1[j];
2288           k              = ++sbuf1_j[0];
2289           sbuf1_j[2*k]   = ctr_j;
2290           sbuf1_j[2*k-1] = i;
2291         }
2292       }
2293     }
2294 
2295     /*  Now  post the sends */
2296     ierr = PetscMalloc1(nrqs,&s_waits1);CHKERRQ(ierr);
2297     for (i=0; i<nrqs; ++i) {
2298       j    = pa[i];
2299       ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRMPI(ierr);
2300     }
2301 
2302     /* Post Receives to capture the buffer size */
2303     ierr = PetscMalloc1(nrqs,&r_waits2);CHKERRQ(ierr);
2304     ierr = PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);CHKERRQ(ierr);
2305     if (nrqs) rbuf2[0] = tmp + msz;
2306     for (i=1; i<nrqs; ++i) {
2307       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2308     }
2309     for (i=0; i<nrqs; ++i) {
2310       j    = pa[i];
2311       ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRMPI(ierr);
2312     }
2313 
2314     /* Send to other procs the buf size they should allocate */
2315     /* Receive messages*/
2316     ierr = PetscMalloc1(nrqr,&s_waits2);CHKERRQ(ierr);
2317     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2318     {
2319       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2320       PetscInt   *sbuf2_i;
2321 
2322       ierr = MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2323       for (i=0; i<nrqr; ++i) {
2324         req_size[i] = 0;
2325         rbuf1_i        = rbuf1[i];
2326         start          = 2*rbuf1_i[0] + 1;
2327         end            = olengths1[i];
2328         ierr           = PetscMalloc1(end,&sbuf2[i]);CHKERRQ(ierr);
2329         sbuf2_i        = sbuf2[i];
2330         for (j=start; j<end; j++) {
2331           id              = rbuf1_i[j] - rstart;
2332           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2333           sbuf2_i[j]      = ncols;
2334           req_size[i] += ncols;
2335         }
2336         req_source1[i] = onodes1[i];
2337         /* form the header */
2338         sbuf2_i[0] = req_size[i];
2339         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2340 
2341         ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr);
2342       }
2343     }
2344 
2345     ierr = PetscFree(onodes1);CHKERRQ(ierr);
2346     ierr = PetscFree(olengths1);CHKERRQ(ierr);
2347     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2348     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2349 
2350     /* Receive messages*/
2351     ierr = PetscMalloc1(nrqs,&r_waits3);CHKERRQ(ierr);
2352     ierr = MPI_Waitall(nrqs,r_waits2,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2353     for (i=0; i<nrqs; ++i) {
2354       ierr = PetscMalloc1(rbuf2[i][0],&rbuf3[i]);CHKERRQ(ierr);
2355       req_source2[i] = pa[i];
2356       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr);
2357     }
2358     ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2359 
2360     /* Wait on sends1 and sends2 */
2361     ierr = MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2362     ierr = MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2363     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2364     ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2365 
2366     /* Now allocate sending buffers for a->j, and send them off */
2367     ierr = PetscMalloc1(nrqr,&sbuf_aj);CHKERRQ(ierr);
2368     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2369     if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aj[0]);CHKERRQ(ierr);}
2370     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2371 
2372     ierr = PetscMalloc1(nrqr,&s_waits3);CHKERRQ(ierr);
2373     {
2374       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2375       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2376       PetscInt cend = C->cmap->rend;
2377       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2378 
2379       for (i=0; i<nrqr; i++) {
2380         rbuf1_i   = rbuf1[i];
2381         sbuf_aj_i = sbuf_aj[i];
2382         ct1       = 2*rbuf1_i[0] + 1;
2383         ct2       = 0;
2384         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2385           kmax = rbuf1[i][2*j];
2386           for (k=0; k<kmax; k++,ct1++) {
2387             row    = rbuf1_i[ct1] - rstart;
2388             nzA    = a_i[row+1] - a_i[row];
2389             nzB    = b_i[row+1] - b_i[row];
2390             ncols  = nzA + nzB;
2391             cworkA = a_j + a_i[row];
2392             cworkB = b_j + b_i[row];
2393 
2394             /* load the column indices for this row into cols */
2395             cols = sbuf_aj_i + ct2;
2396 
2397             lwrite = 0;
2398             for (l=0; l<nzB; l++) {
2399               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2400             }
2401             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2402             for (l=0; l<nzB; l++) {
2403               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2404             }
2405 
2406             ct2 += ncols;
2407           }
2408         }
2409         ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr);
2410       }
2411     }
2412 
2413     /* create col map: global col of C -> local col of submatrices */
2414     {
2415       const PetscInt *icol_i;
2416 #if defined(PETSC_USE_CTABLE)
2417       for (i=0; i<ismax; i++) {
2418         if (!allcolumns[i]) {
2419           ierr = PetscTableCreate(ncol[i],C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2420 
2421           jmax   = ncol[i];
2422           icol_i = icol[i];
2423           cmap_i = cmap[i];
2424           for (j=0; j<jmax; j++) {
2425             ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2426           }
2427         } else cmap[i] = NULL;
2428       }
2429 #else
2430       for (i=0; i<ismax; i++) {
2431         if (!allcolumns[i]) {
2432           ierr   = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2433           jmax   = ncol[i];
2434           icol_i = icol[i];
2435           cmap_i = cmap[i];
2436           for (j=0; j<jmax; j++) {
2437             cmap_i[icol_i[j]] = j+1;
2438           }
2439         } else cmap[i] = NULL;
2440       }
2441 #endif
2442     }
2443 
2444     /* Create lens which is required for MatCreate... */
2445     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2446     ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2447 
2448     if (ismax) {
2449       ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
2450     }
2451     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2452 
2453     /* Update lens from local data */
2454     for (i=0; i<ismax; i++) {
2455       row2proc_i = row2proc[i];
2456       jmax = nrow[i];
2457       if (!allcolumns[i]) cmap_i = cmap[i];
2458       irow_i = irow[i];
2459       lens_i = lens[i];
2460       for (j=0; j<jmax; j++) {
2461         row = irow_i[j];
2462         proc = row2proc_i[j];
2463         if (proc == rank) {
2464           ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr);
2465           if (!allcolumns[i]) {
2466             for (k=0; k<ncols; k++) {
2467 #if defined(PETSC_USE_CTABLE)
2468               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2469 #else
2470               tcol = cmap_i[cols[k]];
2471 #endif
2472               if (tcol) lens_i[j]++;
2473             }
2474           } else { /* allcolumns */
2475             lens_i[j] = ncols;
2476           }
2477           ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr);
2478         }
2479       }
2480     }
2481 
2482     /* Create row map: global row of C -> local row of submatrices */
2483 #if defined(PETSC_USE_CTABLE)
2484     for (i=0; i<ismax; i++) {
2485       ierr   = PetscTableCreate(nrow[i],C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2486       irow_i = irow[i];
2487       jmax   = nrow[i];
2488       for (j=0; j<jmax; j++) {
2489       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2490       }
2491     }
2492 #else
2493     for (i=0; i<ismax; i++) {
2494       ierr   = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2495       rmap_i = rmap[i];
2496       irow_i = irow[i];
2497       jmax   = nrow[i];
2498       for (j=0; j<jmax; j++) {
2499         rmap_i[irow_i[j]] = j;
2500       }
2501     }
2502 #endif
2503 
2504     /* Update lens from offproc data */
2505     {
2506       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2507 
2508       ierr    = MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2509       for (tmp2=0; tmp2<nrqs; tmp2++) {
2510         sbuf1_i = sbuf1[pa[tmp2]];
2511         jmax    = sbuf1_i[0];
2512         ct1     = 2*jmax+1;
2513         ct2     = 0;
2514         rbuf2_i = rbuf2[tmp2];
2515         rbuf3_i = rbuf3[tmp2];
2516         for (j=1; j<=jmax; j++) {
2517           is_no  = sbuf1_i[2*j-1];
2518           max1   = sbuf1_i[2*j];
2519           lens_i = lens[is_no];
2520           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2521           rmap_i = rmap[is_no];
2522           for (k=0; k<max1; k++,ct1++) {
2523 #if defined(PETSC_USE_CTABLE)
2524             ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2525             row--;
2526             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2527 #else
2528             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2529 #endif
2530             max2 = rbuf2_i[ct1];
2531             for (l=0; l<max2; l++,ct2++) {
2532               if (!allcolumns[is_no]) {
2533 #if defined(PETSC_USE_CTABLE)
2534                 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2535 #else
2536                 tcol = cmap_i[rbuf3_i[ct2]];
2537 #endif
2538                 if (tcol) lens_i[row]++;
2539               } else { /* allcolumns */
2540                 lens_i[row]++; /* lens_i[row] += max2 ? */
2541               }
2542             }
2543           }
2544         }
2545       }
2546     }
2547     ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2548     ierr = MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2549     ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2550 
2551     /* Create the submatrices */
2552     for (i=0; i<ismax; i++) {
2553       PetscInt    rbs,cbs;
2554 
2555       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2556       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2557 
2558       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2559       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2560 
2561       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2562       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2563       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2564 
2565       /* create struct Mat_SubSppt and attached it to submat */
2566       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2567       subc = (Mat_SeqAIJ*)submats[i]->data;
2568       subc->submatis1 = smat_i;
2569 
2570       smat_i->destroy          = submats[i]->ops->destroy;
2571       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2572       submats[i]->factortype   = C->factortype;
2573 
2574       smat_i->id          = i;
2575       smat_i->nrqs        = nrqs;
2576       smat_i->nrqr        = nrqr;
2577       smat_i->rbuf1       = rbuf1;
2578       smat_i->rbuf2       = rbuf2;
2579       smat_i->rbuf3       = rbuf3;
2580       smat_i->sbuf2       = sbuf2;
2581       smat_i->req_source2 = req_source2;
2582 
2583       smat_i->sbuf1       = sbuf1;
2584       smat_i->ptr         = ptr;
2585       smat_i->tmp         = tmp;
2586       smat_i->ctr         = ctr;
2587 
2588       smat_i->pa           = pa;
2589       smat_i->req_size     = req_size;
2590       smat_i->req_source1  = req_source1;
2591 
2592       smat_i->allcolumns  = allcolumns[i];
2593       smat_i->singleis    = PETSC_FALSE;
2594       smat_i->row2proc    = row2proc[i];
2595       smat_i->rmap        = rmap[i];
2596       smat_i->cmap        = cmap[i];
2597     }
2598 
2599     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2600       ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
2601       ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2602       ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr);
2603 
2604       /* create struct Mat_SubSppt and attached it to submat */
2605       ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr);
2606       submats[0]->data = (void*)smat_i;
2607 
2608       smat_i->destroy          = submats[0]->ops->destroy;
2609       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2610       submats[0]->factortype   = C->factortype;
2611 
2612       smat_i->id          = 0;
2613       smat_i->nrqs        = nrqs;
2614       smat_i->nrqr        = nrqr;
2615       smat_i->rbuf1       = rbuf1;
2616       smat_i->rbuf2       = rbuf2;
2617       smat_i->rbuf3       = rbuf3;
2618       smat_i->sbuf2       = sbuf2;
2619       smat_i->req_source2 = req_source2;
2620 
2621       smat_i->sbuf1       = sbuf1;
2622       smat_i->ptr         = ptr;
2623       smat_i->tmp         = tmp;
2624       smat_i->ctr         = ctr;
2625 
2626       smat_i->pa           = pa;
2627       smat_i->req_size     = req_size;
2628       smat_i->req_source1  = req_source1;
2629 
2630       smat_i->allcolumns  = PETSC_FALSE;
2631       smat_i->singleis    = PETSC_FALSE;
2632       smat_i->row2proc    = NULL;
2633       smat_i->rmap        = NULL;
2634       smat_i->cmap        = NULL;
2635     }
2636 
2637     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2638     ierr = PetscFree(lens);CHKERRQ(ierr);
2639     if (sbuf_aj) {
2640       ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2641       ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2642     }
2643 
2644   } /* endof scall == MAT_INITIAL_MATRIX */
2645 
2646   /* Post recv matrix values */
2647   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
2648   ierr = PetscMalloc1(nrqs,&rbuf4);CHKERRQ(ierr);
2649   ierr = PetscMalloc1(nrqs,&r_waits4);CHKERRQ(ierr);
2650   for (i=0; i<nrqs; ++i) {
2651     ierr = PetscMalloc1(rbuf2[i][0],&rbuf4[i]);CHKERRQ(ierr);
2652     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr);
2653   }
2654 
2655   /* Allocate sending buffers for a->a, and send them off */
2656   ierr = PetscMalloc1(nrqr,&sbuf_aa);CHKERRQ(ierr);
2657   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2658   if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aa[0]);CHKERRQ(ierr);}
2659   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2660 
2661   ierr = PetscMalloc1(nrqr,&s_waits4);CHKERRQ(ierr);
2662   {
2663     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2664     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2665     PetscInt    cend   = C->cmap->rend;
2666     PetscInt    *b_j   = b->j;
2667     PetscScalar *vworkA,*vworkB,*a_a,*b_a;
2668 
2669     ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
2670     ierr = MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
2671     for (i=0; i<nrqr; i++) {
2672       rbuf1_i   = rbuf1[i];
2673       sbuf_aa_i = sbuf_aa[i];
2674       ct1       = 2*rbuf1_i[0]+1;
2675       ct2       = 0;
2676       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2677         kmax = rbuf1_i[2*j];
2678         for (k=0; k<kmax; k++,ct1++) {
2679           row    = rbuf1_i[ct1] - rstart;
2680           nzA    = a_i[row+1] - a_i[row];
2681           nzB    = b_i[row+1] - b_i[row];
2682           ncols  = nzA + nzB;
2683           cworkB = b_j + b_i[row];
2684           vworkA = a_a + a_i[row];
2685           vworkB = b_a + b_i[row];
2686 
2687           /* load the column values for this row into vals*/
2688           vals = sbuf_aa_i+ct2;
2689 
2690           lwrite = 0;
2691           for (l=0; l<nzB; l++) {
2692             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2693           }
2694           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2695           for (l=0; l<nzB; l++) {
2696             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2697           }
2698 
2699           ct2 += ncols;
2700         }
2701       }
2702       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr);
2703     }
2704     ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
2705     ierr = MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
2706   }
2707 
2708   /* Assemble the matrices */
2709   /* First assemble the local rows */
2710   for (i=0; i<ismax; i++) {
2711     row2proc_i = row2proc[i];
2712     subc      = (Mat_SeqAIJ*)submats[i]->data;
2713     imat_ilen = subc->ilen;
2714     imat_j    = subc->j;
2715     imat_i    = subc->i;
2716     imat_a    = subc->a;
2717 
2718     if (!allcolumns[i]) cmap_i = cmap[i];
2719     rmap_i = rmap[i];
2720     irow_i = irow[i];
2721     jmax   = nrow[i];
2722     for (j=0; j<jmax; j++) {
2723       row  = irow_i[j];
2724       proc = row2proc_i[j];
2725       if (proc == rank) {
2726         old_row = row;
2727 #if defined(PETSC_USE_CTABLE)
2728         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2729         row--;
2730 #else
2731         row = rmap_i[row];
2732 #endif
2733         ilen_row = imat_ilen[row];
2734         ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2735         mat_i    = imat_i[row];
2736         mat_a    = imat_a + mat_i;
2737         mat_j    = imat_j + mat_i;
2738         if (!allcolumns[i]) {
2739           for (k=0; k<ncols; k++) {
2740 #if defined(PETSC_USE_CTABLE)
2741             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2742 #else
2743             tcol = cmap_i[cols[k]];
2744 #endif
2745             if (tcol) {
2746               *mat_j++ = tcol - 1;
2747               *mat_a++ = vals[k];
2748               ilen_row++;
2749             }
2750           }
2751         } else { /* allcolumns */
2752           for (k=0; k<ncols; k++) {
2753             *mat_j++ = cols[k];  /* global col index! */
2754             *mat_a++ = vals[k];
2755             ilen_row++;
2756           }
2757         }
2758         ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2759 
2760         imat_ilen[row] = ilen_row;
2761       }
2762     }
2763   }
2764 
2765   /* Now assemble the off proc rows */
2766   ierr    = MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2767   for (tmp2=0; tmp2<nrqs; tmp2++) {
2768     sbuf1_i = sbuf1[pa[tmp2]];
2769     jmax    = sbuf1_i[0];
2770     ct1     = 2*jmax + 1;
2771     ct2     = 0;
2772     rbuf2_i = rbuf2[tmp2];
2773     rbuf3_i = rbuf3[tmp2];
2774     rbuf4_i = rbuf4[tmp2];
2775     for (j=1; j<=jmax; j++) {
2776       is_no     = sbuf1_i[2*j-1];
2777       rmap_i    = rmap[is_no];
2778       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2779       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2780       imat_ilen = subc->ilen;
2781       imat_j    = subc->j;
2782       imat_i    = subc->i;
2783       imat_a    = subc->a;
2784       max1      = sbuf1_i[2*j];
2785       for (k=0; k<max1; k++,ct1++) {
2786         row = sbuf1_i[ct1];
2787 #if defined(PETSC_USE_CTABLE)
2788         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2789         row--;
2790 #else
2791         row = rmap_i[row];
2792 #endif
2793         ilen  = imat_ilen[row];
2794         mat_i = imat_i[row];
2795         mat_a = imat_a + mat_i;
2796         mat_j = imat_j + mat_i;
2797         max2  = rbuf2_i[ct1];
2798         if (!allcolumns[is_no]) {
2799           for (l=0; l<max2; l++,ct2++) {
2800 #if defined(PETSC_USE_CTABLE)
2801             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2802 #else
2803             tcol = cmap_i[rbuf3_i[ct2]];
2804 #endif
2805             if (tcol) {
2806               *mat_j++ = tcol - 1;
2807               *mat_a++ = rbuf4_i[ct2];
2808               ilen++;
2809             }
2810           }
2811         } else { /* allcolumns */
2812           for (l=0; l<max2; l++,ct2++) {
2813             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2814             *mat_a++ = rbuf4_i[ct2];
2815             ilen++;
2816           }
2817         }
2818         imat_ilen[row] = ilen;
2819       }
2820     }
2821   }
2822 
2823   if (!iscsorted) { /* sort column indices of the rows */
2824     for (i=0; i<ismax; i++) {
2825       subc      = (Mat_SeqAIJ*)submats[i]->data;
2826       imat_j    = subc->j;
2827       imat_i    = subc->i;
2828       imat_a    = subc->a;
2829       imat_ilen = subc->ilen;
2830 
2831       if (allcolumns[i]) continue;
2832       jmax = nrow[i];
2833       for (j=0; j<jmax; j++) {
2834         mat_i = imat_i[j];
2835         mat_a = imat_a + mat_i;
2836         mat_j = imat_j + mat_i;
2837         ierr  = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr);
2838       }
2839     }
2840   }
2841 
2842   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2843   ierr = MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2844   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2845 
2846   /* Restore the indices */
2847   for (i=0; i<ismax; i++) {
2848     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2849     if (!allcolumns[i]) {
2850       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2851     }
2852   }
2853 
2854   for (i=0; i<ismax; i++) {
2855     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2856     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2857   }
2858 
2859   /* Destroy allocated memory */
2860   if (sbuf_aa) {
2861     ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2862     ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2863   }
2864   ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr);
2865 
2866   for (i=0; i<nrqs; ++i) {
2867     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2868   }
2869   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2870 
2871   ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr);
2872   PetscFunctionReturn(0);
2873 }
2874 
2875 /*
2876  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2877  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2878  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2879  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2880  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2881  state, and needs to be "assembled" later by compressing B's column space.
2882 
2883  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2884  Following this call, C->A & C->B have been created, even if empty.
2885  */
2886 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2887 {
2888   /* If making this function public, change the error returned in this function away from _PLIB. */
2889   PetscErrorCode ierr;
2890   Mat_MPIAIJ     *aij;
2891   Mat_SeqAIJ     *Baij;
2892   PetscBool      seqaij,Bdisassembled;
2893   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2894   PetscScalar    v;
2895   const PetscInt *rowindices,*colindices;
2896 
2897   PetscFunctionBegin;
2898   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2899   if (A) {
2900     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2901     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2902     if (rowemb) {
2903       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2904       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2905     } else {
2906       if (C->rmap->n != A->rmap->n) {
2907         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2908       }
2909     }
2910     if (dcolemb) {
2911       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2912       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2913     } else {
2914       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2915     }
2916   }
2917   if (B) {
2918     ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2919     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2920     if (rowemb) {
2921       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2922       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2923     } else {
2924       if (C->rmap->n != B->rmap->n) {
2925         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2926       }
2927     }
2928     if (ocolemb) {
2929       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2930       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2931     } else {
2932       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2933     }
2934   }
2935 
2936   aij = (Mat_MPIAIJ*)C->data;
2937   if (!aij->A) {
2938     /* Mimic parts of MatMPIAIJSetPreallocation() */
2939     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2940     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2941     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2942     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2943     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2944   }
2945   if (A) {
2946     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2947   } else {
2948     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2949   }
2950   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2951     /*
2952       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2953       need to "disassemble" B -- convert it to using C's global indices.
2954       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2955 
2956       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2957       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2958 
2959       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2960       At least avoid calling MatSetValues() and the implied searches?
2961     */
2962 
2963     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2964 #if defined(PETSC_USE_CTABLE)
2965       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2966 #else
2967       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2968       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2969       if (aij->B) {
2970         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2971       }
2972 #endif
2973       ngcol = 0;
2974       if (aij->lvec) {
2975         ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2976       }
2977       if (aij->garray) {
2978         ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2979         ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2980       }
2981       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2982       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2983     }
2984     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2985       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2986     }
2987     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2988       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2989     }
2990   }
2991   Bdisassembled = PETSC_FALSE;
2992   if (!aij->B) {
2993     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2994     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2995     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2996     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
2997     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
2998     Bdisassembled = PETSC_TRUE;
2999   }
3000   if (B) {
3001     Baij = (Mat_SeqAIJ*)B->data;
3002     if (pattern == DIFFERENT_NONZERO_PATTERN) {
3003       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
3004       for (i=0; i<B->rmap->n; i++) {
3005         nz[i] = Baij->i[i+1] - Baij->i[i];
3006       }
3007       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3008       ierr = PetscFree(nz);CHKERRQ(ierr);
3009     }
3010 
3011     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3012     shift = rend-rstart;
3013     count = 0;
3014     rowindices = NULL;
3015     colindices = NULL;
3016     if (rowemb) {
3017       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3018     }
3019     if (ocolemb) {
3020       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3021     }
3022     for (i=0; i<B->rmap->n; i++) {
3023       PetscInt row;
3024       row = i;
3025       if (rowindices) row = rowindices[i];
3026       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3027         col  = Baij->j[count];
3028         if (colindices) col = colindices[col];
3029         if (Bdisassembled && col>=rstart) col += shift;
3030         v    = Baij->a[count];
3031         ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3032         ++count;
3033       }
3034     }
3035     /* No assembly for aij->B is necessary. */
3036     /* FIXME: set aij->B's nonzerostate correctly. */
3037   } else {
3038     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3039   }
3040   C->preallocated  = PETSC_TRUE;
3041   C->was_assembled = PETSC_FALSE;
3042   C->assembled     = PETSC_FALSE;
3043    /*
3044       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3045       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3046    */
3047   PetscFunctionReturn(0);
3048 }
3049 
3050 /*
3051   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3052  */
3053 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3054 {
3055   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3056 
3057   PetscFunctionBegin;
3058   PetscValidPointer(A,2);
3059   PetscValidPointer(B,3);
3060   /* FIXME: make sure C is assembled */
3061   *A = aij->A;
3062   *B = aij->B;
3063   /* Note that we don't incref *A and *B, so be careful! */
3064   PetscFunctionReturn(0);
3065 }
3066 
3067 /*
3068   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3069   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3070 */
3071 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3072                                                PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3073                                                PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3074                                                PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3075                                                PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3076 {
3077   PetscErrorCode ierr;
3078   PetscMPIInt    size,flag;
3079   PetscInt       i,ii,cismax,ispar;
3080   Mat            *A,*B;
3081   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3082 
3083   PetscFunctionBegin;
3084   if (!ismax) PetscFunctionReturn(0);
3085 
3086   for (i = 0, cismax = 0; i < ismax; ++i) {
3087     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRMPI(ierr);
3088     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3089     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRMPI(ierr);
3090     if (size > 1) ++cismax;
3091   }
3092 
3093   /*
3094      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3095      ispar counts the number of parallel ISs across C's comm.
3096   */
3097   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr);
3098   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3099     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3100     PetscFunctionReturn(0);
3101   }
3102 
3103   /* if (ispar) */
3104   /*
3105     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3106     These are used to extract the off-diag portion of the resulting parallel matrix.
3107     The row IS for the off-diag portion is the same as for the diag portion,
3108     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3109   */
3110   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3111   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3112   for (i = 0, ii = 0; i < ismax; ++i) {
3113     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3114     if (size > 1) {
3115       /*
3116          TODO: This is the part that's ***NOT SCALABLE***.
3117          To fix this we need to extract just the indices of C's nonzero columns
3118          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3119          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3120          to be done without serializing on the IS list, so, most likely, it is best
3121          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3122       */
3123       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3124       /* Now we have to
3125          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3126              were sorted on each rank, concatenated they might no longer be sorted;
3127          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3128              indices in the nondecreasing order to the original index positions.
3129          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3130       */
3131       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3132       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3133       ++ii;
3134     }
3135   }
3136   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3137   for (i = 0, ii = 0; i < ismax; ++i) {
3138     PetscInt       j,issize;
3139     const PetscInt *indices;
3140 
3141     /*
3142        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3143      */
3144     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3145     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3146     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3147     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3148     for (j = 1; j < issize; ++j) {
3149       if (indices[j] == indices[j-1]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3150     }
3151     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3152     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3153     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3154     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3155     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3156     for (j = 1; j < issize; ++j) {
3157       if (indices[j-1] == indices[j]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3158     }
3159     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3160     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3161     if (size > 1) {
3162       cisrow[ii] = isrow[i];
3163       ++ii;
3164     }
3165   }
3166   /*
3167     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3168     array of sequential matrices underlying the resulting parallel matrices.
3169     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3170     contain duplicates.
3171 
3172     There are as many diag matrices as there are original index sets. There are only as many parallel
3173     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3174 
3175     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3176     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3177       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3178       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3179       with A[i] and B[ii] extracted from the corresponding MPI submat.
3180     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3181       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3182       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3183       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3184       values into A[i] and B[ii] sitting inside the corresponding submat.
3185     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3186       A[i], B[ii], AA[i] or BB[ii] matrices.
3187   */
3188   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3189   if (scall == MAT_INITIAL_MATRIX) {
3190     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3191   }
3192 
3193   /* Now obtain the sequential A and B submatrices separately. */
3194   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3195   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3196   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3197 
3198   /*
3199     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3200     matrices A & B have been extracted directly into the parallel matrices containing them, or
3201     simply into the sequential matrix identical with the corresponding A (if size == 1).
3202     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3203     to have the same sparsity pattern.
3204     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3205     must be constructed for C. This is done by setseqmat(s).
3206   */
3207   for (i = 0, ii = 0; i < ismax; ++i) {
3208     /*
3209        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3210        That way we can avoid sorting and computing permutations when reusing.
3211        To this end:
3212         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3213         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3214           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3215     */
3216     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3217 
3218     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3219     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3220     if (size > 1) {
3221       if (scall == MAT_INITIAL_MATRIX) {
3222         ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3223         ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3224         ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3225         ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3226         ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3227       }
3228       /*
3229         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3230       */
3231       {
3232         Mat AA = A[i],BB = B[ii];
3233 
3234         if (AA || BB) {
3235           ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3236           ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3237           ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3238         }
3239         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3240       }
3241       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3242       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3243       ++ii;
3244     } else { /* if (size == 1) */
3245       if (scall == MAT_REUSE_MATRIX) {
3246         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3247       }
3248       if (isrow_p[i] || iscol_p[i]) {
3249         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3250         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3251         /* Otherwise A is extracted straight into (*submats)[i]. */
3252         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3253         ierr = MatDestroy(A+i);CHKERRQ(ierr);
3254       } else (*submat)[i] = A[i];
3255     }
3256     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3257     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3258   }
3259   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3260   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3261   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3262   ierr = PetscFree(A);CHKERRQ(ierr);
3263   ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr);
3264   PetscFunctionReturn(0);
3265 }
3266 
3267 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3268 {
3269   PetscErrorCode ierr;
3270 
3271   PetscFunctionBegin;
3272   ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3273   PetscFunctionReturn(0);
3274 }
3275