xref: /petsc/src/mat/impls/baij/mpi/baijov.c (revision 7d0a6c19129e7069c8a40e210b34ed62989173db)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include "../src/mat/impls/baij/mpi/mpibaij.h"
7 #include "petscbt.h"
8 
9 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat,PetscInt,IS *);
10 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
11 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
12 extern PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13 extern PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ"
17 PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
18 {
19   PetscErrorCode ierr;
20   PetscInt       i,N=C->cmap->N, bs=C->rmap->bs;
21   IS             *is_new;
22 
23   PetscFunctionBegin;
24   ierr = PetscMalloc(imax*sizeof(IS),&is_new);CHKERRQ(ierr);
25   /* Convert the indices into block format */
26   ierr = ISCompressIndicesGeneral(N,bs,imax,is,is_new);CHKERRQ(ierr);
27   if (ov < 0){ SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");}
28   for (i=0; i<ov; ++i) {
29     ierr = MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);CHKERRQ(ierr);
30   }
31   for (i=0; i<imax; i++) {ierr = ISDestroy(is[i]);CHKERRQ(ierr);}
32   ierr = ISExpandIndicesGeneral(N,bs,imax,is_new,is);CHKERRQ(ierr);
33   for (i=0; i<imax; i++) {ierr = ISDestroy(is_new[i]);CHKERRQ(ierr);}
34   ierr = PetscFree(is_new);CHKERRQ(ierr);
35   PetscFunctionReturn(0);
36 }
37 
38 /*
39   Sample message format:
40   If a processor A wants processor B to process some elements corresponding
41   to index sets is[1], is[5]
42   mesg [0] = 2   (no of index sets in the mesg)
43   -----------
44   mesg [1] = 1 => is[1]
45   mesg [2] = sizeof(is[1]);
46   -----------
47   mesg [5] = 5  => is[5]
48   mesg [6] = sizeof(is[5]);
49   -----------
50   mesg [7]
51   mesg [n]  data(is[1])
52   -----------
53   mesg[n+1]
54   mesg[m]  data(is[5])
55   -----------
56 
57   Notes:
58   nrqs - no of requests sent (or to be sent out)
59   nrqr - no of requests recieved (which have to be or which have been processed
60 */
61 #undef __FUNCT__
62 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ_Once"
63 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[])
64 {
65   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
66   const PetscInt **idx,*idx_i;
67   PetscInt       *n,*w3,*w4,*rtable,**data,len;
68   PetscErrorCode ierr;
69   PetscMPIInt    size,rank,tag1,tag2,*w2,*w1,nrqr;
70   PetscInt       Mbs,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
71   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
72   PetscMPIInt    *onodes1,*olengths1,*onodes2,*olengths2;
73   PetscBT        *table;
74   MPI_Comm       comm;
75   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
76   MPI_Status     *s_status,*recv_status;
77 
78   PetscFunctionBegin;
79   comm   = ((PetscObject)C)->comm;
80   size   = c->size;
81   rank   = c->rank;
82   Mbs    = c->Mbs;
83 
84   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
85   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
86 
87   ierr = PetscMalloc3(imax+1,const PetscInt*,&idx,imax,PetscInt,&n,Mbs,PetscInt,&rtable);CHKERRQ(ierr);
88   for (i=0; i<imax; i++) {
89     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
90     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
91   }
92 
93   /* Create hash table for the mapping :row -> proc*/
94   for (i=0,j=0; i<size; i++) {
95     len = c->rangebs[i+1];
96     for (; j<len; j++) {
97       rtable[j] = i;
98     }
99   }
100 
101   /* evaluate communication - mesg to who,length of mesg, and buffer space
102      required. Based on this, buffers are allocated, and data copied into them*/
103   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscInt,&w3,size,PetscInt,&w4);CHKERRQ(ierr);
104   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
105   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
106   ierr = PetscMemzero(w3,size*sizeof(PetscInt));CHKERRQ(ierr);
107   for (i=0; i<imax; i++) {
108     ierr  = PetscMemzero(w4,size*sizeof(PetscInt));CHKERRQ(ierr); /* initialise work vector*/
109     idx_i = idx[i];
110     len   = n[i];
111     for (j=0; j<len; j++) {
112       row  = idx_i[j];
113       if (row < 0) {
114         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
115       }
116       proc = rtable[row];
117       w4[proc]++;
118     }
119     for (j=0; j<size; j++){
120       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
121     }
122   }
123 
124   nrqs     = 0;              /* no of outgoing messages */
125   msz      = 0;              /* total mesg length (for all proc */
126   w1[rank] = 0;              /* no mesg sent to itself */
127   w3[rank] = 0;
128   for (i=0; i<size; i++) {
129     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
130   }
131   /* pa - is list of processors to communicate with */
132   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
133   for (i=0,j=0; i<size; i++) {
134     if (w1[i]) {pa[j] = i; j++;}
135   }
136 
137   /* Each message would have a header = 1 + 2*(no of IS) + data */
138   for (i=0; i<nrqs; i++) {
139     j      = pa[i];
140     w1[j] += w2[j] + 2*w3[j];
141     msz   += w1[j];
142   }
143 
144   /* Determine the number of messages to expect, their lengths, from from-ids */
145   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
146   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
147 
148   /* Now post the Irecvs corresponding to these messages */
149   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
150 
151   /* Allocate Memory for outgoing messages */
152   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
153   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
154   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
155   {
156     PetscInt *iptr = tmp,ict  = 0;
157     for (i=0; i<nrqs; i++) {
158       j         = pa[i];
159       iptr     +=  ict;
160       outdat[j] = iptr;
161       ict       = w1[j];
162     }
163   }
164 
165   /* Form the outgoing messages */
166   /*plug in the headers*/
167   for (i=0; i<nrqs; i++) {
168     j            = pa[i];
169     outdat[j][0] = 0;
170     ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
171     ptr[j]       = outdat[j] + 2*w3[j] + 1;
172   }
173 
174   /* Memory for doing local proc's work*/
175   {
176     PetscInt  *d_p;
177     char      *t_p;
178 
179     /* should change to use PetscMallocN() */
180     ierr = PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
181       (Mbs)*imax*sizeof(PetscInt)  + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char),&table);CHKERRQ(ierr);
182     ierr = PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
183       (Mbs)*imax*sizeof(PetscInt)  + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr);
184     data = (PetscInt **)(table + imax);
185     isz  = (PetscInt  *)(data  + imax);
186     d_p  = (PetscInt  *)(isz   + imax);
187     t_p  = (char *)(d_p   + Mbs*imax);
188     for (i=0; i<imax; i++) {
189       table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
190       data[i]  = d_p + (Mbs)*i;
191     }
192   }
193 
194   /* Parse the IS and update local tables and the outgoing buf with the data*/
195   {
196     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
197     PetscBT  table_i;
198 
199     for (i=0; i<imax; i++) {
200       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
201       n_i     = n[i];
202       table_i = table[i];
203       idx_i   = idx[i];
204       data_i  = data[i];
205       isz_i   = isz[i];
206       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
207         row  = idx_i[j];
208         proc = rtable[row];
209         if (proc != rank) { /* copy to the outgoing buffer */
210           ctr[proc]++;
211           *ptr[proc] = row;
212           ptr[proc]++;
213         } else { /* Update the local table */
214           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
215         }
216       }
217       /* Update the headers for the current IS */
218       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
219         if ((ctr_j = ctr[j])) {
220           outdat_j        = outdat[j];
221           k               = ++outdat_j[0];
222           outdat_j[2*k]   = ctr_j;
223           outdat_j[2*k-1] = i;
224         }
225       }
226       isz[i] = isz_i;
227     }
228   }
229 
230   /*  Now  post the sends */
231   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
232   for (i=0; i<nrqs; ++i) {
233     j    = pa[i];
234     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
235   }
236 
237   /* No longer need the original indices*/
238   for (i=0; i<imax; ++i) {
239     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
240   }
241   ierr = PetscFree3(idx,n,rtable);CHKERRQ(ierr);
242 
243   for (i=0; i<imax; ++i) {
244     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
245   }
246 
247   /* Do Local work*/
248   ierr = MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
249 
250   /* Receive messages*/
251   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
252   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
253 
254   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
255   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
256 
257   /* Phase 1 sends are complete - deallocate buffers */
258   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
259   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
260 
261   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
262   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
263   ierr = MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
264   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
265   ierr = PetscFree(rbuf);CHKERRQ(ierr);
266 
267   /* Send the data back*/
268   /* Do a global reduction to know the buffer space req for incoming messages*/
269   {
270     PetscMPIInt *rw1;
271 
272     ierr = PetscMalloc(size*sizeof(PetscInt),&rw1);CHKERRQ(ierr);
273     ierr = PetscMemzero(rw1,size*sizeof(PetscInt));CHKERRQ(ierr);
274 
275     for (i=0; i<nrqr; ++i) {
276       proc      = recv_status[i].MPI_SOURCE;
277       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
278       rw1[proc] = isz1[i];
279     }
280 
281     ierr = PetscFree(onodes1);CHKERRQ(ierr);
282     ierr = PetscFree(olengths1);CHKERRQ(ierr);
283 
284     /* Determine the number of messages to expect, their lengths, from from-ids */
285     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
286     ierr = PetscFree(rw1);CHKERRQ(ierr);
287   }
288   /* Now post the Irecvs corresponding to these messages */
289   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
290 
291   /*  Now  post the sends */
292   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
293   for (i=0; i<nrqr; ++i) {
294     j    = recv_status[i].MPI_SOURCE;
295     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
296   }
297 
298   /* receive work done on other processors*/
299   {
300     PetscMPIInt idex;
301     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
302     PetscBT     table_i;
303     MPI_Status  *status2;
304 
305     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
306     for (i=0; i<nrqs; ++i) {
307       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
308       /* Process the message*/
309       rbuf2_i = rbuf2[idex];
310       ct1     = 2*rbuf2_i[0]+1;
311       jmax    = rbuf2[idex][0];
312       for (j=1; j<=jmax; j++) {
313         max     = rbuf2_i[2*j];
314         is_no   = rbuf2_i[2*j-1];
315         isz_i   = isz[is_no];
316         data_i  = data[is_no];
317         table_i = table[is_no];
318         for (k=0; k<max; k++,ct1++) {
319           row = rbuf2_i[ct1];
320           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
321         }
322         isz[is_no] = isz_i;
323       }
324     }
325     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
326     ierr = PetscFree(status2);CHKERRQ(ierr);
327   }
328 
329   for (i=0; i<imax; ++i) {
330     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
331   }
332 
333 
334   ierr = PetscFree(onodes2);CHKERRQ(ierr);
335   ierr = PetscFree(olengths2);CHKERRQ(ierr);
336 
337   ierr = PetscFree(pa);CHKERRQ(ierr);
338   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
339   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
340   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
341   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
342   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
343   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
344   ierr = PetscFree(table);CHKERRQ(ierr);
345   ierr = PetscFree(s_status);CHKERRQ(ierr);
346   ierr = PetscFree(recv_status);CHKERRQ(ierr);
347   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
348   ierr = PetscFree(xdata);CHKERRQ(ierr);
349   ierr = PetscFree(isz1);CHKERRQ(ierr);
350   PetscFunctionReturn(0);
351 }
352 
353 #undef __FUNCT__
354 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ_Local"
355 /*
356    MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
357        the work on the local processor.
358 
359      Inputs:
360       C      - MAT_MPIBAIJ;
361       imax - total no of index sets processed at a time;
362       table  - an array of char - size = Mbs bits.
363 
364      Output:
365       isz    - array containing the count of the solution elements corresponding
366                to each index set;
367       data   - pointer to the solutions
368 */
369 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
370 {
371   Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
372   Mat         A = c->A,B = c->B;
373   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
374   PetscInt    start,end,val,max,rstart,cstart,*ai,*aj;
375   PetscInt    *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
376   PetscBT     table_i;
377 
378   PetscFunctionBegin;
379   rstart = c->rstartbs;
380   cstart = c->cstartbs;
381   ai     = a->i;
382   aj     = a->j;
383   bi     = b->i;
384   bj     = b->j;
385   garray = c->garray;
386 
387 
388   for (i=0; i<imax; i++) {
389     data_i  = data[i];
390     table_i = table[i];
391     isz_i   = isz[i];
392     for (j=0,max=isz[i]; j<max; j++) {
393       row   = data_i[j] - rstart;
394       start = ai[row];
395       end   = ai[row+1];
396       for (k=start; k<end; k++) { /* Amat */
397         val = aj[k] + cstart;
398         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
399       }
400       start = bi[row];
401       end   = bi[row+1];
402       for (k=start; k<end; k++) { /* Bmat */
403         val = garray[bj[k]];
404         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
405       }
406     }
407     isz[i] = isz_i;
408   }
409   PetscFunctionReturn(0);
410 }
411 #undef __FUNCT__
412 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ_Receive"
413 /*
414       MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages,
415          and return the output
416 
417          Input:
418            C    - the matrix
419            nrqr - no of messages being processed.
420            rbuf - an array of pointers to the recieved requests
421 
422          Output:
423            xdata - array of messages to be sent back
424            isz1  - size of each message
425 
426   For better efficiency perhaps we should malloc separately each xdata[i],
427 then if a remalloc is required we need only copy the data for that one row
428 rather than all previous rows as it is now where a single large chunck of
429 memory is used.
430 
431 */
432 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
433 {
434   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
435   Mat            A = c->A,B = c->B;
436   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
437   PetscErrorCode ierr;
438   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
439   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
440   PetscInt       val,max1,max2,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
441   PetscInt       *rbuf_i,kmax,rbuf_0;
442   PetscBT        xtable;
443 
444   PetscFunctionBegin;
445   Mbs    = c->Mbs;
446   rstart = c->rstartbs;
447   cstart = c->cstartbs;
448   ai     = a->i;
449   aj     = a->j;
450   bi     = b->i;
451   bj     = b->j;
452   garray = c->garray;
453 
454 
455   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
456     rbuf_i  =  rbuf[i];
457     rbuf_0  =  rbuf_i[0];
458     ct     += rbuf_0;
459     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
460   }
461 
462   if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
463   else        max1 = 1;
464   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
465   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
466   ++no_malloc;
467   ierr         = PetscBTCreate(Mbs,xtable);CHKERRQ(ierr);
468   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
469 
470   ct3 = 0;
471   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
472     rbuf_i =  rbuf[i];
473     rbuf_0 =  rbuf_i[0];
474     ct1    =  2*rbuf_0+1;
475     ct2    =  ct1;
476     ct3    += ct1;
477     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
478       ierr = PetscBTMemzero(Mbs,xtable);CHKERRQ(ierr);
479       oct2 = ct2;
480       kmax = rbuf_i[2*j];
481       for (k=0; k<kmax; k++,ct1++) {
482         row = rbuf_i[ct1];
483         if (!PetscBTLookupSet(xtable,row)) {
484           if (!(ct3 < mem_estimate)) {
485             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
486             ierr = PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);CHKERRQ(ierr);
487             ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
488             ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
489             xdata[0]     = tmp;
490             mem_estimate = new_estimate; ++no_malloc;
491             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
492           }
493           xdata[i][ct2++] = row;
494           ct3++;
495         }
496       }
497       for (k=oct2,max2=ct2; k<max2; k++)  {
498         row   = xdata[i][k] - rstart;
499         start = ai[row];
500         end   = ai[row+1];
501         for (l=start; l<end; l++) {
502           val = aj[l] + cstart;
503           if (!PetscBTLookupSet(xtable,val)) {
504             if (!(ct3 < mem_estimate)) {
505               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
506               ierr = PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);CHKERRQ(ierr);
507               ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
508               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
509               xdata[0]     = tmp;
510               mem_estimate = new_estimate; ++no_malloc;
511               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
512             }
513             xdata[i][ct2++] = val;
514             ct3++;
515           }
516         }
517         start = bi[row];
518         end   = bi[row+1];
519         for (l=start; l<end; l++) {
520           val = garray[bj[l]];
521           if (!PetscBTLookupSet(xtable,val)) {
522             if (!(ct3 < mem_estimate)) {
523               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
524               ierr = PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);CHKERRQ(ierr);
525               ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
526               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
527               xdata[0]     = tmp;
528               mem_estimate = new_estimate; ++no_malloc;
529               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
530             }
531             xdata[i][ct2++] = val;
532             ct3++;
533           }
534         }
535       }
536       /* Update the header*/
537       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
538       xdata[i][2*j-1] = rbuf_i[2*j-1];
539     }
540     xdata[i][0] = rbuf_0;
541     xdata[i+1]  = xdata[i] + ct2;
542     isz1[i]     = ct2; /* size of each message */
543   }
544   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
545   ierr = PetscInfo3(C,"Allocated %D bytes, required %D, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
546   PetscFunctionReturn(0);
547 }
548 
549 static PetscErrorCode MatGetSubMatrices_MPIBAIJ_local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat *);
550 
551 #undef __FUNCT__
552 #define __FUNCT__ "MatGetSubMatrices_MPIBAIJ"
553 PetscErrorCode MatGetSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
554 {
555   IS             *isrow_new,*iscol_new;
556   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
557   PetscErrorCode ierr;
558   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,N=C->cmap->N,bs=C->rmap->bs;
559 
560   PetscFunctionBegin;
561   /* The compression and expansion should be avoided. Does'nt point
562      out errors might change the indices hence buggey */
563 
564   ierr = PetscMalloc2(ismax+1,IS,&isrow_new,ismax+1,IS,&iscol_new);CHKERRQ(ierr);
565   ierr = ISCompressIndicesGeneral(N,bs,ismax,isrow,isrow_new);CHKERRQ(ierr);
566   ierr = ISCompressIndicesGeneral(N,bs,ismax,iscol,iscol_new);CHKERRQ(ierr);
567 
568   /* Allocate memory to hold all the submatrices */
569   if (scall != MAT_REUSE_MATRIX) {
570     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
571   }
572   /* Determine the number of stages through which submatrices are done */
573   nmax          = 20*1000000 / (c->Nbs * sizeof(PetscInt));
574   if (!nmax) nmax = 1;
575   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
576 
577   /* Make sure every processor loops through the nstages */
578   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
579   for (i=0,pos=0; i<nstages; i++) {
580     if (pos+nmax <= ismax) max_no = nmax;
581     else if (pos == ismax) max_no = 0;
582     else                   max_no = ismax-pos;
583     ierr = MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos);CHKERRQ(ierr);
584     pos += max_no;
585   }
586 
587   for (i=0; i<ismax; i++) {
588     ierr = ISDestroy(isrow_new[i]);CHKERRQ(ierr);
589     ierr = ISDestroy(iscol_new[i]);CHKERRQ(ierr);
590   }
591   ierr = PetscFree2(isrow_new,iscol_new);CHKERRQ(ierr);
592   PetscFunctionReturn(0);
593 }
594 
595 #if defined (PETSC_USE_CTABLE)
596 #undef __FUNCT__
597 #define __FUNCT__ "PetscGetProc"
598 PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
599 {
600   PetscInt    nGlobalNd = proc_gnode[size];
601   PetscMPIInt fproc = PetscMPIIntCast( (PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)));
602 
603   PetscFunctionBegin;
604   if (fproc > size) fproc = size;
605   while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
606     if (row < proc_gnode[fproc]) fproc--;
607     else                         fproc++;
608   }
609   *rank = fproc;
610   PetscFunctionReturn(0);
611 }
612 #endif
613 
614 /* -------------------------------------------------------------------------*/
615 /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */
616 #undef __FUNCT__
617 #define __FUNCT__ "MatGetSubMatrices_MPIBAIJ_local"
618 static PetscErrorCode MatGetSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
619 {
620   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
621   Mat            A = c->A;
622   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
623   const PetscInt **irow,**icol,*irow_i;
624   PetscInt       *nrow,*ncol,*w3,*w4,start;
625   PetscErrorCode ierr;
626   PetscMPIInt    size,tag0,tag1,tag2,tag3,*w1,*w2,nrqr,idex,end,proc;
627   PetscInt       **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,**rbuf1,row;
628   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
629   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
630   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
631   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
632   PetscInt       bs=C->rmap->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB;
633   PetscInt       cstart = c->cstartbs,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
634   PetscInt       *bmap = c->garray,ctmp,rstart=c->rstartbs;
635   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
636   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
637   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
638   MPI_Status     *r_status3,*r_status4,*s_status4;
639   MPI_Comm       comm;
640   MatScalar      **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB;
641   MatScalar      *a_a=a->a,*b_a=b->a;
642   PetscBool      flag;
643   PetscMPIInt    *onodes1,*olengths1;
644 
645 #if defined (PETSC_USE_CTABLE)
646   PetscInt       tt;
647   PetscTable     *rowmaps,*colmaps,lrow1_grow1,lcol1_gcol1;
648 #else
649   PetscInt       **cmap,*cmap_i,*rtable,*rmap_i,**rmap, Mbs = c->Mbs;
650 #endif
651 
652   PetscFunctionBegin;
653   comm   = ((PetscObject)C)->comm;
654   tag0   = ((PetscObject)C)->tag;
655   size   = c->size;
656   rank   = c->rank;
657 
658   /* Get some new tags to keep the communication clean */
659   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
660   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
661   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
662 
663 #if defined(PETSC_USE_CTABLE)
664   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
665 #else
666   ierr = PetscMalloc5(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol,Mbs+1,PetscInt,&rtable);CHKERRQ(ierr);
667   /* Create hash table for the mapping :row -> proc*/
668   for (i=0,j=0; i<size; i++) {
669     jmax = c->rowners[i+1];
670     for (; j<jmax; j++) {
671       rtable[j] = i;
672     }
673   }
674 #endif
675 
676   for (i=0; i<ismax; i++) {
677     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
678     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
679     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
680     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
681   }
682 
683   /* evaluate communication - mesg to who,length of mesg,and buffer space
684      required. Based on this, buffers are allocated, and data copied into them*/
685   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscInt,&w3,size,PetscInt,&w4);CHKERRQ(ierr);
686   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
687   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
688   ierr = PetscMemzero(w3,size*sizeof(PetscInt));CHKERRQ(ierr);
689   for (i=0; i<ismax; i++) {
690     ierr   = PetscMemzero(w4,size*sizeof(PetscInt));CHKERRQ(ierr); /* initialise work vector*/
691     jmax   = nrow[i];
692     irow_i = irow[i];
693     for (j=0; j<jmax; j++) {
694       row  = irow_i[j];
695 #if defined (PETSC_USE_CTABLE)
696       ierr = PetscGetProc(row,size,c->rangebs,&proc);CHKERRQ(ierr);
697 #else
698       proc = rtable[row];
699 #endif
700       w4[proc]++;
701     }
702     for (j=0; j<size; j++) {
703       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
704     }
705   }
706 
707   nrqs     = 0;              /* no of outgoing messages */
708   msz      = 0;              /* total mesg length for all proc */
709   w1[rank] = 0;              /* no mesg sent to intself */
710   w3[rank] = 0;
711   for (i=0; i<size; i++) {
712     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
713   }
714   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
715   for (i=0,j=0; i<size; i++) {
716     if (w1[i]) { pa[j] = i; j++; }
717   }
718 
719   /* Each message would have a header = 1 + 2*(no of IS) + data */
720   for (i=0; i<nrqs; i++) {
721     j     = pa[i];
722     w1[j] += w2[j] + 2* w3[j];
723     msz   += w1[j];
724   }
725 
726   /* Determine the number of messages to expect, their lengths, from from-ids */
727   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
728   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
729 
730   /* Now post the Irecvs corresponding to these messages */
731   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
732 
733   ierr = PetscFree(onodes1);CHKERRQ(ierr);
734   ierr = PetscFree(olengths1);CHKERRQ(ierr);
735 
736   /* Allocate Memory for outgoing messages */
737   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
738   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
739   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
740   {
741     PetscInt *iptr = tmp,ict = 0;
742     for (i=0; i<nrqs; i++) {
743       j         = pa[i];
744       iptr     += ict;
745       sbuf1[j]  = iptr;
746       ict       = w1[j];
747     }
748   }
749 
750   /* Form the outgoing messages */
751   /* Initialise the header space */
752   for (i=0; i<nrqs; i++) {
753     j           = pa[i];
754     sbuf1[j][0] = 0;
755     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
756     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
757   }
758 
759   /* Parse the isrow and copy data into outbuf */
760   for (i=0; i<ismax; i++) {
761     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
762     irow_i = irow[i];
763     jmax   = nrow[i];
764     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
765       row  = irow_i[j];
766 #if defined (PETSC_USE_CTABLE)
767       ierr = PetscGetProc(row,size,c->rangebs,&proc);CHKERRQ(ierr);
768 #else
769       proc = rtable[row];
770 #endif
771       if (proc != rank) { /* copy to the outgoing buf*/
772         ctr[proc]++;
773         *ptr[proc] = row;
774         ptr[proc]++;
775       }
776     }
777     /* Update the headers for the current IS */
778     for (j=0; j<size; j++) { /* Can Optimise this loop too */
779       if ((ctr_j = ctr[j])) {
780         sbuf1_j        = sbuf1[j];
781         k              = ++sbuf1_j[0];
782         sbuf1_j[2*k]   = ctr_j;
783         sbuf1_j[2*k-1] = i;
784       }
785     }
786   }
787 
788   /*  Now  post the sends */
789   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
790   for (i=0; i<nrqs; ++i) {
791     j = pa[i];
792     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
793   }
794 
795   /* Post Recieves to capture the buffer size */
796   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
797   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
798   rbuf2[0] = tmp + msz;
799   for (i=1; i<nrqs; ++i) {
800     j        = pa[i];
801     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
802   }
803   for (i=0; i<nrqs; ++i) {
804     j    = pa[i];
805     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
806   }
807 
808   /* Send to other procs the buf size they should allocate */
809 
810   /* Receive messages*/
811   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
812   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
813   ierr = PetscMalloc3(nrqr+1,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
814   {
815     Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
816     PetscInt    *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;
817 
818     for (i=0; i<nrqr; ++i) {
819       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
820       req_size[idex] = 0;
821       rbuf1_i         = rbuf1[idex];
822       start           = 2*rbuf1_i[0] + 1;
823       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
824       ierr            = PetscMalloc(end*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
825       sbuf2_i         = sbuf2[idex];
826       for (j=start; j<end; j++) {
827         id               = rbuf1_i[j] - rstart;
828         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
829         sbuf2_i[j]       = ncols;
830         req_size[idex] += ncols;
831       }
832       req_source[idex] = r_status1[i].MPI_SOURCE;
833       /* form the header */
834       sbuf2_i[0]   = req_size[idex];
835       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
836       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
837     }
838   }
839   ierr = PetscFree(r_status1);CHKERRQ(ierr);
840   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
841 
842   /*  recv buffer sizes */
843   /* Receive messages*/
844 
845   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
846   ierr = PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);CHKERRQ(ierr);
847   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
848   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
849   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
850 
851   for (i=0; i<nrqs; ++i) {
852     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
853     ierr = PetscMalloc(rbuf2[idex][0]*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
854     ierr = PetscMalloc(rbuf2[idex][0]*bs2*sizeof(MatScalar),&rbuf4[idex]);CHKERRQ(ierr);
855     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
856     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
857   }
858   ierr = PetscFree(r_status2);CHKERRQ(ierr);
859   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
860 
861   /* Wait on sends1 and sends2 */
862   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
863   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
864 
865   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
866   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
867   ierr = PetscFree(s_status1);CHKERRQ(ierr);
868   ierr = PetscFree(s_status2);CHKERRQ(ierr);
869   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
870   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
871 
872   /* Now allocate buffers for a->j, and send them off */
873   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
874   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
875   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
876   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
877 
878   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
879   {
880      for (i=0; i<nrqr; i++) {
881       rbuf1_i   = rbuf1[i];
882       sbuf_aj_i = sbuf_aj[i];
883       ct1       = 2*rbuf1_i[0] + 1;
884       ct2       = 0;
885       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
886         kmax = rbuf1[i][2*j];
887         for (k=0; k<kmax; k++,ct1++) {
888           row    = rbuf1_i[ct1] - rstart;
889           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
890           ncols  = nzA + nzB;
891           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
892 
893           /* load the column indices for this row into cols*/
894           cols  = sbuf_aj_i + ct2;
895           for (l=0; l<nzB; l++) {
896             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[l] = ctmp;
897             else break;
898           }
899           imark = l;
900           for (l=0; l<nzA; l++)   cols[imark+l] = cstart + cworkA[l];
901           for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
902           ct2 += ncols;
903         }
904       }
905       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
906     }
907   }
908   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
909   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
910 
911   /* Allocate buffers for a->a, and send them off */
912   ierr = PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);CHKERRQ(ierr);
913   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
914   ierr = PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);CHKERRQ(ierr);
915   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
916 
917   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
918   {
919     for (i=0; i<nrqr; i++) {
920       rbuf1_i   = rbuf1[i];
921       sbuf_aa_i = sbuf_aa[i];
922       ct1       = 2*rbuf1_i[0]+1;
923       ct2       = 0;
924       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
925         kmax = rbuf1_i[2*j];
926         for (k=0; k<kmax; k++,ct1++) {
927           row    = rbuf1_i[ct1] - rstart;
928           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
929           ncols  = nzA + nzB;
930           cworkA = a_j + a_i[row];     cworkB = b_j + b_i[row];
931           vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2;
932 
933           /* load the column values for this row into vals*/
934           vals  = sbuf_aa_i+ct2*bs2;
935           for (l=0; l<nzB; l++) {
936             if ((bmap[cworkB[l]]) < cstart) {
937               ierr = PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
938             }
939             else break;
940           }
941           imark = l;
942           for (l=0; l<nzA; l++) {
943             ierr = PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
944           }
945           for (l=imark; l<nzB; l++) {
946             ierr = PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
947           }
948           ct2 += ncols;
949         }
950       }
951       ierr = MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
952     }
953   }
954   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
955   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
956   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
957   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
958 
959   /* Form the matrix */
960   /* create col map */
961   {
962     const PetscInt *icol_i;
963 #if defined (PETSC_USE_CTABLE)
964     /* Create row map*/
965     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&colmaps);CHKERRQ(ierr);
966     for (i=0; i<ismax; i++) {
967       ierr = PetscTableCreate(ncol[i]+1,&colmaps[i]);CHKERRQ(ierr);
968     }
969 #else
970     ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
971     ierr    = PetscMalloc(ismax*c->Nbs*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr);
972     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; }
973 #endif
974     for (i=0; i<ismax; i++) {
975       jmax   = ncol[i];
976       icol_i = icol[i];
977 #if defined (PETSC_USE_CTABLE)
978       lcol1_gcol1 = colmaps[i];
979       for (j=0; j<jmax; j++) {
980 	ierr = PetscTableAdd(lcol1_gcol1,icol_i[j]+1,j+1);CHKERRQ(ierr);
981       }
982 #else
983       cmap_i = cmap[i];
984       for (j=0; j<jmax; j++) {
985         cmap_i[icol_i[j]] = j+1;
986       }
987 #endif
988     }
989   }
990 
991   /* Create lens which is required for MatCreate... */
992   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
993   ierr    = PetscMalloc((1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt),&lens);CHKERRQ(ierr);
994   lens[0] = (PetscInt*)(lens + ismax);
995   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
996   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
997 
998   /* Update lens from local data */
999   for (i=0; i<ismax; i++) {
1000     jmax   = nrow[i];
1001 #if defined (PETSC_USE_CTABLE)
1002     lcol1_gcol1 = colmaps[i];
1003 #else
1004     cmap_i = cmap[i];
1005 #endif
1006     irow_i = irow[i];
1007     lens_i = lens[i];
1008     for (j=0; j<jmax; j++) {
1009       row  = irow_i[j];
1010 #if defined (PETSC_USE_CTABLE)
1011       ierr = PetscGetProc(row,size,c->rangebs,&proc);CHKERRQ(ierr);
1012 #else
1013       proc = rtable[row];
1014 #endif
1015       if (proc == rank) {
1016         /* Get indices from matA and then from matB */
1017         row    = row - rstart;
1018         nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1019         cworkA =  a_j + a_i[row]; cworkB = b_j + b_i[row];
1020 #if defined (PETSC_USE_CTABLE)
1021        for (k=0; k<nzA; k++) {
1022 	 ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[k]+1,&tt);CHKERRQ(ierr);
1023           if (tt) { lens_i[j]++; }
1024         }
1025         for (k=0; k<nzB; k++) {
1026 	  ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[k]]+1,&tt);CHKERRQ(ierr);
1027           if (tt) { lens_i[j]++; }
1028         }
1029 #else
1030         for (k=0; k<nzA; k++) {
1031           if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++; }
1032         }
1033         for (k=0; k<nzB; k++) {
1034           if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++; }
1035         }
1036 #endif
1037       }
1038     }
1039   }
1040 #if defined (PETSC_USE_CTABLE)
1041   /* Create row map*/
1042   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rowmaps);CHKERRQ(ierr);
1043   for (i=0; i<ismax; i++){
1044     ierr = PetscTableCreate(nrow[i]+1,&rowmaps[i]);CHKERRQ(ierr);
1045   }
1046 #else
1047   /* Create row map*/
1048   ierr    = PetscMalloc((1+ismax)*sizeof(PetscInt*)+ ismax*Mbs*sizeof(PetscInt),&rmap);CHKERRQ(ierr);
1049   rmap[0] = (PetscInt*)(rmap + ismax);
1050   ierr    = PetscMemzero(rmap[0],ismax*Mbs*sizeof(PetscInt));CHKERRQ(ierr);
1051   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + Mbs;}
1052 #endif
1053   for (i=0; i<ismax; i++) {
1054     irow_i = irow[i];
1055     jmax   = nrow[i];
1056 #if defined (PETSC_USE_CTABLE)
1057     lrow1_grow1 = rowmaps[i];
1058     for (j=0; j<jmax; j++) {
1059       ierr = PetscTableAdd(lrow1_grow1,irow_i[j]+1,j+1);CHKERRQ(ierr);
1060     }
1061 #else
1062     rmap_i = rmap[i];
1063     for (j=0; j<jmax; j++) {
1064       rmap_i[irow_i[j]] = j;
1065     }
1066 #endif
1067   }
1068 
1069   /* Update lens from offproc data */
1070   {
1071     PetscInt    *rbuf2_i,*rbuf3_i,*sbuf1_i;
1072     PetscMPIInt ii;
1073 
1074     for (tmp2=0; tmp2<nrqs; tmp2++) {
1075       ierr    = MPI_Waitany(nrqs,r_waits3,&ii,r_status3+tmp2);CHKERRQ(ierr);
1076       idex   = pa[ii];
1077       sbuf1_i = sbuf1[idex];
1078       jmax    = sbuf1_i[0];
1079       ct1     = 2*jmax+1;
1080       ct2     = 0;
1081       rbuf2_i = rbuf2[ii];
1082       rbuf3_i = rbuf3[ii];
1083       for (j=1; j<=jmax; j++) {
1084         is_no   = sbuf1_i[2*j-1];
1085         max1    = sbuf1_i[2*j];
1086         lens_i  = lens[is_no];
1087 #if defined (PETSC_USE_CTABLE)
1088 	lcol1_gcol1 = colmaps[is_no];
1089 	lrow1_grow1 = rowmaps[is_no];
1090 #else
1091         cmap_i  = cmap[is_no];
1092 	rmap_i  = rmap[is_no];
1093 #endif
1094         for (k=0; k<max1; k++,ct1++) {
1095 #if defined (PETSC_USE_CTABLE)
1096 	  ierr = PetscTableFind(lrow1_grow1,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1097           row--;
1098           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
1099 #else
1100           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1101 #endif
1102           max2 = rbuf2_i[ct1];
1103           for (l=0; l<max2; l++,ct2++) {
1104 #if defined (PETSC_USE_CTABLE)
1105 	    ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tt);CHKERRQ(ierr);
1106 	    if (tt) {
1107               lens_i[row]++;
1108             }
1109 #else
1110             if (cmap_i[rbuf3_i[ct2]]) {
1111               lens_i[row]++;
1112             }
1113 #endif
1114           }
1115         }
1116       }
1117     }
1118   }
1119   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1120   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1121   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1122   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1123   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1124 
1125   /* Create the submatrices */
1126   if (scall == MAT_REUSE_MATRIX) {
1127     /*
1128         Assumes new rows are same length as the old rows, hence bug!
1129     */
1130     for (i=0; i<ismax; i++) {
1131       mat = (Mat_SeqBAIJ *)(submats[i]->data);
1132       if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || C->rmap->bs != bs)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1133       ierr = PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
1134       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros");
1135       /* Initial matrix as if empty */
1136       ierr = PetscMemzero(mat->ilen,mat->mbs*sizeof(PetscInt));CHKERRQ(ierr);
1137       submats[i]->factortype = C->factortype;
1138     }
1139   } else {
1140     for (i=0; i<ismax; i++) {
1141       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1142       ierr = MatSetSizes(submats[i],nrow[i]*bs,ncol[i]*bs,nrow[i]*bs,ncol[i]*bs);CHKERRQ(ierr);
1143       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1144       ierr = MatSeqBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);CHKERRQ(ierr);
1145       ierr = MatSeqSBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);CHKERRQ(ierr);
1146     }
1147   }
1148 
1149   /* Assemble the matrices */
1150   /* First assemble the local rows */
1151   {
1152     PetscInt       ilen_row,*imat_ilen,*imat_j,*imat_i;
1153     MatScalar *imat_a;
1154 
1155     for (i=0; i<ismax; i++) {
1156       mat       = (Mat_SeqBAIJ*)submats[i]->data;
1157       imat_ilen = mat->ilen;
1158       imat_j    = mat->j;
1159       imat_i    = mat->i;
1160       imat_a    = mat->a;
1161 
1162 #if defined (PETSC_USE_CTABLE)
1163       lcol1_gcol1 = colmaps[i];
1164       lrow1_grow1 = rowmaps[i];
1165 #else
1166       cmap_i    = cmap[i];
1167       rmap_i    = rmap[i];
1168 #endif
1169       irow_i    = irow[i];
1170       jmax      = nrow[i];
1171       for (j=0; j<jmax; j++) {
1172         row      = irow_i[j];
1173 #if defined (PETSC_USE_CTABLE)
1174 	ierr = PetscGetProc(row,size,c->rangebs,&proc);CHKERRQ(ierr);
1175 #else
1176 	proc = rtable[row];
1177 #endif
1178         if (proc == rank) {
1179           row      = row - rstart;
1180           nzA      = a_i[row+1] - a_i[row];
1181           nzB      = b_i[row+1] - b_i[row];
1182           cworkA   = a_j + a_i[row];
1183           cworkB   = b_j + b_i[row];
1184           vworkA   = a_a + a_i[row]*bs2;
1185           vworkB   = b_a + b_i[row]*bs2;
1186 #if defined (PETSC_USE_CTABLE)
1187 	  ierr = PetscTableFind(lrow1_grow1,row+rstart+1,&row);CHKERRQ(ierr);
1188           row--;
1189           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
1190 #else
1191           row      = rmap_i[row + rstart];
1192 #endif
1193           mat_i    = imat_i[row];
1194           mat_a    = imat_a + mat_i*bs2;
1195           mat_j    = imat_j + mat_i;
1196           ilen_row = imat_ilen[row];
1197 
1198           /* load the column indices for this row into cols*/
1199           for (l=0; l<nzB; l++) {
1200 	    if ((ctmp = bmap[cworkB[l]]) < cstart) {
1201 #if defined (PETSC_USE_CTABLE)
1202 	      ierr = PetscTableFind(lcol1_gcol1,ctmp+1,&tcol);CHKERRQ(ierr);
1203 	      if (tcol) {
1204 #else
1205               if ((tcol = cmap_i[ctmp])) {
1206 #endif
1207                 *mat_j++ = tcol - 1;
1208                 ierr     = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1209                 mat_a   += bs2;
1210                 ilen_row++;
1211               }
1212             } else break;
1213           }
1214           imark = l;
1215           for (l=0; l<nzA; l++) {
1216 #if defined (PETSC_USE_CTABLE)
1217 	    ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[l]+1,&tcol);CHKERRQ(ierr);
1218 	    if (tcol) {
1219 #else
1220 	    if ((tcol = cmap_i[cstart + cworkA[l]])) {
1221 #endif
1222               *mat_j++ = tcol - 1;
1223               ierr     = PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1224               mat_a   += bs2;
1225               ilen_row++;
1226             }
1227           }
1228           for (l=imark; l<nzB; l++) {
1229 #if defined (PETSC_USE_CTABLE)
1230 	    ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[l]]+1,&tcol);CHKERRQ(ierr);
1231 	    if (tcol) {
1232 #else
1233             if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1234 #endif
1235               *mat_j++ = tcol - 1;
1236               ierr     = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1237               mat_a   += bs2;
1238               ilen_row++;
1239             }
1240           }
1241           imat_ilen[row] = ilen_row;
1242         }
1243       }
1244 
1245     }
1246   }
1247 
1248   /*   Now assemble the off proc rows*/
1249   {
1250     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1251     PetscInt    *imat_j,*imat_i;
1252     MatScalar   *imat_a,*rbuf4_i;
1253     PetscMPIInt ii;
1254 
1255     for (tmp2=0; tmp2<nrqs; tmp2++) {
1256       ierr    = MPI_Waitany(nrqs,r_waits4,&ii,r_status4+tmp2);CHKERRQ(ierr);
1257       idex   = pa[ii];
1258       sbuf1_i = sbuf1[idex];
1259       jmax    = sbuf1_i[0];
1260       ct1     = 2*jmax + 1;
1261       ct2     = 0;
1262       rbuf2_i = rbuf2[ii];
1263       rbuf3_i = rbuf3[ii];
1264       rbuf4_i = rbuf4[ii];
1265       for (j=1; j<=jmax; j++) {
1266         is_no     = sbuf1_i[2*j-1];
1267 #if defined (PETSC_USE_CTABLE)
1268 	lrow1_grow1 = rowmaps[is_no];
1269 	lcol1_gcol1 = colmaps[is_no];
1270 #else
1271         rmap_i    = rmap[is_no];
1272         cmap_i    = cmap[is_no];
1273 #endif
1274         mat       = (Mat_SeqBAIJ*)submats[is_no]->data;
1275         imat_ilen = mat->ilen;
1276         imat_j    = mat->j;
1277         imat_i    = mat->i;
1278         imat_a    = mat->a;
1279         max1      = sbuf1_i[2*j];
1280         for (k=0; k<max1; k++,ct1++) {
1281           row   = sbuf1_i[ct1];
1282 #if defined (PETSC_USE_CTABLE)
1283 	  ierr = PetscTableFind(lrow1_grow1,row+1,&row);CHKERRQ(ierr);
1284           row--;
1285           if(row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
1286 #else
1287           row   = rmap_i[row];
1288 #endif
1289           ilen  = imat_ilen[row];
1290           mat_i = imat_i[row];
1291           mat_a = imat_a + mat_i*bs2;
1292           mat_j = imat_j + mat_i;
1293           max2 = rbuf2_i[ct1];
1294           for (l=0; l<max2; l++,ct2++) {
1295 #if defined (PETSC_USE_CTABLE)
1296 	    ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1297 	    if (tcol) {
1298 #else
1299 	    if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1300 #endif
1301               *mat_j++    = tcol - 1;
1302               /* *mat_a++= rbuf4_i[ct2]; */
1303               ierr        = PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1304               mat_a      += bs2;
1305               ilen++;
1306             }
1307           }
1308           imat_ilen[row] = ilen;
1309         }
1310       }
1311     }
1312   }
1313   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1314   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1315   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1316   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1317   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1318 
1319   /* Restore the indices */
1320   for (i=0; i<ismax; i++) {
1321     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1322     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1323   }
1324 
1325   /* Destroy allocated memory */
1326 #if defined(PETSC_USE_CTABLE)
1327   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1328 #else
1329   ierr = PetscFree5(irow,icol,nrow,ncol,rtable);CHKERRQ(ierr);
1330 #endif
1331   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1332   ierr = PetscFree(pa);CHKERRQ(ierr);
1333 
1334   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1335   ierr = PetscFree(sbuf1);CHKERRQ(ierr);
1336   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1337   for (i=0; i<nrqr; ++i) {
1338     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1339   }
1340   for (i=0; i<nrqs; ++i) {
1341     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1342     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1343   }
1344   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1345   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1346   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1347   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1348   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1349   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1350   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1351 
1352 #if defined (PETSC_USE_CTABLE)
1353   for (i=0; i<ismax; i++){
1354     ierr = PetscTableDestroy(rowmaps[i]);CHKERRQ(ierr);
1355     ierr = PetscTableDestroy(colmaps[i]);CHKERRQ(ierr);
1356   }
1357   ierr = PetscFree(colmaps);CHKERRQ(ierr);
1358   ierr = PetscFree(rowmaps);CHKERRQ(ierr);
1359 #else
1360   ierr = PetscFree(rmap);CHKERRQ(ierr);
1361   ierr = PetscFree(cmap[0]);CHKERRQ(ierr);
1362   ierr = PetscFree(cmap);CHKERRQ(ierr);
1363 #endif
1364   ierr = PetscFree(lens);CHKERRQ(ierr);
1365 
1366   for (i=0; i<ismax; i++) {
1367     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1368     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1369   }
1370   PetscFunctionReturn(0);
1371 }
1372 
1373