xref: /petsc/src/mat/impls/baij/mpi/baijov.c (revision 01a79839fc82a7dabb7a87cd2a8bb532c6bfa88d)
1 #define PETSCMAT_DLL
2 
3 
4 /*
5    Routines to compute overlapping regions of a parallel MPI matrix
6   and to find submatrices that were shared across processors.
7 */
8 #include "../src/mat/impls/baij/mpi/mpibaij.h"
9 #include "petscbt.h"
10 
11 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat,PetscInt,IS *);
12 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
13 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
14 extern PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 extern PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16 
17 #undef __FUNCT__
18 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ"
19 PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
20 {
21   PetscErrorCode ierr;
22   PetscInt       i,N=C->cmap->N, bs=C->rmap->bs;
23   IS             *is_new;
24 
25   PetscFunctionBegin;
26   ierr = PetscMalloc(imax*sizeof(IS),&is_new);CHKERRQ(ierr);
27   /* Convert the indices into block format */
28   ierr = ISCompressIndicesGeneral(N,bs,imax,is,is_new);CHKERRQ(ierr);
29   if (ov < 0){ SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");}
30   for (i=0; i<ov; ++i) {
31     ierr = MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);CHKERRQ(ierr);
32   }
33   for (i=0; i<imax; i++) {ierr = ISDestroy(is[i]);CHKERRQ(ierr);}
34   ierr = ISExpandIndicesGeneral(N,bs,imax,is_new,is);CHKERRQ(ierr);
35   for (i=0; i<imax; i++) {ierr = ISDestroy(is_new[i]);CHKERRQ(ierr);}
36   ierr = PetscFree(is_new);CHKERRQ(ierr);
37   PetscFunctionReturn(0);
38 }
39 
40 /*
41   Sample message format:
42   If a processor A wants processor B to process some elements corresponding
43   to index sets is[1], is[5]
44   mesg [0] = 2   (no of index sets in the mesg)
45   -----------
46   mesg [1] = 1 => is[1]
47   mesg [2] = sizeof(is[1]);
48   -----------
49   mesg [5] = 5  => is[5]
50   mesg [6] = sizeof(is[5]);
51   -----------
52   mesg [7]
53   mesg [n]  data(is[1])
54   -----------
55   mesg[n+1]
56   mesg[m]  data(is[5])
57   -----------
58 
59   Notes:
60   nrqs - no of requests sent (or to be sent out)
61   nrqr - no of requests recieved (which have to be or which have been processed
62 */
63 #undef __FUNCT__
64 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ_Once"
65 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[])
66 {
67   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
68   const PetscInt **idx,*idx_i;
69   PetscInt       *n,*w3,*w4,*rtable,**data,len;
70   PetscErrorCode ierr;
71   PetscMPIInt    size,rank,tag1,tag2,*w2,*w1,nrqr;
72   PetscInt       Mbs,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
73   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
74   PetscMPIInt    *onodes1,*olengths1,*onodes2,*olengths2;
75   PetscBT        *table;
76   MPI_Comm       comm;
77   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
78   MPI_Status     *s_status,*recv_status;
79 
80   PetscFunctionBegin;
81   comm   = ((PetscObject)C)->comm;
82   size   = c->size;
83   rank   = c->rank;
84   Mbs    = c->Mbs;
85 
86   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
87   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
88 
89   ierr = PetscMalloc3(imax+1,const PetscInt*,&idx,imax,PetscInt,&n,Mbs,PetscInt,&rtable);CHKERRQ(ierr);
90   for (i=0; i<imax; i++) {
91     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
92     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
93   }
94 
95   /* Create hash table for the mapping :row -> proc*/
96   for (i=0,j=0; i<size; i++) {
97     len = c->rangebs[i+1];
98     for (; j<len; j++) {
99       rtable[j] = i;
100     }
101   }
102 
103   /* evaluate communication - mesg to who,length of mesg, and buffer space
104      required. Based on this, buffers are allocated, and data copied into them*/
105   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscInt,&w3,size,PetscInt,&w4);CHKERRQ(ierr);
106   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
107   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
108   ierr = PetscMemzero(w3,size*sizeof(PetscInt));CHKERRQ(ierr);
109   for (i=0; i<imax; i++) {
110     ierr  = PetscMemzero(w4,size*sizeof(PetscInt));CHKERRQ(ierr); /* initialise work vector*/
111     idx_i = idx[i];
112     len   = n[i];
113     for (j=0; j<len; j++) {
114       row  = idx_i[j];
115       if (row < 0) {
116         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
117       }
118       proc = rtable[row];
119       w4[proc]++;
120     }
121     for (j=0; j<size; j++){
122       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
123     }
124   }
125 
126   nrqs     = 0;              /* no of outgoing messages */
127   msz      = 0;              /* total mesg length (for all proc */
128   w1[rank] = 0;              /* no mesg sent to itself */
129   w3[rank] = 0;
130   for (i=0; i<size; i++) {
131     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
132   }
133   /* pa - is list of processors to communicate with */
134   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
135   for (i=0,j=0; i<size; i++) {
136     if (w1[i]) {pa[j] = i; j++;}
137   }
138 
139   /* Each message would have a header = 1 + 2*(no of IS) + data */
140   for (i=0; i<nrqs; i++) {
141     j      = pa[i];
142     w1[j] += w2[j] + 2*w3[j];
143     msz   += w1[j];
144   }
145 
146   /* Determine the number of messages to expect, their lengths, from from-ids */
147   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
148   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
149 
150   /* Now post the Irecvs corresponding to these messages */
151   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
152 
153   /* Allocate Memory for outgoing messages */
154   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
155   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
156   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
157   {
158     PetscInt *iptr = tmp,ict  = 0;
159     for (i=0; i<nrqs; i++) {
160       j         = pa[i];
161       iptr     +=  ict;
162       outdat[j] = iptr;
163       ict       = w1[j];
164     }
165   }
166 
167   /* Form the outgoing messages */
168   /*plug in the headers*/
169   for (i=0; i<nrqs; i++) {
170     j            = pa[i];
171     outdat[j][0] = 0;
172     ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
173     ptr[j]       = outdat[j] + 2*w3[j] + 1;
174   }
175 
176   /* Memory for doing local proc's work*/
177   {
178     PetscInt  *d_p;
179     char      *t_p;
180 
181     /* should change to use PetscMallocN() */
182     ierr = PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
183       (Mbs)*imax*sizeof(PetscInt)  + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char),&table);CHKERRQ(ierr);
184     ierr = PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
185       (Mbs)*imax*sizeof(PetscInt)  + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr);
186     data = (PetscInt **)(table + imax);
187     isz  = (PetscInt  *)(data  + imax);
188     d_p  = (PetscInt  *)(isz   + imax);
189     t_p  = (char *)(d_p   + Mbs*imax);
190     for (i=0; i<imax; i++) {
191       table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
192       data[i]  = d_p + (Mbs)*i;
193     }
194   }
195 
196   /* Parse the IS and update local tables and the outgoing buf with the data*/
197   {
198     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
199     PetscBT  table_i;
200 
201     for (i=0; i<imax; i++) {
202       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
203       n_i     = n[i];
204       table_i = table[i];
205       idx_i   = idx[i];
206       data_i  = data[i];
207       isz_i   = isz[i];
208       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
209         row  = idx_i[j];
210         proc = rtable[row];
211         if (proc != rank) { /* copy to the outgoing buffer */
212           ctr[proc]++;
213           *ptr[proc] = row;
214           ptr[proc]++;
215         } else { /* Update the local table */
216           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
217         }
218       }
219       /* Update the headers for the current IS */
220       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
221         if ((ctr_j = ctr[j])) {
222           outdat_j        = outdat[j];
223           k               = ++outdat_j[0];
224           outdat_j[2*k]   = ctr_j;
225           outdat_j[2*k-1] = i;
226         }
227       }
228       isz[i] = isz_i;
229     }
230   }
231 
232   /*  Now  post the sends */
233   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
234   for (i=0; i<nrqs; ++i) {
235     j    = pa[i];
236     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
237   }
238 
239   /* No longer need the original indices*/
240   for (i=0; i<imax; ++i) {
241     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
242   }
243   ierr = PetscFree3(idx,n,rtable);CHKERRQ(ierr);
244 
245   for (i=0; i<imax; ++i) {
246     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
247   }
248 
249   /* Do Local work*/
250   ierr = MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
251 
252   /* Receive messages*/
253   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
254   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
255 
256   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
257   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
258 
259   /* Phase 1 sends are complete - deallocate buffers */
260   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
261   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
262 
263   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
264   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
265   ierr = MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
266   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
267   ierr = PetscFree(rbuf);CHKERRQ(ierr);
268 
269   /* Send the data back*/
270   /* Do a global reduction to know the buffer space req for incoming messages*/
271   {
272     PetscMPIInt *rw1;
273 
274     ierr = PetscMalloc(size*sizeof(PetscInt),&rw1);CHKERRQ(ierr);
275     ierr = PetscMemzero(rw1,size*sizeof(PetscInt));CHKERRQ(ierr);
276 
277     for (i=0; i<nrqr; ++i) {
278       proc      = recv_status[i].MPI_SOURCE;
279       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
280       rw1[proc] = isz1[i];
281     }
282 
283     ierr = PetscFree(onodes1);CHKERRQ(ierr);
284     ierr = PetscFree(olengths1);CHKERRQ(ierr);
285 
286     /* Determine the number of messages to expect, their lengths, from from-ids */
287     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
288     ierr = PetscFree(rw1);CHKERRQ(ierr);
289   }
290   /* Now post the Irecvs corresponding to these messages */
291   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
292 
293   /*  Now  post the sends */
294   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
295   for (i=0; i<nrqr; ++i) {
296     j    = recv_status[i].MPI_SOURCE;
297     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
298   }
299 
300   /* receive work done on other processors*/
301   {
302     PetscMPIInt idex;
303     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
304     PetscBT     table_i;
305     MPI_Status  *status2;
306 
307     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
308     for (i=0; i<nrqs; ++i) {
309       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
310       /* Process the message*/
311       rbuf2_i = rbuf2[idex];
312       ct1     = 2*rbuf2_i[0]+1;
313       jmax    = rbuf2[idex][0];
314       for (j=1; j<=jmax; j++) {
315         max     = rbuf2_i[2*j];
316         is_no   = rbuf2_i[2*j-1];
317         isz_i   = isz[is_no];
318         data_i  = data[is_no];
319         table_i = table[is_no];
320         for (k=0; k<max; k++,ct1++) {
321           row = rbuf2_i[ct1];
322           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
323         }
324         isz[is_no] = isz_i;
325       }
326     }
327     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
328     ierr = PetscFree(status2);CHKERRQ(ierr);
329   }
330 
331   for (i=0; i<imax; ++i) {
332     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
333   }
334 
335 
336   ierr = PetscFree(onodes2);CHKERRQ(ierr);
337   ierr = PetscFree(olengths2);CHKERRQ(ierr);
338 
339   ierr = PetscFree(pa);CHKERRQ(ierr);
340   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
341   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
342   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
343   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
344   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
345   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
346   ierr = PetscFree(table);CHKERRQ(ierr);
347   ierr = PetscFree(s_status);CHKERRQ(ierr);
348   ierr = PetscFree(recv_status);CHKERRQ(ierr);
349   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
350   ierr = PetscFree(xdata);CHKERRQ(ierr);
351   ierr = PetscFree(isz1);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ_Local"
357 /*
358    MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
359        the work on the local processor.
360 
361      Inputs:
362       C      - MAT_MPIBAIJ;
363       imax - total no of index sets processed at a time;
364       table  - an array of char - size = Mbs bits.
365 
366      Output:
367       isz    - array containing the count of the solution elements corresponding
368                to each index set;
369       data   - pointer to the solutions
370 */
371 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
372 {
373   Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
374   Mat         A = c->A,B = c->B;
375   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
376   PetscInt    start,end,val,max,rstart,cstart,*ai,*aj;
377   PetscInt    *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
378   PetscBT     table_i;
379 
380   PetscFunctionBegin;
381   rstart = c->rstartbs;
382   cstart = c->cstartbs;
383   ai     = a->i;
384   aj     = a->j;
385   bi     = b->i;
386   bj     = b->j;
387   garray = c->garray;
388 
389 
390   for (i=0; i<imax; i++) {
391     data_i  = data[i];
392     table_i = table[i];
393     isz_i   = isz[i];
394     for (j=0,max=isz[i]; j<max; j++) {
395       row   = data_i[j] - rstart;
396       start = ai[row];
397       end   = ai[row+1];
398       for (k=start; k<end; k++) { /* Amat */
399         val = aj[k] + cstart;
400         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
401       }
402       start = bi[row];
403       end   = bi[row+1];
404       for (k=start; k<end; k++) { /* Bmat */
405         val = garray[bj[k]];
406         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
407       }
408     }
409     isz[i] = isz_i;
410   }
411   PetscFunctionReturn(0);
412 }
413 #undef __FUNCT__
414 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ_Receive"
415 /*
416       MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages,
417          and return the output
418 
419          Input:
420            C    - the matrix
421            nrqr - no of messages being processed.
422            rbuf - an array of pointers to the recieved requests
423 
424          Output:
425            xdata - array of messages to be sent back
426            isz1  - size of each message
427 
428   For better efficiency perhaps we should malloc separately each xdata[i],
429 then if a remalloc is required we need only copy the data for that one row
430 rather than all previous rows as it is now where a single large chunck of
431 memory is used.
432 
433 */
434 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
435 {
436   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
437   Mat            A = c->A,B = c->B;
438   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
439   PetscErrorCode ierr;
440   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
441   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
442   PetscInt       val,max1,max2,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
443   PetscInt       *rbuf_i,kmax,rbuf_0;
444   PetscBT        xtable;
445 
446   PetscFunctionBegin;
447   Mbs    = c->Mbs;
448   rstart = c->rstartbs;
449   cstart = c->cstartbs;
450   ai     = a->i;
451   aj     = a->j;
452   bi     = b->i;
453   bj     = b->j;
454   garray = c->garray;
455 
456 
457   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
458     rbuf_i  =  rbuf[i];
459     rbuf_0  =  rbuf_i[0];
460     ct     += rbuf_0;
461     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
462   }
463 
464   if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
465   else        max1 = 1;
466   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
467   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
468   ++no_malloc;
469   ierr         = PetscBTCreate(Mbs,xtable);CHKERRQ(ierr);
470   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
471 
472   ct3 = 0;
473   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
474     rbuf_i =  rbuf[i];
475     rbuf_0 =  rbuf_i[0];
476     ct1    =  2*rbuf_0+1;
477     ct2    =  ct1;
478     ct3    += ct1;
479     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
480       ierr = PetscBTMemzero(Mbs,xtable);CHKERRQ(ierr);
481       oct2 = ct2;
482       kmax = rbuf_i[2*j];
483       for (k=0; k<kmax; k++,ct1++) {
484         row = rbuf_i[ct1];
485         if (!PetscBTLookupSet(xtable,row)) {
486           if (!(ct3 < mem_estimate)) {
487             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
488             ierr = PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);CHKERRQ(ierr);
489             ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
490             ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
491             xdata[0]     = tmp;
492             mem_estimate = new_estimate; ++no_malloc;
493             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
494           }
495           xdata[i][ct2++] = row;
496           ct3++;
497         }
498       }
499       for (k=oct2,max2=ct2; k<max2; k++)  {
500         row   = xdata[i][k] - rstart;
501         start = ai[row];
502         end   = ai[row+1];
503         for (l=start; l<end; l++) {
504           val = aj[l] + cstart;
505           if (!PetscBTLookupSet(xtable,val)) {
506             if (!(ct3 < mem_estimate)) {
507               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
508               ierr = PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);CHKERRQ(ierr);
509               ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
510               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
511               xdata[0]     = tmp;
512               mem_estimate = new_estimate; ++no_malloc;
513               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
514             }
515             xdata[i][ct2++] = val;
516             ct3++;
517           }
518         }
519         start = bi[row];
520         end   = bi[row+1];
521         for (l=start; l<end; l++) {
522           val = garray[bj[l]];
523           if (!PetscBTLookupSet(xtable,val)) {
524             if (!(ct3 < mem_estimate)) {
525               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
526               ierr = PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);CHKERRQ(ierr);
527               ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
528               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
529               xdata[0]     = tmp;
530               mem_estimate = new_estimate; ++no_malloc;
531               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
532             }
533             xdata[i][ct2++] = val;
534             ct3++;
535           }
536         }
537       }
538       /* Update the header*/
539       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
540       xdata[i][2*j-1] = rbuf_i[2*j-1];
541     }
542     xdata[i][0] = rbuf_0;
543     xdata[i+1]  = xdata[i] + ct2;
544     isz1[i]     = ct2; /* size of each message */
545   }
546   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
547   ierr = PetscInfo3(C,"Allocated %D bytes, required %D, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
548   PetscFunctionReturn(0);
549 }
550 
551 static PetscErrorCode MatGetSubMatrices_MPIBAIJ_local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat *);
552 
553 #undef __FUNCT__
554 #define __FUNCT__ "MatGetSubMatrices_MPIBAIJ"
555 PetscErrorCode MatGetSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
556 {
557   IS             *isrow_new,*iscol_new;
558   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
559   PetscErrorCode ierr;
560   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,N=C->cmap->N,bs=C->rmap->bs;
561 
562   PetscFunctionBegin;
563   /* The compression and expansion should be avoided. Does'nt point
564      out errors might change the indices hence buggey */
565 
566   ierr = PetscMalloc2(ismax+1,IS,&isrow_new,ismax+1,IS,&iscol_new);CHKERRQ(ierr);
567   ierr = ISCompressIndicesGeneral(N,bs,ismax,isrow,isrow_new);CHKERRQ(ierr);
568   ierr = ISCompressIndicesGeneral(N,bs,ismax,iscol,iscol_new);CHKERRQ(ierr);
569 
570   /* Allocate memory to hold all the submatrices */
571   if (scall != MAT_REUSE_MATRIX) {
572     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
573   }
574   /* Determine the number of stages through which submatrices are done */
575   nmax          = 20*1000000 / (c->Nbs * sizeof(PetscInt));
576   if (!nmax) nmax = 1;
577   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
578 
579   /* Make sure every processor loops through the nstages */
580   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
581   for (i=0,pos=0; i<nstages; i++) {
582     if (pos+nmax <= ismax) max_no = nmax;
583     else if (pos == ismax) max_no = 0;
584     else                   max_no = ismax-pos;
585     ierr = MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos);CHKERRQ(ierr);
586     pos += max_no;
587   }
588 
589   for (i=0; i<ismax; i++) {
590     ierr = ISDestroy(isrow_new[i]);CHKERRQ(ierr);
591     ierr = ISDestroy(iscol_new[i]);CHKERRQ(ierr);
592   }
593   ierr = PetscFree2(isrow_new,iscol_new);CHKERRQ(ierr);
594   PetscFunctionReturn(0);
595 }
596 
597 #if defined (PETSC_USE_CTABLE)
598 #undef __FUNCT__
599 #define __FUNCT__ "PetscGetProc"
600 PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
601 {
602   PetscInt    nGlobalNd = proc_gnode[size];
603   PetscMPIInt fproc = PetscMPIIntCast( (PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)));
604 
605   PetscFunctionBegin;
606   if (fproc > size) fproc = size;
607   while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
608     if (row < proc_gnode[fproc]) fproc--;
609     else                         fproc++;
610   }
611   *rank = fproc;
612   PetscFunctionReturn(0);
613 }
614 #endif
615 
616 /* -------------------------------------------------------------------------*/
617 /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */
618 #undef __FUNCT__
619 #define __FUNCT__ "MatGetSubMatrices_MPIBAIJ_local"
620 static PetscErrorCode MatGetSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
621 {
622   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
623   Mat            A = c->A;
624   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
625   const PetscInt **irow,**icol,*irow_i;
626   PetscInt       *nrow,*ncol,*w3,*w4,start;
627   PetscErrorCode ierr;
628   PetscMPIInt    size,tag0,tag1,tag2,tag3,*w1,*w2,nrqr,idex,end,proc;
629   PetscInt       **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,**rbuf1,row;
630   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
631   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
632   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
633   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
634   PetscInt       bs=C->rmap->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB;
635   PetscInt       cstart = c->cstartbs,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
636   PetscInt       *bmap = c->garray,ctmp,rstart=c->rstartbs;
637   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
638   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
639   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
640   MPI_Status     *r_status3,*r_status4,*s_status4;
641   MPI_Comm       comm;
642   MatScalar      **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB;
643   MatScalar      *a_a=a->a,*b_a=b->a;
644   PetscBool      flag;
645   PetscMPIInt    *onodes1,*olengths1;
646 
647 #if defined (PETSC_USE_CTABLE)
648   PetscInt       tt;
649   PetscTable     *rowmaps,*colmaps,lrow1_grow1,lcol1_gcol1;
650 #else
651   PetscInt       **cmap,*cmap_i,*rtable,*rmap_i,**rmap, Mbs = c->Mbs;
652 #endif
653 
654   PetscFunctionBegin;
655   comm   = ((PetscObject)C)->comm;
656   tag0   = ((PetscObject)C)->tag;
657   size   = c->size;
658   rank   = c->rank;
659 
660   /* Get some new tags to keep the communication clean */
661   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
662   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
663   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
664 
665 #if defined(PETSC_USE_CTABLE)
666   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
667 #else
668   ierr = PetscMalloc5(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol,Mbs+1,PetscInt,&rtable);CHKERRQ(ierr);
669   /* Create hash table for the mapping :row -> proc*/
670   for (i=0,j=0; i<size; i++) {
671     jmax = c->rowners[i+1];
672     for (; j<jmax; j++) {
673       rtable[j] = i;
674     }
675   }
676 #endif
677 
678   for (i=0; i<ismax; i++) {
679     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
680     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
681     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
682     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
683   }
684 
685   /* evaluate communication - mesg to who,length of mesg,and buffer space
686      required. Based on this, buffers are allocated, and data copied into them*/
687   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscInt,&w3,size,PetscInt,&w4);CHKERRQ(ierr);
688   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
689   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
690   ierr = PetscMemzero(w3,size*sizeof(PetscInt));CHKERRQ(ierr);
691   for (i=0; i<ismax; i++) {
692     ierr   = PetscMemzero(w4,size*sizeof(PetscInt));CHKERRQ(ierr); /* initialise work vector*/
693     jmax   = nrow[i];
694     irow_i = irow[i];
695     for (j=0; j<jmax; j++) {
696       row  = irow_i[j];
697 #if defined (PETSC_USE_CTABLE)
698       ierr = PetscGetProc(row,size,c->rangebs,&proc);CHKERRQ(ierr);
699 #else
700       proc = rtable[row];
701 #endif
702       w4[proc]++;
703     }
704     for (j=0; j<size; j++) {
705       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
706     }
707   }
708 
709   nrqs     = 0;              /* no of outgoing messages */
710   msz      = 0;              /* total mesg length for all proc */
711   w1[rank] = 0;              /* no mesg sent to intself */
712   w3[rank] = 0;
713   for (i=0; i<size; i++) {
714     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
715   }
716   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
717   for (i=0,j=0; i<size; i++) {
718     if (w1[i]) { pa[j] = i; j++; }
719   }
720 
721   /* Each message would have a header = 1 + 2*(no of IS) + data */
722   for (i=0; i<nrqs; i++) {
723     j     = pa[i];
724     w1[j] += w2[j] + 2* w3[j];
725     msz   += w1[j];
726   }
727 
728   /* Determine the number of messages to expect, their lengths, from from-ids */
729   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
730   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
731 
732   /* Now post the Irecvs corresponding to these messages */
733   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
734 
735   ierr = PetscFree(onodes1);CHKERRQ(ierr);
736   ierr = PetscFree(olengths1);CHKERRQ(ierr);
737 
738   /* Allocate Memory for outgoing messages */
739   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
740   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
741   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
742   {
743     PetscInt *iptr = tmp,ict = 0;
744     for (i=0; i<nrqs; i++) {
745       j         = pa[i];
746       iptr     += ict;
747       sbuf1[j]  = iptr;
748       ict       = w1[j];
749     }
750   }
751 
752   /* Form the outgoing messages */
753   /* Initialise the header space */
754   for (i=0; i<nrqs; i++) {
755     j           = pa[i];
756     sbuf1[j][0] = 0;
757     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
758     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
759   }
760 
761   /* Parse the isrow and copy data into outbuf */
762   for (i=0; i<ismax; i++) {
763     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
764     irow_i = irow[i];
765     jmax   = nrow[i];
766     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
767       row  = irow_i[j];
768 #if defined (PETSC_USE_CTABLE)
769       ierr = PetscGetProc(row,size,c->rangebs,&proc);CHKERRQ(ierr);
770 #else
771       proc = rtable[row];
772 #endif
773       if (proc != rank) { /* copy to the outgoing buf*/
774         ctr[proc]++;
775         *ptr[proc] = row;
776         ptr[proc]++;
777       }
778     }
779     /* Update the headers for the current IS */
780     for (j=0; j<size; j++) { /* Can Optimise this loop too */
781       if ((ctr_j = ctr[j])) {
782         sbuf1_j        = sbuf1[j];
783         k              = ++sbuf1_j[0];
784         sbuf1_j[2*k]   = ctr_j;
785         sbuf1_j[2*k-1] = i;
786       }
787     }
788   }
789 
790   /*  Now  post the sends */
791   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
792   for (i=0; i<nrqs; ++i) {
793     j = pa[i];
794     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
795   }
796 
797   /* Post Recieves to capture the buffer size */
798   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
799   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
800   rbuf2[0] = tmp + msz;
801   for (i=1; i<nrqs; ++i) {
802     j        = pa[i];
803     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
804   }
805   for (i=0; i<nrqs; ++i) {
806     j    = pa[i];
807     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
808   }
809 
810   /* Send to other procs the buf size they should allocate */
811 
812   /* Receive messages*/
813   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
814   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
815   ierr = PetscMalloc3(nrqr+1,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
816   {
817     Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
818     PetscInt    *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;
819 
820     for (i=0; i<nrqr; ++i) {
821       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
822       req_size[idex] = 0;
823       rbuf1_i         = rbuf1[idex];
824       start           = 2*rbuf1_i[0] + 1;
825       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
826       ierr            = PetscMalloc(end*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
827       sbuf2_i         = sbuf2[idex];
828       for (j=start; j<end; j++) {
829         id               = rbuf1_i[j] - rstart;
830         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
831         sbuf2_i[j]       = ncols;
832         req_size[idex] += ncols;
833       }
834       req_source[idex] = r_status1[i].MPI_SOURCE;
835       /* form the header */
836       sbuf2_i[0]   = req_size[idex];
837       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
838       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
839     }
840   }
841   ierr = PetscFree(r_status1);CHKERRQ(ierr);
842   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
843 
844   /*  recv buffer sizes */
845   /* Receive messages*/
846 
847   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
848   ierr = PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);CHKERRQ(ierr);
849   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
850   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
851   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
852 
853   for (i=0; i<nrqs; ++i) {
854     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
855     ierr = PetscMalloc(rbuf2[idex][0]*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
856     ierr = PetscMalloc(rbuf2[idex][0]*bs2*sizeof(MatScalar),&rbuf4[idex]);CHKERRQ(ierr);
857     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
858     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
859   }
860   ierr = PetscFree(r_status2);CHKERRQ(ierr);
861   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
862 
863   /* Wait on sends1 and sends2 */
864   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
865   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
866 
867   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
868   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
869   ierr = PetscFree(s_status1);CHKERRQ(ierr);
870   ierr = PetscFree(s_status2);CHKERRQ(ierr);
871   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
872   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
873 
874   /* Now allocate buffers for a->j, and send them off */
875   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
876   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
877   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
878   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
879 
880   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
881   {
882      for (i=0; i<nrqr; i++) {
883       rbuf1_i   = rbuf1[i];
884       sbuf_aj_i = sbuf_aj[i];
885       ct1       = 2*rbuf1_i[0] + 1;
886       ct2       = 0;
887       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
888         kmax = rbuf1[i][2*j];
889         for (k=0; k<kmax; k++,ct1++) {
890           row    = rbuf1_i[ct1] - rstart;
891           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
892           ncols  = nzA + nzB;
893           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
894 
895           /* load the column indices for this row into cols*/
896           cols  = sbuf_aj_i + ct2;
897           for (l=0; l<nzB; l++) {
898             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[l] = ctmp;
899             else break;
900           }
901           imark = l;
902           for (l=0; l<nzA; l++)   cols[imark+l] = cstart + cworkA[l];
903           for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
904           ct2 += ncols;
905         }
906       }
907       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
908     }
909   }
910   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
911   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
912 
913   /* Allocate buffers for a->a, and send them off */
914   ierr = PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);CHKERRQ(ierr);
915   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
916   ierr = PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);CHKERRQ(ierr);
917   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
918 
919   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
920   {
921     for (i=0; i<nrqr; i++) {
922       rbuf1_i   = rbuf1[i];
923       sbuf_aa_i = sbuf_aa[i];
924       ct1       = 2*rbuf1_i[0]+1;
925       ct2       = 0;
926       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
927         kmax = rbuf1_i[2*j];
928         for (k=0; k<kmax; k++,ct1++) {
929           row    = rbuf1_i[ct1] - rstart;
930           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
931           ncols  = nzA + nzB;
932           cworkA = a_j + a_i[row];     cworkB = b_j + b_i[row];
933           vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2;
934 
935           /* load the column values for this row into vals*/
936           vals  = sbuf_aa_i+ct2*bs2;
937           for (l=0; l<nzB; l++) {
938             if ((bmap[cworkB[l]]) < cstart) {
939               ierr = PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
940             }
941             else break;
942           }
943           imark = l;
944           for (l=0; l<nzA; l++) {
945             ierr = PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
946           }
947           for (l=imark; l<nzB; l++) {
948             ierr = PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
949           }
950           ct2 += ncols;
951         }
952       }
953       ierr = MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
954     }
955   }
956   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
957   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
958   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
959   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
960 
961   /* Form the matrix */
962   /* create col map */
963   {
964     const PetscInt *icol_i;
965 #if defined (PETSC_USE_CTABLE)
966     /* Create row map*/
967     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&colmaps);CHKERRQ(ierr);
968     for (i=0; i<ismax; i++) {
969       ierr = PetscTableCreate(ncol[i]+1,&colmaps[i]);CHKERRQ(ierr);
970     }
971 #else
972     ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
973     ierr    = PetscMalloc(ismax*c->Nbs*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr);
974     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; }
975 #endif
976     for (i=0; i<ismax; i++) {
977       jmax   = ncol[i];
978       icol_i = icol[i];
979 #if defined (PETSC_USE_CTABLE)
980       lcol1_gcol1 = colmaps[i];
981       for (j=0; j<jmax; j++) {
982 	ierr = PetscTableAdd(lcol1_gcol1,icol_i[j]+1,j+1);CHKERRQ(ierr);
983       }
984 #else
985       cmap_i = cmap[i];
986       for (j=0; j<jmax; j++) {
987         cmap_i[icol_i[j]] = j+1;
988       }
989 #endif
990     }
991   }
992 
993   /* Create lens which is required for MatCreate... */
994   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
995   ierr    = PetscMalloc((1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt),&lens);CHKERRQ(ierr);
996   lens[0] = (PetscInt*)(lens + ismax);
997   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
998   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
999 
1000   /* Update lens from local data */
1001   for (i=0; i<ismax; i++) {
1002     jmax   = nrow[i];
1003 #if defined (PETSC_USE_CTABLE)
1004     lcol1_gcol1 = colmaps[i];
1005 #else
1006     cmap_i = cmap[i];
1007 #endif
1008     irow_i = irow[i];
1009     lens_i = lens[i];
1010     for (j=0; j<jmax; j++) {
1011       row  = irow_i[j];
1012 #if defined (PETSC_USE_CTABLE)
1013       ierr = PetscGetProc(row,size,c->rangebs,&proc);CHKERRQ(ierr);
1014 #else
1015       proc = rtable[row];
1016 #endif
1017       if (proc == rank) {
1018         /* Get indices from matA and then from matB */
1019         row    = row - rstart;
1020         nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1021         cworkA =  a_j + a_i[row]; cworkB = b_j + b_i[row];
1022 #if defined (PETSC_USE_CTABLE)
1023        for (k=0; k<nzA; k++) {
1024 	 ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[k]+1,&tt);CHKERRQ(ierr);
1025           if (tt) { lens_i[j]++; }
1026         }
1027         for (k=0; k<nzB; k++) {
1028 	  ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[k]]+1,&tt);CHKERRQ(ierr);
1029           if (tt) { lens_i[j]++; }
1030         }
1031 #else
1032         for (k=0; k<nzA; k++) {
1033           if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++; }
1034         }
1035         for (k=0; k<nzB; k++) {
1036           if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++; }
1037         }
1038 #endif
1039       }
1040     }
1041   }
1042 #if defined (PETSC_USE_CTABLE)
1043   /* Create row map*/
1044   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rowmaps);CHKERRQ(ierr);
1045   for (i=0; i<ismax; i++){
1046     ierr = PetscTableCreate(nrow[i]+1,&rowmaps[i]);CHKERRQ(ierr);
1047   }
1048 #else
1049   /* Create row map*/
1050   ierr    = PetscMalloc((1+ismax)*sizeof(PetscInt*)+ ismax*Mbs*sizeof(PetscInt),&rmap);CHKERRQ(ierr);
1051   rmap[0] = (PetscInt*)(rmap + ismax);
1052   ierr    = PetscMemzero(rmap[0],ismax*Mbs*sizeof(PetscInt));CHKERRQ(ierr);
1053   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + Mbs;}
1054 #endif
1055   for (i=0; i<ismax; i++) {
1056     irow_i = irow[i];
1057     jmax   = nrow[i];
1058 #if defined (PETSC_USE_CTABLE)
1059     lrow1_grow1 = rowmaps[i];
1060     for (j=0; j<jmax; j++) {
1061       ierr = PetscTableAdd(lrow1_grow1,irow_i[j]+1,j+1);CHKERRQ(ierr);
1062     }
1063 #else
1064     rmap_i = rmap[i];
1065     for (j=0; j<jmax; j++) {
1066       rmap_i[irow_i[j]] = j;
1067     }
1068 #endif
1069   }
1070 
1071   /* Update lens from offproc data */
1072   {
1073     PetscInt    *rbuf2_i,*rbuf3_i,*sbuf1_i;
1074     PetscMPIInt ii;
1075 
1076     for (tmp2=0; tmp2<nrqs; tmp2++) {
1077       ierr    = MPI_Waitany(nrqs,r_waits3,&ii,r_status3+tmp2);CHKERRQ(ierr);
1078       idex   = pa[ii];
1079       sbuf1_i = sbuf1[idex];
1080       jmax    = sbuf1_i[0];
1081       ct1     = 2*jmax+1;
1082       ct2     = 0;
1083       rbuf2_i = rbuf2[ii];
1084       rbuf3_i = rbuf3[ii];
1085       for (j=1; j<=jmax; j++) {
1086         is_no   = sbuf1_i[2*j-1];
1087         max1    = sbuf1_i[2*j];
1088         lens_i  = lens[is_no];
1089 #if defined (PETSC_USE_CTABLE)
1090 	lcol1_gcol1 = colmaps[is_no];
1091 	lrow1_grow1 = rowmaps[is_no];
1092 #else
1093         cmap_i  = cmap[is_no];
1094 	rmap_i  = rmap[is_no];
1095 #endif
1096         for (k=0; k<max1; k++,ct1++) {
1097 #if defined (PETSC_USE_CTABLE)
1098 	  ierr = PetscTableFind(lrow1_grow1,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1099           row--;
1100           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
1101 #else
1102           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1103 #endif
1104           max2 = rbuf2_i[ct1];
1105           for (l=0; l<max2; l++,ct2++) {
1106 #if defined (PETSC_USE_CTABLE)
1107 	    ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tt);CHKERRQ(ierr);
1108 	    if (tt) {
1109               lens_i[row]++;
1110             }
1111 #else
1112             if (cmap_i[rbuf3_i[ct2]]) {
1113               lens_i[row]++;
1114             }
1115 #endif
1116           }
1117         }
1118       }
1119     }
1120   }
1121   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1122   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1123   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1124   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1125   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1126 
1127   /* Create the submatrices */
1128   if (scall == MAT_REUSE_MATRIX) {
1129     /*
1130         Assumes new rows are same length as the old rows, hence bug!
1131     */
1132     for (i=0; i<ismax; i++) {
1133       mat = (Mat_SeqBAIJ *)(submats[i]->data);
1134       if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || C->rmap->bs != bs)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1135       ierr = PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
1136       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros");
1137       /* Initial matrix as if empty */
1138       ierr = PetscMemzero(mat->ilen,mat->mbs*sizeof(PetscInt));CHKERRQ(ierr);
1139       submats[i]->factortype = C->factortype;
1140     }
1141   } else {
1142     for (i=0; i<ismax; i++) {
1143       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1144       ierr = MatSetSizes(submats[i],nrow[i]*bs,ncol[i]*bs,nrow[i]*bs,ncol[i]*bs);CHKERRQ(ierr);
1145       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1146       ierr = MatSeqBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);CHKERRQ(ierr);
1147       ierr = MatSeqSBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);CHKERRQ(ierr);
1148     }
1149   }
1150 
1151   /* Assemble the matrices */
1152   /* First assemble the local rows */
1153   {
1154     PetscInt       ilen_row,*imat_ilen,*imat_j,*imat_i;
1155     MatScalar *imat_a;
1156 
1157     for (i=0; i<ismax; i++) {
1158       mat       = (Mat_SeqBAIJ*)submats[i]->data;
1159       imat_ilen = mat->ilen;
1160       imat_j    = mat->j;
1161       imat_i    = mat->i;
1162       imat_a    = mat->a;
1163 
1164 #if defined (PETSC_USE_CTABLE)
1165       lcol1_gcol1 = colmaps[i];
1166       lrow1_grow1 = rowmaps[i];
1167 #else
1168       cmap_i    = cmap[i];
1169       rmap_i    = rmap[i];
1170 #endif
1171       irow_i    = irow[i];
1172       jmax      = nrow[i];
1173       for (j=0; j<jmax; j++) {
1174         row      = irow_i[j];
1175 #if defined (PETSC_USE_CTABLE)
1176 	ierr = PetscGetProc(row,size,c->rangebs,&proc);CHKERRQ(ierr);
1177 #else
1178 	proc = rtable[row];
1179 #endif
1180         if (proc == rank) {
1181           row      = row - rstart;
1182           nzA      = a_i[row+1] - a_i[row];
1183           nzB      = b_i[row+1] - b_i[row];
1184           cworkA   = a_j + a_i[row];
1185           cworkB   = b_j + b_i[row];
1186           vworkA   = a_a + a_i[row]*bs2;
1187           vworkB   = b_a + b_i[row]*bs2;
1188 #if defined (PETSC_USE_CTABLE)
1189 	  ierr = PetscTableFind(lrow1_grow1,row+rstart+1,&row);CHKERRQ(ierr);
1190           row--;
1191           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
1192 #else
1193           row      = rmap_i[row + rstart];
1194 #endif
1195           mat_i    = imat_i[row];
1196           mat_a    = imat_a + mat_i*bs2;
1197           mat_j    = imat_j + mat_i;
1198           ilen_row = imat_ilen[row];
1199 
1200           /* load the column indices for this row into cols*/
1201           for (l=0; l<nzB; l++) {
1202 	    if ((ctmp = bmap[cworkB[l]]) < cstart) {
1203 #if defined (PETSC_USE_CTABLE)
1204 	      ierr = PetscTableFind(lcol1_gcol1,ctmp+1,&tcol);CHKERRQ(ierr);
1205 	      if (tcol) {
1206 #else
1207               if ((tcol = cmap_i[ctmp])) {
1208 #endif
1209                 *mat_j++ = tcol - 1;
1210                 ierr     = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1211                 mat_a   += bs2;
1212                 ilen_row++;
1213               }
1214             } else break;
1215           }
1216           imark = l;
1217           for (l=0; l<nzA; l++) {
1218 #if defined (PETSC_USE_CTABLE)
1219 	    ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[l]+1,&tcol);CHKERRQ(ierr);
1220 	    if (tcol) {
1221 #else
1222 	    if ((tcol = cmap_i[cstart + cworkA[l]])) {
1223 #endif
1224               *mat_j++ = tcol - 1;
1225               ierr     = PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1226               mat_a   += bs2;
1227               ilen_row++;
1228             }
1229           }
1230           for (l=imark; l<nzB; l++) {
1231 #if defined (PETSC_USE_CTABLE)
1232 	    ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[l]]+1,&tcol);CHKERRQ(ierr);
1233 	    if (tcol) {
1234 #else
1235             if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1236 #endif
1237               *mat_j++ = tcol - 1;
1238               ierr     = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1239               mat_a   += bs2;
1240               ilen_row++;
1241             }
1242           }
1243           imat_ilen[row] = ilen_row;
1244         }
1245       }
1246 
1247     }
1248   }
1249 
1250   /*   Now assemble the off proc rows*/
1251   {
1252     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1253     PetscInt    *imat_j,*imat_i;
1254     MatScalar   *imat_a,*rbuf4_i;
1255     PetscMPIInt ii;
1256 
1257     for (tmp2=0; tmp2<nrqs; tmp2++) {
1258       ierr    = MPI_Waitany(nrqs,r_waits4,&ii,r_status4+tmp2);CHKERRQ(ierr);
1259       idex   = pa[ii];
1260       sbuf1_i = sbuf1[idex];
1261       jmax    = sbuf1_i[0];
1262       ct1     = 2*jmax + 1;
1263       ct2     = 0;
1264       rbuf2_i = rbuf2[ii];
1265       rbuf3_i = rbuf3[ii];
1266       rbuf4_i = rbuf4[ii];
1267       for (j=1; j<=jmax; j++) {
1268         is_no     = sbuf1_i[2*j-1];
1269 #if defined (PETSC_USE_CTABLE)
1270 	lrow1_grow1 = rowmaps[is_no];
1271 	lcol1_gcol1 = colmaps[is_no];
1272 #else
1273         rmap_i    = rmap[is_no];
1274         cmap_i    = cmap[is_no];
1275 #endif
1276         mat       = (Mat_SeqBAIJ*)submats[is_no]->data;
1277         imat_ilen = mat->ilen;
1278         imat_j    = mat->j;
1279         imat_i    = mat->i;
1280         imat_a    = mat->a;
1281         max1      = sbuf1_i[2*j];
1282         for (k=0; k<max1; k++,ct1++) {
1283           row   = sbuf1_i[ct1];
1284 #if defined (PETSC_USE_CTABLE)
1285 	  ierr = PetscTableFind(lrow1_grow1,row+1,&row);CHKERRQ(ierr);
1286           row--;
1287           if(row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
1288 #else
1289           row   = rmap_i[row];
1290 #endif
1291           ilen  = imat_ilen[row];
1292           mat_i = imat_i[row];
1293           mat_a = imat_a + mat_i*bs2;
1294           mat_j = imat_j + mat_i;
1295           max2 = rbuf2_i[ct1];
1296           for (l=0; l<max2; l++,ct2++) {
1297 #if defined (PETSC_USE_CTABLE)
1298 	    ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1299 	    if (tcol) {
1300 #else
1301 	    if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1302 #endif
1303               *mat_j++    = tcol - 1;
1304               /* *mat_a++= rbuf4_i[ct2]; */
1305               ierr        = PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1306               mat_a      += bs2;
1307               ilen++;
1308             }
1309           }
1310           imat_ilen[row] = ilen;
1311         }
1312       }
1313     }
1314   }
1315   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1316   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1317   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1318   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1319   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1320 
1321   /* Restore the indices */
1322   for (i=0; i<ismax; i++) {
1323     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1324     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1325   }
1326 
1327   /* Destroy allocated memory */
1328 #if defined(PETSC_USE_CTABLE)
1329   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1330 #else
1331   ierr = PetscFree5(irow,icol,nrow,ncol,rtable);CHKERRQ(ierr);
1332 #endif
1333   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1334   ierr = PetscFree(pa);CHKERRQ(ierr);
1335 
1336   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1337   ierr = PetscFree(sbuf1);CHKERRQ(ierr);
1338   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1339   for (i=0; i<nrqr; ++i) {
1340     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1341   }
1342   for (i=0; i<nrqs; ++i) {
1343     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1344     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1345   }
1346   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1347   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1348   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1349   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1350   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1351   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1352   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1353 
1354 #if defined (PETSC_USE_CTABLE)
1355   for (i=0; i<ismax; i++){
1356     ierr = PetscTableDestroy(rowmaps[i]);CHKERRQ(ierr);
1357     ierr = PetscTableDestroy(colmaps[i]);CHKERRQ(ierr);
1358   }
1359   ierr = PetscFree(colmaps);CHKERRQ(ierr);
1360   ierr = PetscFree(rowmaps);CHKERRQ(ierr);
1361 #else
1362   ierr = PetscFree(rmap);CHKERRQ(ierr);
1363   ierr = PetscFree(cmap[0]);CHKERRQ(ierr);
1364   ierr = PetscFree(cmap);CHKERRQ(ierr);
1365 #endif
1366   ierr = PetscFree(lens);CHKERRQ(ierr);
1367 
1368   for (i=0; i<ismax; i++) {
1369     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1370     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1371   }
1372   PetscFunctionReturn(0);
1373 }
1374 
1375