xref: /petsc/src/mat/impls/baij/mpi/baijov.c (revision 5d5aaa0ee72c8f47e5b2a2b8f343869bed14e256)
1 #define PETSCMAT_DLL
2 
3 
4 /*
5    Routines to compute overlapping regions of a parallel MPI matrix
6   and to find submatrices that were shared across processors.
7 */
8 #include "../src/mat/impls/baij/mpi/mpibaij.h"
9 #include "petscbt.h"
10 
11 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat,PetscInt,IS *);
12 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
13 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
14 EXTERN PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 EXTERN PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16 
17 #undef __FUNCT__
18 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ"
19 PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
20 {
21   PetscErrorCode ierr;
22   PetscInt       i,N=C->cmap->N, bs=C->rmap->bs;
23   IS             *is_new;
24 
25   PetscFunctionBegin;
26   ierr = PetscMalloc(imax*sizeof(IS),&is_new);CHKERRQ(ierr);
27   /* Convert the indices into block format */
28   ierr = ISCompressIndicesGeneral(N,bs,imax,is,is_new);CHKERRQ(ierr);
29   if (ov < 0){ SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");}
30   for (i=0; i<ov; ++i) {
31     ierr = MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);CHKERRQ(ierr);
32   }
33   for (i=0; i<imax; i++) {ierr = ISDestroy(is[i]);CHKERRQ(ierr);}
34   ierr = ISExpandIndicesGeneral(N,bs,imax,is_new,is);CHKERRQ(ierr);
35   for (i=0; i<imax; i++) {ierr = ISDestroy(is_new[i]);CHKERRQ(ierr);}
36   ierr = PetscFree(is_new);CHKERRQ(ierr);
37   PetscFunctionReturn(0);
38 }
39 
40 /*
41   Sample message format:
42   If a processor A wants processor B to process some elements corresponding
43   to index sets is[1], is[5]
44   mesg [0] = 2   (no of index sets in the mesg)
45   -----------
46   mesg [1] = 1 => is[1]
47   mesg [2] = sizeof(is[1]);
48   -----------
49   mesg [5] = 5  => is[5]
50   mesg [6] = sizeof(is[5]);
51   -----------
52   mesg [7]
53   mesg [n]  data(is[1])
54   -----------
55   mesg[n+1]
56   mesg[m]  data(is[5])
57   -----------
58 
59   Notes:
60   nrqs - no of requests sent (or to be sent out)
61   nrqr - no of requests recieved (which have to be or which have been processed
62 */
63 #undef __FUNCT__
64 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ_Once"
65 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[])
66 {
67   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
68   const PetscInt **idx,*idx_i;
69   PetscInt       *n,*w3,*w4,*rtable,**data,len;
70   PetscErrorCode ierr;
71   PetscMPIInt    size,rank,tag1,tag2,*w2,*w1,nrqr;
72   PetscInt       Mbs,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
73   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
74   PetscMPIInt    *onodes1,*olengths1,*onodes2,*olengths2;
75   PetscBT        *table;
76   MPI_Comm       comm;
77   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
78   MPI_Status     *s_status,*recv_status;
79 
80   PetscFunctionBegin;
81   comm   = ((PetscObject)C)->comm;
82   size   = c->size;
83   rank   = c->rank;
84   Mbs    = c->Mbs;
85 
86   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
87   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
88 
89   ierr = PetscMalloc3(imax+1,const PetscInt*,&idx,imax,PetscInt,&n,Mbs,PetscInt,&rtable);CHKERRQ(ierr);
90   for (i=0; i<imax; i++) {
91     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
92     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
93   }
94 
95   /* Create hash table for the mapping :row -> proc*/
96   for (i=0,j=0; i<size; i++) {
97     len = c->rangebs[i+1];
98     for (; j<len; j++) {
99       rtable[j] = i;
100     }
101   }
102 
103   /* evaluate communication - mesg to who,length of mesg, and buffer space
104      required. Based on this, buffers are allocated, and data copied into them*/
105   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscInt,&w3,size,PetscInt,&w4);CHKERRQ(ierr);
106   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
107   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
108   ierr = PetscMemzero(w3,size*sizeof(PetscInt));CHKERRQ(ierr);
109   for (i=0; i<imax; i++) {
110     ierr  = PetscMemzero(w4,size*sizeof(PetscInt));CHKERRQ(ierr); /* initialise work vector*/
111     idx_i = idx[i];
112     len   = n[i];
113     for (j=0; j<len; j++) {
114       row  = idx_i[j];
115       if (row < 0) {
116         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
117       }
118       proc = rtable[row];
119       w4[proc]++;
120     }
121     for (j=0; j<size; j++){
122       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
123     }
124   }
125 
126   nrqs     = 0;              /* no of outgoing messages */
127   msz      = 0;              /* total mesg length (for all proc */
128   w1[rank] = 0;              /* no mesg sent to itself */
129   w3[rank] = 0;
130   for (i=0; i<size; i++) {
131     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
132   }
133   /* pa - is list of processors to communicate with */
134   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
135   for (i=0,j=0; i<size; i++) {
136     if (w1[i]) {pa[j] = i; j++;}
137   }
138 
139   /* Each message would have a header = 1 + 2*(no of IS) + data */
140   for (i=0; i<nrqs; i++) {
141     j      = pa[i];
142     w1[j] += w2[j] + 2*w3[j];
143     msz   += w1[j];
144   }
145 
146   /* Determine the number of messages to expect, their lengths, from from-ids */
147   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
148   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
149 
150   /* Now post the Irecvs corresponding to these messages */
151   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
152 
153   /* Allocate Memory for outgoing messages */
154   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
155   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
156   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
157   {
158     PetscInt *iptr = tmp,ict  = 0;
159     for (i=0; i<nrqs; i++) {
160       j         = pa[i];
161       iptr     +=  ict;
162       outdat[j] = iptr;
163       ict       = w1[j];
164     }
165   }
166 
167   /* Form the outgoing messages */
168   /*plug in the headers*/
169   for (i=0; i<nrqs; i++) {
170     j            = pa[i];
171     outdat[j][0] = 0;
172     ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
173     ptr[j]       = outdat[j] + 2*w3[j] + 1;
174   }
175 
176   /* Memory for doing local proc's work*/
177   {
178     PetscInt  *d_p;
179     char      *t_p;
180 
181     /* should change to use PetscMallocN() */
182     ierr = PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
183       (Mbs)*imax*sizeof(PetscInt)  + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char),&table);CHKERRQ(ierr);
184     ierr = PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
185       (Mbs)*imax*sizeof(PetscInt)  + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr);
186     data = (PetscInt **)(table + imax);
187     isz  = (PetscInt  *)(data  + imax);
188     d_p  = (PetscInt  *)(isz   + imax);
189     t_p  = (char *)(d_p   + Mbs*imax);
190     for (i=0; i<imax; i++) {
191       table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
192       data[i]  = d_p + (Mbs)*i;
193     }
194   }
195 
196   /* Parse the IS and update local tables and the outgoing buf with the data*/
197   {
198     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
199     PetscBT  table_i;
200 
201     for (i=0; i<imax; i++) {
202       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
203       n_i     = n[i];
204       table_i = table[i];
205       idx_i   = idx[i];
206       data_i  = data[i];
207       isz_i   = isz[i];
208       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
209         row  = idx_i[j];
210         proc = rtable[row];
211         if (proc != rank) { /* copy to the outgoing buffer */
212           ctr[proc]++;
213           *ptr[proc] = row;
214           ptr[proc]++;
215         } else { /* Update the local table */
216           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
217         }
218       }
219       /* Update the headers for the current IS */
220       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
221         if ((ctr_j = ctr[j])) {
222           outdat_j        = outdat[j];
223           k               = ++outdat_j[0];
224           outdat_j[2*k]   = ctr_j;
225           outdat_j[2*k-1] = i;
226         }
227       }
228       isz[i] = isz_i;
229     }
230   }
231 
232   /*  Now  post the sends */
233   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
234   for (i=0; i<nrqs; ++i) {
235     j    = pa[i];
236     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
237   }
238 
239   /* No longer need the original indices*/
240   for (i=0; i<imax; ++i) {
241     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
242   }
243   ierr = PetscFree3(idx,n,rtable);CHKERRQ(ierr);
244 
245   for (i=0; i<imax; ++i) {
246     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
247   }
248 
249   /* Do Local work*/
250   ierr = MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
251 
252   /* Receive messages*/
253   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
254   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
255 
256   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
257   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
258 
259   /* Phase 1 sends are complete - deallocate buffers */
260   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
261   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
262 
263   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
264   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
265   ierr = MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
266   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
267   ierr = PetscFree(rbuf);CHKERRQ(ierr);
268 
269   /* Send the data back*/
270   /* Do a global reduction to know the buffer space req for incoming messages*/
271   {
272     PetscMPIInt *rw1;
273 
274     ierr = PetscMalloc(size*sizeof(PetscInt),&rw1);CHKERRQ(ierr);
275     ierr = PetscMemzero(rw1,size*sizeof(PetscInt));CHKERRQ(ierr);
276 
277     for (i=0; i<nrqr; ++i) {
278       proc      = recv_status[i].MPI_SOURCE;
279       if (proc != onodes1[i]) SETERRQ(PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
280       rw1[proc] = isz1[i];
281     }
282 
283     ierr = PetscFree(onodes1);CHKERRQ(ierr);
284     ierr = PetscFree(olengths1);CHKERRQ(ierr);
285 
286     /* Determine the number of messages to expect, their lengths, from from-ids */
287     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
288     ierr = PetscFree(rw1);CHKERRQ(ierr);
289   }
290   /* Now post the Irecvs corresponding to these messages */
291   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
292 
293   /*  Now  post the sends */
294   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
295   for (i=0; i<nrqr; ++i) {
296     j    = recv_status[i].MPI_SOURCE;
297     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
298   }
299 
300   /* receive work done on other processors*/
301   {
302     PetscMPIInt idex;
303     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
304     PetscBT     table_i;
305     MPI_Status  *status2;
306 
307     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
308     for (i=0; i<nrqs; ++i) {
309       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
310       /* Process the message*/
311       rbuf2_i = rbuf2[idex];
312       ct1     = 2*rbuf2_i[0]+1;
313       jmax    = rbuf2[idex][0];
314       for (j=1; j<=jmax; j++) {
315         max     = rbuf2_i[2*j];
316         is_no   = rbuf2_i[2*j-1];
317         isz_i   = isz[is_no];
318         data_i  = data[is_no];
319         table_i = table[is_no];
320         for (k=0; k<max; k++,ct1++) {
321           row = rbuf2_i[ct1];
322           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
323         }
324         isz[is_no] = isz_i;
325       }
326     }
327     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
328     ierr = PetscFree(status2);CHKERRQ(ierr);
329   }
330 
331   for (i=0; i<imax; ++i) {
332     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);CHKERRQ(ierr);
333   }
334 
335 
336   ierr = PetscFree(onodes2);CHKERRQ(ierr);
337   ierr = PetscFree(olengths2);CHKERRQ(ierr);
338 
339   ierr = PetscFree(pa);CHKERRQ(ierr);
340   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
341   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
342   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
343   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
344   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
345   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
346   ierr = PetscFree(table);CHKERRQ(ierr);
347   ierr = PetscFree(s_status);CHKERRQ(ierr);
348   ierr = PetscFree(recv_status);CHKERRQ(ierr);
349   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
350   ierr = PetscFree(xdata);CHKERRQ(ierr);
351   ierr = PetscFree(isz1);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ_Local"
357 /*
358    MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
359        the work on the local processor.
360 
361      Inputs:
362       C      - MAT_MPIBAIJ;
363       imax - total no of index sets processed at a time;
364       table  - an array of char - size = Mbs bits.
365 
366      Output:
367       isz    - array containing the count of the solution elements corresponding
368                to each index set;
369       data   - pointer to the solutions
370 */
371 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
372 {
373   Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
374   Mat         A = c->A,B = c->B;
375   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
376   PetscInt    start,end,val,max,rstart,cstart,*ai,*aj;
377   PetscInt    *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
378   PetscBT     table_i;
379 
380   PetscFunctionBegin;
381   rstart = c->rstartbs;
382   cstart = c->cstartbs;
383   ai     = a->i;
384   aj     = a->j;
385   bi     = b->i;
386   bj     = b->j;
387   garray = c->garray;
388 
389 
390   for (i=0; i<imax; i++) {
391     data_i  = data[i];
392     table_i = table[i];
393     isz_i   = isz[i];
394     for (j=0,max=isz[i]; j<max; j++) {
395       row   = data_i[j] - rstart;
396       start = ai[row];
397       end   = ai[row+1];
398       for (k=start; k<end; k++) { /* Amat */
399         val = aj[k] + cstart;
400         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
401       }
402       start = bi[row];
403       end   = bi[row+1];
404       for (k=start; k<end; k++) { /* Bmat */
405         val = garray[bj[k]];
406         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
407       }
408     }
409     isz[i] = isz_i;
410   }
411   PetscFunctionReturn(0);
412 }
413 #undef __FUNCT__
414 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ_Receive"
415 /*
416       MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages,
417          and return the output
418 
419          Input:
420            C    - the matrix
421            nrqr - no of messages being processed.
422            rbuf - an array of pointers to the recieved requests
423 
424          Output:
425            xdata - array of messages to be sent back
426            isz1  - size of each message
427 
428   For better efficiency perhaps we should malloc separately each xdata[i],
429 then if a remalloc is required we need only copy the data for that one row
430 rather than all previous rows as it is now where a single large chunck of
431 memory is used.
432 
433 */
434 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
435 {
436   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
437   Mat            A = c->A,B = c->B;
438   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
439   PetscErrorCode ierr;
440   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
441   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
442   PetscInt       val,max1,max2,rank,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
443   PetscInt       *rbuf_i,kmax,rbuf_0;
444   PetscBT        xtable;
445 
446   PetscFunctionBegin;
447   rank   = c->rank;
448   Mbs    = c->Mbs;
449   rstart = c->rstartbs;
450   cstart = c->cstartbs;
451   ai     = a->i;
452   aj     = a->j;
453   bi     = b->i;
454   bj     = b->j;
455   garray = c->garray;
456 
457 
458   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
459     rbuf_i  =  rbuf[i];
460     rbuf_0  =  rbuf_i[0];
461     ct     += rbuf_0;
462     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
463   }
464 
465   if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
466   else        max1 = 1;
467   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
468   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
469   ++no_malloc;
470   ierr         = PetscBTCreate(Mbs,xtable);CHKERRQ(ierr);
471   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
472 
473   ct3 = 0;
474   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
475     rbuf_i =  rbuf[i];
476     rbuf_0 =  rbuf_i[0];
477     ct1    =  2*rbuf_0+1;
478     ct2    =  ct1;
479     ct3    += ct1;
480     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
481       ierr = PetscBTMemzero(Mbs,xtable);CHKERRQ(ierr);
482       oct2 = ct2;
483       kmax = rbuf_i[2*j];
484       for (k=0; k<kmax; k++,ct1++) {
485         row = rbuf_i[ct1];
486         if (!PetscBTLookupSet(xtable,row)) {
487           if (!(ct3 < mem_estimate)) {
488             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
489             ierr = PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);CHKERRQ(ierr);
490             ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
491             ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
492             xdata[0]     = tmp;
493             mem_estimate = new_estimate; ++no_malloc;
494             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
495           }
496           xdata[i][ct2++] = row;
497           ct3++;
498         }
499       }
500       for (k=oct2,max2=ct2; k<max2; k++)  {
501         row   = xdata[i][k] - rstart;
502         start = ai[row];
503         end   = ai[row+1];
504         for (l=start; l<end; l++) {
505           val = aj[l] + cstart;
506           if (!PetscBTLookupSet(xtable,val)) {
507             if (!(ct3 < mem_estimate)) {
508               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
509               ierr = PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);CHKERRQ(ierr);
510               ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
511               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
512               xdata[0]     = tmp;
513               mem_estimate = new_estimate; ++no_malloc;
514               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
515             }
516             xdata[i][ct2++] = val;
517             ct3++;
518           }
519         }
520         start = bi[row];
521         end   = bi[row+1];
522         for (l=start; l<end; l++) {
523           val = garray[bj[l]];
524           if (!PetscBTLookupSet(xtable,val)) {
525             if (!(ct3 < mem_estimate)) {
526               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
527               ierr = PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);CHKERRQ(ierr);
528               ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
529               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
530               xdata[0]     = tmp;
531               mem_estimate = new_estimate; ++no_malloc;
532               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
533             }
534             xdata[i][ct2++] = val;
535             ct3++;
536           }
537         }
538       }
539       /* Update the header*/
540       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
541       xdata[i][2*j-1] = rbuf_i[2*j-1];
542     }
543     xdata[i][0] = rbuf_0;
544     xdata[i+1]  = xdata[i] + ct2;
545     isz1[i]     = ct2; /* size of each message */
546   }
547   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
548   ierr = PetscInfo3(C,"Allocated %D bytes, required %D, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
549   PetscFunctionReturn(0);
550 }
551 
552 static PetscErrorCode MatGetSubMatrices_MPIBAIJ_local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat *);
553 
554 #undef __FUNCT__
555 #define __FUNCT__ "MatGetSubMatrices_MPIBAIJ"
556 PetscErrorCode MatGetSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
557 {
558   IS             *isrow_new,*iscol_new;
559   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
560   PetscErrorCode ierr;
561   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,N=C->cmap->N,bs=C->rmap->bs;
562 
563   PetscFunctionBegin;
564   /* The compression and expansion should be avoided. Does'nt point
565      out errors might change the indices hence buggey */
566 
567   ierr = PetscMalloc2(ismax+1,IS,&isrow_new,ismax+1,IS,&iscol_new);CHKERRQ(ierr);
568   ierr = ISCompressIndicesGeneral(N,bs,ismax,isrow,isrow_new);CHKERRQ(ierr);
569   ierr = ISCompressIndicesGeneral(N,bs,ismax,iscol,iscol_new);CHKERRQ(ierr);
570 
571   /* Allocate memory to hold all the submatrices */
572   if (scall != MAT_REUSE_MATRIX) {
573     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
574   }
575   /* Determine the number of stages through which submatrices are done */
576   nmax          = 20*1000000 / (c->Nbs * sizeof(PetscInt));
577   if (!nmax) nmax = 1;
578   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
579 
580   /* Make sure every processor loops through the nstages */
581   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
582   for (i=0,pos=0; i<nstages; i++) {
583     if (pos+nmax <= ismax) max_no = nmax;
584     else if (pos == ismax) max_no = 0;
585     else                   max_no = ismax-pos;
586     ierr = MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos);CHKERRQ(ierr);
587     pos += max_no;
588   }
589 
590   for (i=0; i<ismax; i++) {
591     ierr = ISDestroy(isrow_new[i]);CHKERRQ(ierr);
592     ierr = ISDestroy(iscol_new[i]);CHKERRQ(ierr);
593   }
594   ierr = PetscFree2(isrow_new,iscol_new);CHKERRQ(ierr);
595   PetscFunctionReturn(0);
596 }
597 
598 #if defined (PETSC_USE_CTABLE)
599 #undef __FUNCT__
600 #define __FUNCT__ "PetscGetProc"
601 PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
602 {
603   PetscInt    nGlobalNd = proc_gnode[size];
604   PetscMPIInt fproc = PetscMPIIntCast( (PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)));
605 
606   PetscFunctionBegin;
607   if (fproc > size) fproc = size;
608   while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
609     if (row < proc_gnode[fproc]) fproc--;
610     else                         fproc++;
611   }
612   *rank = fproc;
613   PetscFunctionReturn(0);
614 }
615 #endif
616 
617 /* -------------------------------------------------------------------------*/
618 /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */
619 #undef __FUNCT__
620 #define __FUNCT__ "MatGetSubMatrices_MPIBAIJ_local"
621 static PetscErrorCode MatGetSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
622 {
623   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
624   Mat            A = c->A;
625   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
626   const PetscInt **irow,**icol,*irow_i;
627   PetscInt       *nrow,*ncol,*w3,*w4,start;
628   PetscErrorCode ierr;
629   PetscMPIInt    size,tag0,tag1,tag2,tag3,*w1,*w2,nrqr,idex,end,proc;
630   PetscInt       **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,**rbuf1,row;
631   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
632   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
633   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
634   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
635   PetscInt       bs=C->rmap->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB;
636   PetscInt       cstart = c->cstartbs,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
637   PetscInt       *bmap = c->garray,ctmp,rstart=c->rstartbs;
638   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
639   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
640   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
641   MPI_Status     *r_status3,*r_status4,*s_status4;
642   MPI_Comm       comm;
643   MatScalar      **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB;
644   MatScalar      *a_a=a->a,*b_a=b->a;
645   PetscTruth     flag;
646   PetscMPIInt    *onodes1,*olengths1;
647 
648 #if defined (PETSC_USE_CTABLE)
649   PetscInt       tt;
650   PetscTable     *rowmaps,*colmaps,lrow1_grow1,lcol1_gcol1;
651 #else
652   PetscInt       **cmap,*cmap_i,*rtable,*rmap_i,**rmap, Mbs = c->Mbs;
653 #endif
654 
655   PetscFunctionBegin;
656   comm   = ((PetscObject)C)->comm;
657   tag0   = ((PetscObject)C)->tag;
658   size   = c->size;
659   rank   = c->rank;
660 
661   /* Get some new tags to keep the communication clean */
662   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
663   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
664   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
665 
666 #if defined(PETSC_USE_CTABLE)
667   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
668 #else
669   ierr = PetscMalloc5(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol,Mbs+1,PetscInt,&rtable);CHKERRQ(ierr);
670   /* Create hash table for the mapping :row -> proc*/
671   for (i=0,j=0; i<size; i++) {
672     jmax = c->rowners[i+1];
673     for (; j<jmax; j++) {
674       rtable[j] = i;
675     }
676   }
677 #endif
678 
679   for (i=0; i<ismax; i++) {
680     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
681     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
682     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
683     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
684   }
685 
686   /* evaluate communication - mesg to who,length of mesg,and buffer space
687      required. Based on this, buffers are allocated, and data copied into them*/
688   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscInt,&w3,size,PetscInt,&w4);CHKERRQ(ierr);
689   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
690   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
691   ierr = PetscMemzero(w3,size*sizeof(PetscInt));CHKERRQ(ierr);
692   for (i=0; i<ismax; i++) {
693     ierr   = PetscMemzero(w4,size*sizeof(PetscInt));CHKERRQ(ierr); /* initialise work vector*/
694     jmax   = nrow[i];
695     irow_i = irow[i];
696     for (j=0; j<jmax; j++) {
697       row  = irow_i[j];
698 #if defined (PETSC_USE_CTABLE)
699       ierr = PetscGetProc(row,size,c->rangebs,&proc);CHKERRQ(ierr);
700 #else
701       proc = rtable[row];
702 #endif
703       w4[proc]++;
704     }
705     for (j=0; j<size; j++) {
706       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
707     }
708   }
709 
710   nrqs     = 0;              /* no of outgoing messages */
711   msz      = 0;              /* total mesg length for all proc */
712   w1[rank] = 0;              /* no mesg sent to intself */
713   w3[rank] = 0;
714   for (i=0; i<size; i++) {
715     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
716   }
717   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
718   for (i=0,j=0; i<size; i++) {
719     if (w1[i]) { pa[j] = i; j++; }
720   }
721 
722   /* Each message would have a header = 1 + 2*(no of IS) + data */
723   for (i=0; i<nrqs; i++) {
724     j     = pa[i];
725     w1[j] += w2[j] + 2* w3[j];
726     msz   += w1[j];
727   }
728 
729   /* Determine the number of messages to expect, their lengths, from from-ids */
730   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
731   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
732 
733   /* Now post the Irecvs corresponding to these messages */
734   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
735 
736   ierr = PetscFree(onodes1);CHKERRQ(ierr);
737   ierr = PetscFree(olengths1);CHKERRQ(ierr);
738 
739   /* Allocate Memory for outgoing messages */
740   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
741   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
742   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
743   {
744     PetscInt *iptr = tmp,ict = 0;
745     for (i=0; i<nrqs; i++) {
746       j         = pa[i];
747       iptr     += ict;
748       sbuf1[j]  = iptr;
749       ict       = w1[j];
750     }
751   }
752 
753   /* Form the outgoing messages */
754   /* Initialise the header space */
755   for (i=0; i<nrqs; i++) {
756     j           = pa[i];
757     sbuf1[j][0] = 0;
758     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
759     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
760   }
761 
762   /* Parse the isrow and copy data into outbuf */
763   for (i=0; i<ismax; i++) {
764     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
765     irow_i = irow[i];
766     jmax   = nrow[i];
767     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
768       row  = irow_i[j];
769 #if defined (PETSC_USE_CTABLE)
770       ierr = PetscGetProc(row,size,c->rangebs,&proc);CHKERRQ(ierr);
771 #else
772       proc = rtable[row];
773 #endif
774       if (proc != rank) { /* copy to the outgoing buf*/
775         ctr[proc]++;
776         *ptr[proc] = row;
777         ptr[proc]++;
778       }
779     }
780     /* Update the headers for the current IS */
781     for (j=0; j<size; j++) { /* Can Optimise this loop too */
782       if ((ctr_j = ctr[j])) {
783         sbuf1_j        = sbuf1[j];
784         k              = ++sbuf1_j[0];
785         sbuf1_j[2*k]   = ctr_j;
786         sbuf1_j[2*k-1] = i;
787       }
788     }
789   }
790 
791   /*  Now  post the sends */
792   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
793   for (i=0; i<nrqs; ++i) {
794     j = pa[i];
795     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
796   }
797 
798   /* Post Recieves to capture the buffer size */
799   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
800   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
801   rbuf2[0] = tmp + msz;
802   for (i=1; i<nrqs; ++i) {
803     j        = pa[i];
804     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
805   }
806   for (i=0; i<nrqs; ++i) {
807     j    = pa[i];
808     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
809   }
810 
811   /* Send to other procs the buf size they should allocate */
812 
813   /* Receive messages*/
814   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
815   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
816   ierr = PetscMalloc3(nrqr+1,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
817   {
818     Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
819     PetscInt    *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;
820 
821     for (i=0; i<nrqr; ++i) {
822       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
823       req_size[idex] = 0;
824       rbuf1_i         = rbuf1[idex];
825       start           = 2*rbuf1_i[0] + 1;
826       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
827       ierr            = PetscMalloc(end*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
828       sbuf2_i         = sbuf2[idex];
829       for (j=start; j<end; j++) {
830         id               = rbuf1_i[j] - rstart;
831         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
832         sbuf2_i[j]       = ncols;
833         req_size[idex] += ncols;
834       }
835       req_source[idex] = r_status1[i].MPI_SOURCE;
836       /* form the header */
837       sbuf2_i[0]   = req_size[idex];
838       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
839       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
840     }
841   }
842   ierr = PetscFree(r_status1);CHKERRQ(ierr);
843   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
844 
845   /*  recv buffer sizes */
846   /* Receive messages*/
847 
848   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
849   ierr = PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);CHKERRQ(ierr);
850   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
851   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
852   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
853 
854   for (i=0; i<nrqs; ++i) {
855     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
856     ierr = PetscMalloc(rbuf2[idex][0]*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
857     ierr = PetscMalloc(rbuf2[idex][0]*bs2*sizeof(MatScalar),&rbuf4[idex]);CHKERRQ(ierr);
858     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
859     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
860   }
861   ierr = PetscFree(r_status2);CHKERRQ(ierr);
862   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
863 
864   /* Wait on sends1 and sends2 */
865   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
866   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
867 
868   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
869   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
870   ierr = PetscFree(s_status1);CHKERRQ(ierr);
871   ierr = PetscFree(s_status2);CHKERRQ(ierr);
872   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
873   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
874 
875   /* Now allocate buffers for a->j, and send them off */
876   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
877   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
878   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
879   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
880 
881   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
882   {
883      for (i=0; i<nrqr; i++) {
884       rbuf1_i   = rbuf1[i];
885       sbuf_aj_i = sbuf_aj[i];
886       ct1       = 2*rbuf1_i[0] + 1;
887       ct2       = 0;
888       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
889         kmax = rbuf1[i][2*j];
890         for (k=0; k<kmax; k++,ct1++) {
891           row    = rbuf1_i[ct1] - rstart;
892           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
893           ncols  = nzA + nzB;
894           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
895 
896           /* load the column indices for this row into cols*/
897           cols  = sbuf_aj_i + ct2;
898           for (l=0; l<nzB; l++) {
899             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[l] = ctmp;
900             else break;
901           }
902           imark = l;
903           for (l=0; l<nzA; l++)   cols[imark+l] = cstart + cworkA[l];
904           for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
905           ct2 += ncols;
906         }
907       }
908       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
909     }
910   }
911   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
912   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
913 
914   /* Allocate buffers for a->a, and send them off */
915   ierr = PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);CHKERRQ(ierr);
916   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
917   ierr = PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);CHKERRQ(ierr);
918   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
919 
920   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
921   {
922     for (i=0; i<nrqr; i++) {
923       rbuf1_i   = rbuf1[i];
924       sbuf_aa_i = sbuf_aa[i];
925       ct1       = 2*rbuf1_i[0]+1;
926       ct2       = 0;
927       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
928         kmax = rbuf1_i[2*j];
929         for (k=0; k<kmax; k++,ct1++) {
930           row    = rbuf1_i[ct1] - rstart;
931           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
932           ncols  = nzA + nzB;
933           cworkA = a_j + a_i[row];     cworkB = b_j + b_i[row];
934           vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2;
935 
936           /* load the column values for this row into vals*/
937           vals  = sbuf_aa_i+ct2*bs2;
938           for (l=0; l<nzB; l++) {
939             if ((bmap[cworkB[l]]) < cstart) {
940               ierr = PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
941             }
942             else break;
943           }
944           imark = l;
945           for (l=0; l<nzA; l++) {
946             ierr = PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
947           }
948           for (l=imark; l<nzB; l++) {
949             ierr = PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
950           }
951           ct2 += ncols;
952         }
953       }
954       ierr = MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
955     }
956   }
957   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
958   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
959   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
960   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
961 
962   /* Form the matrix */
963   /* create col map */
964   {
965     const PetscInt *icol_i;
966 #if defined (PETSC_USE_CTABLE)
967     /* Create row map*/
968     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&colmaps);CHKERRQ(ierr);
969     for (i=0; i<ismax; i++) {
970       ierr = PetscTableCreate(ncol[i]+1,&colmaps[i]);CHKERRQ(ierr);
971     }
972 #else
973     ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
974     ierr    = PetscMalloc(ismax*c->Nbs*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr);
975     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; }
976 #endif
977     for (i=0; i<ismax; i++) {
978       jmax   = ncol[i];
979       icol_i = icol[i];
980 #if defined (PETSC_USE_CTABLE)
981       lcol1_gcol1 = colmaps[i];
982       for (j=0; j<jmax; j++) {
983 	ierr = PetscTableAdd(lcol1_gcol1,icol_i[j]+1,j+1);CHKERRQ(ierr);
984       }
985 #else
986       cmap_i = cmap[i];
987       for (j=0; j<jmax; j++) {
988         cmap_i[icol_i[j]] = j+1;
989       }
990 #endif
991     }
992   }
993 
994   /* Create lens which is required for MatCreate... */
995   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
996   ierr    = PetscMalloc((1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt),&lens);CHKERRQ(ierr);
997   lens[0] = (PetscInt*)(lens + ismax);
998   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
999   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1000 
1001   /* Update lens from local data */
1002   for (i=0; i<ismax; i++) {
1003     jmax   = nrow[i];
1004 #if defined (PETSC_USE_CTABLE)
1005     lcol1_gcol1 = colmaps[i];
1006 #else
1007     cmap_i = cmap[i];
1008 #endif
1009     irow_i = irow[i];
1010     lens_i = lens[i];
1011     for (j=0; j<jmax; j++) {
1012       row  = irow_i[j];
1013 #if defined (PETSC_USE_CTABLE)
1014       ierr = PetscGetProc(row,size,c->rangebs,&proc);CHKERRQ(ierr);
1015 #else
1016       proc = rtable[row];
1017 #endif
1018       if (proc == rank) {
1019         /* Get indices from matA and then from matB */
1020         row    = row - rstart;
1021         nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1022         cworkA =  a_j + a_i[row]; cworkB = b_j + b_i[row];
1023 #if defined (PETSC_USE_CTABLE)
1024        for (k=0; k<nzA; k++) {
1025 	 ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[k]+1,&tt);CHKERRQ(ierr);
1026           if (tt) { lens_i[j]++; }
1027         }
1028         for (k=0; k<nzB; k++) {
1029 	  ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[k]]+1,&tt);CHKERRQ(ierr);
1030           if (tt) { lens_i[j]++; }
1031         }
1032 #else
1033         for (k=0; k<nzA; k++) {
1034           if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++; }
1035         }
1036         for (k=0; k<nzB; k++) {
1037           if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++; }
1038         }
1039 #endif
1040       }
1041     }
1042   }
1043 #if defined (PETSC_USE_CTABLE)
1044   /* Create row map*/
1045   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rowmaps);CHKERRQ(ierr);
1046   for (i=0; i<ismax; i++){
1047     ierr = PetscTableCreate(nrow[i]+1,&rowmaps[i]);CHKERRQ(ierr);
1048   }
1049 #else
1050   /* Create row map*/
1051   ierr    = PetscMalloc((1+ismax)*sizeof(PetscInt*)+ ismax*Mbs*sizeof(PetscInt),&rmap);CHKERRQ(ierr);
1052   rmap[0] = (PetscInt*)(rmap + ismax);
1053   ierr    = PetscMemzero(rmap[0],ismax*Mbs*sizeof(PetscInt));CHKERRQ(ierr);
1054   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + Mbs;}
1055 #endif
1056   for (i=0; i<ismax; i++) {
1057     irow_i = irow[i];
1058     jmax   = nrow[i];
1059 #if defined (PETSC_USE_CTABLE)
1060     lrow1_grow1 = rowmaps[i];
1061     for (j=0; j<jmax; j++) {
1062       ierr = PetscTableAdd(lrow1_grow1,irow_i[j]+1,j+1);CHKERRQ(ierr);
1063     }
1064 #else
1065     rmap_i = rmap[i];
1066     for (j=0; j<jmax; j++) {
1067       rmap_i[irow_i[j]] = j;
1068     }
1069 #endif
1070   }
1071 
1072   /* Update lens from offproc data */
1073   {
1074     PetscInt    *rbuf2_i,*rbuf3_i,*sbuf1_i;
1075     PetscMPIInt ii;
1076 
1077     for (tmp2=0; tmp2<nrqs; tmp2++) {
1078       ierr    = MPI_Waitany(nrqs,r_waits3,&ii,r_status3+tmp2);CHKERRQ(ierr);
1079       idex   = pa[ii];
1080       sbuf1_i = sbuf1[idex];
1081       jmax    = sbuf1_i[0];
1082       ct1     = 2*jmax+1;
1083       ct2     = 0;
1084       rbuf2_i = rbuf2[ii];
1085       rbuf3_i = rbuf3[ii];
1086       for (j=1; j<=jmax; j++) {
1087         is_no   = sbuf1_i[2*j-1];
1088         max1    = sbuf1_i[2*j];
1089         lens_i  = lens[is_no];
1090 #if defined (PETSC_USE_CTABLE)
1091 	lcol1_gcol1 = colmaps[is_no];
1092 	lrow1_grow1 = rowmaps[is_no];
1093 #else
1094         cmap_i  = cmap[is_no];
1095 	rmap_i  = rmap[is_no];
1096 #endif
1097         for (k=0; k<max1; k++,ct1++) {
1098 #if defined (PETSC_USE_CTABLE)
1099 	  ierr = PetscTableFind(lrow1_grow1,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1100           row--;
1101           if (row < 0) { SETERRQ(PETSC_ERR_PLIB,"row not found in table"); }
1102 #else
1103           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1104 #endif
1105           max2 = rbuf2_i[ct1];
1106           for (l=0; l<max2; l++,ct2++) {
1107 #if defined (PETSC_USE_CTABLE)
1108 	    ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tt);CHKERRQ(ierr);
1109 	    if (tt) {
1110               lens_i[row]++;
1111             }
1112 #else
1113             if (cmap_i[rbuf3_i[ct2]]) {
1114               lens_i[row]++;
1115             }
1116 #endif
1117           }
1118         }
1119       }
1120     }
1121   }
1122   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1123   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1124   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1125   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1126   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1127 
1128   /* Create the submatrices */
1129   if (scall == MAT_REUSE_MATRIX) {
1130     /*
1131         Assumes new rows are same length as the old rows, hence bug!
1132     */
1133     for (i=0; i<ismax; i++) {
1134       mat = (Mat_SeqBAIJ *)(submats[i]->data);
1135       if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || C->rmap->bs != bs)) {
1136         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1137       }
1138       ierr = PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
1139       if (!flag) {
1140         SETERRQ(PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros");
1141       }
1142       /* Initial matrix as if empty */
1143       ierr = PetscMemzero(mat->ilen,mat->mbs*sizeof(PetscInt));CHKERRQ(ierr);
1144       submats[i]->factor = C->factor;
1145     }
1146   } else {
1147     for (i=0; i<ismax; i++) {
1148       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1149       ierr = MatSetSizes(submats[i],nrow[i]*bs,ncol[i]*bs,nrow[i]*bs,ncol[i]*bs);CHKERRQ(ierr);
1150       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1151       ierr = MatSeqBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);CHKERRQ(ierr);
1152       ierr = MatSeqSBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);CHKERRQ(ierr);
1153     }
1154   }
1155 
1156   /* Assemble the matrices */
1157   /* First assemble the local rows */
1158   {
1159     PetscInt       ilen_row,*imat_ilen,*imat_j,*imat_i;
1160     MatScalar *imat_a;
1161 
1162     for (i=0; i<ismax; i++) {
1163       mat       = (Mat_SeqBAIJ*)submats[i]->data;
1164       imat_ilen = mat->ilen;
1165       imat_j    = mat->j;
1166       imat_i    = mat->i;
1167       imat_a    = mat->a;
1168 
1169 #if defined (PETSC_USE_CTABLE)
1170       lcol1_gcol1 = colmaps[i];
1171       lrow1_grow1 = rowmaps[i];
1172 #else
1173       cmap_i    = cmap[i];
1174       rmap_i    = rmap[i];
1175 #endif
1176       irow_i    = irow[i];
1177       jmax      = nrow[i];
1178       for (j=0; j<jmax; j++) {
1179         row      = irow_i[j];
1180 #if defined (PETSC_USE_CTABLE)
1181 	ierr = PetscGetProc(row,size,c->rangebs,&proc);CHKERRQ(ierr);
1182 #else
1183 	proc = rtable[row];
1184 #endif
1185         if (proc == rank) {
1186           row      = row - rstart;
1187           nzA      = a_i[row+1] - a_i[row];
1188           nzB      = b_i[row+1] - b_i[row];
1189           cworkA   = a_j + a_i[row];
1190           cworkB   = b_j + b_i[row];
1191           vworkA   = a_a + a_i[row]*bs2;
1192           vworkB   = b_a + b_i[row]*bs2;
1193 #if defined (PETSC_USE_CTABLE)
1194 	  ierr = PetscTableFind(lrow1_grow1,row+rstart+1,&row);CHKERRQ(ierr);
1195           row--;
1196           if (row < 0) { SETERRQ(PETSC_ERR_PLIB,"row not found in table"); }
1197 #else
1198           row      = rmap_i[row + rstart];
1199 #endif
1200           mat_i    = imat_i[row];
1201           mat_a    = imat_a + mat_i*bs2;
1202           mat_j    = imat_j + mat_i;
1203           ilen_row = imat_ilen[row];
1204 
1205           /* load the column indices for this row into cols*/
1206           for (l=0; l<nzB; l++) {
1207 	    if ((ctmp = bmap[cworkB[l]]) < cstart) {
1208 #if defined (PETSC_USE_CTABLE)
1209 	      ierr = PetscTableFind(lcol1_gcol1,ctmp+1,&tcol);CHKERRQ(ierr);
1210 	      if (tcol) {
1211 #else
1212               if ((tcol = cmap_i[ctmp])) {
1213 #endif
1214                 *mat_j++ = tcol - 1;
1215                 ierr     = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1216                 mat_a   += bs2;
1217                 ilen_row++;
1218               }
1219             } else break;
1220           }
1221           imark = l;
1222           for (l=0; l<nzA; l++) {
1223 #if defined (PETSC_USE_CTABLE)
1224 	    ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[l]+1,&tcol);CHKERRQ(ierr);
1225 	    if (tcol) {
1226 #else
1227 	    if ((tcol = cmap_i[cstart + cworkA[l]])) {
1228 #endif
1229               *mat_j++ = tcol - 1;
1230               ierr     = PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1231               mat_a   += bs2;
1232               ilen_row++;
1233             }
1234           }
1235           for (l=imark; l<nzB; l++) {
1236 #if defined (PETSC_USE_CTABLE)
1237 	    ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[l]]+1,&tcol);CHKERRQ(ierr);
1238 	    if (tcol) {
1239 #else
1240             if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1241 #endif
1242               *mat_j++ = tcol - 1;
1243               ierr     = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1244               mat_a   += bs2;
1245               ilen_row++;
1246             }
1247           }
1248           imat_ilen[row] = ilen_row;
1249         }
1250       }
1251 
1252     }
1253   }
1254 
1255   /*   Now assemble the off proc rows*/
1256   {
1257     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1258     PetscInt    *imat_j,*imat_i;
1259     MatScalar   *imat_a,*rbuf4_i;
1260     PetscMPIInt ii;
1261 
1262     for (tmp2=0; tmp2<nrqs; tmp2++) {
1263       ierr    = MPI_Waitany(nrqs,r_waits4,&ii,r_status4+tmp2);CHKERRQ(ierr);
1264       idex   = pa[ii];
1265       sbuf1_i = sbuf1[idex];
1266       jmax    = sbuf1_i[0];
1267       ct1     = 2*jmax + 1;
1268       ct2     = 0;
1269       rbuf2_i = rbuf2[ii];
1270       rbuf3_i = rbuf3[ii];
1271       rbuf4_i = rbuf4[ii];
1272       for (j=1; j<=jmax; j++) {
1273         is_no     = sbuf1_i[2*j-1];
1274 #if defined (PETSC_USE_CTABLE)
1275 	lrow1_grow1 = rowmaps[is_no];
1276 	lcol1_gcol1 = colmaps[is_no];
1277 #else
1278         rmap_i    = rmap[is_no];
1279         cmap_i    = cmap[is_no];
1280 #endif
1281         mat       = (Mat_SeqBAIJ*)submats[is_no]->data;
1282         imat_ilen = mat->ilen;
1283         imat_j    = mat->j;
1284         imat_i    = mat->i;
1285         imat_a    = mat->a;
1286         max1      = sbuf1_i[2*j];
1287         for (k=0; k<max1; k++,ct1++) {
1288           row   = sbuf1_i[ct1];
1289 #if defined (PETSC_USE_CTABLE)
1290 	  ierr = PetscTableFind(lrow1_grow1,row+1,&row);CHKERRQ(ierr);
1291           row--;
1292           if(row < 0) { SETERRQ(PETSC_ERR_PLIB,"row not found in table"); }
1293 #else
1294           row   = rmap_i[row];
1295 #endif
1296           ilen  = imat_ilen[row];
1297           mat_i = imat_i[row];
1298           mat_a = imat_a + mat_i*bs2;
1299           mat_j = imat_j + mat_i;
1300           max2 = rbuf2_i[ct1];
1301           for (l=0; l<max2; l++,ct2++) {
1302 #if defined (PETSC_USE_CTABLE)
1303 	    ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1304 	    if (tcol) {
1305 #else
1306 	    if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1307 #endif
1308               *mat_j++    = tcol - 1;
1309               /* *mat_a++= rbuf4_i[ct2]; */
1310               ierr        = PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1311               mat_a      += bs2;
1312               ilen++;
1313             }
1314           }
1315           imat_ilen[row] = ilen;
1316         }
1317       }
1318     }
1319   }
1320   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1321   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1322   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1323   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1324   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1325 
1326   /* Restore the indices */
1327   for (i=0; i<ismax; i++) {
1328     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1329     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1330   }
1331 
1332   /* Destroy allocated memory */
1333 #if defined(PETSC_USE_CTABLE)
1334   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1335 #else
1336   ierr = PetscFree5(irow,icol,nrow,ncol,rtable);CHKERRQ(ierr);
1337 #endif
1338   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1339   ierr = PetscFree(pa);CHKERRQ(ierr);
1340 
1341   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1342   ierr = PetscFree(sbuf1);CHKERRQ(ierr);
1343   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1344   for (i=0; i<nrqr; ++i) {
1345     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1346   }
1347   for (i=0; i<nrqs; ++i) {
1348     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1349     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1350   }
1351   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1352   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1353   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1354   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1355   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1356   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1357   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1358 
1359 #if defined (PETSC_USE_CTABLE)
1360   for (i=0; i<ismax; i++){
1361     ierr = PetscTableDestroy(rowmaps[i]);CHKERRQ(ierr);
1362     ierr = PetscTableDestroy(colmaps[i]);CHKERRQ(ierr);
1363   }
1364   ierr = PetscFree(colmaps);CHKERRQ(ierr);
1365   ierr = PetscFree(rowmaps);CHKERRQ(ierr);
1366 #else
1367   ierr = PetscFree(rmap);CHKERRQ(ierr);
1368   ierr = PetscFree(cmap[0]);CHKERRQ(ierr);
1369   ierr = PetscFree(cmap);CHKERRQ(ierr);
1370 #endif
1371   ierr = PetscFree(lens);CHKERRQ(ierr);
1372 
1373   for (i=0; i<ismax; i++) {
1374     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1375     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1376   }
1377   PetscFunctionReturn(0);
1378 }
1379 
1380