xref: /petsc/src/mat/impls/baij/mpi/baijov.c (revision fca92195c093e8a5faf0c844190ed016dcbb215a)
1 #define PETSCMAT_DLL
2 
3 
4 /*
5    Routines to compute overlapping regions of a parallel MPI matrix
6   and to find submatrices that were shared across processors.
7 */
8 #include "../src/mat/impls/baij/mpi/mpibaij.h"
9 #include "petscbt.h"
10 
11 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat,PetscInt,IS *);
12 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
13 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
14 EXTERN PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 EXTERN PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16 
17 #undef __FUNCT__
18 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ"
19 PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
20 {
21   PetscErrorCode ierr;
22   PetscInt       i,N=C->cmap->N, bs=C->rmap->bs;
23   IS             *is_new;
24 
25   PetscFunctionBegin;
26   ierr = PetscMalloc(imax*sizeof(IS),&is_new);CHKERRQ(ierr);
27   /* Convert the indices into block format */
28   ierr = ISCompressIndicesGeneral(N,bs,imax,is,is_new);CHKERRQ(ierr);
29   if (ov < 0){ SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");}
30   for (i=0; i<ov; ++i) {
31     ierr = MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);CHKERRQ(ierr);
32   }
33   for (i=0; i<imax; i++) {ierr = ISDestroy(is[i]);CHKERRQ(ierr);}
34   ierr = ISExpandIndicesGeneral(N,bs,imax,is_new,is);CHKERRQ(ierr);
35   for (i=0; i<imax; i++) {ierr = ISDestroy(is_new[i]);CHKERRQ(ierr);}
36   ierr = PetscFree(is_new);CHKERRQ(ierr);
37   PetscFunctionReturn(0);
38 }
39 
40 /*
41   Sample message format:
42   If a processor A wants processor B to process some elements corresponding
43   to index sets is[1], is[5]
44   mesg [0] = 2   (no of index sets in the mesg)
45   -----------
46   mesg [1] = 1 => is[1]
47   mesg [2] = sizeof(is[1]);
48   -----------
49   mesg [5] = 5  => is[5]
50   mesg [6] = sizeof(is[5]);
51   -----------
52   mesg [7]
53   mesg [n]  data(is[1])
54   -----------
55   mesg[n+1]
56   mesg[m]  data(is[5])
57   -----------
58 
59   Notes:
60   nrqs - no of requests sent (or to be sent out)
61   nrqr - no of requests recieved (which have to be or which have been processed
62 */
63 #undef __FUNCT__
64 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ_Once"
65 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[])
66 {
67   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
68   const PetscInt **idx,*idx_i;
69   PetscInt       *n,*w3,*w4,*rtable,**data,len;
70   PetscErrorCode ierr;
71   PetscMPIInt    size,rank,tag1,tag2,*w2,*w1,nrqr;
72   PetscInt       Mbs,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
73   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
74   PetscMPIInt    *onodes1,*olengths1,*onodes2,*olengths2;
75   PetscBT        *table;
76   MPI_Comm       comm;
77   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
78   MPI_Status     *s_status,*recv_status;
79 
80   PetscFunctionBegin;
81   comm   = ((PetscObject)C)->comm;
82   size   = c->size;
83   rank   = c->rank;
84   Mbs    = c->Mbs;
85 
86   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
87   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
88 
89   ierr = PetscMalloc3(imax+1,const PetscInt*,&idx,imax,PetscInt,&n,Mbs,PetscInt,&rtable);CHKERRQ(ierr);
90   for (i=0; i<imax; i++) {
91     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
92     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
93   }
94 
95   /* Create hash table for the mapping :row -> proc*/
96   for (i=0,j=0; i<size; i++) {
97     len = c->rangebs[i+1];
98     for (; j<len; j++) {
99       rtable[j] = i;
100     }
101   }
102 
103   /* evaluate communication - mesg to who,length of mesg, and buffer space
104      required. Based on this, buffers are allocated, and data copied into them*/
105   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscInt,&w3,size,PetscInt,&w4);CHKERRQ(ierr);
106   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
107   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
108   ierr = PetscMemzero(w3,size*sizeof(PetscInt));CHKERRQ(ierr);
109   for (i=0; i<imax; i++) {
110     ierr  = PetscMemzero(w4,size*sizeof(PetscInt));CHKERRQ(ierr); /* initialise work vector*/
111     idx_i = idx[i];
112     len   = n[i];
113     for (j=0; j<len; j++) {
114       row  = idx_i[j];
115       if (row < 0) {
116         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
117       }
118       proc = rtable[row];
119       w4[proc]++;
120     }
121     for (j=0; j<size; j++){
122       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
123     }
124   }
125 
126   nrqs     = 0;              /* no of outgoing messages */
127   msz      = 0;              /* total mesg length (for all proc */
128   w1[rank] = 0;              /* no mesg sent to itself */
129   w3[rank] = 0;
130   for (i=0; i<size; i++) {
131     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
132   }
133   /* pa - is list of processors to communicate with */
134   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
135   for (i=0,j=0; i<size; i++) {
136     if (w1[i]) {pa[j] = i; j++;}
137   }
138 
139   /* Each message would have a header = 1 + 2*(no of IS) + data */
140   for (i=0; i<nrqs; i++) {
141     j      = pa[i];
142     w1[j] += w2[j] + 2*w3[j];
143     msz   += w1[j];
144   }
145 
146   /* Determine the number of messages to expect, their lengths, from from-ids */
147   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
148   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
149 
150   /* Now post the Irecvs corresponding to these messages */
151   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
152 
153   /* Allocate Memory for outgoing messages */
154   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
155   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
156   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
157   {
158     PetscInt *iptr = tmp,ict  = 0;
159     for (i=0; i<nrqs; i++) {
160       j         = pa[i];
161       iptr     +=  ict;
162       outdat[j] = iptr;
163       ict       = w1[j];
164     }
165   }
166 
167   /* Form the outgoing messages */
168   /*plug in the headers*/
169   for (i=0; i<nrqs; i++) {
170     j            = pa[i];
171     outdat[j][0] = 0;
172     ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
173     ptr[j]       = outdat[j] + 2*w3[j] + 1;
174   }
175 
176   /* Memory for doing local proc's work*/
177   {
178     PetscInt  *d_p;
179     char      *t_p;
180 
181     /* should change to use PetscMallocN() */
182     ierr = PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
183       (Mbs)*imax*sizeof(PetscInt)  + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char),&table);CHKERRQ(ierr);
184     ierr = PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
185       (Mbs)*imax*sizeof(PetscInt)  + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr);
186     data = (PetscInt **)(table + imax);
187     isz  = (PetscInt  *)(data  + imax);
188     d_p  = (PetscInt  *)(isz   + imax);
189     t_p  = (char *)(d_p   + Mbs*imax);
190     for (i=0; i<imax; i++) {
191       table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
192       data[i]  = d_p + (Mbs)*i;
193     }
194   }
195 
196   /* Parse the IS and update local tables and the outgoing buf with the data*/
197   {
198     PetscInt     n_i,*data_i,isz_i,*outdat_j,ctr_j;
199     PetscBT table_i;
200 
201     for (i=0; i<imax; i++) {
202       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
203       n_i     = n[i];
204       table_i = table[i];
205       idx_i   = idx[i];
206       data_i  = data[i];
207       isz_i   = isz[i];
208       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
209         row  = idx_i[j];
210         proc = rtable[row];
211         if (proc != rank) { /* copy to the outgoing buffer */
212           ctr[proc]++;
213           *ptr[proc] = row;
214           ptr[proc]++;
215         }
216         else { /* Update the local table */
217           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
218         }
219       }
220       /* Update the headers for the current IS */
221       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
222         if ((ctr_j = ctr[j])) {
223           outdat_j        = outdat[j];
224           k               = ++outdat_j[0];
225           outdat_j[2*k]   = ctr_j;
226           outdat_j[2*k-1] = i;
227         }
228       }
229       isz[i] = isz_i;
230     }
231   }
232 
233   /*  Now  post the sends */
234   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
235   for (i=0; i<nrqs; ++i) {
236     j    = pa[i];
237     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
238   }
239 
240   /* No longer need the original indices*/
241   for (i=0; i<imax; ++i) {
242     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
243   }
244   ierr = PetscFree3(idx,n,rtable);CHKERRQ(ierr);
245 
246   for (i=0; i<imax; ++i) {
247     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
248   }
249 
250   /* Do Local work*/
251   ierr = MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
252 
253   /* Receive messages*/
254   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
255   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
256 
257   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
258   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
259 
260   /* Phase 1 sends are complete - deallocate buffers */
261   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
262   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
263 
264   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
265   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
266   ierr = MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
267   ierr = PetscFree(rbuf);CHKERRQ(ierr);
268 
269   /* Send the data back*/
270   /* Do a global reduction to know the buffer space req for incoming messages*/
271   {
272     PetscMPIInt *rw1;
273 
274     ierr = PetscMalloc(size*sizeof(PetscInt),&rw1);CHKERRQ(ierr);
275     ierr = PetscMemzero(rw1,size*sizeof(PetscInt));CHKERRQ(ierr);
276 
277     for (i=0; i<nrqr; ++i) {
278       proc      = recv_status[i].MPI_SOURCE;
279       if (proc != onodes1[i]) SETERRQ(PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
280       rw1[proc] = isz1[i];
281     }
282 
283     ierr = PetscFree(onodes1);CHKERRQ(ierr);
284     ierr = PetscFree(olengths1);CHKERRQ(ierr);
285 
286     /* Determine the number of messages to expect, their lengths, from from-ids */
287     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
288     ierr = PetscFree(rw1);CHKERRQ(ierr);
289   }
290   /* Now post the Irecvs corresponding to these messages */
291   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
292 
293   /*  Now  post the sends */
294   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
295   for (i=0; i<nrqr; ++i) {
296     j    = recv_status[i].MPI_SOURCE;
297     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
298   }
299 
300   /* receive work done on other processors*/
301   {
302     PetscMPIInt idex;
303     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
304     PetscBT     table_i;
305     MPI_Status  *status2;
306 
307     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
308     for (i=0; i<nrqs; ++i) {
309       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
310       /* Process the message*/
311       rbuf2_i = rbuf2[idex];
312       ct1     = 2*rbuf2_i[0]+1;
313       jmax    = rbuf2[idex][0];
314       for (j=1; j<=jmax; j++) {
315         max     = rbuf2_i[2*j];
316         is_no   = rbuf2_i[2*j-1];
317         isz_i   = isz[is_no];
318         data_i  = data[is_no];
319         table_i = table[is_no];
320         for (k=0; k<max; k++,ct1++) {
321           row = rbuf2_i[ct1];
322           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
323         }
324         isz[is_no] = isz_i;
325       }
326     }
327     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
328     ierr = PetscFree(status2);CHKERRQ(ierr);
329   }
330 
331   for (i=0; i<imax; ++i) {
332     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);CHKERRQ(ierr);
333   }
334 
335 
336   ierr = PetscFree(onodes2);CHKERRQ(ierr);
337   ierr = PetscFree(olengths2);CHKERRQ(ierr);
338 
339   ierr = PetscFree(pa);CHKERRQ(ierr);
340   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
341   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
342   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
343   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
344   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
345   ierr = PetscFree(table);CHKERRQ(ierr);
346   ierr = PetscFree(s_status);CHKERRQ(ierr);
347   ierr = PetscFree(recv_status);CHKERRQ(ierr);
348   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
349   ierr = PetscFree(xdata);CHKERRQ(ierr);
350   ierr = PetscFree(isz1);CHKERRQ(ierr);
351   PetscFunctionReturn(0);
352 }
353 
354 #undef __FUNCT__
355 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ_Local"
356 /*
357    MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
358        the work on the local processor.
359 
360      Inputs:
361       C      - MAT_MPIBAIJ;
362       imax - total no of index sets processed at a time;
363       table  - an array of char - size = Mbs bits.
364 
365      Output:
366       isz    - array containing the count of the solution elements corresponding
367                to each index set;
368       data   - pointer to the solutions
369 */
370 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
371 {
372   Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
373   Mat         A = c->A,B = c->B;
374   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
375   PetscInt    start,end,val,max,rstart,cstart,*ai,*aj;
376   PetscInt    *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
377   PetscBT     table_i;
378 
379   PetscFunctionBegin;
380   rstart = c->rstartbs;
381   cstart = c->cstartbs;
382   ai     = a->i;
383   aj     = a->j;
384   bi     = b->i;
385   bj     = b->j;
386   garray = c->garray;
387 
388 
389   for (i=0; i<imax; i++) {
390     data_i  = data[i];
391     table_i = table[i];
392     isz_i   = isz[i];
393     for (j=0,max=isz[i]; j<max; j++) {
394       row   = data_i[j] - rstart;
395       start = ai[row];
396       end   = ai[row+1];
397       for (k=start; k<end; k++) { /* Amat */
398         val = aj[k] + cstart;
399         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
400       }
401       start = bi[row];
402       end   = bi[row+1];
403       for (k=start; k<end; k++) { /* Bmat */
404         val = garray[bj[k]];
405         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
406       }
407     }
408     isz[i] = isz_i;
409   }
410   PetscFunctionReturn(0);
411 }
412 #undef __FUNCT__
413 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ_Receive"
414 /*
415       MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages,
416          and return the output
417 
418          Input:
419            C    - the matrix
420            nrqr - no of messages being processed.
421            rbuf - an array of pointers to the recieved requests
422 
423          Output:
424            xdata - array of messages to be sent back
425            isz1  - size of each message
426 
427   For better efficiency perhaps we should malloc separately each xdata[i],
428 then if a remalloc is required we need only copy the data for that one row
429 rather than all previous rows as it is now where a single large chunck of
430 memory is used.
431 
432 */
433 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
434 {
435   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
436   Mat            A = c->A,B = c->B;
437   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
438   PetscErrorCode ierr;
439   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
440   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
441   PetscInt       val,max1,max2,rank,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
442   PetscInt       *rbuf_i,kmax,rbuf_0;
443   PetscBT        xtable;
444 
445   PetscFunctionBegin;
446   rank   = c->rank;
447   Mbs    = c->Mbs;
448   rstart = c->rstartbs;
449   cstart = c->cstartbs;
450   ai     = a->i;
451   aj     = a->j;
452   bi     = b->i;
453   bj     = b->j;
454   garray = c->garray;
455 
456 
457   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
458     rbuf_i  =  rbuf[i];
459     rbuf_0  =  rbuf_i[0];
460     ct     += rbuf_0;
461     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
462   }
463 
464   if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
465   else        max1 = 1;
466   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
467   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
468   ++no_malloc;
469   ierr         = PetscBTCreate(Mbs,xtable);CHKERRQ(ierr);
470   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
471 
472   ct3 = 0;
473   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
474     rbuf_i =  rbuf[i];
475     rbuf_0 =  rbuf_i[0];
476     ct1    =  2*rbuf_0+1;
477     ct2    =  ct1;
478     ct3    += ct1;
479     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
480       ierr = PetscBTMemzero(Mbs,xtable);CHKERRQ(ierr);
481       oct2 = ct2;
482       kmax = rbuf_i[2*j];
483       for (k=0; k<kmax; k++,ct1++) {
484         row = rbuf_i[ct1];
485         if (!PetscBTLookupSet(xtable,row)) {
486           if (!(ct3 < mem_estimate)) {
487             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
488             ierr = PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);CHKERRQ(ierr);
489             ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
490             ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
491             xdata[0]     = tmp;
492             mem_estimate = new_estimate; ++no_malloc;
493             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
494           }
495           xdata[i][ct2++] = row;
496           ct3++;
497         }
498       }
499       for (k=oct2,max2=ct2; k<max2; k++)  {
500         row   = xdata[i][k] - rstart;
501         start = ai[row];
502         end   = ai[row+1];
503         for (l=start; l<end; l++) {
504           val = aj[l] + cstart;
505           if (!PetscBTLookupSet(xtable,val)) {
506             if (!(ct3 < mem_estimate)) {
507               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
508               ierr = PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);CHKERRQ(ierr);
509               ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
510               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
511               xdata[0]     = tmp;
512               mem_estimate = new_estimate; ++no_malloc;
513               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
514             }
515             xdata[i][ct2++] = val;
516             ct3++;
517           }
518         }
519         start = bi[row];
520         end   = bi[row+1];
521         for (l=start; l<end; l++) {
522           val = garray[bj[l]];
523           if (!PetscBTLookupSet(xtable,val)) {
524             if (!(ct3 < mem_estimate)) {
525               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
526               ierr = PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);CHKERRQ(ierr);
527               ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
528               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
529               xdata[0]     = tmp;
530               mem_estimate = new_estimate; ++no_malloc;
531               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
532             }
533             xdata[i][ct2++] = val;
534             ct3++;
535           }
536         }
537       }
538       /* Update the header*/
539       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
540       xdata[i][2*j-1] = rbuf_i[2*j-1];
541     }
542     xdata[i][0] = rbuf_0;
543     xdata[i+1]  = xdata[i] + ct2;
544     isz1[i]     = ct2; /* size of each message */
545   }
546   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
547   ierr = PetscInfo3(C,"Allocated %D bytes, required %D, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
548   PetscFunctionReturn(0);
549 }
550 
551 static PetscErrorCode MatGetSubMatrices_MPIBAIJ_local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat *);
552 
553 #undef __FUNCT__
554 #define __FUNCT__ "MatGetSubMatrices_MPIBAIJ"
555 PetscErrorCode MatGetSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
556 {
557   IS             *isrow_new,*iscol_new;
558   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
559   PetscErrorCode ierr;
560   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,N=C->cmap->N,bs=C->rmap->bs;
561 
562   PetscFunctionBegin;
563   /* The compression and expansion should be avoided. Does'nt point
564      out errors might change the indices hence buggey */
565 
566   ierr = PetscMalloc2(ismax+1,IS,&isrow_new,ismax+1,IS,&iscol_new);CHKERRQ(ierr);
567   ierr = ISCompressIndicesGeneral(N,bs,ismax,isrow,isrow_new);CHKERRQ(ierr);
568   ierr = ISCompressIndicesGeneral(N,bs,ismax,iscol,iscol_new);CHKERRQ(ierr);
569 
570   /* Allocate memory to hold all the submatrices */
571   if (scall != MAT_REUSE_MATRIX) {
572     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
573   }
574   /* Determine the number of stages through which submatrices are done */
575   nmax          = 20*1000000 / (c->Nbs * sizeof(PetscInt));
576   if (!nmax) nmax = 1;
577   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
578 
579   /* Make sure every processor loops through the nstages */
580   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
581   for (i=0,pos=0; i<nstages; i++) {
582     if (pos+nmax <= ismax) max_no = nmax;
583     else if (pos == ismax) max_no = 0;
584     else                   max_no = ismax-pos;
585     ierr = MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos);CHKERRQ(ierr);
586     pos += max_no;
587   }
588 
589   for (i=0; i<ismax; i++) {
590     ierr = ISDestroy(isrow_new[i]);CHKERRQ(ierr);
591     ierr = ISDestroy(iscol_new[i]);CHKERRQ(ierr);
592   }
593   ierr = PetscFree2(isrow_new,iscol_new);CHKERRQ(ierr);
594   PetscFunctionReturn(0);
595 }
596 
597 #if defined (PETSC_USE_CTABLE)
598 #undef __FUNCT__
599 #define __FUNCT__ "PetscGetProc"
600 PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
601 {
602   PetscInt    nGlobalNd = proc_gnode[size];
603   PetscMPIInt fproc = PetscMPIIntCast( (PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)));
604 
605   PetscFunctionBegin;
606   if (fproc > size) fproc = size;
607   while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
608     if (row < proc_gnode[fproc]) fproc--;
609     else                         fproc++;
610   }
611   *rank = fproc;
612   PetscFunctionReturn(0);
613 }
614 #endif
615 
616 /* -------------------------------------------------------------------------*/
617 /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */
618 #undef __FUNCT__
619 #define __FUNCT__ "MatGetSubMatrices_MPIBAIJ_local"
620 static PetscErrorCode MatGetSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
621 {
622   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
623   Mat            A = c->A;
624   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
625   const PetscInt **irow,**icol,*irow_i;
626   PetscInt       *nrow,*ncol,*w3,*w4,start;
627   PetscErrorCode ierr;
628   PetscMPIInt    size,tag0,tag1,tag2,tag3,*w1,*w2,nrqr,idex,end,proc;
629   PetscInt       **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,**rbuf1,row;
630   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
631   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
632   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
633   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
634   PetscInt       bs=C->rmap->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB;
635   PetscInt       cstart = c->cstartbs,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
636   PetscInt       *bmap = c->garray,ctmp,rstart=c->rstartbs;
637   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
638   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
639   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
640   MPI_Status     *r_status3,*r_status4,*s_status4;
641   MPI_Comm       comm;
642   MatScalar      **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB;
643   MatScalar      *a_a=a->a,*b_a=b->a;
644   PetscTruth     flag;
645   PetscMPIInt    *onodes1,*olengths1;
646 
647 #if defined (PETSC_USE_CTABLE)
648   PetscInt       tt;
649   PetscTable     *rowmaps,*colmaps,lrow1_grow1,lcol1_gcol1;
650 #else
651   PetscInt       **cmap,*cmap_i,*rtable,*rmap_i,**rmap, Mbs = c->Mbs;
652 #endif
653 
654   PetscFunctionBegin;
655   comm   = ((PetscObject)C)->comm;
656   tag0   = ((PetscObject)C)->tag;
657   size   = c->size;
658   rank   = c->rank;
659 
660   /* Get some new tags to keep the communication clean */
661   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
662   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
663   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
664 
665 #if defined(PETSC_USE_CTABLE)
666   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
667 #else
668   ierr = PetscMalloc5(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol,Mbs+1,PetscInt,&rtable);CHKERRQ(ierr);
669   /* Create hash table for the mapping :row -> proc*/
670   for (i=0,j=0; i<size; i++) {
671     jmax = c->rowners[i+1];
672     for (; j<jmax; j++) {
673       rtable[j] = i;
674     }
675   }
676 #endif
677 
678   for (i=0; i<ismax; i++) {
679     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
680     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
681     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
682     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
683   }
684 
685   /* evaluate communication - mesg to who,length of mesg,and buffer space
686      required. Based on this, buffers are allocated, and data copied into them*/
687   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscInt,&w3,size,PetscInt,&w4);CHKERRQ(ierr);
688   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
689   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
690   ierr = PetscMemzero(w3,size*sizeof(PetscInt));CHKERRQ(ierr);
691   for (i=0; i<ismax; i++) {
692     ierr   = PetscMemzero(w4,size*sizeof(PetscInt));CHKERRQ(ierr); /* initialise work vector*/
693     jmax   = nrow[i];
694     irow_i = irow[i];
695     for (j=0; j<jmax; j++) {
696       row  = irow_i[j];
697 #if defined (PETSC_USE_CTABLE)
698       ierr = PetscGetProc(row,size,c->rangebs,&proc);CHKERRQ(ierr);
699 #else
700       proc = rtable[row];
701 #endif
702       w4[proc]++;
703     }
704     for (j=0; j<size; j++) {
705       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
706     }
707   }
708 
709   nrqs     = 0;              /* no of outgoing messages */
710   msz      = 0;              /* total mesg length for all proc */
711   w1[rank] = 0;              /* no mesg sent to intself */
712   w3[rank] = 0;
713   for (i=0; i<size; i++) {
714     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
715   }
716   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
717   for (i=0,j=0; i<size; i++) {
718     if (w1[i]) { pa[j] = i; j++; }
719   }
720 
721   /* Each message would have a header = 1 + 2*(no of IS) + data */
722   for (i=0; i<nrqs; i++) {
723     j     = pa[i];
724     w1[j] += w2[j] + 2* w3[j];
725     msz   += w1[j];
726   }
727 
728   /* Determine the number of messages to expect, their lengths, from from-ids */
729   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
730   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
731 
732   /* Now post the Irecvs corresponding to these messages */
733   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
734 
735   ierr = PetscFree(onodes1);CHKERRQ(ierr);
736   ierr = PetscFree(olengths1);CHKERRQ(ierr);
737 
738   /* Allocate Memory for outgoing messages */
739   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
740   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
741   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
742   {
743     PetscInt *iptr = tmp,ict = 0;
744     for (i=0; i<nrqs; i++) {
745       j         = pa[i];
746       iptr     += ict;
747       sbuf1[j]  = iptr;
748       ict       = w1[j];
749     }
750   }
751 
752   /* Form the outgoing messages */
753   /* Initialise the header space */
754   for (i=0; i<nrqs; i++) {
755     j           = pa[i];
756     sbuf1[j][0] = 0;
757     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
758     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
759   }
760 
761   /* Parse the isrow and copy data into outbuf */
762   for (i=0; i<ismax; i++) {
763     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
764     irow_i = irow[i];
765     jmax   = nrow[i];
766     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
767       row  = irow_i[j];
768 #if defined (PETSC_USE_CTABLE)
769       ierr = PetscGetProc(row,size,c->rangebs,&proc);CHKERRQ(ierr);
770 #else
771       proc = rtable[row];
772 #endif
773       if (proc != rank) { /* copy to the outgoing buf*/
774         ctr[proc]++;
775         *ptr[proc] = row;
776         ptr[proc]++;
777       }
778     }
779     /* Update the headers for the current IS */
780     for (j=0; j<size; j++) { /* Can Optimise this loop too */
781       if ((ctr_j = ctr[j])) {
782         sbuf1_j        = sbuf1[j];
783         k              = ++sbuf1_j[0];
784         sbuf1_j[2*k]   = ctr_j;
785         sbuf1_j[2*k-1] = i;
786       }
787     }
788   }
789 
790   /*  Now  post the sends */
791   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
792   for (i=0; i<nrqs; ++i) {
793     j = pa[i];
794     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
795   }
796 
797   /* Post Recieves to capture the buffer size */
798   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
799   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
800   rbuf2[0] = tmp + msz;
801   for (i=1; i<nrqs; ++i) {
802     j        = pa[i];
803     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
804   }
805   for (i=0; i<nrqs; ++i) {
806     j    = pa[i];
807     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
808   }
809 
810   /* Send to other procs the buf size they should allocate */
811 
812   /* Receive messages*/
813   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
814   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
815   ierr = PetscMalloc3(nrqr+1,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
816   {
817     Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
818     PetscInt    *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;
819 
820     for (i=0; i<nrqr; ++i) {
821       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
822       req_size[idex] = 0;
823       rbuf1_i         = rbuf1[idex];
824       start           = 2*rbuf1_i[0] + 1;
825       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
826       ierr            = PetscMalloc(end*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
827       sbuf2_i         = sbuf2[idex];
828       for (j=start; j<end; j++) {
829         id               = rbuf1_i[j] - rstart;
830         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
831         sbuf2_i[j]       = ncols;
832         req_size[idex] += ncols;
833       }
834       req_source[idex] = r_status1[i].MPI_SOURCE;
835       /* form the header */
836       sbuf2_i[0]   = req_size[idex];
837       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
838       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
839     }
840   }
841   ierr = PetscFree(r_status1);CHKERRQ(ierr);
842   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
843 
844   /*  recv buffer sizes */
845   /* Receive messages*/
846 
847   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
848   ierr = PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);CHKERRQ(ierr);
849   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
850   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
851   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
852 
853   for (i=0; i<nrqs; ++i) {
854     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
855     ierr = PetscMalloc(rbuf2[idex][0]*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
856     ierr = PetscMalloc(rbuf2[idex][0]*bs2*sizeof(MatScalar),&rbuf4[idex]);CHKERRQ(ierr);
857     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
858     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
859   }
860   ierr = PetscFree(r_status2);CHKERRQ(ierr);
861   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
862 
863   /* Wait on sends1 and sends2 */
864   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
865   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
866 
867   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
868   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
869   ierr = PetscFree(s_status1);CHKERRQ(ierr);
870   ierr = PetscFree(s_status2);CHKERRQ(ierr);
871   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
872   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
873 
874   /* Now allocate buffers for a->j, and send them off */
875   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
876   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
877   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
878   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
879 
880   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
881   {
882      for (i=0; i<nrqr; i++) {
883       rbuf1_i   = rbuf1[i];
884       sbuf_aj_i = sbuf_aj[i];
885       ct1       = 2*rbuf1_i[0] + 1;
886       ct2       = 0;
887       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
888         kmax = rbuf1[i][2*j];
889         for (k=0; k<kmax; k++,ct1++) {
890           row    = rbuf1_i[ct1] - rstart;
891           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
892           ncols  = nzA + nzB;
893           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
894 
895           /* load the column indices for this row into cols*/
896           cols  = sbuf_aj_i + ct2;
897           for (l=0; l<nzB; l++) {
898             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[l] = ctmp;
899             else break;
900           }
901           imark = l;
902           for (l=0; l<nzA; l++)   cols[imark+l] = cstart + cworkA[l];
903           for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
904           ct2 += ncols;
905         }
906       }
907       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
908     }
909   }
910   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
911   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
912 
913   /* Allocate buffers for a->a, and send them off */
914   ierr = PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);CHKERRQ(ierr);
915   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
916   ierr = PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);CHKERRQ(ierr);
917   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
918 
919   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
920   {
921     for (i=0; i<nrqr; i++) {
922       rbuf1_i   = rbuf1[i];
923       sbuf_aa_i = sbuf_aa[i];
924       ct1       = 2*rbuf1_i[0]+1;
925       ct2       = 0;
926       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
927         kmax = rbuf1_i[2*j];
928         for (k=0; k<kmax; k++,ct1++) {
929           row    = rbuf1_i[ct1] - rstart;
930           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
931           ncols  = nzA + nzB;
932           cworkA = a_j + a_i[row];     cworkB = b_j + b_i[row];
933           vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2;
934 
935           /* load the column values for this row into vals*/
936           vals  = sbuf_aa_i+ct2*bs2;
937           for (l=0; l<nzB; l++) {
938             if ((bmap[cworkB[l]]) < cstart) {
939               ierr = PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
940             }
941             else break;
942           }
943           imark = l;
944           for (l=0; l<nzA; l++) {
945             ierr = PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
946           }
947           for (l=imark; l<nzB; l++) {
948             ierr = PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
949           }
950           ct2 += ncols;
951         }
952       }
953       ierr = MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
954     }
955   }
956   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
957   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
958   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
959 
960   /* Form the matrix */
961   /* create col map */
962   {
963     const PetscInt *icol_i;
964 #if defined (PETSC_USE_CTABLE)
965     /* Create row map*/
966     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&colmaps);CHKERRQ(ierr);
967     for (i=0; i<ismax; i++) {
968       ierr = PetscTableCreate(ncol[i]+1,&colmaps[i]);CHKERRQ(ierr);
969     }
970 #else
971     ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
972     ierr    = PetscMalloc(ismax*c->Nbs*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr);
973     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; }
974 #endif
975     for (i=0; i<ismax; i++) {
976       jmax   = ncol[i];
977       icol_i = icol[i];
978 #if defined (PETSC_USE_CTABLE)
979       lcol1_gcol1 = colmaps[i];
980       for (j=0; j<jmax; j++) {
981 	ierr = PetscTableAdd(lcol1_gcol1,icol_i[j]+1,j+1);CHKERRQ(ierr);
982       }
983 #else
984       cmap_i = cmap[i];
985       for (j=0; j<jmax; j++) {
986         cmap_i[icol_i[j]] = j+1;
987       }
988 #endif
989     }
990   }
991 
992   /* Create lens which is required for MatCreate... */
993   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
994   ierr    = PetscMalloc((1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt),&lens);CHKERRQ(ierr);
995   lens[0] = (PetscInt*)(lens + ismax);
996   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
997   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
998 
999   /* Update lens from local data */
1000   for (i=0; i<ismax; i++) {
1001     jmax   = nrow[i];
1002 #if defined (PETSC_USE_CTABLE)
1003     lcol1_gcol1 = colmaps[i];
1004 #else
1005     cmap_i = cmap[i];
1006 #endif
1007     irow_i = irow[i];
1008     lens_i = lens[i];
1009     for (j=0; j<jmax; j++) {
1010       row  = irow_i[j];
1011 #if defined (PETSC_USE_CTABLE)
1012       ierr = PetscGetProc(row,size,c->rangebs,&proc);CHKERRQ(ierr);
1013 #else
1014       proc = rtable[row];
1015 #endif
1016       if (proc == rank) {
1017         /* Get indices from matA and then from matB */
1018         row    = row - rstart;
1019         nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1020         cworkA =  a_j + a_i[row]; cworkB = b_j + b_i[row];
1021 #if defined (PETSC_USE_CTABLE)
1022        for (k=0; k<nzA; k++) {
1023 	 ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[k]+1,&tt);CHKERRQ(ierr);
1024           if (tt) { lens_i[j]++; }
1025         }
1026         for (k=0; k<nzB; k++) {
1027 	  ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[k]]+1,&tt);CHKERRQ(ierr);
1028           if (tt) { lens_i[j]++; }
1029         }
1030 #else
1031         for (k=0; k<nzA; k++) {
1032           if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++; }
1033         }
1034         for (k=0; k<nzB; k++) {
1035           if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++; }
1036         }
1037 #endif
1038       }
1039     }
1040   }
1041 #if defined (PETSC_USE_CTABLE)
1042   /* Create row map*/
1043   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rowmaps);CHKERRQ(ierr);
1044   for (i=0; i<ismax; i++){
1045     ierr = PetscTableCreate(nrow[i]+1,&rowmaps[i]);CHKERRQ(ierr);
1046   }
1047 #else
1048   /* Create row map*/
1049   ierr    = PetscMalloc((1+ismax)*sizeof(PetscInt*)+ ismax*Mbs*sizeof(PetscInt),&rmap);CHKERRQ(ierr);
1050   rmap[0] = (PetscInt*)(rmap + ismax);
1051   ierr    = PetscMemzero(rmap[0],ismax*Mbs*sizeof(PetscInt));CHKERRQ(ierr);
1052   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + Mbs;}
1053 #endif
1054   for (i=0; i<ismax; i++) {
1055     irow_i = irow[i];
1056     jmax   = nrow[i];
1057 #if defined (PETSC_USE_CTABLE)
1058     lrow1_grow1 = rowmaps[i];
1059     for (j=0; j<jmax; j++) {
1060       ierr = PetscTableAdd(lrow1_grow1,irow_i[j]+1,j+1);CHKERRQ(ierr);
1061     }
1062 #else
1063     rmap_i = rmap[i];
1064     for (j=0; j<jmax; j++) {
1065       rmap_i[irow_i[j]] = j;
1066     }
1067 #endif
1068   }
1069 
1070   /* Update lens from offproc data */
1071   {
1072     PetscInt    *rbuf2_i,*rbuf3_i,*sbuf1_i;
1073     PetscMPIInt ii;
1074 
1075     for (tmp2=0; tmp2<nrqs; tmp2++) {
1076       ierr    = MPI_Waitany(nrqs,r_waits3,&ii,r_status3+tmp2);CHKERRQ(ierr);
1077       idex   = pa[ii];
1078       sbuf1_i = sbuf1[idex];
1079       jmax    = sbuf1_i[0];
1080       ct1     = 2*jmax+1;
1081       ct2     = 0;
1082       rbuf2_i = rbuf2[ii];
1083       rbuf3_i = rbuf3[ii];
1084       for (j=1; j<=jmax; j++) {
1085         is_no   = sbuf1_i[2*j-1];
1086         max1    = sbuf1_i[2*j];
1087         lens_i  = lens[is_no];
1088 #if defined (PETSC_USE_CTABLE)
1089 	lcol1_gcol1 = colmaps[is_no];
1090 	lrow1_grow1 = rowmaps[is_no];
1091 #else
1092         cmap_i  = cmap[is_no];
1093 	rmap_i  = rmap[is_no];
1094 #endif
1095         for (k=0; k<max1; k++,ct1++) {
1096 #if defined (PETSC_USE_CTABLE)
1097 	  ierr = PetscTableFind(lrow1_grow1,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1098           row--;
1099           if (row < 0) { SETERRQ(PETSC_ERR_PLIB,"row not found in table"); }
1100 #else
1101           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1102 #endif
1103           max2 = rbuf2_i[ct1];
1104           for (l=0; l<max2; l++,ct2++) {
1105 #if defined (PETSC_USE_CTABLE)
1106 	    ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tt);CHKERRQ(ierr);
1107 	    if (tt) {
1108               lens_i[row]++;
1109             }
1110 #else
1111             if (cmap_i[rbuf3_i[ct2]]) {
1112               lens_i[row]++;
1113             }
1114 #endif
1115           }
1116         }
1117       }
1118     }
1119   }
1120   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1121   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1122   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1123   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1124   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1125 
1126   /* Create the submatrices */
1127   if (scall == MAT_REUSE_MATRIX) {
1128     /*
1129         Assumes new rows are same length as the old rows, hence bug!
1130     */
1131     for (i=0; i<ismax; i++) {
1132       mat = (Mat_SeqBAIJ *)(submats[i]->data);
1133       if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || C->rmap->bs != bs)) {
1134         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1135       }
1136       ierr = PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
1137       if (!flag) {
1138         SETERRQ(PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros");
1139       }
1140       /* Initial matrix as if empty */
1141       ierr = PetscMemzero(mat->ilen,mat->mbs*sizeof(PetscInt));CHKERRQ(ierr);
1142       submats[i]->factor = C->factor;
1143     }
1144   } else {
1145     for (i=0; i<ismax; i++) {
1146       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1147       ierr = MatSetSizes(submats[i],nrow[i]*bs,ncol[i]*bs,nrow[i]*bs,ncol[i]*bs);CHKERRQ(ierr);
1148       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1149       ierr = MatSeqBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);CHKERRQ(ierr);
1150       ierr = MatSeqSBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);CHKERRQ(ierr);
1151     }
1152   }
1153 
1154   /* Assemble the matrices */
1155   /* First assemble the local rows */
1156   {
1157     PetscInt       ilen_row,*imat_ilen,*imat_j,*imat_i;
1158     MatScalar *imat_a;
1159 
1160     for (i=0; i<ismax; i++) {
1161       mat       = (Mat_SeqBAIJ*)submats[i]->data;
1162       imat_ilen = mat->ilen;
1163       imat_j    = mat->j;
1164       imat_i    = mat->i;
1165       imat_a    = mat->a;
1166 
1167 #if defined (PETSC_USE_CTABLE)
1168       lcol1_gcol1 = colmaps[i];
1169       lrow1_grow1 = rowmaps[i];
1170 #else
1171       cmap_i    = cmap[i];
1172       rmap_i    = rmap[i];
1173 #endif
1174       irow_i    = irow[i];
1175       jmax      = nrow[i];
1176       for (j=0; j<jmax; j++) {
1177         row      = irow_i[j];
1178 #if defined (PETSC_USE_CTABLE)
1179 	ierr = PetscGetProc(row,size,c->rangebs,&proc);CHKERRQ(ierr);
1180 #else
1181 	proc = rtable[row];
1182 #endif
1183         if (proc == rank) {
1184           row      = row - rstart;
1185           nzA      = a_i[row+1] - a_i[row];
1186           nzB      = b_i[row+1] - b_i[row];
1187           cworkA   = a_j + a_i[row];
1188           cworkB   = b_j + b_i[row];
1189           vworkA   = a_a + a_i[row]*bs2;
1190           vworkB   = b_a + b_i[row]*bs2;
1191 #if defined (PETSC_USE_CTABLE)
1192 	  ierr = PetscTableFind(lrow1_grow1,row+rstart+1,&row);CHKERRQ(ierr);
1193           row--;
1194           if (row < 0) { SETERRQ(PETSC_ERR_PLIB,"row not found in table"); }
1195 #else
1196           row      = rmap_i[row + rstart];
1197 #endif
1198           mat_i    = imat_i[row];
1199           mat_a    = imat_a + mat_i*bs2;
1200           mat_j    = imat_j + mat_i;
1201           ilen_row = imat_ilen[row];
1202 
1203           /* load the column indices for this row into cols*/
1204           for (l=0; l<nzB; l++) {
1205 	    if ((ctmp = bmap[cworkB[l]]) < cstart) {
1206 #if defined (PETSC_USE_CTABLE)
1207 	      ierr = PetscTableFind(lcol1_gcol1,ctmp+1,&tcol);CHKERRQ(ierr);
1208 	      if (tcol) {
1209 #else
1210               if ((tcol = cmap_i[ctmp])) {
1211 #endif
1212                 *mat_j++ = tcol - 1;
1213                 ierr     = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1214                 mat_a   += bs2;
1215                 ilen_row++;
1216               }
1217             } else break;
1218           }
1219           imark = l;
1220           for (l=0; l<nzA; l++) {
1221 #if defined (PETSC_USE_CTABLE)
1222 	    ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[l]+1,&tcol);CHKERRQ(ierr);
1223 	    if (tcol) {
1224 #else
1225 	    if ((tcol = cmap_i[cstart + cworkA[l]])) {
1226 #endif
1227               *mat_j++ = tcol - 1;
1228               ierr     = PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1229               mat_a   += bs2;
1230               ilen_row++;
1231             }
1232           }
1233           for (l=imark; l<nzB; l++) {
1234 #if defined (PETSC_USE_CTABLE)
1235 	    ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[l]]+1,&tcol);CHKERRQ(ierr);
1236 	    if (tcol) {
1237 #else
1238             if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1239 #endif
1240               *mat_j++ = tcol - 1;
1241               ierr     = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1242               mat_a   += bs2;
1243               ilen_row++;
1244             }
1245           }
1246           imat_ilen[row] = ilen_row;
1247         }
1248       }
1249 
1250     }
1251   }
1252 
1253   /*   Now assemble the off proc rows*/
1254   {
1255     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1256     PetscInt    *imat_j,*imat_i;
1257     MatScalar   *imat_a,*rbuf4_i;
1258     PetscMPIInt ii;
1259 
1260     for (tmp2=0; tmp2<nrqs; tmp2++) {
1261       ierr    = MPI_Waitany(nrqs,r_waits4,&ii,r_status4+tmp2);CHKERRQ(ierr);
1262       idex   = pa[ii];
1263       sbuf1_i = sbuf1[idex];
1264       jmax    = sbuf1_i[0];
1265       ct1     = 2*jmax + 1;
1266       ct2     = 0;
1267       rbuf2_i = rbuf2[ii];
1268       rbuf3_i = rbuf3[ii];
1269       rbuf4_i = rbuf4[ii];
1270       for (j=1; j<=jmax; j++) {
1271         is_no     = sbuf1_i[2*j-1];
1272 #if defined (PETSC_USE_CTABLE)
1273 	lrow1_grow1 = rowmaps[is_no];
1274 	lcol1_gcol1 = colmaps[is_no];
1275 #else
1276         rmap_i    = rmap[is_no];
1277         cmap_i    = cmap[is_no];
1278 #endif
1279         mat       = (Mat_SeqBAIJ*)submats[is_no]->data;
1280         imat_ilen = mat->ilen;
1281         imat_j    = mat->j;
1282         imat_i    = mat->i;
1283         imat_a    = mat->a;
1284         max1      = sbuf1_i[2*j];
1285         for (k=0; k<max1; k++,ct1++) {
1286           row   = sbuf1_i[ct1];
1287 #if defined (PETSC_USE_CTABLE)
1288 	  ierr = PetscTableFind(lrow1_grow1,row+1,&row);CHKERRQ(ierr);
1289           row--;
1290           if(row < 0) { SETERRQ(PETSC_ERR_PLIB,"row not found in table"); }
1291 #else
1292           row   = rmap_i[row];
1293 #endif
1294           ilen  = imat_ilen[row];
1295           mat_i = imat_i[row];
1296           mat_a = imat_a + mat_i*bs2;
1297           mat_j = imat_j + mat_i;
1298           max2 = rbuf2_i[ct1];
1299           for (l=0; l<max2; l++,ct2++) {
1300 #if defined (PETSC_USE_CTABLE)
1301 	    ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1302 	    if (tcol) {
1303 #else
1304 	    if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1305 #endif
1306               *mat_j++    = tcol - 1;
1307               /* *mat_a++= rbuf4_i[ct2]; */
1308               ierr        = PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1309               mat_a      += bs2;
1310               ilen++;
1311             }
1312           }
1313           imat_ilen[row] = ilen;
1314         }
1315       }
1316     }
1317   }
1318   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1319   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1320   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1321   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1322   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1323 
1324   /* Restore the indices */
1325   for (i=0; i<ismax; i++) {
1326     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1327     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1328   }
1329 
1330   /* Destroy allocated memory */
1331 #if defined(PETSC_USE_CTABLE)
1332   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1333 #else
1334   ierr = PetscFree5(irow,icol,nrow,ncol,rtable);CHKERRQ(ierr);
1335 #endif
1336   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1337   ierr = PetscFree(pa);CHKERRQ(ierr);
1338 
1339   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1340   ierr = PetscFree(sbuf1);CHKERRQ(ierr);
1341   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1342   for (i=0; i<nrqr; ++i) {
1343     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1344   }
1345   for (i=0; i<nrqs; ++i) {
1346     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1347     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1348   }
1349   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1350   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1351   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1352   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1353   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1354   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1355   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1356 
1357 #if defined (PETSC_USE_CTABLE)
1358   for (i=0; i<ismax; i++){
1359     ierr = PetscTableDestroy(rowmaps[i]);CHKERRQ(ierr);
1360     ierr = PetscTableDestroy(colmaps[i]);CHKERRQ(ierr);
1361   }
1362   ierr = PetscFree(colmaps);CHKERRQ(ierr);
1363   ierr = PetscFree(rowmaps);CHKERRQ(ierr);
1364 #else
1365   ierr = PetscFree(rmap);CHKERRQ(ierr);
1366   ierr = PetscFree(cmap[0]);CHKERRQ(ierr);
1367   ierr = PetscFree(cmap);CHKERRQ(ierr);
1368 #endif
1369   ierr = PetscFree(lens);CHKERRQ(ierr);
1370 
1371   for (i=0; i<ismax; i++) {
1372     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1373     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1374   }
1375   PetscFunctionReturn(0);
1376 }
1377 
1378