xref: /petsc/src/mat/impls/baij/mpi/baijov.c (revision 4482741e5b2e2bbc854fb1f8dba65221386520f2)
1 /*$Id: baijov.c,v 1.65 2001/08/06 21:15:42 bsmith Exp $*/
2 
3 /*
4    Routines to compute overlapping regions of a parallel MPI matrix
5   and to find submatrices that were shared across processors.
6 */
7 #include "src/mat/impls/baij/mpi/mpibaij.h"
8 #include "petscbt.h"
9 
10 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat,int,IS *);
11 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat,int,char **,int*,int**);
12 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat,int,int **,int**,int*);
13 EXTERN int MatGetRow_MPIBAIJ(Mat,int,int*,int**,PetscScalar**);
14 EXTERN int MatRestoreRow_MPIBAIJ(Mat,int,int*,int**,PetscScalar**);
15 
16 #undef __FUNCT__
17 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ"
18 int MatIncreaseOverlap_MPIBAIJ(Mat C,int imax,IS is[],int ov)
19 {
20   Mat_MPIBAIJ  *c = (Mat_MPIBAIJ*)C->data;
21   int          i,ierr,N=C->N, bs=c->bs;
22   IS           *is_new;
23 
24   PetscFunctionBegin;
25   ierr = PetscMalloc(imax*sizeof(IS),&is_new);CHKERRQ(ierr);
26   /* Convert the indices into block format */
27   ierr = ISCompressIndicesGeneral(N,bs,imax,is,is_new);CHKERRQ(ierr);
28   if (ov < 0){ SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");}
29   for (i=0; i<ov; ++i) {
30     ierr = MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);CHKERRQ(ierr);
31   }
32   for (i=0; i<imax; i++) {ierr = ISDestroy(is[i]);CHKERRQ(ierr);}
33   ierr = ISExpandIndicesGeneral(N,bs,imax,is_new,is);CHKERRQ(ierr);
34   for (i=0; i<imax; i++) {ierr = ISDestroy(is_new[i]);CHKERRQ(ierr);}
35   ierr = PetscFree(is_new);CHKERRQ(ierr);
36   PetscFunctionReturn(0);
37 }
38 
39 /*
40   Sample message format:
41   If a processor A wants processor B to process some elements corresponding
42   to index sets is[1], is[5]
43   mesg [0] = 2   (no of index sets in the mesg)
44   -----------
45   mesg [1] = 1 => is[1]
46   mesg [2] = sizeof(is[1]);
47   -----------
48   mesg [5] = 5  => is[5]
49   mesg [6] = sizeof(is[5]);
50   -----------
51   mesg [7]
52   mesg [n]  data(is[1])
53   -----------
54   mesg[n+1]
55   mesg[m]  data(is[5])
56   -----------
57 
58   Notes:
59   nrqs - no of requests sent (or to be sent out)
60   nrqr - no of requests recieved (which have to be or which have been processed
61 */
62 #undef __FUNCT__
63 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ_Once"
64 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat C,int imax,IS is[])
65 {
66   Mat_MPIBAIJ  *c = (Mat_MPIBAIJ*)C->data;
67   int         **idx,*n,*w1,*w2,*w3,*w4,*rtable,**data,len,*idx_i;
68   int         size,rank,Mbs,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
69   int         *ctr,*pa,*tmp,nrqr,*isz,*isz1,**xdata,**rbuf2;
70   int         *onodes1,*olengths1,tag1,tag2,*onodes2,*olengths2;
71   PetscBT     *table;
72   MPI_Comm    comm;
73   MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
74   MPI_Status  *s_status,*recv_status;
75 
76   PetscFunctionBegin;
77   comm   = C->comm;
78   size   = c->size;
79   rank   = c->rank;
80   Mbs    = c->Mbs;
81 
82   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
83   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
84 
85   len    = (imax+1)*sizeof(int*)+ (imax + Mbs)*sizeof(int);
86   ierr   = PetscMalloc(len,&idx);CHKERRQ(ierr);
87   n      = (int*)(idx + imax);
88   rtable = n + imax;
89 
90   for (i=0; i<imax; i++) {
91     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
92     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
93   }
94 
95   /* Create hash table for the mapping :row -> proc*/
96   for (i=0,j=0; i<size; i++) {
97     len = c->rowners[i+1];
98     for (; j<len; j++) {
99       rtable[j] = i;
100     }
101   }
102 
103   /* evaluate communication - mesg to who,length of mesg, and buffer space
104      required. Based on this, buffers are allocated, and data copied into them*/
105   ierr = PetscMalloc(size*4*sizeof(int),&w1);CHKERRQ(ierr);/*  mesg size */
106   w2   = w1 + size;       /* if w2[i] marked, then a message to proc i*/
107   w3   = w2 + size;       /* no of IS that needs to be sent to proc i */
108   w4   = w3 + size;       /* temp work space used in determining w1, w2, w3 */
109   ierr = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/
110   for (i=0; i<imax; i++) {
111     ierr  = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/
112     idx_i = idx[i];
113     len   = n[i];
114     for (j=0; j<len; j++) {
115       row  = idx_i[j];
116       if (row < 0) {
117         SETERRQ(1,"Index set cannot have negative entries");
118       }
119       proc = rtable[row];
120       w4[proc]++;
121     }
122     for (j=0; j<size; j++){
123       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
124     }
125   }
126 
127   nrqs     = 0;              /* no of outgoing messages */
128   msz      = 0;              /* total mesg length (for all proc */
129   w1[rank] = 0;              /* no mesg sent to itself */
130   w3[rank] = 0;
131   for (i=0; i<size; i++) {
132     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
133   }
134   /* pa - is list of processors to communicate with */
135   ierr = PetscMalloc((nrqs+1)*sizeof(int),&pa);CHKERRQ(ierr);
136   for (i=0,j=0; i<size; i++) {
137     if (w1[i]) {pa[j] = i; j++;}
138   }
139 
140   /* Each message would have a header = 1 + 2*(no of IS) + data */
141   for (i=0; i<nrqs; i++) {
142     j      = pa[i];
143     w1[j] += w2[j] + 2*w3[j];
144     msz   += w1[j];
145   }
146 
147   /* Determine the number of messages to expect, their lengths, from from-ids */
148   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
149   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
150 
151   /* Now post the Irecvs corresponding to these messages */
152   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
153 
154   /* Allocate Memory for outgoing messages */
155   len    = 2*size*sizeof(int*) + (size+msz)*sizeof(int);
156   ierr = PetscMalloc(len,&outdat);CHKERRQ(ierr);
157   ptr    = outdat + size;     /* Pointers to the data in outgoing buffers */
158   ierr   = PetscMemzero(outdat,2*size*sizeof(int*));CHKERRQ(ierr);
159   tmp    = (int*)(outdat + 2*size);
160   ctr    = tmp + msz;
161 
162   {
163     int *iptr = tmp,ict  = 0;
164     for (i=0; i<nrqs; i++) {
165       j         = pa[i];
166       iptr     +=  ict;
167       outdat[j] = iptr;
168       ict       = w1[j];
169     }
170   }
171 
172   /* Form the outgoing messages */
173   /*plug in the headers*/
174   for (i=0; i<nrqs; i++) {
175     j            = pa[i];
176     outdat[j][0] = 0;
177     ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr);
178     ptr[j]       = outdat[j] + 2*w3[j] + 1;
179   }
180 
181   /* Memory for doing local proc's work*/
182   {
183     int  *d_p;
184     char *t_p;
185 
186     len  = (imax)*(sizeof(PetscBT) + sizeof(int*)+ sizeof(int)) +
187       (Mbs)*imax*sizeof(int)  + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1;
188     ierr = PetscMalloc(len,&table);CHKERRQ(ierr);
189     ierr = PetscMemzero(table,len);CHKERRQ(ierr);
190     data = (int **)(table + imax);
191     isz  = (int  *)(data  + imax);
192     d_p  = (int  *)(isz   + imax);
193     t_p  = (char *)(d_p   + Mbs*imax);
194     for (i=0; i<imax; i++) {
195       table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
196       data[i]  = d_p + (Mbs)*i;
197     }
198   }
199 
200   /* Parse the IS and update local tables and the outgoing buf with the data*/
201   {
202     int     n_i,*data_i,isz_i,*outdat_j,ctr_j;
203     PetscBT table_i;
204 
205     for (i=0; i<imax; i++) {
206       ierr    = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr);
207       n_i     = n[i];
208       table_i = table[i];
209       idx_i   = idx[i];
210       data_i  = data[i];
211       isz_i   = isz[i];
212       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
213         row  = idx_i[j];
214         proc = rtable[row];
215         if (proc != rank) { /* copy to the outgoing buffer */
216           ctr[proc]++;
217           *ptr[proc] = row;
218           ptr[proc]++;
219         }
220         else { /* Update the local table */
221           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
222         }
223       }
224       /* Update the headers for the current IS */
225       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
226         if ((ctr_j = ctr[j])) {
227           outdat_j        = outdat[j];
228           k               = ++outdat_j[0];
229           outdat_j[2*k]   = ctr_j;
230           outdat_j[2*k-1] = i;
231         }
232       }
233       isz[i] = isz_i;
234     }
235   }
236 
237   /*  Now  post the sends */
238   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
239   for (i=0; i<nrqs; ++i) {
240     j    = pa[i];
241     ierr = MPI_Isend(outdat[j],w1[j],MPI_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
242   }
243 
244   /* No longer need the original indices*/
245   for (i=0; i<imax; ++i) {
246     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
247   }
248   ierr = PetscFree(idx);CHKERRQ(ierr);
249 
250   for (i=0; i<imax; ++i) {
251     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
252   }
253 
254   /* Do Local work*/
255   ierr = MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
256 
257   /* Receive messages*/
258   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
259   ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);
260 
261   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
262   ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);
263 
264   /* Phase 1 sends are complete - deallocate buffers */
265   ierr = PetscFree(outdat);CHKERRQ(ierr);
266   ierr = PetscFree(w1);CHKERRQ(ierr);
267 
268   ierr = PetscMalloc((nrqr+1)*sizeof(int *),&xdata);CHKERRQ(ierr);
269   ierr = PetscMalloc((nrqr+1)*sizeof(int),&isz1);CHKERRQ(ierr);
270   ierr = MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
271   ierr = PetscFree(rbuf);CHKERRQ(ierr);
272 
273   /* Send the data back*/
274   /* Do a global reduction to know the buffer space req for incoming messages*/
275   {
276     int *rw1;
277 
278     ierr = PetscMalloc(size*sizeof(int),&rw1);CHKERRQ(ierr);
279     ierr = PetscMemzero(rw1,size*sizeof(int));CHKERRQ(ierr);
280 
281     for (i=0; i<nrqr; ++i) {
282       proc      = recv_status[i].MPI_SOURCE;
283       if (proc != onodes1[i]) SETERRQ(1,"MPI_SOURCE mismatch");
284       rw1[proc] = isz1[i];
285     }
286 
287     ierr = PetscFree(onodes1);CHKERRQ(ierr);
288     ierr = PetscFree(olengths1);CHKERRQ(ierr);
289 
290     /* Determine the number of messages to expect, their lengths, from from-ids */
291     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
292     PetscFree(rw1);
293   }
294   /* Now post the Irecvs corresponding to these messages */
295   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
296 
297   /*  Now  post the sends */
298   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
299   for (i=0; i<nrqr; ++i) {
300     j    = recv_status[i].MPI_SOURCE;
301     ierr = MPI_Isend(xdata[i],isz1[i],MPI_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
302   }
303 
304   /* receive work done on other processors*/
305   {
306     int         idex,is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
307     PetscBT     table_i;
308     MPI_Status  *status2;
309 
310     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
311 
312     for (i=0; i<nrqs; ++i) {
313       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
314       /* Process the message*/
315       rbuf2_i = rbuf2[idex];
316       ct1     = 2*rbuf2_i[0]+1;
317       jmax    = rbuf2[idex][0];
318       for (j=1; j<=jmax; j++) {
319         max     = rbuf2_i[2*j];
320         is_no   = rbuf2_i[2*j-1];
321         isz_i   = isz[is_no];
322         data_i  = data[is_no];
323         table_i = table[is_no];
324         for (k=0; k<max; k++,ct1++) {
325           row = rbuf2_i[ct1];
326           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
327         }
328         isz[is_no] = isz_i;
329       }
330     }
331     ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);
332     ierr = PetscFree(status2);CHKERRQ(ierr);
333   }
334 
335   for (i=0; i<imax; ++i) {
336     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);CHKERRQ(ierr);
337   }
338 
339 
340   ierr = PetscFree(onodes2);CHKERRQ(ierr);
341   ierr = PetscFree(olengths2);CHKERRQ(ierr);
342 
343   ierr = PetscFree(pa);CHKERRQ(ierr);
344   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
345   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
346   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
347   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
348   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
349   ierr = PetscFree(table);CHKERRQ(ierr);
350   ierr = PetscFree(s_status);CHKERRQ(ierr);
351   ierr = PetscFree(recv_status);CHKERRQ(ierr);
352   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
353   ierr = PetscFree(xdata);CHKERRQ(ierr);
354   ierr = PetscFree(isz1);CHKERRQ(ierr);
355   PetscFunctionReturn(0);
356 }
357 
358 #undef __FUNCT__
359 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ_Local"
360 /*
361    MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
362        the work on the local processor.
363 
364      Inputs:
365       C      - MAT_MPIBAIJ;
366       imax - total no of index sets processed at a time;
367       table  - an array of char - size = Mbs bits.
368 
369      Output:
370       isz    - array containing the count of the solution elements corresponding
371                to each index set;
372       data   - pointer to the solutions
373 */
374 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat C,int imax,PetscBT *table,int *isz,int **data)
375 {
376   Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
377   Mat         A = c->A,B = c->B;
378   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
379   int         start,end,val,max,rstart,cstart,*ai,*aj;
380   int         *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
381   PetscBT     table_i;
382 
383   PetscFunctionBegin;
384   rstart = c->rstart;
385   cstart = c->cstart;
386   ai     = a->i;
387   aj     = a->j;
388   bi     = b->i;
389   bj     = b->j;
390   garray = c->garray;
391 
392 
393   for (i=0; i<imax; i++) {
394     data_i  = data[i];
395     table_i = table[i];
396     isz_i   = isz[i];
397     for (j=0,max=isz[i]; j<max; j++) {
398       row   = data_i[j] - rstart;
399       start = ai[row];
400       end   = ai[row+1];
401       for (k=start; k<end; k++) { /* Amat */
402         val = aj[k] + cstart;
403         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
404       }
405       start = bi[row];
406       end   = bi[row+1];
407       for (k=start; k<end; k++) { /* Bmat */
408         val = garray[bj[k]];
409         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
410       }
411     }
412     isz[i] = isz_i;
413   }
414   PetscFunctionReturn(0);
415 }
416 #undef __FUNCT__
417 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ_Receive"
418 /*
419       MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages,
420          and return the output
421 
422          Input:
423            C    - the matrix
424            nrqr - no of messages being processed.
425            rbuf - an array of pointers to the recieved requests
426 
427          Output:
428            xdata - array of messages to be sent back
429            isz1  - size of each message
430 
431   For better efficiency perhaps we should malloc seperately each xdata[i],
432 then if a remalloc is required we need only copy the data for that one row
433 rather than all previous rows as it is now where a single large chunck of
434 memory is used.
435 
436 */
437 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,int nrqr,int **rbuf,int **xdata,int * isz1)
438 {
439   Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
440   Mat         A = c->A,B = c->B;
441   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
442   int         rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
443   int         row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
444   int         val,max1,max2,rank,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
445   int         *rbuf_i,kmax,rbuf_0,ierr;
446   PetscBT     xtable;
447 
448   PetscFunctionBegin;
449   rank   = c->rank;
450   Mbs    = c->Mbs;
451   rstart = c->rstart;
452   cstart = c->cstart;
453   ai     = a->i;
454   aj     = a->j;
455   bi     = b->i;
456   bj     = b->j;
457   garray = c->garray;
458 
459 
460   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
461     rbuf_i  =  rbuf[i];
462     rbuf_0  =  rbuf_i[0];
463     ct     += rbuf_0;
464     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
465   }
466 
467   if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
468   else        max1 = 1;
469   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
470   ierr         = PetscMalloc(mem_estimate*sizeof(int),&xdata[0]);CHKERRQ(ierr);
471   ++no_malloc;
472   ierr         = PetscBTCreate(Mbs,xtable);CHKERRQ(ierr);
473   ierr         = PetscMemzero(isz1,nrqr*sizeof(int));CHKERRQ(ierr);
474 
475   ct3 = 0;
476   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
477     rbuf_i =  rbuf[i];
478     rbuf_0 =  rbuf_i[0];
479     ct1    =  2*rbuf_0+1;
480     ct2    =  ct1;
481     ct3    += ct1;
482     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
483       ierr = PetscBTMemzero(Mbs,xtable);CHKERRQ(ierr);
484       oct2 = ct2;
485       kmax = rbuf_i[2*j];
486       for (k=0; k<kmax; k++,ct1++) {
487         row = rbuf_i[ct1];
488         if (!PetscBTLookupSet(xtable,row)) {
489           if (!(ct3 < mem_estimate)) {
490             new_estimate = (int)(1.5*mem_estimate)+1;
491             ierr = PetscMalloc(new_estimate * sizeof(int),&tmp);CHKERRQ(ierr);
492             ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr);
493             ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
494             xdata[0]     = tmp;
495             mem_estimate = new_estimate; ++no_malloc;
496             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
497           }
498           xdata[i][ct2++] = row;
499           ct3++;
500         }
501       }
502       for (k=oct2,max2=ct2; k<max2; k++)  {
503         row   = xdata[i][k] - rstart;
504         start = ai[row];
505         end   = ai[row+1];
506         for (l=start; l<end; l++) {
507           val = aj[l] + cstart;
508           if (!PetscBTLookupSet(xtable,val)) {
509             if (!(ct3 < mem_estimate)) {
510               new_estimate = (int)(1.5*mem_estimate)+1;
511               ierr = PetscMalloc(new_estimate * sizeof(int),&tmp);CHKERRQ(ierr);
512               ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr);
513               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
514               xdata[0]     = tmp;
515               mem_estimate = new_estimate; ++no_malloc;
516               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
517             }
518             xdata[i][ct2++] = val;
519             ct3++;
520           }
521         }
522         start = bi[row];
523         end   = bi[row+1];
524         for (l=start; l<end; l++) {
525           val = garray[bj[l]];
526           if (!PetscBTLookupSet(xtable,val)) {
527             if (!(ct3 < mem_estimate)) {
528               new_estimate = (int)(1.5*mem_estimate)+1;
529               ierr = PetscMalloc(new_estimate * sizeof(int),&tmp);CHKERRQ(ierr);
530               ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr);
531               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
532               xdata[0]     = tmp;
533               mem_estimate = new_estimate; ++no_malloc;
534               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
535             }
536             xdata[i][ct2++] = val;
537             ct3++;
538           }
539         }
540       }
541       /* Update the header*/
542       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
543       xdata[i][2*j-1] = rbuf_i[2*j-1];
544     }
545     xdata[i][0] = rbuf_0;
546     xdata[i+1]  = xdata[i] + ct2;
547     isz1[i]     = ct2; /* size of each message */
548   }
549   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
550   PetscLogInfo(0,"MatIncreaseOverlap_MPIBAIJ:[%d] Allocated %d bytes, required %d, no of mallocs = %d\n",rank,mem_estimate,ct3,no_malloc);
551   PetscFunctionReturn(0);
552 }
553 
554 static int MatGetSubMatrices_MPIBAIJ_local(Mat,int,const IS[],const IS[],MatReuse,Mat *);
555 
556 #undef __FUNCT__
557 #define __FUNCT__ "MatGetSubMatrices_MPIBAIJ"
558 int MatGetSubMatrices_MPIBAIJ(Mat C,int ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
559 {
560   IS          *isrow_new,*iscol_new;
561   Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
562   int         nmax,nstages_local,nstages,i,pos,max_no,ierr,N=C->N,bs=c->bs;
563 
564   PetscFunctionBegin;
565   /* The compression and expansion should be avoided. Does'nt point
566      out errors might change the indices hence buggey */
567 
568   ierr = PetscMalloc(2*(ismax+1)*sizeof(IS),&isrow_new);CHKERRQ(ierr);
569   iscol_new = isrow_new + ismax;
570   ierr = ISCompressIndicesSorted(N,bs,ismax,isrow,isrow_new);CHKERRQ(ierr);
571   ierr = ISCompressIndicesSorted(N,bs,ismax,iscol,iscol_new);CHKERRQ(ierr);
572 
573   /* Allocate memory to hold all the submatrices */
574   if (scall != MAT_REUSE_MATRIX) {
575     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
576   }
577   /* Determine the number of stages through which submatrices are done */
578   nmax          = 20*1000000 / (c->Nbs * sizeof(int));
579   if (!nmax) nmax = 1;
580   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
581 
582   /* Make sure every processor loops through the nstages */
583   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);CHKERRQ(ierr);
584   for (i=0,pos=0; i<nstages; i++) {
585     if (pos+nmax <= ismax) max_no = nmax;
586     else if (pos == ismax) max_no = 0;
587     else                   max_no = ismax-pos;
588     ierr = MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos);CHKERRQ(ierr);
589     pos += max_no;
590   }
591 
592   for (i=0; i<ismax; i++) {
593     ierr = ISDestroy(isrow_new[i]);CHKERRQ(ierr);
594     ierr = ISDestroy(iscol_new[i]);CHKERRQ(ierr);
595   }
596   ierr = PetscFree(isrow_new);CHKERRQ(ierr);
597   PetscFunctionReturn(0);
598 }
599 
600 #if defined (PETSC_USE_CTABLE)
601 #undef __FUNCT__
602 #define __FUNCT__ "PetscGetProc"
603 int PetscGetProc(const int gid, const int numprocs, const int proc_gnode[], int *proc)
604 {
605   int nGlobalNd = proc_gnode[numprocs];
606   int fproc = (int) ((float)gid * (float)numprocs / (float)nGlobalNd + 0.5);
607 
608   PetscFunctionBegin;
609   /* if(fproc < 0) SETERRQ(1,"fproc < 0");*/
610   if (fproc > numprocs) fproc = numprocs;
611   while (gid < proc_gnode[fproc] || gid >= proc_gnode[fproc+1]) {
612     if (gid < proc_gnode[fproc]) fproc--;
613     else                         fproc++;
614   }
615   /* if(fproc<0 || fproc>=numprocs) { SETERRQ(1,"fproc < 0 || fproc >= numprocs"); }*/
616   *proc = fproc;
617   PetscFunctionReturn(0);
618 }
619 #endif
620 
621 /* -------------------------------------------------------------------------*/
622 #undef __FUNCT__
623 #define __FUNCT__ "MatGetSubMatrices_MPIBAIJ_local"
624 static int MatGetSubMatrices_MPIBAIJ_local(Mat C,int ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
625 {
626   Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
627   Mat         A = c->A;
628   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
629   int         **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,start,end,size;
630   int         **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,ierr,**rbuf1,row,proc;
631   int         nrqs,msz,**ptr,idex,*req_size,*ctr,*pa,*tmp,tcol,nrqr;
632   int         **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
633   int         **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i;
634   int         len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
635   int         bs=c->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB;
636   int         cstart = c->cstart,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
637   int         *bmap = c->garray,ctmp,rstart=c->rstart,tag0,tag1,tag2,tag3;
638   MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
639   MPI_Request *r_waits4,*s_waits3,*s_waits4;
640   MPI_Status  *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
641   MPI_Status  *r_status3,*r_status4,*s_status4;
642   MPI_Comm    comm;
643   MatScalar   **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB;
644   MatScalar   *a_a=a->a,*b_a=b->a;
645   PetscTruth  flag;
646   int         *onodes1,*olengths1;
647 
648 #if defined (PETSC_USE_CTABLE)
649   int tt;
650   PetscTable  *rowmaps,*colmaps,lrow1_grow1,lcol1_gcol1;
651 #else
652   int         **cmap,*cmap_i,*rtable,*rmap_i,**rmap, Mbs = c->Mbs;
653 #endif
654 
655   PetscFunctionBegin;
656   comm   = C->comm;
657   tag0   = C->tag;
658   size   = c->size;
659   rank   = c->rank;
660 
661   /* Get some new tags to keep the communication clean */
662   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
663   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
664   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
665 
666   /* Check if the col indices are sorted */
667   for (i=0; i<ismax; i++) {
668     ierr = ISSorted(iscol[i],(PetscTruth*)&j);CHKERRQ(ierr);
669     if (!j) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
670   }
671 
672   len    = (2*ismax+1)*(sizeof(int*)+ sizeof(int));
673 #if !defined (PETSC_USE_CTABLE)
674   len    += (Mbs+1)*sizeof(int);
675 #endif
676   ierr = PetscMalloc(len,&irow);CHKERRQ(ierr);
677   icol = irow + ismax;
678   nrow = (int*)(icol + ismax);
679   ncol = nrow + ismax;
680 #if !defined (PETSC_USE_CTABLE)
681   rtable = ncol + ismax;
682   /* Create hash table for the mapping :row -> proc*/
683   for (i=0,j=0; i<size; i++) {
684     jmax = c->rowners[i+1];
685     for (; j<jmax; j++) {
686       rtable[j] = i;
687     }
688   }
689 #endif
690 
691   for (i=0; i<ismax; i++) {
692     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
693     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
694     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
695     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
696   }
697 
698   /* evaluate communication - mesg to who,length of mesg,and buffer space
699      required. Based on this, buffers are allocated, and data copied into them*/
700   ierr = PetscMalloc(size*4*sizeof(int),&w1);CHKERRQ(ierr); /* mesg size */
701   w2   = w1 + size;      /* if w2[i] marked, then a message to proc i*/
702   w3   = w2 + size;      /* no of IS that needs to be sent to proc i */
703   w4   = w3 + size;      /* temp work space used in determining w1, w2, w3 */
704   ierr = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/
705   for (i=0; i<ismax; i++) {
706     ierr   = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/
707     jmax   = nrow[i];
708     irow_i = irow[i];
709     for (j=0; j<jmax; j++) {
710       row  = irow_i[j];
711 #if defined (PETSC_USE_CTABLE)
712       ierr = PetscGetProc(row,size,c->rowners,&proc);CHKERRQ(ierr);
713 #else
714       proc = rtable[row];
715 #endif
716       w4[proc]++;
717     }
718     for (j=0; j<size; j++) {
719       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
720     }
721   }
722 
723   nrqs     = 0;              /* no of outgoing messages */
724   msz      = 0;              /* total mesg length for all proc */
725   w1[rank] = 0;              /* no mesg sent to intself */
726   w3[rank] = 0;
727   for (i=0; i<size; i++) {
728     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
729   }
730   ierr = PetscMalloc((nrqs+1)*sizeof(int),&pa);CHKERRQ(ierr); /*(proc -array)*/
731   for (i=0,j=0; i<size; i++) {
732     if (w1[i]) { pa[j] = i; j++; }
733   }
734 
735   /* Each message would have a header = 1 + 2*(no of IS) + data */
736   for (i=0; i<nrqs; i++) {
737     j     = pa[i];
738     w1[j] += w2[j] + 2* w3[j];
739     msz   += w1[j];
740   }
741 
742   /* Determine the number of messages to expect, their lengths, from from-ids */
743   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
744   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
745 
746   /* Now post the Irecvs corresponding to these messages */
747   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
748 
749   ierr = PetscFree(onodes1);CHKERRQ(ierr);
750   ierr = PetscFree(olengths1);CHKERRQ(ierr);
751 
752   /* Allocate Memory for outgoing messages */
753   len  = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int);
754   ierr = PetscMalloc(len,&sbuf1);CHKERRQ(ierr);
755   ptr  = sbuf1 + size;   /* Pointers to the data in outgoing buffers */
756   ierr = PetscMemzero(sbuf1,2*size*sizeof(int*));CHKERRQ(ierr);
757   /* allocate memory for outgoing data + buf to receive the first reply */
758   tmp  = (int*)(ptr + size);
759   ctr  = tmp + 2*msz;
760 
761   {
762     int *iptr = tmp,ict = 0;
763     for (i=0; i<nrqs; i++) {
764       j         = pa[i];
765       iptr     += ict;
766       sbuf1[j]  = iptr;
767       ict       = w1[j];
768     }
769   }
770 
771   /* Form the outgoing messages */
772   /* Initialise the header space */
773   for (i=0; i<nrqs; i++) {
774     j           = pa[i];
775     sbuf1[j][0] = 0;
776     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr);
777     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
778   }
779 
780   /* Parse the isrow and copy data into outbuf */
781   for (i=0; i<ismax; i++) {
782     ierr   = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr);
783     irow_i = irow[i];
784     jmax   = nrow[i];
785     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
786       row  = irow_i[j];
787 #if defined (PETSC_USE_CTABLE)
788       ierr = PetscGetProc(row,size,c->rowners,&proc);CHKERRQ(ierr);
789 #else
790       proc = rtable[row];
791 #endif
792       if (proc != rank) { /* copy to the outgoing buf*/
793         ctr[proc]++;
794         *ptr[proc] = row;
795         ptr[proc]++;
796       }
797     }
798     /* Update the headers for the current IS */
799     for (j=0; j<size; j++) { /* Can Optimise this loop too */
800       if ((ctr_j = ctr[j])) {
801         sbuf1_j        = sbuf1[j];
802         k              = ++sbuf1_j[0];
803         sbuf1_j[2*k]   = ctr_j;
804         sbuf1_j[2*k-1] = i;
805       }
806     }
807   }
808 
809   /*  Now  post the sends */
810   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
811   for (i=0; i<nrqs; ++i) {
812     j = pa[i];
813     ierr = MPI_Isend(sbuf1[j],w1[j],MPI_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
814   }
815 
816   /* Post Recieves to capture the buffer size */
817   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
818   ierr     = PetscMalloc((nrqs+1)*sizeof(int *),&rbuf2);CHKERRQ(ierr);
819   rbuf2[0] = tmp + msz;
820   for (i=1; i<nrqs; ++i) {
821     j        = pa[i];
822     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
823   }
824   for (i=0; i<nrqs; ++i) {
825     j    = pa[i];
826     ierr = MPI_Irecv(rbuf2[i],w1[j],MPI_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
827   }
828 
829   /* Send to other procs the buf size they should allocate */
830 
831   /* Receive messages*/
832   ierr       = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
833   ierr       = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
834   len        = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*);
835   ierr       = PetscMalloc(len,&sbuf2);CHKERRQ(ierr);
836   req_size   = (int*)(sbuf2 + nrqr);
837   req_source = req_size + nrqr;
838 
839   {
840     Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
841     int        *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;
842 
843     for (i=0; i<nrqr; ++i) {
844       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
845       req_size[idex] = 0;
846       rbuf1_i         = rbuf1[idex];
847       start           = 2*rbuf1_i[0] + 1;
848       ierr            = MPI_Get_count(r_status1+i,MPI_INT,&end);CHKERRQ(ierr);
849       ierr            = PetscMalloc(end*sizeof(int),&sbuf2[idex]);CHKERRQ(ierr);
850       sbuf2_i         = sbuf2[idex];
851       for (j=start; j<end; j++) {
852         id               = rbuf1_i[j] - rstart;
853         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
854         sbuf2_i[j]       = ncols;
855         req_size[idex] += ncols;
856       }
857       req_source[idex] = r_status1[i].MPI_SOURCE;
858       /* form the header */
859       sbuf2_i[0]   = req_size[idex];
860       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
861       ierr = MPI_Isend(sbuf2_i,end,MPI_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
862     }
863   }
864   ierr = PetscFree(r_status1);CHKERRQ(ierr);
865   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
866 
867   /*  recv buffer sizes */
868   /* Receive messages*/
869 
870   ierr = PetscMalloc((nrqs+1)*sizeof(int*),&rbuf3);CHKERRQ(ierr);
871   ierr = PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);CHKERRQ(ierr);
872   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
873   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
874   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
875 
876   for (i=0; i<nrqs; ++i) {
877     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
878     ierr = PetscMalloc(rbuf2[idex][0]*sizeof(int),&rbuf3[idex]);CHKERRQ(ierr);
879     ierr = PetscMalloc(rbuf2[idex][0]*bs2*sizeof(MatScalar),&rbuf4[idex]);CHKERRQ(ierr);
880     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPI_INT,
881                      r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
882     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR,
883                      r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
884   }
885   ierr = PetscFree(r_status2);CHKERRQ(ierr);
886   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
887 
888   /* Wait on sends1 and sends2 */
889   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
890   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
891 
892   ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
893   ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
894   ierr = PetscFree(s_status1);CHKERRQ(ierr);
895   ierr = PetscFree(s_status2);CHKERRQ(ierr);
896   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
897   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
898 
899   /* Now allocate buffers for a->j, and send them off */
900   ierr = PetscMalloc((nrqr+1)*sizeof(int *),&sbuf_aj);CHKERRQ(ierr);
901   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
902   ierr = PetscMalloc((j+1)*sizeof(int),&sbuf_aj[0]);CHKERRQ(ierr);
903   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
904 
905   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
906   {
907      for (i=0; i<nrqr; i++) {
908       rbuf1_i   = rbuf1[i];
909       sbuf_aj_i = sbuf_aj[i];
910       ct1       = 2*rbuf1_i[0] + 1;
911       ct2       = 0;
912       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
913         kmax = rbuf1[i][2*j];
914         for (k=0; k<kmax; k++,ct1++) {
915           row    = rbuf1_i[ct1] - rstart;
916           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
917           ncols  = nzA + nzB;
918           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
919 
920           /* load the column indices for this row into cols*/
921           cols  = sbuf_aj_i + ct2;
922           for (l=0; l<nzB; l++) {
923             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[l] = ctmp;
924             else break;
925           }
926           imark = l;
927           for (l=0; l<nzA; l++)   cols[imark+l] = cstart + cworkA[l];
928           for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
929           ct2 += ncols;
930         }
931       }
932       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
933     }
934   }
935   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
936   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
937 
938   /* Allocate buffers for a->a, and send them off */
939   ierr = PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);CHKERRQ(ierr);
940   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
941   ierr = PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);CHKERRQ(ierr);
942   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
943 
944   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
945   {
946     for (i=0; i<nrqr; i++) {
947       rbuf1_i   = rbuf1[i];
948       sbuf_aa_i = sbuf_aa[i];
949       ct1       = 2*rbuf1_i[0]+1;
950       ct2       = 0;
951       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
952         kmax = rbuf1_i[2*j];
953         for (k=0; k<kmax; k++,ct1++) {
954           row    = rbuf1_i[ct1] - rstart;
955           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
956           ncols  = nzA + nzB;
957           cworkA = a_j + a_i[row];     cworkB = b_j + b_i[row];
958           vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2;
959 
960           /* load the column values for this row into vals*/
961           vals  = sbuf_aa_i+ct2*bs2;
962           for (l=0; l<nzB; l++) {
963             if ((bmap[cworkB[l]]) < cstart) {
964               ierr = PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
965             }
966             else break;
967           }
968           imark = l;
969           for (l=0; l<nzA; l++) {
970             ierr = PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
971           }
972           for (l=imark; l<nzB; l++) {
973             ierr = PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
974           }
975           ct2 += ncols;
976         }
977       }
978       ierr = MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
979     }
980   }
981   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
982   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
983   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
984 
985   /* Form the matrix */
986   /* create col map */
987   {
988     int *icol_i;
989 #if defined (PETSC_USE_CTABLE)
990     /* Create row map*/
991     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&colmaps);CHKERRQ(ierr);
992     for (i=0; i<ismax; i++) {
993       ierr = PetscTableCreate(ncol[i]+1,&colmaps[i]);CHKERRQ(ierr);
994     }
995 #else
996     len     = (1+ismax)*sizeof(int*)+ ismax*c->Nbs*sizeof(int);
997     ierr    = PetscMalloc(len,&cmap);CHKERRQ(ierr);
998     cmap[0] = (int *)(cmap + ismax);
999     ierr    = PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(int));CHKERRQ(ierr);
1000     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; }
1001 #endif
1002     for (i=0; i<ismax; i++) {
1003       jmax   = ncol[i];
1004       icol_i = icol[i];
1005 #if defined (PETSC_USE_CTABLE)
1006       lcol1_gcol1 = colmaps[i];
1007       for (j=0; j<jmax; j++) {
1008 	ierr = PetscTableAdd(lcol1_gcol1,icol_i[j]+1,j+1);CHKERRQ(ierr);
1009       }
1010 #else
1011       cmap_i = cmap[i];
1012       for (j=0; j<jmax; j++) {
1013         cmap_i[icol_i[j]] = j+1;
1014       }
1015 #endif
1016     }
1017   }
1018 
1019   /* Create lens which is required for MatCreate... */
1020   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1021   len     = (1+ismax)*sizeof(int*)+ j*sizeof(int);
1022   ierr    = PetscMalloc(len,&lens);CHKERRQ(ierr);
1023   lens[0] = (int *)(lens + ismax);
1024   ierr    = PetscMemzero(lens[0],j*sizeof(int));CHKERRQ(ierr);
1025   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1026 
1027   /* Update lens from local data */
1028   for (i=0; i<ismax; i++) {
1029     jmax   = nrow[i];
1030 #if defined (PETSC_USE_CTABLE)
1031     lcol1_gcol1 = colmaps[i];
1032 #else
1033     cmap_i = cmap[i];
1034 #endif
1035     irow_i = irow[i];
1036     lens_i = lens[i];
1037     for (j=0; j<jmax; j++) {
1038       row  = irow_i[j];
1039 #if defined (PETSC_USE_CTABLE)
1040       ierr = PetscGetProc(row,size,c->rowners,&proc);CHKERRQ(ierr);
1041 #else
1042       proc = rtable[row];
1043 #endif
1044       if (proc == rank) {
1045         /* Get indices from matA and then from matB */
1046         row    = row - rstart;
1047         nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1048         cworkA =  a_j + a_i[row]; cworkB = b_j + b_i[row];
1049 #if defined (PETSC_USE_CTABLE)
1050        for (k=0; k<nzA; k++) {
1051 	 ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[k]+1,&tt);CHKERRQ(ierr);
1052           if (tt) { lens_i[j]++; }
1053         }
1054         for (k=0; k<nzB; k++) {
1055 	  ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[k]]+1,&tt);CHKERRQ(ierr);
1056           if (tt) { lens_i[j]++; }
1057         }
1058 #else
1059         for (k=0; k<nzA; k++) {
1060           if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++; }
1061         }
1062         for (k=0; k<nzB; k++) {
1063           if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++; }
1064         }
1065 #endif
1066       }
1067     }
1068   }
1069 #if defined (PETSC_USE_CTABLE)
1070   /* Create row map*/
1071   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rowmaps);CHKERRQ(ierr);
1072   for (i=0; i<ismax; i++){
1073     ierr = PetscTableCreate(nrow[i]+1,&rowmaps[i]);CHKERRQ(ierr);
1074   }
1075 #else
1076   /* Create row map*/
1077   len     = (1+ismax)*sizeof(int*)+ ismax*Mbs*sizeof(int);
1078   ierr    = PetscMalloc(len,&rmap);CHKERRQ(ierr);
1079   rmap[0] = (int *)(rmap + ismax);
1080   ierr    = PetscMemzero(rmap[0],ismax*Mbs*sizeof(int));CHKERRQ(ierr);
1081   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + Mbs;}
1082 #endif
1083   for (i=0; i<ismax; i++) {
1084     irow_i = irow[i];
1085     jmax   = nrow[i];
1086 #if defined (PETSC_USE_CTABLE)
1087     lrow1_grow1 = rowmaps[i];
1088     for (j=0; j<jmax; j++) {
1089       ierr = PetscTableAdd(lrow1_grow1,irow_i[j]+1,j+1);CHKERRQ(ierr);
1090     }
1091 #else
1092     rmap_i = rmap[i];
1093     for (j=0; j<jmax; j++) {
1094       rmap_i[irow_i[j]] = j;
1095     }
1096 #endif
1097   }
1098 
1099   /* Update lens from offproc data */
1100   {
1101     int *rbuf2_i,*rbuf3_i,*sbuf1_i;
1102 
1103     for (tmp2=0; tmp2<nrqs; tmp2++) {
1104       ierr    = MPI_Waitany(nrqs,r_waits3,&i,r_status3+tmp2);CHKERRQ(ierr);
1105       idex   = pa[i];
1106       sbuf1_i = sbuf1[idex];
1107       jmax    = sbuf1_i[0];
1108       ct1     = 2*jmax+1;
1109       ct2     = 0;
1110       rbuf2_i = rbuf2[i];
1111       rbuf3_i = rbuf3[i];
1112       for (j=1; j<=jmax; j++) {
1113         is_no   = sbuf1_i[2*j-1];
1114         max1    = sbuf1_i[2*j];
1115         lens_i  = lens[is_no];
1116 #if defined (PETSC_USE_CTABLE)
1117 	lcol1_gcol1 = colmaps[is_no];
1118 	lrow1_grow1 = rowmaps[is_no];
1119 #else
1120         cmap_i  = cmap[is_no];
1121 	rmap_i  = rmap[is_no];
1122 #endif
1123         for (k=0; k<max1; k++,ct1++) {
1124 #if defined (PETSC_USE_CTABLE)
1125 	  ierr = PetscTableFind(lrow1_grow1,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1126           row--;
1127           if(row < 0) { SETERRQ(1,"row not found in table"); }
1128 #else
1129           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1130 #endif
1131           max2 = rbuf2_i[ct1];
1132           for (l=0; l<max2; l++,ct2++) {
1133 #if defined (PETSC_USE_CTABLE)
1134 	    ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tt);CHKERRQ(ierr);
1135 	    if (tt) {
1136               lens_i[row]++;
1137             }
1138 #else
1139             if (cmap_i[rbuf3_i[ct2]]) {
1140               lens_i[row]++;
1141             }
1142 #endif
1143           }
1144         }
1145       }
1146     }
1147   }
1148   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1149   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1150   ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1151   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1152   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1153 
1154   /* Create the submatrices */
1155   if (scall == MAT_REUSE_MATRIX) {
1156     /*
1157         Assumes new rows are same length as the old rows, hence bug!
1158     */
1159     for (i=0; i<ismax; i++) {
1160       mat = (Mat_SeqBAIJ *)(submats[i]->data);
1161       if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || mat->bs != bs)) {
1162         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1163       }
1164       ierr = PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(int),&flag);CHKERRQ(ierr);
1165       if (flag == PETSC_FALSE) {
1166         SETERRQ(PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros");
1167       }
1168       /* Initial matrix as if empty */
1169       ierr = PetscMemzero(mat->ilen,mat->mbs*sizeof(int));CHKERRQ(ierr);
1170       submats[i]->factor = C->factor;
1171     }
1172   } else {
1173     for (i=0; i<ismax; i++) {
1174       ierr = MatCreate(PETSC_COMM_SELF,nrow[i]*bs,ncol[i]*bs,nrow[i]*bs,ncol[i]*bs,submats+i);CHKERRQ(ierr);
1175       ierr = MatSetType(submats[i],A->type_name);CHKERRQ(ierr);
1176       ierr = MatSeqBAIJSetPreallocation(submats[i],a->bs,0,lens[i]);CHKERRQ(ierr);
1177       ierr = MatSeqSBAIJSetPreallocation(submats[i],a->bs,0,lens[i]);CHKERRQ(ierr);
1178     }
1179   }
1180 
1181   /* Assemble the matrices */
1182   /* First assemble the local rows */
1183   {
1184     int       ilen_row,*imat_ilen,*imat_j,*imat_i;
1185     MatScalar *imat_a;
1186 
1187     for (i=0; i<ismax; i++) {
1188       mat       = (Mat_SeqBAIJ*)submats[i]->data;
1189       imat_ilen = mat->ilen;
1190       imat_j    = mat->j;
1191       imat_i    = mat->i;
1192       imat_a    = mat->a;
1193 
1194 #if defined (PETSC_USE_CTABLE)
1195       lcol1_gcol1 = colmaps[i];
1196       lrow1_grow1 = rowmaps[i];
1197 #else
1198       cmap_i    = cmap[i];
1199       rmap_i    = rmap[i];
1200 #endif
1201       irow_i    = irow[i];
1202       jmax      = nrow[i];
1203       for (j=0; j<jmax; j++) {
1204         row      = irow_i[j];
1205 #if defined (PETSC_USE_CTABLE)
1206 	ierr = PetscGetProc(row,size,c->rowners,&proc);CHKERRQ(ierr);
1207 #else
1208 	proc = rtable[row];
1209 #endif
1210         if (proc == rank) {
1211           row      = row - rstart;
1212           nzA      = a_i[row+1] - a_i[row];
1213           nzB      = b_i[row+1] - b_i[row];
1214           cworkA   = a_j + a_i[row];
1215           cworkB   = b_j + b_i[row];
1216           vworkA   = a_a + a_i[row]*bs2;
1217           vworkB   = b_a + b_i[row]*bs2;
1218 #if defined (PETSC_USE_CTABLE)
1219 	  ierr = PetscTableFind(lrow1_grow1,row+rstart+1,&row);CHKERRQ(ierr);
1220           row--;
1221           if (row < 0) { SETERRQ(1,"row not found in table"); }
1222 #else
1223           row      = rmap_i[row + rstart];
1224 #endif
1225           mat_i    = imat_i[row];
1226           mat_a    = imat_a + mat_i*bs2;
1227           mat_j    = imat_j + mat_i;
1228           ilen_row = imat_ilen[row];
1229 
1230           /* load the column indices for this row into cols*/
1231           for (l=0; l<nzB; l++) {
1232 	    if ((ctmp = bmap[cworkB[l]]) < cstart) {
1233 #if defined (PETSC_USE_CTABLE)
1234 	      ierr = PetscTableFind(lcol1_gcol1,ctmp+1,&tcol);CHKERRQ(ierr);
1235 	      if (tcol) {
1236 #else
1237               if ((tcol = cmap_i[ctmp])) {
1238 #endif
1239                 *mat_j++ = tcol - 1;
1240                 ierr     = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1241                 mat_a   += bs2;
1242                 ilen_row++;
1243               }
1244             } else break;
1245           }
1246           imark = l;
1247           for (l=0; l<nzA; l++) {
1248 #if defined (PETSC_USE_CTABLE)
1249 	    ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[l]+1,&tcol);CHKERRQ(ierr);
1250 	    if (tcol) {
1251 #else
1252 	    if ((tcol = cmap_i[cstart + cworkA[l]])) {
1253 #endif
1254               *mat_j++ = tcol - 1;
1255               ierr     = PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1256               mat_a   += bs2;
1257               ilen_row++;
1258             }
1259           }
1260           for (l=imark; l<nzB; l++) {
1261 #if defined (PETSC_USE_CTABLE)
1262 	    ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[l]]+1,&tcol);CHKERRQ(ierr);
1263 	    if (tcol) {
1264 #else
1265             if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1266 #endif
1267               *mat_j++ = tcol - 1;
1268               ierr     = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1269               mat_a   += bs2;
1270               ilen_row++;
1271             }
1272           }
1273           imat_ilen[row] = ilen_row;
1274         }
1275       }
1276 
1277     }
1278   }
1279 
1280   /*   Now assemble the off proc rows*/
1281   {
1282     int       *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1283     int       *imat_j,*imat_i;
1284     MatScalar *imat_a,*rbuf4_i;
1285 
1286     for (tmp2=0; tmp2<nrqs; tmp2++) {
1287       ierr    = MPI_Waitany(nrqs,r_waits4,&i,r_status4+tmp2);CHKERRQ(ierr);
1288       idex   = pa[i];
1289       sbuf1_i = sbuf1[idex];
1290       jmax    = sbuf1_i[0];
1291       ct1     = 2*jmax + 1;
1292       ct2     = 0;
1293       rbuf2_i = rbuf2[i];
1294       rbuf3_i = rbuf3[i];
1295       rbuf4_i = rbuf4[i];
1296       for (j=1; j<=jmax; j++) {
1297         is_no     = sbuf1_i[2*j-1];
1298 #if defined (PETSC_USE_CTABLE)
1299 	lrow1_grow1 = rowmaps[is_no];
1300 	lcol1_gcol1 = colmaps[is_no];
1301 #else
1302         rmap_i    = rmap[is_no];
1303         cmap_i    = cmap[is_no];
1304 #endif
1305         mat       = (Mat_SeqBAIJ*)submats[is_no]->data;
1306         imat_ilen = mat->ilen;
1307         imat_j    = mat->j;
1308         imat_i    = mat->i;
1309         imat_a    = mat->a;
1310         max1      = sbuf1_i[2*j];
1311         for (k=0; k<max1; k++,ct1++) {
1312           row   = sbuf1_i[ct1];
1313 #if defined (PETSC_USE_CTABLE)
1314 	  ierr = PetscTableFind(lrow1_grow1,row+1,&row);CHKERRQ(ierr);
1315           row--;
1316           if(row < 0) { SETERRQ(1,"row not found in table"); }
1317 #else
1318           row   = rmap_i[row];
1319 #endif
1320           ilen  = imat_ilen[row];
1321           mat_i = imat_i[row];
1322           mat_a = imat_a + mat_i*bs2;
1323           mat_j = imat_j + mat_i;
1324           max2 = rbuf2_i[ct1];
1325           for (l=0; l<max2; l++,ct2++) {
1326 #if defined (PETSC_USE_CTABLE)
1327 	    ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1328 	    if (tcol) {
1329 #else
1330 	    if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1331 #endif
1332               *mat_j++    = tcol - 1;
1333               /* *mat_a++= rbuf4_i[ct2]; */
1334               ierr        = PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1335               mat_a      += bs2;
1336               ilen++;
1337             }
1338           }
1339           imat_ilen[row] = ilen;
1340         }
1341       }
1342     }
1343   }
1344   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1345   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1346   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1347   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1348   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1349 
1350   /* Restore the indices */
1351   for (i=0; i<ismax; i++) {
1352     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1353     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1354   }
1355 
1356   /* Destroy allocated memory */
1357   ierr = PetscFree(irow);CHKERRQ(ierr);
1358   ierr = PetscFree(w1);CHKERRQ(ierr);
1359   ierr = PetscFree(pa);CHKERRQ(ierr);
1360 
1361   ierr = PetscFree(sbuf1);CHKERRQ(ierr);
1362   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1363   for (i=0; i<nrqr; ++i) {
1364     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1365   }
1366   for (i=0; i<nrqs; ++i) {
1367     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1368     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1369   }
1370 
1371   ierr = PetscFree(sbuf2);CHKERRQ(ierr);
1372   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1373   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1374   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1375   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1376   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1377   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1378 
1379 #if defined (PETSC_USE_CTABLE)
1380   for (i=0; i<ismax; i++){
1381     ierr = PetscTableDelete(rowmaps[i]);CHKERRQ(ierr);
1382     ierr = PetscTableDelete(colmaps[i]);CHKERRQ(ierr);
1383   }
1384   ierr = PetscFree(colmaps);CHKERRQ(ierr);
1385   ierr = PetscFree(rowmaps);CHKERRQ(ierr);
1386 #else
1387   ierr = PetscFree(rmap);CHKERRQ(ierr);
1388   ierr = PetscFree(cmap);CHKERRQ(ierr);
1389 #endif
1390   ierr = PetscFree(lens);CHKERRQ(ierr);
1391 
1392   for (i=0; i<ismax; i++) {
1393     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1394     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1395   }
1396 
1397   PetscFunctionReturn(0);
1398 }
1399 
1400