xref: /petsc/src/mat/impls/baij/mpi/baijov.c (revision 95d5f7c29374efcfd2ca44c2fe93981fbc2b4454)
1 /*$Id: baijov.c,v 1.44 2000/04/09 04:36:25 bsmith Exp bsmith $*/
2 
3 /*
4    Routines to compute overlapping regions of a parallel MPI matrix
5   and to find submatrices that were shared across processors.
6 */
7 #include "src/mat/impls/baij/mpi/mpibaij.h"
8 #include "bitarray.h"
9 
10 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat,int,IS *);
11 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat,int,char **,int*,int**);
12 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat,int,int **,int**,int*);
13 extern int MatGetRow_MPIBAIJ(Mat,int,int*,int**,Scalar**);
14 extern int MatRestoreRow_MPIBAIJ(Mat,int,int*,int**,Scalar**);
15 
16 #undef __FUNC__
17 #define __FUNC__ /*<a name=""></a>*/"MatCompressIndicesGeneral_MPIBAIJ"
18 static int MatCompressIndicesGeneral_MPIBAIJ(Mat C,int imax,IS *is_in,IS *is_out)
19 {
20   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ*)C->data;
21   int          ierr,isz,bs = baij->bs,Nbs,n,i,j,*idx,*nidx,ival;
22   PetscBT      table;
23 
24   PetscFunctionBegin;
25   Nbs   = baij->Nbs;
26   nidx  = (int*)PetscMalloc((Nbs+1)*sizeof(int));CHKPTRQ(nidx);
27   ierr  = PetscBTCreate(Nbs,table);CHKERRQ(ierr);
28 
29   for (i=0; i<imax; i++) {
30     isz  = 0;
31     ierr = PetscBTMemzero(Nbs,table);CHKERRQ(ierr);
32     ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr);
33     ierr = ISGetSize(is_in[i],&n);CHKERRQ(ierr);
34     for (j=0; j<n ; j++) {
35       ival = idx[j]/bs; /* convert the indices into block indices */
36       if (ival>Nbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"index greater than mat-dim");
37       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
38     }
39     ierr = ISRestoreIndices(is_in[i],&idx);CHKERRQ(ierr);
40     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz,nidx,(is_out+i));CHKERRQ(ierr);
41   }
42   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
43   ierr = PetscFree(nidx);CHKERRQ(ierr);
44   PetscFunctionReturn(0);
45 }
46 
47 #undef __FUNC__
48 #define __FUNC__ /*<a name=""></a>*/"MatCompressIndicesSorted_MPIBAIJ"
49 static int MatCompressIndicesSorted_MPIBAIJ(Mat C,int imax,IS *is_in,IS *is_out)
50 {
51   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ*)C->data;
52   int          ierr,bs=baij->bs,i,j,k,val,n,*idx,*nidx,Nbs=baij->Nbs,*idx_local;
53   PetscTruth   flg;
54 
55   PetscFunctionBegin;
56   for (i=0; i<imax; i++) {
57     ierr = ISSorted(is_in[i],&flg);CHKERRQ(ierr);
58     if (!flg) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Indices are not sorted");
59   }
60   nidx  = (int*)PetscMalloc((Nbs+1)*sizeof(int));CHKPTRQ(nidx);
61   /* Now check if the indices are in block order */
62   for (i=0; i<imax; i++) {
63     ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr);
64     ierr = ISGetSize(is_in[i],&n);CHKERRQ(ierr);
65     if (n%bs !=0) SETERRA(1,0,"Indices are not block ordered");
66 
67     n = n/bs; /* The reduced index size */
68     idx_local = idx;
69     for (j=0; j<n ; j++) {
70       val = idx_local[0];
71       if (val%bs != 0) SETERRA(1,0,"Indices are not block ordered");
72       for (k=0; k<bs; k++) {
73         if (val+k != idx_local[k]) SETERRA(1,0,"Indices are not block ordered");
74       }
75       nidx[j] = val/bs;
76       idx_local +=bs;
77     }
78     ierr = ISRestoreIndices(is_in[i],&idx);CHKERRQ(ierr);
79     ierr = ISCreateGeneral(PETSC_COMM_SELF,n,nidx,(is_out+i));CHKERRQ(ierr);
80   }
81   ierr = PetscFree(nidx);CHKERRQ(ierr);
82   PetscFunctionReturn(0);
83 }
84 
85 #undef __FUNC__
86 #define __FUNC__ /*<a name=""></a>*/"MatExpandIndices_MPIBAIJ"
87 static int MatExpandIndices_MPIBAIJ(Mat C,int imax,IS *is_in,IS *is_out)
88 {
89   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ*)C->data;
90   int          ierr,bs = baij->bs,Nbs,n,i,j,k,*idx,*nidx;
91 
92   PetscFunctionBegin;
93   Nbs   = baij->Nbs;
94 
95   nidx  = (int*)PetscMalloc((Nbs*bs+1)*sizeof(int));CHKPTRQ(nidx);
96 
97   for (i=0; i<imax; i++) {
98     ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr);
99     ierr = ISGetSize(is_in[i],&n);CHKERRQ(ierr);
100     for (j=0; j<n ; ++j){
101       for (k=0; k<bs; k++)
102         nidx[j*bs+k] = idx[j]*bs+k;
103     }
104     ierr = ISRestoreIndices(is_in[i],&idx);CHKERRQ(ierr);
105     ierr = ISCreateGeneral(PETSC_COMM_SELF,n*bs,nidx,is_out+i);CHKERRQ(ierr);
106   }
107   ierr = PetscFree(nidx);CHKERRQ(ierr);
108   PetscFunctionReturn(0);
109 }
110 
111 
112 #undef __FUNC__
113 #define __FUNC__ /*<a name=""></a>*/"MatIncreaseOverlap_MPIBAIJ"
114 int MatIncreaseOverlap_MPIBAIJ(Mat C,int imax,IS *is,int ov)
115 {
116   int i,ierr;
117   IS  *is_new;
118 
119   PetscFunctionBegin;
120   is_new = (IS *)PetscMalloc(imax*sizeof(IS));CHKPTRQ(is_new);
121   /* Convert the indices into block format */
122   ierr = MatCompressIndicesGeneral_MPIBAIJ(C,imax,is,is_new);CHKERRQ(ierr);
123   if (ov < 0){ SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative overlap specified\n");}
124   for (i=0; i<ov; ++i) {
125     ierr = MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);CHKERRQ(ierr);
126   }
127   for (i=0; i<imax; i++) {ierr = ISDestroy(is[i]);CHKERRQ(ierr);}
128   ierr = MatExpandIndices_MPIBAIJ(C,imax,is_new,is);CHKERRQ(ierr);
129   for (i=0; i<imax; i++) {ierr = ISDestroy(is_new[i]);CHKERRQ(ierr);}
130   ierr = PetscFree(is_new);CHKERRQ(ierr);
131   PetscFunctionReturn(0);
132 }
133 
134 /*
135   Sample message format:
136   If a processor A wants processor B to process some elements corresponding
137   to index sets 1s[1], is[5]
138   mesg [0] = 2   (no of index sets in the mesg)
139   -----------
140   mesg [1] = 1 => is[1]
141   mesg [2] = sizeof(is[1]);
142   -----------
143   mesg [5] = 5  => is[5]
144   mesg [6] = sizeof(is[5]);
145   -----------
146   mesg [7]
147   mesg [n]  datas[1]
148   -----------
149   mesg[n+1]
150   mesg[m]  data(is[5])
151   -----------
152 
153   Notes:
154   nrqs - no of requests sent (or to be sent out)
155   nrqr - no of requests recieved (which have to be or which have been processed
156 */
157 #undef __FUNC__
158 #define __FUNC__ /*<a name=""></a>*/"MatIncreaseOverlap_MPIBAIJ_Once"
159 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat C,int imax,IS *is)
160 {
161   Mat_MPIBAIJ  *c = (Mat_MPIBAIJ*)C->data;
162   int         **idx,*n,*w1,*w2,*w3,*w4,*rtable,**data,len,*idx_i;
163   int         size,rank,Mbs,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
164   int         *ctr,*pa,tag,*tmp,bsz,nrqr,*isz,*isz1,**xdata,bsz1,**rbuf2;
165   PetscBT     *table;
166   MPI_Comm    comm;
167   MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
168   MPI_Status  *s_status,*recv_status;
169 
170   PetscFunctionBegin;
171   comm   = C->comm;
172   tag    = C->tag;
173   size   = c->size;
174   rank   = c->rank;
175   Mbs      = c->Mbs;
176 
177   len    = (imax+1)*sizeof(int*)+ (imax + Mbs)*sizeof(int);
178   idx    = (int**)PetscMalloc(len);CHKPTRQ(idx);
179   n      = (int*)(idx + imax);
180   rtable = n + imax;
181 
182   for (i=0; i<imax; i++) {
183     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
184     ierr = ISGetSize(is[i],&n[i]);CHKERRQ(ierr);
185   }
186 
187   /* Create hash table for the mapping :row -> proc*/
188   for (i=0,j=0; i<size; i++) {
189     len = c->rowners[i+1];
190     for (; j<len; j++) {
191       rtable[j] = i;
192     }
193   }
194 
195   /* evaluate communication - mesg to who,length of mesg, and buffer space
196      required. Based on this, buffers are allocated, and data copied into them*/
197   w1   = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1);/*  mesg size */
198   w2   = w1 + size;       /* if w2[i] marked, then a message to proc i*/
199   w3   = w2 + size;       /* no of IS that needs to be sent to proc i */
200   w4   = w3 + size;       /* temp work space used in determining w1, w2, w3 */
201   ierr = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/
202   for (i=0; i<imax; i++) {
203     ierr  = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/
204     idx_i = idx[i];
205     len   = n[i];
206     for (j=0; j<len; j++) {
207       row  = idx_i[j];
208       if (row < 0) {
209         SETERRQ(1,1,"Index set cannot have negative entries");
210       }
211       proc = rtable[row];
212       w4[proc]++;
213     }
214     for (j=0; j<size; j++){
215       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
216     }
217   }
218 
219   nrqs     = 0;              /* no of outgoing messages */
220   msz      = 0;              /* total mesg length (for all proc */
221   w1[rank] = 0;              /* no mesg sent to intself */
222   w3[rank] = 0;
223   for (i=0; i<size; i++) {
224     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
225   }
226   /* pa - is list of processors to communicate with */
227   pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa);
228   for (i=0,j=0; i<size; i++) {
229     if (w1[i]) {pa[j] = i; j++;}
230   }
231 
232   /* Each message would have a header = 1 + 2*(no of IS) + data */
233   for (i=0; i<nrqs; i++) {
234     j      = pa[i];
235     w1[j] += w2[j] + 2*w3[j];
236     msz   += w1[j];
237   }
238 
239 
240   /* Do a global reduction to determine how many messages to expect*/
241   {
242     int *rw1;
243     rw1   = (int*)PetscMalloc(2*size*sizeof(int));CHKPTRQ(rw1);
244     ierr  = MPI_Allreduce(w1,rw1,2*size,MPI_INT,PetscMaxSum_Op,comm);CHKERRQ(ierr);
245     bsz   = rw1[rank];
246     nrqr  = rw1[size+rank];
247     ierr  = PetscFree(rw1);CHKERRQ(ierr);
248   }
249 
250   /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */
251   len     = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int);
252   rbuf    = (int**)PetscMalloc(len);CHKPTRQ(rbuf);
253   rbuf[0] = (int*)(rbuf + nrqr);
254   for (i=1; i<nrqr; ++i) rbuf[i] = rbuf[i-1] + bsz;
255 
256   /* Post the receives */
257   r_waits1 = (MPI_Request*)PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(r_waits1);
258   for (i=0; i<nrqr; ++i) {
259     ierr = MPI_Irecv(rbuf[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag,comm,r_waits1+i);CHKERRQ(ierr);
260   }
261 
262   /* Allocate Memory for outgoing messages */
263   len    = 2*size*sizeof(int*) + (size+msz)*sizeof(int);
264   outdat = (int **)PetscMalloc(len);CHKPTRQ(outdat);
265   ptr    = outdat + size;     /* Pointers to the data in outgoing buffers */
266   ierr   = PetscMemzero(outdat,2*size*sizeof(int*));CHKERRQ(ierr);
267   tmp    = (int*)(outdat + 2*size);
268   ctr    = tmp + msz;
269 
270   {
271     int *iptr = tmp,ict  = 0;
272     for (i=0; i<nrqs; i++) {
273       j         = pa[i];
274       iptr     +=  ict;
275       outdat[j] = iptr;
276       ict       = w1[j];
277     }
278   }
279 
280   /* Form the outgoing messages */
281   /*plug in the headers*/
282   for (i=0; i<nrqs; i++) {
283     j            = pa[i];
284     outdat[j][0] = 0;
285     ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr);
286     ptr[j]       = outdat[j] + 2*w3[j] + 1;
287   }
288 
289   /* Memory for doing local proc's work*/
290   {
291     int  *d_p;
292     char *t_p;
293 
294     len   = (imax)*(sizeof(PetscBT) + sizeof(int*)+ sizeof(int)) +
295       (Mbs)*imax*sizeof(int)  + (Mbs/BITSPERBYTE+1)*imax*sizeof(char) + 1;
296     table = (PetscBT *)PetscMalloc(len);CHKPTRQ(table);
297     ierr  = PetscMemzero(table,len);CHKERRQ(ierr);
298     data  = (int **)(table + imax);
299     isz   = (int  *)(data  + imax);
300     d_p   = (int  *)(isz   + imax);
301     t_p   = (char *)(d_p   + Mbs*imax);
302     for (i=0; i<imax; i++) {
303       table[i] = t_p + (Mbs/BITSPERBYTE+1)*i;
304       data[i]  = d_p + (Mbs)*i;
305     }
306   }
307 
308   /* Parse the IS and update local tables and the outgoing buf with the data*/
309   {
310     int     n_i,*data_i,isz_i,*outdat_j,ctr_j;
311     PetscBT table_i;
312 
313     for (i=0; i<imax; i++) {
314       ierr    = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr);
315       n_i     = n[i];
316       table_i = table[i];
317       idx_i   = idx[i];
318       data_i  = data[i];
319       isz_i   = isz[i];
320       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
321         row  = idx_i[j];
322         proc = rtable[row];
323         if (proc != rank) { /* copy to the outgoing buffer */
324           ctr[proc]++;
325           *ptr[proc] = row;
326           ptr[proc]++;
327         }
328         else { /* Update the local table */
329           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
330         }
331       }
332       /* Update the headers for the current IS */
333       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
334         if ((ctr_j = ctr[j])) {
335           outdat_j        = outdat[j];
336           k               = ++outdat_j[0];
337           outdat_j[2*k]   = ctr_j;
338           outdat_j[2*k-1] = i;
339         }
340       }
341       isz[i] = isz_i;
342     }
343   }
344 
345   /*  Now  post the sends */
346   s_waits1 = (MPI_Request*)PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(s_waits1);
347   for (i=0; i<nrqs; ++i) {
348     j    = pa[i];
349     ierr = MPI_Isend(outdat[j],w1[j],MPI_INT,j,tag,comm,s_waits1+i);CHKERRQ(ierr);
350   }
351 
352   /* No longer need the original indices*/
353   for (i=0; i<imax; ++i) {
354     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
355   }
356   ierr = PetscFree(idx);CHKERRQ(ierr);
357 
358   for (i=0; i<imax; ++i) {
359     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
360   }
361 
362   /* Do Local work*/
363   ierr = MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
364 
365   /* Receive messages*/
366   {
367     int        index;
368 
369     recv_status = (MPI_Status*)PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(recv_status);
370     for (i=0; i<nrqr; ++i) {
371       ierr = MPI_Waitany(nrqr,r_waits1,&index,recv_status+i);CHKERRQ(ierr);
372     }
373 
374     s_status = (MPI_Status*)PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(s_status);
375     ierr     = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);
376   }
377 
378   /* Phase 1 sends are complete - deallocate buffers */
379   ierr = PetscFree(outdat);CHKERRQ(ierr);
380   ierr = PetscFree(w1);CHKERRQ(ierr);
381 
382   xdata = (int **)PetscMalloc((nrqr+1)*sizeof(int *));CHKPTRQ(xdata);
383   isz1  = (int *)PetscMalloc((nrqr+1)*sizeof(int));CHKPTRQ(isz1);
384   ierr  = MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
385   ierr  = PetscFree(rbuf);CHKERRQ(ierr);
386 
387   /* Send the data back*/
388   /* Do a global reduction to know the buffer space req for incoming messages*/
389   {
390     int *rw1,*rw2;
391 
392     rw1  = (int *)PetscMalloc(2*size*sizeof(int));CHKPTRQ(rw1);
393     ierr = PetscMemzero(rw1,2*size*sizeof(int));CHKERRQ(ierr);
394     rw2  = rw1+size;
395     for (i=0; i<nrqr; ++i) {
396       proc      = recv_status[i].MPI_SOURCE;
397       rw1[proc] = isz1[i];
398     }
399 
400     ierr   = MPI_Allreduce(rw1,rw2,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
401     bsz1   = rw2[rank];
402     ierr = PetscFree(rw1);CHKERRQ(ierr);
403   }
404 
405   /* Allocate buffers*/
406 
407   /* Allocate memory for recv buffers. Prob none if nrqr = 0 ???? */
408   len      = (nrqs+1)*sizeof(int*) + nrqs*bsz1*sizeof(int);
409   rbuf2    = (int**)PetscMalloc(len);CHKPTRQ(rbuf2);
410   rbuf2[0] = (int*)(rbuf2 + nrqs);
411   for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + bsz1;
412 
413   /* Post the receives */
414   r_waits2 = (MPI_Request *)PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits2);
415   for (i=0; i<nrqs; ++i) {
416     ierr = MPI_Irecv(rbuf2[i],bsz1,MPI_INT,MPI_ANY_SOURCE,tag,comm,r_waits2+i);CHKERRQ(ierr);
417   }
418 
419   /*  Now  post the sends */
420   s_waits2 = (MPI_Request*)PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits2);
421   for (i=0; i<nrqr; ++i) {
422     j    = recv_status[i].MPI_SOURCE;
423     ierr = MPI_Isend(xdata[i],isz1[i],MPI_INT,j,tag,comm,s_waits2+i);CHKERRQ(ierr);
424   }
425 
426   /* receive work done on other processors*/
427   {
428     int         index,is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
429     PetscBT     table_i;
430     MPI_Status  *status2;
431 
432     status2 = (MPI_Status*)PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(status2);
433 
434     for (i=0; i<nrqs; ++i) {
435       ierr = MPI_Waitany(nrqs,r_waits2,&index,status2+i);CHKERRQ(ierr);
436       /* Process the message*/
437       rbuf2_i = rbuf2[index];
438       ct1     = 2*rbuf2_i[0]+1;
439       jmax    = rbuf2[index][0];
440       for (j=1; j<=jmax; j++) {
441         max     = rbuf2_i[2*j];
442         is_no   = rbuf2_i[2*j-1];
443         isz_i   = isz[is_no];
444         data_i  = data[is_no];
445         table_i = table[is_no];
446         for (k=0; k<max; k++,ct1++) {
447           row = rbuf2_i[ct1];
448           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
449         }
450         isz[is_no] = isz_i;
451       }
452     }
453     ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);
454     ierr = PetscFree(status2);CHKERRQ(ierr);
455   }
456 
457   for (i=0; i<imax; ++i) {
458     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);CHKERRQ(ierr);
459   }
460 
461   ierr = PetscFree(pa);CHKERRQ(ierr);
462   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
463   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
464   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
465   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
466   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
467   ierr = PetscFree(table);CHKERRQ(ierr);
468   ierr = PetscFree(s_status);CHKERRQ(ierr);
469   ierr = PetscFree(recv_status);CHKERRQ(ierr);
470   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
471   ierr = PetscFree(xdata);CHKERRQ(ierr);
472   ierr = PetscFree(isz1);CHKERRQ(ierr);
473   PetscFunctionReturn(0);
474 }
475 
476 #undef __FUNC__
477 #define __FUNC__ /*<a name=""></a>*/"MatIncreaseOverlap_MPIBAIJ_Local"
478 /*
479    MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
480        the work on the local processor.
481 
482      Inputs:
483       C      - MAT_MPIBAIJ;
484       imax - total no of index sets processed at a time;
485       table  - an array of char - size = Mbs bits.
486 
487      Output:
488       isz    - array containing the count of the solution elements correspondign
489                to each index set;
490       data   - pointer to the solutions
491 */
492 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat C,int imax,PetscBT *table,int *isz,int **data)
493 {
494   Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
495   Mat         A = c->A,B = c->B;
496   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
497   int         start,end,val,max,rstart,cstart,*ai,*aj;
498   int         *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
499   PetscBT     table_i;
500 
501   PetscFunctionBegin;
502   rstart = c->rstart;
503   cstart = c->cstart;
504   ai     = a->i;
505   aj     = a->j;
506   bi     = b->i;
507   bj     = b->j;
508   garray = c->garray;
509 
510 
511   for (i=0; i<imax; i++) {
512     data_i  = data[i];
513     table_i = table[i];
514     isz_i   = isz[i];
515     for (j=0,max=isz[i]; j<max; j++) {
516       row   = data_i[j] - rstart;
517       start = ai[row];
518       end   = ai[row+1];
519       for (k=start; k<end; k++) { /* Amat */
520         val = aj[k] + cstart;
521         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
522       }
523       start = bi[row];
524       end   = bi[row+1];
525       for (k=start; k<end; k++) { /* Bmat */
526         val = garray[bj[k]];
527         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
528       }
529     }
530     isz[i] = isz_i;
531   }
532   PetscFunctionReturn(0);
533 }
534 #undef __FUNC__
535 #define __FUNC__ /*<a name=""></a>*/"MatIncreaseOverlap_MPIBAIJ_Receive"
536 /*
537       MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages,
538          and return the output
539 
540          Input:
541            C    - the matrix
542            nrqr - no of messages being processed.
543            rbuf - an array of pointers to the recieved requests
544 
545          Output:
546            xdata - array of messages to be sent back
547            isz1  - size of each message
548 
549   For better efficiency perhaps we should malloc seperately each xdata[i],
550 then if a remalloc is required we need only copy the data for that one row
551 rather then all previous rows as it is now where a single large chunck of
552 memory is used.
553 
554 */
555 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,int nrqr,int **rbuf,int **xdata,int * isz1)
556 {
557   Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
558   Mat         A = c->A,B = c->B;
559   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
560   int         rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
561   int         row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
562   int         val,max1,max2,rank,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
563   int         *rbuf_i,kmax,rbuf_0,ierr;
564   PetscBT     xtable;
565 
566   PetscFunctionBegin;
567   rank   = c->rank;
568   Mbs    = c->Mbs;
569   rstart = c->rstart;
570   cstart = c->cstart;
571   ai     = a->i;
572   aj     = a->j;
573   bi     = b->i;
574   bj     = b->j;
575   garray = c->garray;
576 
577 
578   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
579     rbuf_i  =  rbuf[i];
580     rbuf_0  =  rbuf_i[0];
581     ct     += rbuf_0;
582     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
583   }
584 
585   if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
586   else        max1 = 1;
587   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
588   xdata[0]     = (int*)PetscMalloc(mem_estimate*sizeof(int));CHKPTRQ(xdata[0]);
589   ++no_malloc;
590   ierr         = PetscBTCreate(Mbs,xtable);CHKERRQ(ierr);
591   ierr         = PetscMemzero(isz1,nrqr*sizeof(int));CHKERRQ(ierr);
592 
593   ct3 = 0;
594   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
595     rbuf_i =  rbuf[i];
596     rbuf_0 =  rbuf_i[0];
597     ct1    =  2*rbuf_0+1;
598     ct2    =  ct1;
599     ct3    += ct1;
600     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
601       ierr = PetscBTMemzero(Mbs,xtable);CHKERRQ(ierr);
602       oct2 = ct2;
603       kmax = rbuf_i[2*j];
604       for (k=0; k<kmax; k++,ct1++) {
605         row = rbuf_i[ct1];
606         if (!PetscBTLookupSet(xtable,row)) {
607           if (!(ct3 < mem_estimate)) {
608             new_estimate = (int)(1.5*mem_estimate)+1;
609             tmp          = (int*)PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp);
610             ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr);
611             ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
612             xdata[0]     = tmp;
613             mem_estimate = new_estimate; ++no_malloc;
614             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
615           }
616           xdata[i][ct2++] = row;
617           ct3++;
618         }
619       }
620       for (k=oct2,max2=ct2; k<max2; k++)  {
621         row   = xdata[i][k] - rstart;
622         start = ai[row];
623         end   = ai[row+1];
624         for (l=start; l<end; l++) {
625           val = aj[l] + cstart;
626           if (!PetscBTLookupSet(xtable,val)) {
627             if (!(ct3 < mem_estimate)) {
628               new_estimate = (int)(1.5*mem_estimate)+1;
629               tmp          = (int*)PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp);
630               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr);
631               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
632               xdata[0]     = tmp;
633               mem_estimate = new_estimate; ++no_malloc;
634               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
635             }
636             xdata[i][ct2++] = val;
637             ct3++;
638           }
639         }
640         start = bi[row];
641         end   = bi[row+1];
642         for (l=start; l<end; l++) {
643           val = garray[bj[l]];
644           if (!PetscBTLookupSet(xtable,val)) {
645             if (!(ct3 < mem_estimate)) {
646               new_estimate = (int)(1.5*mem_estimate)+1;
647               tmp          = (int*)PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp);
648               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr);
649               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
650               xdata[0]     = tmp;
651               mem_estimate = new_estimate; ++no_malloc;
652               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
653             }
654             xdata[i][ct2++] = val;
655             ct3++;
656           }
657         }
658       }
659       /* Update the header*/
660       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
661       xdata[i][2*j-1] = rbuf_i[2*j-1];
662     }
663     xdata[i][0] = rbuf_0;
664     xdata[i+1]  = xdata[i] + ct2;
665     isz1[i]     = ct2; /* size of each message */
666   }
667   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
668   PLogInfo(0,"MatIncreaseOverlap_MPIBAIJ:[%d] Allocated %d bytes, required %d, no of mallocs = %d\n",rank,mem_estimate,ct3,no_malloc);
669   PetscFunctionReturn(0);
670 }
671 
672 static int MatGetSubMatrices_MPIBAIJ_local(Mat,int,IS *,IS *,MatReuse,Mat *);
673 
674 #undef __FUNC__
675 #define __FUNC__ /*<a name=""></a>*/"MatGetSubMatrices_MPIBAIJ"
676 int MatGetSubMatrices_MPIBAIJ(Mat C,int ismax,IS *isrow,IS *iscol,MatReuse scall,Mat **submat)
677 {
678   IS          *isrow_new,*iscol_new;
679   Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
680   int         nmax,nstages_local,nstages,i,pos,max_no,ierr;
681 
682   PetscFunctionBegin;
683   /* The compression and expansion should be avoided. Does'nt point
684      out errors might change the indices hence buggey */
685 
686   isrow_new = (IS *)PetscMalloc(2*(ismax+1)*sizeof(IS));CHKPTRQ(isrow_new);
687   iscol_new = isrow_new + ismax;
688   ierr = MatCompressIndicesSorted_MPIBAIJ(C,ismax,isrow,isrow_new);CHKERRQ(ierr);
689   ierr = MatCompressIndicesSorted_MPIBAIJ(C,ismax,iscol,iscol_new);CHKERRQ(ierr);
690 
691   /* Allocate memory to hold all the submatrices */
692   if (scall != MAT_REUSE_MATRIX) {
693     *submat = (Mat *)PetscMalloc((ismax+1)*sizeof(Mat));CHKPTRQ(*submat);
694   }
695   /* Determine the number of stages through which submatrices are done */
696   nmax          = 20*1000000 / (c->Nbs * sizeof(int));
697   if (!nmax) nmax = 1;
698   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
699 
700   /* Make sure every porcessor loops through the nstages */
701   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);CHKERRQ(ierr);
702 
703   for (i=0,pos=0; i<nstages; i++) {
704     if (pos+nmax <= ismax) max_no = nmax;
705     else if (pos == ismax) max_no = 0;
706     else                   max_no = ismax-pos;
707     ierr = MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos);CHKERRQ(ierr);
708     pos += max_no;
709   }
710 
711   for (i=0; i<ismax; i++) {
712     ierr = ISDestroy(isrow_new[i]);CHKERRQ(ierr);
713     ierr = ISDestroy(iscol_new[i]);CHKERRQ(ierr);
714   }
715   ierr = PetscFree(isrow_new);CHKERRQ(ierr);
716   PetscFunctionReturn(0);
717 }
718 
719 /* -------------------------------------------------------------------------*/
720 #undef __FUNC__
721 #define __FUNC__ /*<a name=""></a>*/"MatGetSubMatrices_MPIBAIJ_local"
722 static int MatGetSubMatrices_MPIBAIJ_local(Mat C,int ismax,IS *isrow,IS *iscol,MatReuse scall,Mat *submats)
723 {
724   Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
725   Mat         A = c->A;
726   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
727   int         **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size;
728   int         **sbuf1,**sbuf2,rank,Mbs,i,j,k,l,ct1,ct2,ierr,**rbuf1,row,proc;
729   int         nrqs,msz,**ptr,index,*req_size,*ctr,*pa,*tmp,tcol,bsz,nrqr;
730   int         **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap;
731   int         **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i;
732   int         len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
733   int         *rmap_i,bs=c->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB;
734   int         cstart = c->cstart,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
735   int         *bmap = c->garray,ctmp,rstart=c->rstart,tag0,tag1,tag2,tag3;
736   MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
737   MPI_Request *r_waits4,*s_waits3,*s_waits4;
738   MPI_Status  *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
739   MPI_Status  *r_status3,*r_status4,*s_status4;
740   MPI_Comm    comm;
741   MatScalar   **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB;
742   MatScalar   *a_a=a->a,*b_a=b->a;
743   PetscTruth  flag;
744 
745   PetscFunctionBegin;
746   comm   = C->comm;
747   tag0    = C->tag;
748   size   = c->size;
749   rank   = c->rank;
750   Mbs      = c->Mbs;
751 
752   /* Get some new tags to keep the communication clean */
753   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
754   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
755   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
756 
757   /* Check if the col indices are sorted */
758   for (i=0; i<ismax; i++) {
759     ierr = ISSorted(iscol[i],(PetscTruth*)&j);CHKERRQ(ierr);
760     if (!j) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted");
761   }
762 
763   len    = (2*ismax+1)*(sizeof(int*)+ sizeof(int)) + (Mbs+1)*sizeof(int);
764   irow   = (int **)PetscMalloc(len);CHKPTRQ(irow);
765   icol   = irow + ismax;
766   nrow   = (int*)(icol + ismax);
767   ncol   = nrow + ismax;
768   rtable = ncol + ismax;
769 
770   for (i=0; i<ismax; i++) {
771     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
772     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
773     ierr = ISGetSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
774     ierr = ISGetSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
775   }
776 
777   /* Create hash table for the mapping :row -> proc*/
778   for (i=0,j=0; i<size; i++) {
779     jmax = c->rowners[i+1];
780     for (; j<jmax; j++) {
781       rtable[j] = i;
782     }
783   }
784 
785   /* evaluate communication - mesg to who,length of mesg,and buffer space
786      required. Based on this, buffers are allocated, and data copied into them*/
787   w1     = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1); /* mesg size */
788   w2     = w1 + size;      /* if w2[i] marked, then a message to proc i*/
789   w3     = w2 + size;      /* no of IS that needs to be sent to proc i */
790   w4     = w3 + size;      /* temp work space used in determining w1, w2, w3 */
791   ierr   = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/
792   for (i=0; i<ismax; i++) {
793     ierr   = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/
794     jmax   = nrow[i];
795     irow_i = irow[i];
796     for (j=0; j<jmax; j++) {
797       row  = irow_i[j];
798       proc = rtable[row];
799       w4[proc]++;
800     }
801     for (j=0; j<size; j++) {
802       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
803     }
804   }
805 
806   nrqs     = 0;              /* no of outgoing messages */
807   msz      = 0;              /* total mesg length for all proc */
808   w1[rank] = 0;              /* no mesg sent to intself */
809   w3[rank] = 0;
810   for (i=0; i<size; i++) {
811     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
812   }
813   pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); /*(proc -array)*/
814   for (i=0,j=0; i<size; i++) {
815     if (w1[i]) { pa[j] = i; j++; }
816   }
817 
818   /* Each message would have a header = 1 + 2*(no of IS) + data */
819   for (i=0; i<nrqs; i++) {
820     j     = pa[i];
821     w1[j] += w2[j] + 2* w3[j];
822     msz   += w1[j];
823   }
824   /* Do a global reduction to determine how many messages to expect*/
825   {
826     int *rw1;
827     rw1  = (int *)PetscMalloc(2*size*sizeof(int));CHKPTRQ(rw1);
828     ierr = MPI_Allreduce(w1,rw1,2*size,MPI_INT,PetscMaxSum_Op,comm);CHKERRQ(ierr);
829     bsz  = rw1[rank];
830     nrqr = rw1[size+rank];
831     ierr = PetscFree(rw1);CHKERRQ(ierr);
832   }
833 
834   /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */
835   len      = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int);
836   rbuf1    = (int**)PetscMalloc(len);CHKPTRQ(rbuf1);
837   rbuf1[0] = (int*)(rbuf1 + nrqr);
838   for (i=1; i<nrqr; ++i) rbuf1[i] = rbuf1[i-1] + bsz;
839 
840   /* Post the receives */
841   r_waits1 = (MPI_Request*)PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(r_waits1);
842   for (i=0; i<nrqr; ++i) {
843     ierr = MPI_Irecv(rbuf1[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag0,comm,r_waits1+i);CHKERRQ(ierr);
844   }
845 
846   /* Allocate Memory for outgoing messages */
847   len      = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int);
848   sbuf1    = (int **)PetscMalloc(len);CHKPTRQ(sbuf1);
849   ptr      = sbuf1 + size;   /* Pointers to the data in outgoing buffers */
850   ierr     = PetscMemzero(sbuf1,2*size*sizeof(int*));CHKERRQ(ierr);
851   /* allocate memory for outgoing data + buf to receive the first reply */
852   tmp      = (int*)(ptr + size);
853   ctr      = tmp + 2*msz;
854 
855   {
856     int *iptr = tmp,ict = 0;
857     for (i=0; i<nrqs; i++) {
858       j         = pa[i];
859       iptr     += ict;
860       sbuf1[j]  = iptr;
861       ict       = w1[j];
862     }
863   }
864 
865   /* Form the outgoing messages */
866   /* Initialise the header space */
867   for (i=0; i<nrqs; i++) {
868     j           = pa[i];
869     sbuf1[j][0] = 0;
870     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr);
871     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
872   }
873 
874   /* Parse the isrow and copy data into outbuf */
875   for (i=0; i<ismax; i++) {
876     ierr   = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr);
877     irow_i = irow[i];
878     jmax   = nrow[i];
879     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
880       row  = irow_i[j];
881       proc = rtable[row];
882       if (proc != rank) { /* copy to the outgoing buf*/
883         ctr[proc]++;
884         *ptr[proc] = row;
885         ptr[proc]++;
886       }
887     }
888     /* Update the headers for the current IS */
889     for (j=0; j<size; j++) { /* Can Optimise this loop too */
890       if ((ctr_j = ctr[j])) {
891         sbuf1_j        = sbuf1[j];
892         k              = ++sbuf1_j[0];
893         sbuf1_j[2*k]   = ctr_j;
894         sbuf1_j[2*k-1] = i;
895       }
896     }
897   }
898 
899   /*  Now  post the sends */
900   s_waits1 = (MPI_Request*)PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(s_waits1);
901   for (i=0; i<nrqs; ++i) {
902     j = pa[i];
903     ierr = MPI_Isend(sbuf1[j],w1[j],MPI_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
904   }
905 
906   /* Post Recieves to capture the buffer size */
907   r_waits2 = (MPI_Request*)PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits2);
908   rbuf2    = (int**)PetscMalloc((nrqs+1)*sizeof(int *));CHKPTRQ(rbuf2);
909   rbuf2[0] = tmp + msz;
910   for (i=1; i<nrqs; ++i) {
911     j        = pa[i];
912     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
913   }
914   for (i=0; i<nrqs; ++i) {
915     j    = pa[i];
916     ierr = MPI_Irecv(rbuf2[i],w1[j],MPI_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
917   }
918 
919   /* Send to other procs the buf size they should allocate */
920 
921 
922   /* Receive messages*/
923   s_waits2   = (MPI_Request*)PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits2);
924   r_status1  = (MPI_Status*)PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(r_status1);
925   len        = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*);
926   sbuf2      = (int**)PetscMalloc(len);CHKPTRQ(sbuf2);
927   req_size   = (int*)(sbuf2 + nrqr);
928   req_source = req_size + nrqr;
929 
930   {
931     Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
932     int        *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;
933 
934     for (i=0; i<nrqr; ++i) {
935       ierr = MPI_Waitany(nrqr,r_waits1,&index,r_status1+i);CHKERRQ(ierr);
936       req_size[index] = 0;
937       rbuf1_i         = rbuf1[index];
938       start           = 2*rbuf1_i[0] + 1;
939       ierr = MPI_Get_count(r_status1+i,MPI_INT,&end);CHKERRQ(ierr);
940       sbuf2[index] = (int *)PetscMalloc(end*sizeof(int));CHKPTRQ(sbuf2[index]);
941       sbuf2_i      = sbuf2[index];
942       for (j=start; j<end; j++) {
943         id               = rbuf1_i[j] - rstart;
944         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
945         sbuf2_i[j]       = ncols;
946         req_size[index] += ncols;
947       }
948       req_source[index] = r_status1[i].MPI_SOURCE;
949       /* form the header */
950       sbuf2_i[0]   = req_size[index];
951       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
952       ierr = MPI_Isend(sbuf2_i,end,MPI_INT,req_source[index],tag1,comm,s_waits2+i);CHKERRQ(ierr);
953     }
954   }
955   ierr = PetscFree(r_status1);CHKERRQ(ierr);
956   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
957 
958   /*  recv buffer sizes */
959   /* Receive messages*/
960 
961   rbuf3     = (int**)PetscMalloc((nrqs+1)*sizeof(int*));CHKPTRQ(rbuf3);
962   rbuf4     = (MatScalar**)PetscMalloc((nrqs+1)*sizeof(MatScalar*));CHKPTRQ(rbuf4);
963   r_waits3  = (MPI_Request*)PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits3);
964   r_waits4  = (MPI_Request*)PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits4);
965   r_status2 = (MPI_Status*)PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(r_status2);
966 
967   for (i=0; i<nrqs; ++i) {
968     ierr = MPI_Waitany(nrqs,r_waits2,&index,r_status2+i);CHKERRQ(ierr);
969     rbuf3[index] = (int *)PetscMalloc(rbuf2[index][0]*sizeof(int));CHKPTRQ(rbuf3[index]);
970     rbuf4[index] = (MatScalar *)PetscMalloc(rbuf2[index][0]*bs2*sizeof(MatScalar));CHKPTRQ(rbuf4[index]);
971     ierr = MPI_Irecv(rbuf3[index],rbuf2[index][0],MPI_INT,
972                      r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+index);CHKERRQ(ierr);
973     ierr = MPI_Irecv(rbuf4[index],rbuf2[index][0]*bs2,MPIU_MATSCALAR,
974                      r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+index);CHKERRQ(ierr);
975   }
976   ierr = PetscFree(r_status2);CHKERRQ(ierr);
977   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
978 
979   /* Wait on sends1 and sends2 */
980   s_status1 = (MPI_Status*)PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(s_status1);
981   s_status2 = (MPI_Status*)PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(s_status2);
982 
983   ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
984   ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
985   ierr = PetscFree(s_status1);CHKERRQ(ierr);
986   ierr = PetscFree(s_status2);CHKERRQ(ierr);
987   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
988   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
989 
990   /* Now allocate buffers for a->j, and send them off */
991   sbuf_aj = (int **)PetscMalloc((nrqr+1)*sizeof(int *));CHKPTRQ(sbuf_aj);
992   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
993   sbuf_aj[0] = (int*)PetscMalloc((j+1)*sizeof(int));CHKPTRQ(sbuf_aj[0]);
994   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
995 
996   s_waits3 = (MPI_Request*)PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits3);
997   {
998      for (i=0; i<nrqr; i++) {
999       rbuf1_i   = rbuf1[i];
1000       sbuf_aj_i = sbuf_aj[i];
1001       ct1       = 2*rbuf1_i[0] + 1;
1002       ct2       = 0;
1003       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1004         kmax = rbuf1[i][2*j];
1005         for (k=0; k<kmax; k++,ct1++) {
1006           row    = rbuf1_i[ct1] - rstart;
1007           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1008           ncols  = nzA + nzB;
1009           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1010 
1011           /* load the column indices for this row into cols*/
1012           cols  = sbuf_aj_i + ct2;
1013           for (l=0; l<nzB; l++) {
1014             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[l] = ctmp;
1015             else break;
1016           }
1017           imark = l;
1018           for (l=0; l<nzA; l++)   cols[imark+l] = cstart + cworkA[l];
1019           for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
1020           ct2 += ncols;
1021         }
1022       }
1023       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1024     }
1025   }
1026   r_status3 = (MPI_Status*)PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(r_status3);
1027   s_status3 = (MPI_Status*)PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(s_status3);
1028 
1029   /* Allocate buffers for a->a, and send them off */
1030   sbuf_aa = (MatScalar **)PetscMalloc((nrqr+1)*sizeof(MatScalar *));CHKPTRQ(sbuf_aa);
1031   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1032   sbuf_aa[0] = (MatScalar*)PetscMalloc((j+1)*bs2*sizeof(MatScalar));CHKPTRQ(sbuf_aa[0]);
1033   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
1034 
1035   s_waits4 = (MPI_Request*)PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits4);
1036   {
1037     for (i=0; i<nrqr; i++) {
1038       rbuf1_i   = rbuf1[i];
1039       sbuf_aa_i = sbuf_aa[i];
1040       ct1       = 2*rbuf1_i[0]+1;
1041       ct2       = 0;
1042       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1043         kmax = rbuf1_i[2*j];
1044         for (k=0; k<kmax; k++,ct1++) {
1045           row    = rbuf1_i[ct1] - rstart;
1046           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1047           ncols  = nzA + nzB;
1048           cworkA = a_j + a_i[row];     cworkB = b_j + b_i[row];
1049           vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2;
1050 
1051           /* load the column values for this row into vals*/
1052           vals  = sbuf_aa_i+ct2*bs2;
1053           for (l=0; l<nzB; l++) {
1054             if ((bmap[cworkB[l]]) < cstart) {
1055               ierr = PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1056             }
1057             else break;
1058           }
1059           imark = l;
1060           for (l=0; l<nzA; l++) {
1061             ierr = PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1062           }
1063           for (l=imark; l<nzB; l++) {
1064             ierr = PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1065           }
1066           ct2 += ncols;
1067         }
1068       }
1069       ierr = MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1070     }
1071   }
1072   r_status4 = (MPI_Status*)PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(r_status4);
1073   s_status4 = (MPI_Status*)PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(s_status4);
1074   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1075 
1076   /* Form the matrix */
1077   /* create col map */
1078   {
1079     int *icol_i;
1080 
1081     len     = (1+ismax)*sizeof(int*)+ ismax*c->Nbs*sizeof(int);
1082     cmap    = (int **)PetscMalloc(len);CHKPTRQ(cmap);
1083     cmap[0] = (int *)(cmap + ismax);
1084     ierr    = PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(int));CHKERRQ(ierr);
1085     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; }
1086     for (i=0; i<ismax; i++) {
1087       jmax   = ncol[i];
1088       icol_i = icol[i];
1089       cmap_i = cmap[i];
1090       for (j=0; j<jmax; j++) {
1091         cmap_i[icol_i[j]] = j+1;
1092       }
1093     }
1094   }
1095 
1096 
1097   /* Create lens which is required for MatCreate... */
1098   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1099   len     = (1+ismax)*sizeof(int*)+ j*sizeof(int);
1100   lens    = (int **)PetscMalloc(len);CHKPTRQ(lens);
1101   lens[0] = (int *)(lens + ismax);
1102   ierr    = PetscMemzero(lens[0],j*sizeof(int));CHKERRQ(ierr);
1103   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1104 
1105   /* Update lens from local data */
1106   for (i=0; i<ismax; i++) {
1107     jmax   = nrow[i];
1108     cmap_i = cmap[i];
1109     irow_i = irow[i];
1110     lens_i = lens[i];
1111     for (j=0; j<jmax; j++) {
1112       row  = irow_i[j];
1113       proc = rtable[row];
1114       if (proc == rank) {
1115         /* Get indices from matA and then from matB */
1116         row    = row - rstart;
1117         nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1118         cworkA =  a_j + a_i[row]; cworkB = b_j + b_i[row];
1119         for (k=0; k<nzA; k++) {
1120           if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++;}
1121         }
1122         for (k=0; k<nzB; k++) {
1123           if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++;}
1124         }
1125       }
1126     }
1127   }
1128 
1129   /* Create row map*/
1130   len     = (1+ismax)*sizeof(int*)+ ismax*c->Mbs*sizeof(int);
1131   rmap    = (int **)PetscMalloc(len);CHKPTRQ(rmap);
1132   rmap[0] = (int *)(rmap + ismax);
1133   ierr    = PetscMemzero(rmap[0],ismax*c->Mbs*sizeof(int));CHKERRQ(ierr);
1134   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + c->Mbs;}
1135   for (i=0; i<ismax; i++) {
1136     rmap_i = rmap[i];
1137     irow_i = irow[i];
1138     jmax   = nrow[i];
1139     for (j=0; j<jmax; j++) {
1140       rmap_i[irow_i[j]] = j;
1141     }
1142   }
1143 
1144   /* Update lens from offproc data */
1145   {
1146     int *rbuf2_i,*rbuf3_i,*sbuf1_i;
1147 
1148     for (tmp2=0; tmp2<nrqs; tmp2++) {
1149       ierr    = MPI_Waitany(nrqs,r_waits3,&i,r_status3+tmp2);CHKERRQ(ierr);
1150       index   = pa[i];
1151       sbuf1_i = sbuf1[index];
1152       jmax    = sbuf1_i[0];
1153       ct1     = 2*jmax+1;
1154       ct2     = 0;
1155       rbuf2_i = rbuf2[i];
1156       rbuf3_i = rbuf3[i];
1157       for (j=1; j<=jmax; j++) {
1158         is_no   = sbuf1_i[2*j-1];
1159         max1    = sbuf1_i[2*j];
1160         lens_i  = lens[is_no];
1161         cmap_i  = cmap[is_no];
1162         rmap_i  = rmap[is_no];
1163         for (k=0; k<max1; k++,ct1++) {
1164           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1165           max2 = rbuf2_i[ct1];
1166           for (l=0; l<max2; l++,ct2++) {
1167             if (cmap_i[rbuf3_i[ct2]]) {
1168               lens_i[row]++;
1169             }
1170           }
1171         }
1172       }
1173     }
1174   }
1175   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1176   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1177   ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1178   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1179   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1180 
1181   /* Create the submatrices */
1182   if (scall == MAT_REUSE_MATRIX) {
1183     /*
1184         Assumes new rows are same length as the old rows, hence bug!
1185     */
1186     for (i=0; i<ismax; i++) {
1187       mat = (Mat_SeqBAIJ *)(submats[i]->data);
1188       if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || mat->bs != bs)) {
1189         SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong size");
1190       }
1191       ierr = PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(int),&flag);CHKERRQ(ierr);
1192       if (flag == PETSC_FALSE) {
1193         SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Cannot reuse matrix. wrong no of nonzeros");
1194       }
1195       /* Initial matrix as if empty */
1196       ierr = PetscMemzero(mat->ilen,mat->mbs*sizeof(int));CHKERRQ(ierr);
1197       submats[i]->factor = C->factor;
1198     }
1199   } else {
1200     for (i=0; i<ismax; i++) {
1201       ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,a->bs,nrow[i]*bs,ncol[i]*bs,0,lens[i],submats+i);CHKERRQ(ierr);
1202     }
1203   }
1204 
1205   /* Assemble the matrices */
1206   /* First assemble the local rows */
1207   {
1208     int       ilen_row,*imat_ilen,*imat_j,*imat_i;
1209     MatScalar *imat_a;
1210 
1211     for (i=0; i<ismax; i++) {
1212       mat       = (Mat_SeqBAIJ*)submats[i]->data;
1213       imat_ilen = mat->ilen;
1214       imat_j    = mat->j;
1215       imat_i    = mat->i;
1216       imat_a    = mat->a;
1217       cmap_i    = cmap[i];
1218       rmap_i    = rmap[i];
1219       irow_i    = irow[i];
1220       jmax      = nrow[i];
1221       for (j=0; j<jmax; j++) {
1222         row      = irow_i[j];
1223         proc     = rtable[row];
1224         if (proc == rank) {
1225           row      = row - rstart;
1226           nzA      = a_i[row+1] - a_i[row];
1227           nzB      = b_i[row+1] - b_i[row];
1228           cworkA   = a_j + a_i[row];
1229           cworkB   = b_j + b_i[row];
1230           vworkA   = a_a + a_i[row]*bs2;
1231           vworkB   = b_a + b_i[row]*bs2;
1232 
1233           row      = rmap_i[row + rstart];
1234           mat_i    = imat_i[row];
1235           mat_a    = imat_a + mat_i*bs2;
1236           mat_j    = imat_j + mat_i;
1237           ilen_row = imat_ilen[row];
1238 
1239           /* load the column indices for this row into cols*/
1240           for (l=0; l<nzB; l++) {
1241             if ((ctmp = bmap[cworkB[l]]) < cstart) {
1242               if ((tcol = cmap_i[ctmp])) {
1243                 *mat_j++ = tcol - 1;
1244                 ierr     = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1245                 mat_a   += bs2;
1246                 ilen_row++;
1247               }
1248             } else break;
1249           }
1250           imark = l;
1251           for (l=0; l<nzA; l++) {
1252             if ((tcol = cmap_i[cstart + cworkA[l]])) {
1253               *mat_j++ = tcol - 1;
1254               ierr     = PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1255               mat_a   += bs2;
1256               ilen_row++;
1257             }
1258           }
1259           for (l=imark; l<nzB; l++) {
1260             if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1261               *mat_j++ = tcol - 1;
1262               ierr     = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1263               mat_a   += bs2;
1264               ilen_row++;
1265             }
1266           }
1267           imat_ilen[row] = ilen_row;
1268         }
1269       }
1270 
1271     }
1272   }
1273 
1274   /*   Now assemble the off proc rows*/
1275   {
1276     int       *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1277     int       *imat_j,*imat_i;
1278     MatScalar *imat_a,*rbuf4_i;
1279 
1280     for (tmp2=0; tmp2<nrqs; tmp2++) {
1281       ierr    = MPI_Waitany(nrqs,r_waits4,&i,r_status4+tmp2);CHKERRQ(ierr);
1282       index   = pa[i];
1283       sbuf1_i = sbuf1[index];
1284       jmax    = sbuf1_i[0];
1285       ct1     = 2*jmax + 1;
1286       ct2     = 0;
1287       rbuf2_i = rbuf2[i];
1288       rbuf3_i = rbuf3[i];
1289       rbuf4_i = rbuf4[i];
1290       for (j=1; j<=jmax; j++) {
1291         is_no     = sbuf1_i[2*j-1];
1292         rmap_i    = rmap[is_no];
1293         cmap_i    = cmap[is_no];
1294         mat       = (Mat_SeqBAIJ*)submats[is_no]->data;
1295         imat_ilen = mat->ilen;
1296         imat_j    = mat->j;
1297         imat_i    = mat->i;
1298         imat_a    = mat->a;
1299         max1      = sbuf1_i[2*j];
1300         for (k=0; k<max1; k++,ct1++) {
1301           row   = sbuf1_i[ct1];
1302           row   = rmap_i[row];
1303           ilen  = imat_ilen[row];
1304           mat_i = imat_i[row];
1305           mat_a = imat_a + mat_i*bs2;
1306           mat_j = imat_j + mat_i;
1307           max2 = rbuf2_i[ct1];
1308           for (l=0; l<max2; l++,ct2++) {
1309             if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1310               *mat_j++    = tcol - 1;
1311               /* *mat_a++= rbuf4_i[ct2]; */
1312               ierr        = PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1313               mat_a      += bs2;
1314               ilen++;
1315             }
1316           }
1317           imat_ilen[row] = ilen;
1318         }
1319       }
1320     }
1321   }
1322   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1323   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1324   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1325   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1326   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1327 
1328   /* Restore the indices */
1329   for (i=0; i<ismax; i++) {
1330     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1331     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1332   }
1333 
1334   /* Destroy allocated memory */
1335   ierr = PetscFree(irow);CHKERRQ(ierr);
1336   ierr = PetscFree(w1);CHKERRQ(ierr);
1337   ierr = PetscFree(pa);CHKERRQ(ierr);
1338 
1339   ierr = PetscFree(sbuf1);CHKERRQ(ierr);
1340   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1341   for (i=0; i<nrqr; ++i) {
1342     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1343   }
1344   for (i=0; i<nrqs; ++i) {
1345     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1346     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1347   }
1348 
1349   ierr = PetscFree(sbuf2);CHKERRQ(ierr);
1350   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1351   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1352   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1353   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1354   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1355   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1356 
1357   ierr = PetscFree(cmap);CHKERRQ(ierr);
1358   ierr = PetscFree(rmap);CHKERRQ(ierr);
1359   ierr = PetscFree(lens);CHKERRQ(ierr);
1360 
1361   for (i=0; i<ismax; i++) {
1362     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1363     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1364   }
1365 
1366   ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1367   ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1368   ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
1369 
1370   PetscFunctionReturn(0);
1371 }
1372 
1373 
1374