xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 1d79065fde699397c4a1eb4514b06e24fdc073d8)
1 
2 #define PETSCMAT_DLL
3 
4 /*
5    Routines to compute overlapping regions of a parallel MPI matrix
6   and to find submatrices that were shared across processors.
7 */
8 #include "../src/mat/impls/aij/mpi/mpiaij.h"
9 #include "petscbt.h"
10 
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
13 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
14 EXTERN PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 EXTERN PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16 
17 #undef __FUNCT__
18 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
19 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
20 {
21   PetscErrorCode ierr;
22   PetscInt       i;
23 
24   PetscFunctionBegin;
25   if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26   for (i=0; i<ov; ++i) {
27     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
28   }
29   PetscFunctionReturn(0);
30 }
31 
32 /*
33   Sample message format:
34   If a processor A wants processor B to process some elements corresponding
35   to index sets is[1],is[5]
36   mesg [0] = 2   (no of index sets in the mesg)
37   -----------
38   mesg [1] = 1 => is[1]
39   mesg [2] = sizeof(is[1]);
40   -----------
41   mesg [3] = 5  => is[5]
42   mesg [4] = sizeof(is[5]);
43   -----------
44   mesg [5]
45   mesg [n]  datas[1]
46   -----------
47   mesg[n+1]
48   mesg[m]  data(is[5])
49   -----------
50 
51   Notes:
52   nrqs - no of requests sent (or to be sent out)
53   nrqr - no of requests recieved (which have to be or which have been processed
54 */
55 #undef __FUNCT__
56 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
57 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
58 {
59   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
60   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
61   const PetscInt **idx,*idx_i;
62   PetscInt       *n,*rtable,**data,len;
63   PetscErrorCode ierr;
64   PetscMPIInt    size,rank,tag1,tag2;
65   PetscInt       m,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
66   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
67   PetscBT        *table;
68   MPI_Comm       comm;
69   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
70   MPI_Status     *s_status,*recv_status;
71 
72   PetscFunctionBegin;
73   comm   = ((PetscObject)C)->comm;
74   size   = c->size;
75   rank   = c->rank;
76   m      = C->rmap->N;
77 
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
79   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
80 
81   ierr = PetscMalloc3(imax,PetscInt*,&idx,imax,PetscInt,&n,m,PetscInt,&rtable);CHKERRQ(ierr);
82 
83   for (i=0; i<imax; i++) {
84     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
85     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
86   }
87 
88   /* Create hash table for the mapping :row -> proc*/
89   for (i=0,j=0; i<size; i++) {
90     len = C->rmap->range[i+1];
91     for (; j<len; j++) {
92       rtable[j] = i;
93     }
94   }
95 
96   /* evaluate communication - mesg to who,length of mesg, and buffer space
97      required. Based on this, buffers are allocated, and data copied into them*/
98   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
99   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
100   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
101   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
102   for (i=0; i<imax; i++) {
103     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
104     idx_i = idx[i];
105     len   = n[i];
106     for (j=0; j<len; j++) {
107       row  = idx_i[j];
108       if (row < 0) {
109         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
110       }
111       proc = rtable[row];
112       w4[proc]++;
113     }
114     for (j=0; j<size; j++){
115       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
116     }
117   }
118 
119   nrqs     = 0;              /* no of outgoing messages */
120   msz      = 0;              /* total mesg length (for all proc */
121   w1[rank] = 0;              /* no mesg sent to intself */
122   w3[rank] = 0;
123   for (i=0; i<size; i++) {
124     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
125   }
126   /* pa - is list of processors to communicate with */
127   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
128   for (i=0,j=0; i<size; i++) {
129     if (w1[i]) {pa[j] = i; j++;}
130   }
131 
132   /* Each message would have a header = 1 + 2*(no of IS) + data */
133   for (i=0; i<nrqs; i++) {
134     j      = pa[i];
135     w1[j] += w2[j] + 2*w3[j];
136     msz   += w1[j];
137   }
138 
139   /* Determine the number of messages to expect, their lengths, from from-ids */
140   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
141   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
142 
143   /* Now post the Irecvs corresponding to these messages */
144   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
145 
146   /* Allocate Memory for outgoing messages */
147   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
148   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
149   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
150 
151   {
152     PetscInt *iptr = tmp,ict  = 0;
153     for (i=0; i<nrqs; i++) {
154       j         = pa[i];
155       iptr     +=  ict;
156       outdat[j] = iptr;
157       ict       = w1[j];
158     }
159   }
160 
161   /* Form the outgoing messages */
162   /*plug in the headers*/
163   for (i=0; i<nrqs; i++) {
164     j            = pa[i];
165     outdat[j][0] = 0;
166     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
167     ptr[j]       = outdat[j] + 2*w3[j] + 1;
168   }
169 
170   /* Memory for doing local proc's work*/
171   {
172     PetscInt  *d_p;
173     char      *t_p;
174 
175     ierr  = PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
176       (m)*imax*sizeof(PetscInt)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1,&table);CHKERRQ(ierr);
177     ierr  = PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
178       (m)*imax*sizeof(PetscInt)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1);CHKERRQ(ierr);
179     data  = (PetscInt **)(table + imax);
180     isz   = (PetscInt  *)(data  + imax);
181     d_p   = (PetscInt  *)(isz   + imax);
182     t_p   = (char *)(d_p   + m*imax);
183     for (i=0; i<imax; i++) {
184       table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i;
185       data[i]  = d_p + (m)*i;
186     }
187   }
188 
189   /* Parse the IS and update local tables and the outgoing buf with the data*/
190   {
191     PetscInt     n_i,*data_i,isz_i,*outdat_j,ctr_j;
192     PetscBT table_i;
193 
194     for (i=0; i<imax; i++) {
195       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
196       n_i     = n[i];
197       table_i = table[i];
198       idx_i   = idx[i];
199       data_i  = data[i];
200       isz_i   = isz[i];
201       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
202         row  = idx_i[j];
203         proc = rtable[row];
204         if (proc != rank) { /* copy to the outgoing buffer */
205           ctr[proc]++;
206           *ptr[proc] = row;
207           ptr[proc]++;
208         } else { /* Update the local table */
209           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
210         }
211       }
212       /* Update the headers for the current IS */
213       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
214         if ((ctr_j = ctr[j])) {
215           outdat_j        = outdat[j];
216           k               = ++outdat_j[0];
217           outdat_j[2*k]   = ctr_j;
218           outdat_j[2*k-1] = i;
219         }
220       }
221       isz[i] = isz_i;
222     }
223   }
224 
225   /*  Now  post the sends */
226   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
227   for (i=0; i<nrqs; ++i) {
228     j    = pa[i];
229     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
230   }
231 
232   /* No longer need the original indices*/
233   for (i=0; i<imax; ++i) {
234     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
235   }
236   ierr = PetscFree3(idx,n,rtable);CHKERRQ(ierr);
237 
238   for (i=0; i<imax; ++i) {
239     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
240   }
241 
242   /* Do Local work*/
243   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
244 
245   /* Receive messages*/
246   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
247   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
248 
249   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
250   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
251 
252   /* Phase 1 sends are complete - deallocate buffers */
253   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
254   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
255 
256   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
257   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
258   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
259   ierr = PetscFree(rbuf);CHKERRQ(ierr);
260 
261 
262  /* Send the data back*/
263   /* Do a global reduction to know the buffer space req for incoming messages*/
264   {
265     PetscMPIInt *rw1;
266 
267     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
268     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
269 
270     for (i=0; i<nrqr; ++i) {
271       proc      = recv_status[i].MPI_SOURCE;
272       if (proc != onodes1[i]) SETERRQ(PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
273       rw1[proc] = isz1[i];
274     }
275     ierr = PetscFree(onodes1);CHKERRQ(ierr);
276     ierr = PetscFree(olengths1);CHKERRQ(ierr);
277 
278     /* Determine the number of messages to expect, their lengths, from from-ids */
279     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
280     ierr = PetscFree(rw1);CHKERRQ(ierr);
281   }
282   /* Now post the Irecvs corresponding to these messages */
283   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
284 
285   /*  Now  post the sends */
286   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
287   for (i=0; i<nrqr; ++i) {
288     j    = recv_status[i].MPI_SOURCE;
289     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
290   }
291 
292   /* receive work done on other processors*/
293   {
294     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
295     PetscMPIInt idex;
296     PetscBT     table_i;
297     MPI_Status  *status2;
298 
299     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
300     for (i=0; i<nrqs; ++i) {
301       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
302       /* Process the message*/
303       rbuf2_i = rbuf2[idex];
304       ct1     = 2*rbuf2_i[0]+1;
305       jmax    = rbuf2[idex][0];
306       for (j=1; j<=jmax; j++) {
307         max     = rbuf2_i[2*j];
308         is_no   = rbuf2_i[2*j-1];
309         isz_i   = isz[is_no];
310         data_i  = data[is_no];
311         table_i = table[is_no];
312         for (k=0; k<max; k++,ct1++) {
313           row = rbuf2_i[ct1];
314           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
315         }
316         isz[is_no] = isz_i;
317       }
318     }
319 
320     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
321     ierr = PetscFree(status2);CHKERRQ(ierr);
322   }
323 
324   for (i=0; i<imax; ++i) {
325     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);CHKERRQ(ierr);
326   }
327 
328   ierr = PetscFree(onodes2);CHKERRQ(ierr);
329   ierr = PetscFree(olengths2);CHKERRQ(ierr);
330 
331   ierr = PetscFree(pa);CHKERRQ(ierr);
332   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
333   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
334   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
335   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
336   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
337   ierr = PetscFree(table);CHKERRQ(ierr);
338   ierr = PetscFree(s_status);CHKERRQ(ierr);
339   ierr = PetscFree(recv_status);CHKERRQ(ierr);
340   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
341   ierr = PetscFree(xdata);CHKERRQ(ierr);
342   ierr = PetscFree(isz1);CHKERRQ(ierr);
343   PetscFunctionReturn(0);
344 }
345 
346 #undef __FUNCT__
347 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
348 /*
349    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
350        the work on the local processor.
351 
352      Inputs:
353       C      - MAT_MPIAIJ;
354       imax - total no of index sets processed at a time;
355       table  - an array of char - size = m bits.
356 
357      Output:
358       isz    - array containing the count of the solution elements correspondign
359                to each index set;
360       data   - pointer to the solutions
361 */
362 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
363 {
364   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
365   Mat        A = c->A,B = c->B;
366   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
367   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
368   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
369   PetscBT    table_i;
370 
371   PetscFunctionBegin;
372   rstart = C->rmap->rstart;
373   cstart = C->cmap->rstart;
374   ai     = a->i;
375   aj     = a->j;
376   bi     = b->i;
377   bj     = b->j;
378   garray = c->garray;
379 
380 
381   for (i=0; i<imax; i++) {
382     data_i  = data[i];
383     table_i = table[i];
384     isz_i   = isz[i];
385     for (j=0,max=isz[i]; j<max; j++) {
386       row   = data_i[j] - rstart;
387       start = ai[row];
388       end   = ai[row+1];
389       for (k=start; k<end; k++) { /* Amat */
390         val = aj[k] + cstart;
391         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
392       }
393       start = bi[row];
394       end   = bi[row+1];
395       for (k=start; k<end; k++) { /* Bmat */
396         val = garray[bj[k]];
397         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
398       }
399     }
400     isz[i] = isz_i;
401   }
402   PetscFunctionReturn(0);
403 }
404 
405 #undef __FUNCT__
406 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
407 /*
408       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
409          and return the output
410 
411          Input:
412            C    - the matrix
413            nrqr - no of messages being processed.
414            rbuf - an array of pointers to the recieved requests
415 
416          Output:
417            xdata - array of messages to be sent back
418            isz1  - size of each message
419 
420   For better efficiency perhaps we should malloc separately each xdata[i],
421 then if a remalloc is required we need only copy the data for that one row
422 rather then all previous rows as it is now where a single large chunck of
423 memory is used.
424 
425 */
426 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
427 {
428   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
429   Mat            A = c->A,B = c->B;
430   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
431   PetscErrorCode ierr;
432   PetscMPIInt    rank;
433   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
434   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
435   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
436   PetscInt       *rbuf_i,kmax,rbuf_0;
437   PetscBT        xtable;
438 
439   PetscFunctionBegin;
440   rank   = c->rank;
441   m      = C->rmap->N;
442   rstart = C->rmap->rstart;
443   cstart = C->cmap->rstart;
444   ai     = a->i;
445   aj     = a->j;
446   bi     = b->i;
447   bj     = b->j;
448   garray = c->garray;
449 
450 
451   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
452     rbuf_i  =  rbuf[i];
453     rbuf_0  =  rbuf_i[0];
454     ct     += rbuf_0;
455     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
456   }
457 
458   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
459   else      max1 = 1;
460   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
461   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
462   ++no_malloc;
463   ierr         = PetscBTCreate(m,xtable);CHKERRQ(ierr);
464   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
465 
466   ct3 = 0;
467   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
468     rbuf_i =  rbuf[i];
469     rbuf_0 =  rbuf_i[0];
470     ct1    =  2*rbuf_0+1;
471     ct2    =  ct1;
472     ct3    += ct1;
473     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
474       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
475       oct2 = ct2;
476       kmax = rbuf_i[2*j];
477       for (k=0; k<kmax; k++,ct1++) {
478         row = rbuf_i[ct1];
479         if (!PetscBTLookupSet(xtable,row)) {
480           if (!(ct3 < mem_estimate)) {
481             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
482             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
483             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
484             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
485             xdata[0]     = tmp;
486             mem_estimate = new_estimate; ++no_malloc;
487             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
488           }
489           xdata[i][ct2++] = row;
490           ct3++;
491         }
492       }
493       for (k=oct2,max2=ct2; k<max2; k++) {
494         row   = xdata[i][k] - rstart;
495         start = ai[row];
496         end   = ai[row+1];
497         for (l=start; l<end; l++) {
498           val = aj[l] + cstart;
499           if (!PetscBTLookupSet(xtable,val)) {
500             if (!(ct3 < mem_estimate)) {
501               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
502               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
503               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
504               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
505               xdata[0]     = tmp;
506               mem_estimate = new_estimate; ++no_malloc;
507               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
508             }
509             xdata[i][ct2++] = val;
510             ct3++;
511           }
512         }
513         start = bi[row];
514         end   = bi[row+1];
515         for (l=start; l<end; l++) {
516           val = garray[bj[l]];
517           if (!PetscBTLookupSet(xtable,val)) {
518             if (!(ct3 < mem_estimate)) {
519               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
520               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
521               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
522               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
523               xdata[0]     = tmp;
524               mem_estimate = new_estimate; ++no_malloc;
525               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
526             }
527             xdata[i][ct2++] = val;
528             ct3++;
529           }
530         }
531       }
532       /* Update the header*/
533       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
534       xdata[i][2*j-1] = rbuf_i[2*j-1];
535     }
536     xdata[i][0] = rbuf_0;
537     xdata[i+1]  = xdata[i] + ct2;
538     isz1[i]     = ct2; /* size of each message */
539   }
540   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
541   ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
542   PetscFunctionReturn(0);
543 }
544 /* -------------------------------------------------------------------------*/
545 EXTERN PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
546 EXTERN PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
547 /*
548     Every processor gets the entire matrix
549 */
550 #undef __FUNCT__
551 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
552 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
553 {
554   Mat            B;
555   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
556   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
557   PetscErrorCode ierr;
558   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
559   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
560   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
561   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
562 
563   PetscFunctionBegin;
564   ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr);
565   ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr);
566 
567   if (scall == MAT_INITIAL_MATRIX) {
568     /* ----------------------------------------------------------------
569          Tell every processor the number of nonzeros per row
570     */
571     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
572     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
573       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
574     }
575     sendcount = A->rmap->rend - A->rmap->rstart;
576     ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
577     for (i=0; i<size; i++) {
578       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
579       displs[i]     = A->rmap->range[i];
580     }
581 #if defined(PETSC_HAVE_MPI_IN_PLACE)
582     ierr  = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
583 #else
584     ierr  = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
585 #endif
586     /* ---------------------------------------------------------------
587          Create the sequential matrix of the same type as the local block diagonal
588     */
589     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
590     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
591     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
592     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
593     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
594     **Bin = B;
595     b = (Mat_SeqAIJ *)B->data;
596 
597     /*--------------------------------------------------------------------
598        Copy my part of matrix column indices over
599     */
600     sendcount  = ad->nz + bd->nz;
601     jsendbuf   = b->j + b->i[rstarts[rank]];
602     a_jsendbuf = ad->j;
603     b_jsendbuf = bd->j;
604     n          = A->rmap->rend - A->rmap->rstart;
605     cnt        = 0;
606     for (i=0; i<n; i++) {
607 
608       /* put in lower diagonal portion */
609       m = bd->i[i+1] - bd->i[i];
610       while (m > 0) {
611         /* is it above diagonal (in bd (compressed) numbering) */
612         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
613         jsendbuf[cnt++] = garray[*b_jsendbuf++];
614         m--;
615       }
616 
617       /* put in diagonal portion */
618       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
619         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
620       }
621 
622       /* put in upper diagonal portion */
623       while (m-- > 0) {
624         jsendbuf[cnt++] = garray[*b_jsendbuf++];
625       }
626     }
627     if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
628 
629     /*--------------------------------------------------------------------
630        Gather all column indices to all processors
631     */
632     for (i=0; i<size; i++) {
633       recvcounts[i] = 0;
634       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
635         recvcounts[i] += lens[j];
636       }
637     }
638     displs[0]  = 0;
639     for (i=1; i<size; i++) {
640       displs[i] = displs[i-1] + recvcounts[i-1];
641     }
642 #if defined(PETSC_HAVE_MPI_IN_PLACE)
643     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
644 #else
645     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
646 #endif
647     /*--------------------------------------------------------------------
648         Assemble the matrix into useable form (note numerical values not yet set)
649     */
650     /* set the b->ilen (length of each row) values */
651     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
652     /* set the b->i indices */
653     b->i[0] = 0;
654     for (i=1; i<=A->rmap->N; i++) {
655       b->i[i] = b->i[i-1] + lens[i-1];
656     }
657     ierr = PetscFree(lens);CHKERRQ(ierr);
658     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
659     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
660 
661   } else {
662     B  = **Bin;
663     b = (Mat_SeqAIJ *)B->data;
664   }
665 
666   /*--------------------------------------------------------------------
667        Copy my part of matrix numerical values into the values location
668   */
669   if (flag == MAT_GET_VALUES){
670     sendcount = ad->nz + bd->nz;
671     sendbuf   = b->a + b->i[rstarts[rank]];
672     a_sendbuf = ad->a;
673     b_sendbuf = bd->a;
674     b_sendj   = bd->j;
675     n         = A->rmap->rend - A->rmap->rstart;
676     cnt       = 0;
677     for (i=0; i<n; i++) {
678 
679       /* put in lower diagonal portion */
680       m = bd->i[i+1] - bd->i[i];
681       while (m > 0) {
682         /* is it above diagonal (in bd (compressed) numbering) */
683         if (garray[*b_sendj] > A->rmap->rstart + i) break;
684         sendbuf[cnt++] = *b_sendbuf++;
685         m--;
686         b_sendj++;
687       }
688 
689       /* put in diagonal portion */
690       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
691         sendbuf[cnt++] = *a_sendbuf++;
692       }
693 
694       /* put in upper diagonal portion */
695       while (m-- > 0) {
696         sendbuf[cnt++] = *b_sendbuf++;
697         b_sendj++;
698       }
699     }
700     if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
701 
702     /* -----------------------------------------------------------------
703        Gather all numerical values to all processors
704     */
705     if (!recvcounts) {
706       ierr   = PetscMalloc2(size,PetscInt,&recvcounts,size,PetscInt,&displs);CHKERRQ(ierr);
707     }
708     for (i=0; i<size; i++) {
709       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
710     }
711     displs[0]  = 0;
712     for (i=1; i<size; i++) {
713       displs[i] = displs[i-1] + recvcounts[i-1];
714     }
715     recvbuf   = b->a;
716 #if defined(PETSC_HAVE_MPI_IN_PLACE)
717     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
718 #else
719     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
720 #endif
721   }  /* endof (flag == MAT_GET_VALUES) */
722   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
723 
724   if (A->symmetric){
725     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
726   } else if (A->hermitian) {
727     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
728   } else if (A->structurally_symmetric) {
729     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
730   }
731   PetscFunctionReturn(0);
732 }
733 
734 #undef __FUNCT__
735 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
736 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
737 {
738   PetscErrorCode ierr;
739   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
740   PetscTruth     rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
741 
742   PetscFunctionBegin;
743   /*
744        Check for special case each processor gets entire matrix
745   */
746   if (ismax == 1 && C->rmap->N == C->cmap->N) {
747     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
748     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
749     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
750     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
751     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
752       wantallmatrix = PETSC_TRUE;
753       ierr = PetscOptionsGetTruth(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
754     }
755   }
756   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr);
757   if (twantallmatrix) {
758     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
759     PetscFunctionReturn(0);
760   }
761 
762   /* Allocate memory to hold all the submatrices */
763   if (scall != MAT_REUSE_MATRIX) {
764     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
765   }
766   /* Determine the number of stages through which submatrices are done */
767   nmax          = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
768   if (!nmax) nmax = 1;
769   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
770 
771   /* Make sure every processor loops through the nstages */
772   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
773 
774   for (i=0,pos=0; i<nstages; i++) {
775     if (pos+nmax <= ismax) max_no = nmax;
776     else if (pos == ismax) max_no = 0;
777     else                   max_no = ismax-pos;
778     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
779     pos += max_no;
780   }
781   PetscFunctionReturn(0);
782 }
783 /* -------------------------------------------------------------------------*/
784 #undef __FUNCT__
785 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
786 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
787 {
788   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
789   Mat            A = c->A;
790   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
791   const PetscInt **icol,**irow;
792   PetscInt       *nrow,*ncol,start;
793   PetscErrorCode ierr;
794   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
795   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
796   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
797   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap;
798   PetscInt       **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
799   const PetscInt *irow_i;
800   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
801   PetscInt       *rmap_i;
802   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
803   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
804   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
805   MPI_Status     *r_status3,*r_status4,*s_status4;
806   MPI_Comm       comm;
807   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
808   PetscTruth     sorted;
809   PetscMPIInt    *onodes1,*olengths1;
810   PetscMPIInt    idex,idex2,end;
811 
812   PetscFunctionBegin;
813   comm   = ((PetscObject)C)->comm;
814   tag0   = ((PetscObject)C)->tag;
815   size   = c->size;
816   rank   = c->rank;
817 
818   /* Get some new tags to keep the communication clean */
819   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
820   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
821   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
822 
823     /* Check if the col indices are sorted */
824   for (i=0; i<ismax; i++) {
825     ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr);
826     /*if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"isrow is not sorted");*/
827     ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr);
828     /*    if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"iscol is not sorted"); */
829   }
830   ierr = PetscMalloc4(ismax,PetscInt*,&irow,ismax,PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
831 
832   for (i=0; i<ismax; i++) {
833     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
834     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
835     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
836     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
837   }
838 
839   /* evaluate communication - mesg to who, length of mesg, and buffer space
840      required. Based on this, buffers are allocated, and data copied into them*/
841   ierr   = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */
842   ierr   = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
843   ierr   = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
844   ierr   = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
845   for (i=0; i<ismax; i++) {
846     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
847     jmax   = nrow[i];
848     irow_i = irow[i];
849     for (j=0; j<jmax; j++) {
850       l = 0;
851       row  = irow_i[j];
852       while (row >= C->rmap->range[l+1]) l++;
853       proc = l;
854       w4[proc]++;
855     }
856     for (j=0; j<size; j++) {
857       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
858     }
859   }
860 
861   nrqs     = 0;              /* no of outgoing messages */
862   msz      = 0;              /* total mesg length (for all procs) */
863   w1[rank] = 0;              /* no mesg sent to self */
864   w3[rank] = 0;
865   for (i=0; i<size; i++) {
866     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
867   }
868   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
869   for (i=0,j=0; i<size; i++) {
870     if (w1[i]) { pa[j] = i; j++; }
871   }
872 
873   /* Each message would have a header = 1 + 2*(no of IS) + data */
874   for (i=0; i<nrqs; i++) {
875     j     = pa[i];
876     w1[j] += w2[j] + 2* w3[j];
877     msz   += w1[j];
878   }
879 
880   /* Determine the number of messages to expect, their lengths, from from-ids */
881   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
882   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
883 
884   /* Now post the Irecvs corresponding to these messages */
885   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
886 
887   ierr = PetscFree(onodes1);CHKERRQ(ierr);
888   ierr = PetscFree(olengths1);CHKERRQ(ierr);
889 
890   /* Allocate Memory for outgoing messages */
891   ierr     = PetscMalloc(2*size*sizeof(PetscInt*) + 2*msz*sizeof(PetscInt) + size*sizeof(PetscInt),&sbuf1);CHKERRQ(ierr);
892   ptr      = sbuf1 + size;   /* Pointers to the data in outgoing buffers */
893   ierr     = PetscMemzero(sbuf1,2*size*sizeof(PetscInt*));CHKERRQ(ierr);
894   /* allocate memory for outgoing data + buf to receive the first reply */
895   tmp      = (PetscInt*)(ptr + size);
896   ctr      = tmp + 2*msz;
897 
898   {
899     PetscInt *iptr = tmp,ict = 0;
900     for (i=0; i<nrqs; i++) {
901       j         = pa[i];
902       iptr     += ict;
903       sbuf1[j]  = iptr;
904       ict       = w1[j];
905     }
906   }
907 
908   /* Form the outgoing messages */
909   /* Initialize the header space */
910   for (i=0; i<nrqs; i++) {
911     j           = pa[i];
912     sbuf1[j][0] = 0;
913     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
914     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
915   }
916 
917   /* Parse the isrow and copy data into outbuf */
918   for (i=0; i<ismax; i++) {
919     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
920     irow_i = irow[i];
921     jmax   = nrow[i];
922     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
923       l = 0;
924       row  = irow_i[j];
925       while (row >= C->rmap->range[l+1]) l++;
926       proc = l;
927       if (proc != rank) { /* copy to the outgoing buf*/
928         ctr[proc]++;
929         *ptr[proc] = row;
930         ptr[proc]++;
931       }
932     }
933     /* Update the headers for the current IS */
934     for (j=0; j<size; j++) { /* Can Optimise this loop too */
935       if ((ctr_j = ctr[j])) {
936         sbuf1_j        = sbuf1[j];
937         k              = ++sbuf1_j[0];
938         sbuf1_j[2*k]   = ctr_j;
939         sbuf1_j[2*k-1] = i;
940       }
941     }
942   }
943 
944   /*  Now  post the sends */
945   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
946   for (i=0; i<nrqs; ++i) {
947     j    = pa[i];
948     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
949   }
950 
951   /* Post Receives to capture the buffer size */
952   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
953   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
954   rbuf2[0] = tmp + msz;
955   for (i=1; i<nrqs; ++i) {
956     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
957   }
958   for (i=0; i<nrqs; ++i) {
959     j    = pa[i];
960     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
961   }
962 
963   /* Send to other procs the buf size they should allocate */
964 
965 
966   /* Receive messages*/
967   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
968   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
969   ierr        = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
970   {
971     Mat_SeqAIJ  *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
972     PetscInt    *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
973     PetscInt    *sbuf2_i;
974 
975     for (i=0; i<nrqr; ++i) {
976       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
977       req_size[idex] = 0;
978       rbuf1_i         = rbuf1[idex];
979       start           = 2*rbuf1_i[0] + 1;
980       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
981       ierr            = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
982       sbuf2_i         = sbuf2[idex];
983       for (j=start; j<end; j++) {
984         id               = rbuf1_i[j] - rstart;
985         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
986         sbuf2_i[j]       = ncols;
987         req_size[idex] += ncols;
988       }
989       req_source[idex] = r_status1[i].MPI_SOURCE;
990       /* form the header */
991       sbuf2_i[0]   = req_size[idex];
992       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
993       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
994     }
995   }
996   ierr = PetscFree(r_status1);CHKERRQ(ierr);
997   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
998 
999   /*  recv buffer sizes */
1000   /* Receive messages*/
1001 
1002   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
1003   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1004   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1005   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1006   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1007 
1008   for (i=0; i<nrqs; ++i) {
1009     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1010     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
1011     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1012     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1013     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1014   }
1015   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1016   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1017 
1018   /* Wait on sends1 and sends2 */
1019   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1020   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1021 
1022   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1023   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1024   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1025   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1026   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1027   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1028 
1029   /* Now allocate buffers for a->j, and send them off */
1030   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1031   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1032   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1033   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1034 
1035   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1036   {
1037     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1038     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1039     PetscInt cend = C->cmap->rend;
1040     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1041 
1042     for (i=0; i<nrqr; i++) {
1043       rbuf1_i   = rbuf1[i];
1044       sbuf_aj_i = sbuf_aj[i];
1045       ct1       = 2*rbuf1_i[0] + 1;
1046       ct2       = 0;
1047       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1048         kmax = rbuf1[i][2*j];
1049         for (k=0; k<kmax; k++,ct1++) {
1050           row    = rbuf1_i[ct1] - rstart;
1051           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1052           ncols  = nzA + nzB;
1053           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1054 
1055           /* load the column indices for this row into cols*/
1056           cols  = sbuf_aj_i + ct2;
1057 
1058 	  lwrite = 0;
1059           for (l=0; l<nzB; l++) {
1060             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[lwrite++] = ctmp;
1061           }
1062           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1063           for (l=0; l<nzB; l++) {
1064             if ((ctmp = bmap[cworkB[l]]) >= cend)  cols[lwrite++] = ctmp;
1065           }
1066 
1067           ct2 += ncols;
1068         }
1069       }
1070       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1071     }
1072   }
1073   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1074   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1075 
1076   /* Allocate buffers for a->a, and send them off */
1077   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1078   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1079   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1080   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1081 
1082   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1083   {
1084     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1085     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1086     PetscInt    cend = C->cmap->rend;
1087     PetscInt    *b_j = b->j;
1088     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1089 
1090     for (i=0; i<nrqr; i++) {
1091       rbuf1_i   = rbuf1[i];
1092       sbuf_aa_i = sbuf_aa[i];
1093       ct1       = 2*rbuf1_i[0]+1;
1094       ct2       = 0;
1095       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1096         kmax = rbuf1_i[2*j];
1097         for (k=0; k<kmax; k++,ct1++) {
1098           row    = rbuf1_i[ct1] - rstart;
1099           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1100           ncols  = nzA + nzB;
1101           cworkB = b_j + b_i[row];
1102           vworkA = a_a + a_i[row];
1103           vworkB = b_a + b_i[row];
1104 
1105           /* load the column values for this row into vals*/
1106           vals  = sbuf_aa_i+ct2;
1107 
1108 	  lwrite = 0;
1109           for (l=0; l<nzB; l++) {
1110             if ((bmap[cworkB[l]]) < cstart)  vals[lwrite++] = vworkB[l];
1111           }
1112           for (l=0; l<nzA; l++)   vals[lwrite++] = vworkA[l];
1113           for (l=0; l<nzB; l++) {
1114             if ((bmap[cworkB[l]]) >= cend)  vals[lwrite++] = vworkB[l];
1115           }
1116 
1117           ct2 += ncols;
1118         }
1119       }
1120       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1121     }
1122   }
1123   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1124   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1125   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1126 
1127   /* Form the matrix */
1128   /* create col map */
1129   {
1130     const PetscInt *icol_i;
1131 
1132     ierr    = PetscMalloc((1+ismax)*sizeof(PetscInt*)+ (1+ismax*C->cmap->N)*sizeof(PetscInt),&cmap);CHKERRQ(ierr);
1133     cmap[0] = (PetscInt*)(cmap + ismax);
1134     ierr    = PetscMemzero(cmap[0],(1+ismax*C->cmap->N)*sizeof(PetscInt));CHKERRQ(ierr);
1135     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; }
1136     for (i=0; i<ismax; i++) {
1137       jmax   = ncol[i];
1138       icol_i = icol[i];
1139       cmap_i = cmap[i];
1140       for (j=0; j<jmax; j++) {
1141         cmap_i[icol_i[j]] = j+1;
1142       }
1143     }
1144   }
1145 
1146   /* Create lens which is required for MatCreate... */
1147   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1148   ierr    = PetscMalloc( (1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt),&lens);CHKERRQ(ierr);
1149   lens[0] = (PetscInt*)(lens + ismax);
1150   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1151   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1152 
1153   /* Update lens from local data */
1154   for (i=0; i<ismax; i++) {
1155     jmax   = nrow[i];
1156     cmap_i = cmap[i];
1157     irow_i = irow[i];
1158     lens_i = lens[i];
1159     for (j=0; j<jmax; j++) {
1160       l = 0;
1161       row  = irow_i[j];
1162       while (row >= C->rmap->range[l+1]) l++;
1163       proc = l;
1164       if (proc == rank) {
1165         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1166         for (k=0; k<ncols; k++) {
1167           if (cmap_i[cols[k]]) { lens_i[j]++;}
1168         }
1169         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1170       }
1171     }
1172   }
1173 
1174   /* Create row map*/
1175   ierr    = PetscMalloc((1+ismax)*sizeof(PetscInt*)+ ismax*C->rmap->N*sizeof(PetscInt),&rmap);CHKERRQ(ierr);
1176   rmap[0] = (PetscInt*)(rmap + ismax);
1177   ierr    = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1178   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1179   for (i=0; i<ismax; i++) {
1180     rmap_i = rmap[i];
1181     irow_i = irow[i];
1182     jmax   = nrow[i];
1183     for (j=0; j<jmax; j++) {
1184       rmap_i[irow_i[j]] = j;
1185     }
1186   }
1187 
1188   /* Update lens from offproc data */
1189   {
1190     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1191 
1192     for (tmp2=0; tmp2<nrqs; tmp2++) {
1193       ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1194       idex   = pa[idex2];
1195       sbuf1_i = sbuf1[idex];
1196       jmax    = sbuf1_i[0];
1197       ct1     = 2*jmax+1;
1198       ct2     = 0;
1199       rbuf2_i = rbuf2[idex2];
1200       rbuf3_i = rbuf3[idex2];
1201       for (j=1; j<=jmax; j++) {
1202         is_no   = sbuf1_i[2*j-1];
1203         max1    = sbuf1_i[2*j];
1204         lens_i  = lens[is_no];
1205         cmap_i  = cmap[is_no];
1206         rmap_i  = rmap[is_no];
1207         for (k=0; k<max1; k++,ct1++) {
1208           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1209           max2 = rbuf2_i[ct1];
1210           for (l=0; l<max2; l++,ct2++) {
1211             if (cmap_i[rbuf3_i[ct2]]) {
1212               lens_i[row]++;
1213             }
1214           }
1215         }
1216       }
1217     }
1218   }
1219   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1220   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1221   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1222   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1223   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1224 
1225   /* Create the submatrices */
1226   if (scall == MAT_REUSE_MATRIX) {
1227     PetscTruth flag;
1228 
1229     /*
1230         Assumes new rows are same length as the old rows,hence bug!
1231     */
1232     for (i=0; i<ismax; i++) {
1233       mat = (Mat_SeqAIJ *)(submats[i]->data);
1234       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) {
1235         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1236       }
1237       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1238       if (!flag) {
1239         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1240       }
1241       /* Initial matrix as if empty */
1242       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1243       submats[i]->factor = C->factor;
1244     }
1245   } else {
1246     for (i=0; i<ismax; i++) {
1247       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1248       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1249       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1250       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1251     }
1252   }
1253 
1254   /* Assemble the matrices */
1255   /* First assemble the local rows */
1256   {
1257     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1258     PetscScalar *imat_a;
1259 
1260     for (i=0; i<ismax; i++) {
1261       mat       = (Mat_SeqAIJ*)submats[i]->data;
1262       imat_ilen = mat->ilen;
1263       imat_j    = mat->j;
1264       imat_i    = mat->i;
1265       imat_a    = mat->a;
1266       cmap_i    = cmap[i];
1267       rmap_i    = rmap[i];
1268       irow_i    = irow[i];
1269       jmax      = nrow[i];
1270       for (j=0; j<jmax; j++) {
1271 	l = 0;
1272         row      = irow_i[j];
1273         while (row >= C->rmap->range[l+1]) l++;
1274         proc = l;
1275         if (proc == rank) {
1276           old_row  = row;
1277           row      = rmap_i[row];
1278           ilen_row = imat_ilen[row];
1279           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1280           mat_i    = imat_i[row] ;
1281           mat_a    = imat_a + mat_i;
1282           mat_j    = imat_j + mat_i;
1283           for (k=0; k<ncols; k++) {
1284             if ((tcol = cmap_i[cols[k]])) {
1285               *mat_j++ = tcol - 1;
1286               *mat_a++ = vals[k];
1287               ilen_row++;
1288             }
1289           }
1290           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1291           imat_ilen[row] = ilen_row;
1292         }
1293       }
1294     }
1295   }
1296 
1297   /*   Now assemble the off proc rows*/
1298   {
1299     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1300     PetscInt    *imat_j,*imat_i;
1301     PetscScalar *imat_a,*rbuf4_i;
1302 
1303     for (tmp2=0; tmp2<nrqs; tmp2++) {
1304       ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1305       idex   = pa[idex2];
1306       sbuf1_i = sbuf1[idex];
1307       jmax    = sbuf1_i[0];
1308       ct1     = 2*jmax + 1;
1309       ct2     = 0;
1310       rbuf2_i = rbuf2[idex2];
1311       rbuf3_i = rbuf3[idex2];
1312       rbuf4_i = rbuf4[idex2];
1313       for (j=1; j<=jmax; j++) {
1314         is_no     = sbuf1_i[2*j-1];
1315         rmap_i    = rmap[is_no];
1316         cmap_i    = cmap[is_no];
1317         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1318         imat_ilen = mat->ilen;
1319         imat_j    = mat->j;
1320         imat_i    = mat->i;
1321         imat_a    = mat->a;
1322         max1      = sbuf1_i[2*j];
1323         for (k=0; k<max1; k++,ct1++) {
1324           row   = sbuf1_i[ct1];
1325           row   = rmap_i[row];
1326           ilen  = imat_ilen[row];
1327           mat_i = imat_i[row] ;
1328           mat_a = imat_a + mat_i;
1329           mat_j = imat_j + mat_i;
1330           max2 = rbuf2_i[ct1];
1331           for (l=0; l<max2; l++,ct2++) {
1332             if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1333               *mat_j++ = tcol - 1;
1334               *mat_a++ = rbuf4_i[ct2];
1335               ilen++;
1336             }
1337           }
1338           imat_ilen[row] = ilen;
1339         }
1340       }
1341     }
1342   }
1343   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1344   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1345   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1346   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1347   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1348 
1349   /* Restore the indices */
1350   for (i=0; i<ismax; i++) {
1351     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1352     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1353   }
1354 
1355   /* Destroy allocated memory */
1356   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1357   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1358   ierr = PetscFree(pa);CHKERRQ(ierr);
1359 
1360   ierr = PetscFree(sbuf1);CHKERRQ(ierr);
1361   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1362   for (i=0; i<nrqr; ++i) {
1363     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1364   }
1365   for (i=0; i<nrqs; ++i) {
1366     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1367     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1368   }
1369 
1370   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1371   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1372   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1373   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1374   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1375   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1376   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1377 
1378   ierr = PetscFree(cmap);CHKERRQ(ierr);
1379   ierr = PetscFree(rmap);CHKERRQ(ierr);
1380   ierr = PetscFree(lens);CHKERRQ(ierr);
1381 
1382   for (i=0; i<ismax; i++) {
1383     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1384     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1385   }
1386   PetscFunctionReturn(0);
1387 }
1388 
1389