xref: /petsc/src/mat/impls/baij/mpi/baijov.c (revision daa037dfd3c3bec8dc8659548d2b20b07c1dc6de)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/baij/mpi/mpibaij.h>
7 #include <petscbt.h>
8 
9 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
10 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
11 extern PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
12 extern PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13 
14 PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
15 {
16   PetscInt       i,N=C->cmap->N, bs=C->rmap->bs;
17   IS             *is_new;
18 
19   PetscFunctionBegin;
20   PetscCall(PetscMalloc1(imax,&is_new));
21   /* Convert the indices into block format */
22   PetscCall(ISCompressIndicesGeneral(N,C->rmap->n,bs,imax,is,is_new));
23   PetscCheckFalse(ov < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24   for (i=0; i<ov; ++i) {
25     PetscCall(MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new));
26   }
27   for (i=0; i<imax; i++) PetscCall(ISDestroy(&is[i]));
28   PetscCall(ISExpandIndicesGeneral(N,N,bs,imax,is_new,is));
29   for (i=0; i<imax; i++) PetscCall(ISDestroy(&is_new[i]));
30   PetscCall(PetscFree(is_new));
31   PetscFunctionReturn(0);
32 }
33 
34 /*
35   Sample message format:
36   If a processor A wants processor B to process some elements corresponding
37   to index sets is[1], is[5]
38   mesg [0] = 2   (no of index sets in the mesg)
39   -----------
40   mesg [1] = 1 => is[1]
41   mesg [2] = sizeof(is[1]);
42   -----------
43   mesg [5] = 5  => is[5]
44   mesg [6] = sizeof(is[5]);
45   -----------
46   mesg [7]
47   mesg [n]  data(is[1])
48   -----------
49   mesg[n+1]
50   mesg[m]  data(is[5])
51   -----------
52 
53   Notes:
54   nrqs - no of requests sent (or to be sent out)
55   nrqr - no of requests received (which have to be or which have been processed)
56 */
57 PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[])
58 {
59   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
60   const PetscInt **idx,*idx_i;
61   PetscInt       *n,*w3,*w4,**data,len;
62   PetscMPIInt    size,rank,tag1,tag2,*w2,*w1,nrqr;
63   PetscInt       Mbs,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
64   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
65   PetscMPIInt    *onodes1,*olengths1,*onodes2,*olengths2,proc=-1;
66   PetscBT        *table;
67   MPI_Comm       comm,*iscomms;
68   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
69   char           *t_p;
70 
71   PetscFunctionBegin;
72   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
73   size = c->size;
74   rank = c->rank;
75   Mbs  = c->Mbs;
76 
77   PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag1));
78   PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2));
79 
80   PetscCall(PetscMalloc2(imax+1,(PetscInt***)&idx,imax,&n));
81 
82   for (i=0; i<imax; i++) {
83     PetscCall(ISGetIndices(is[i],&idx[i]));
84     PetscCall(ISGetLocalSize(is[i],&n[i]));
85   }
86 
87   /* evaluate communication - mesg to who,length of mesg, and buffer space
88      required. Based on this, buffers are allocated, and data copied into them*/
89   PetscCall(PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4));
90   for (i=0; i<imax; i++) {
91     PetscCall(PetscArrayzero(w4,size)); /* initialise work vector*/
92     idx_i = idx[i];
93     len   = n[i];
94     for (j=0; j<len; j++) {
95       row = idx_i[j];
96       PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
97       PetscCall(PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc));
98       w4[proc]++;
99     }
100     for (j=0; j<size; j++) {
101       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
102     }
103   }
104 
105   nrqs     = 0;              /* no of outgoing messages */
106   msz      = 0;              /* total mesg length (for all proc */
107   w1[rank] = 0;              /* no mesg sent to itself */
108   w3[rank] = 0;
109   for (i=0; i<size; i++) {
110     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
111   }
112   /* pa - is list of processors to communicate with */
113   PetscCall(PetscMalloc1(nrqs,&pa));
114   for (i=0,j=0; i<size; i++) {
115     if (w1[i]) {pa[j] = i; j++;}
116   }
117 
118   /* Each message would have a header = 1 + 2*(no of IS) + data */
119   for (i=0; i<nrqs; i++) {
120     j      = pa[i];
121     w1[j] += w2[j] + 2*w3[j];
122     msz   += w1[j];
123   }
124 
125   /* Determine the number of messages to expect, their lengths, from from-ids */
126   PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr));
127   PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1));
128 
129   /* Now post the Irecvs corresponding to these messages */
130   PetscCall(PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1));
131 
132   /* Allocate Memory for outgoing messages */
133   PetscCall(PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr));
134   PetscCall(PetscArrayzero(outdat,size));
135   PetscCall(PetscArrayzero(ptr,size));
136   {
137     PetscInt *iptr = tmp,ict  = 0;
138     for (i=0; i<nrqs; i++) {
139       j         = pa[i];
140       iptr     +=  ict;
141       outdat[j] = iptr;
142       ict       = w1[j];
143     }
144   }
145 
146   /* Form the outgoing messages */
147   /*plug in the headers*/
148   for (i=0; i<nrqs; i++) {
149     j            = pa[i];
150     outdat[j][0] = 0;
151     PetscCall(PetscArrayzero(outdat[j]+1,2*w3[j]));
152     ptr[j]       = outdat[j] + 2*w3[j] + 1;
153   }
154 
155   /* Memory for doing local proc's work*/
156   {
157     PetscCall(PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mbs*imax,&d_p, (Mbs/PETSC_BITS_PER_BYTE+1)*imax,&t_p));
158 
159     for (i=0; i<imax; i++) {
160       table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
161       data[i]  = d_p + (Mbs)*i;
162     }
163   }
164 
165   /* Parse the IS and update local tables and the outgoing buf with the data*/
166   {
167     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
168     PetscBT  table_i;
169 
170     for (i=0; i<imax; i++) {
171       PetscCall(PetscArrayzero(ctr,size));
172       n_i     = n[i];
173       table_i = table[i];
174       idx_i   = idx[i];
175       data_i  = data[i];
176       isz_i   = isz[i];
177       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
178         row  = idx_i[j];
179         PetscCall(PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc));
180         if (proc != rank) { /* copy to the outgoing buffer */
181           ctr[proc]++;
182           *ptr[proc] = row;
183           ptr[proc]++;
184         } else { /* Update the local table */
185           if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
186         }
187       }
188       /* Update the headers for the current IS */
189       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
190         if ((ctr_j = ctr[j])) {
191           outdat_j        = outdat[j];
192           k               = ++outdat_j[0];
193           outdat_j[2*k]   = ctr_j;
194           outdat_j[2*k-1] = i;
195         }
196       }
197       isz[i] = isz_i;
198     }
199   }
200 
201   /*  Now  post the sends */
202   PetscCall(PetscMalloc1(nrqs,&s_waits1));
203   for (i=0; i<nrqs; ++i) {
204     j    = pa[i];
205     PetscCallMPI(MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i));
206   }
207 
208   /* No longer need the original indices*/
209   for (i=0; i<imax; ++i) {
210     PetscCall(ISRestoreIndices(is[i],idx+i));
211   }
212   PetscCall(PetscFree2(*(PetscInt***)&idx,n));
213 
214   PetscCall(PetscMalloc1(imax,&iscomms));
215   for (i=0; i<imax; ++i) {
216     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL));
217     PetscCall(ISDestroy(&is[i]));
218   }
219 
220   /* Do Local work*/
221   PetscCall(MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data));
222 
223   /* Receive messages*/
224   PetscCallMPI(MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE));
225   PetscCallMPI(MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE));
226 
227   /* Phase 1 sends are complete - deallocate buffers */
228   PetscCall(PetscFree4(outdat,ptr,tmp,ctr));
229   PetscCall(PetscFree4(w1,w2,w3,w4));
230 
231   PetscCall(PetscMalloc1(nrqr,&xdata));
232   PetscCall(PetscMalloc1(nrqr,&isz1));
233   PetscCall(MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1));
234   if (rbuf) {
235     PetscCall(PetscFree(rbuf[0]));
236     PetscCall(PetscFree(rbuf));
237   }
238 
239   /* Send the data back*/
240   /* Do a global reduction to know the buffer space req for incoming messages*/
241   {
242     PetscMPIInt *rw1;
243 
244     PetscCall(PetscCalloc1(size,&rw1));
245 
246     for (i=0; i<nrqr; ++i) {
247       proc = onodes1[i];
248       rw1[proc] = isz1[i];
249     }
250 
251     /* Determine the number of messages to expect, their lengths, from from-ids */
252     PetscCall(PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2));
253     PetscCall(PetscFree(rw1));
254   }
255   /* Now post the Irecvs corresponding to these messages */
256   PetscCall(PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2));
257 
258   /*  Now  post the sends */
259   PetscCall(PetscMalloc1(nrqr,&s_waits2));
260   for (i=0; i<nrqr; ++i) {
261     j    = onodes1[i];
262     PetscCallMPI(MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i));
263   }
264 
265   PetscCall(PetscFree(onodes1));
266   PetscCall(PetscFree(olengths1));
267 
268   /* receive work done on other processors*/
269   {
270     PetscMPIInt idex;
271     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
272     PetscBT     table_i;
273 
274     for (i=0; i<nrqs; ++i) {
275       PetscCallMPI(MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE));
276       /* Process the message*/
277       rbuf2_i = rbuf2[idex];
278       ct1     = 2*rbuf2_i[0]+1;
279       jmax    = rbuf2[idex][0];
280       for (j=1; j<=jmax; j++) {
281         max     = rbuf2_i[2*j];
282         is_no   = rbuf2_i[2*j-1];
283         isz_i   = isz[is_no];
284         data_i  = data[is_no];
285         table_i = table[is_no];
286         for (k=0; k<max; k++,ct1++) {
287           row = rbuf2_i[ct1];
288           if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
289         }
290         isz[is_no] = isz_i;
291       }
292     }
293     PetscCallMPI(MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE));
294   }
295 
296   for (i=0; i<imax; ++i) {
297     PetscCall(ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i));
298     PetscCall(PetscCommDestroy(&iscomms[i]));
299   }
300 
301   PetscCall(PetscFree(iscomms));
302   PetscCall(PetscFree(onodes2));
303   PetscCall(PetscFree(olengths2));
304 
305   PetscCall(PetscFree(pa));
306   if (rbuf2) {
307     PetscCall(PetscFree(rbuf2[0]));
308     PetscCall(PetscFree(rbuf2));
309   }
310   PetscCall(PetscFree(s_waits1));
311   PetscCall(PetscFree(r_waits1));
312   PetscCall(PetscFree(s_waits2));
313   PetscCall(PetscFree(r_waits2));
314   PetscCall(PetscFree5(table,data,isz,d_p,t_p));
315   if (xdata) {
316     PetscCall(PetscFree(xdata[0]));
317     PetscCall(PetscFree(xdata));
318   }
319   PetscCall(PetscFree(isz1));
320   PetscFunctionReturn(0);
321 }
322 
323 /*
324    MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
325        the work on the local processor.
326 
327      Inputs:
328       C      - MAT_MPIBAIJ;
329       imax - total no of index sets processed at a time;
330       table  - an array of char - size = Mbs bits.
331 
332      Output:
333       isz    - array containing the count of the solution elements corresponding
334                to each index set;
335       data   - pointer to the solutions
336 */
337 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
338 {
339   Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
340   Mat         A  = c->A,B = c->B;
341   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
342   PetscInt    start,end,val,max,rstart,cstart,*ai,*aj;
343   PetscInt    *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
344   PetscBT     table_i;
345 
346   PetscFunctionBegin;
347   rstart = c->rstartbs;
348   cstart = c->cstartbs;
349   ai     = a->i;
350   aj     = a->j;
351   bi     = b->i;
352   bj     = b->j;
353   garray = c->garray;
354 
355   for (i=0; i<imax; i++) {
356     data_i  = data[i];
357     table_i = table[i];
358     isz_i   = isz[i];
359     for (j=0,max=isz[i]; j<max; j++) {
360       row   = data_i[j] - rstart;
361       start = ai[row];
362       end   = ai[row+1];
363       for (k=start; k<end; k++) { /* Amat */
364         val = aj[k] + cstart;
365         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
366       }
367       start = bi[row];
368       end   = bi[row+1];
369       for (k=start; k<end; k++) { /* Bmat */
370         val = garray[bj[k]];
371         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
372       }
373     }
374     isz[i] = isz_i;
375   }
376   PetscFunctionReturn(0);
377 }
378 /*
379       MatIncreaseOverlap_MPIBAIJ_Receive - Process the received messages,
380          and return the output
381 
382          Input:
383            C    - the matrix
384            nrqr - no of messages being processed.
385            rbuf - an array of pointers to the received requests
386 
387          Output:
388            xdata - array of messages to be sent back
389            isz1  - size of each message
390 
391   For better efficiency perhaps we should malloc separately each xdata[i],
392 then if a remalloc is required we need only copy the data for that one row
393 rather than all previous rows as it is now where a single large chunk of
394 memory is used.
395 
396 */
397 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
398 {
399   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
400   Mat            A  = c->A,B = c->B;
401   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
402   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
403   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
404   PetscInt       val,max1,max2,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
405   PetscInt       *rbuf_i,kmax,rbuf_0;
406   PetscBT        xtable;
407 
408   PetscFunctionBegin;
409   Mbs    = c->Mbs;
410   rstart = c->rstartbs;
411   cstart = c->cstartbs;
412   ai     = a->i;
413   aj     = a->j;
414   bi     = b->i;
415   bj     = b->j;
416   garray = c->garray;
417 
418   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
419     rbuf_i =  rbuf[i];
420     rbuf_0 =  rbuf_i[0];
421     ct    += rbuf_0;
422     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
423   }
424 
425   if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
426   else        max1 = 1;
427   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
428   if (nrqr) {
429     PetscCall(PetscMalloc1(mem_estimate,&xdata[0]));
430     ++no_malloc;
431   }
432   PetscCall(PetscBTCreate(Mbs,&xtable));
433   PetscCall(PetscArrayzero(isz1,nrqr));
434 
435   ct3 = 0;
436   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
437     rbuf_i =  rbuf[i];
438     rbuf_0 =  rbuf_i[0];
439     ct1    =  2*rbuf_0+1;
440     ct2    =  ct1;
441     ct3   += ct1;
442     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
443       PetscCall(PetscBTMemzero(Mbs,xtable));
444       oct2 = ct2;
445       kmax = rbuf_i[2*j];
446       for (k=0; k<kmax; k++,ct1++) {
447         row = rbuf_i[ct1];
448         if (!PetscBTLookupSet(xtable,row)) {
449           if (!(ct3 < mem_estimate)) {
450             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
451             PetscCall(PetscMalloc1(new_estimate,&tmp));
452             PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate));
453             PetscCall(PetscFree(xdata[0]));
454             xdata[0]     = tmp;
455             mem_estimate = new_estimate; ++no_malloc;
456             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
457           }
458           xdata[i][ct2++] = row;
459           ct3++;
460         }
461       }
462       for (k=oct2,max2=ct2; k<max2; k++)  {
463         row   = xdata[i][k] - rstart;
464         start = ai[row];
465         end   = ai[row+1];
466         for (l=start; l<end; l++) {
467           val = aj[l] + cstart;
468           if (!PetscBTLookupSet(xtable,val)) {
469             if (!(ct3 < mem_estimate)) {
470               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
471               PetscCall(PetscMalloc1(new_estimate,&tmp));
472               PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate));
473               PetscCall(PetscFree(xdata[0]));
474               xdata[0]     = tmp;
475               mem_estimate = new_estimate; ++no_malloc;
476               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
477             }
478             xdata[i][ct2++] = val;
479             ct3++;
480           }
481         }
482         start = bi[row];
483         end   = bi[row+1];
484         for (l=start; l<end; l++) {
485           val = garray[bj[l]];
486           if (!PetscBTLookupSet(xtable,val)) {
487             if (!(ct3 < mem_estimate)) {
488               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
489               PetscCall(PetscMalloc1(new_estimate,&tmp));
490               PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate));
491               PetscCall(PetscFree(xdata[0]));
492               xdata[0]     = tmp;
493               mem_estimate = new_estimate; ++no_malloc;
494               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
495             }
496             xdata[i][ct2++] = val;
497             ct3++;
498           }
499         }
500       }
501       /* Update the header*/
502       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
503       xdata[i][2*j-1] = rbuf_i[2*j-1];
504     }
505     xdata[i][0] = rbuf_0;
506     if (i+1<nrqr) xdata[i+1] = xdata[i] + ct2;
507     isz1[i]     = ct2; /* size of each message */
508   }
509   PetscCall(PetscBTDestroy(&xtable));
510   PetscCall(PetscInfo(C,"Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT ", no of mallocs = %" PetscInt_FMT "\n",mem_estimate,ct3,no_malloc));
511   PetscFunctionReturn(0);
512 }
513 
514 PetscErrorCode MatCreateSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
515 {
516   IS             *isrow_block,*iscol_block;
517   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
518   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,N=C->cmap->N,bs=C->rmap->bs;
519   Mat_SeqBAIJ    *subc;
520   Mat_SubSppt    *smat;
521 
522   PetscFunctionBegin;
523   /* The compression and expansion should be avoided. Doesn't point
524      out errors, might change the indices, hence buggey */
525   PetscCall(PetscMalloc2(ismax+1,&isrow_block,ismax+1,&iscol_block));
526   PetscCall(ISCompressIndicesGeneral(N,C->rmap->n,bs,ismax,isrow,isrow_block));
527   PetscCall(ISCompressIndicesGeneral(N,C->cmap->n,bs,ismax,iscol,iscol_block));
528 
529   /* Determine the number of stages through which submatrices are done */
530   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
531   else nmax = 20*1000000 / (c->Nbs * sizeof(PetscInt));
532   if (!nmax) nmax = 1;
533 
534   if (scall == MAT_INITIAL_MATRIX) {
535     nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
536 
537     /* Make sure every processor loops through the nstages */
538     PetscCall(MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C)));
539 
540     /* Allocate memory to hold all the submatrices and dummy submatrices */
541     PetscCall(PetscCalloc1(ismax+nstages,submat));
542   } else { /* MAT_REUSE_MATRIX */
543     if (ismax) {
544       subc = (Mat_SeqBAIJ*)((*submat)[0]->data);
545       smat   = subc->submatis1;
546     } else { /* (*submat)[0] is a dummy matrix */
547       smat = (Mat_SubSppt*)(*submat)[0]->data;
548     }
549     PetscCheck(smat,PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"MatCreateSubMatrices(...,MAT_REUSE_MATRIX,...) requires submat");
550     nstages = smat->nstages;
551   }
552 
553   for (i=0,pos=0; i<nstages; i++) {
554     if (pos+nmax <= ismax) max_no = nmax;
555     else if (pos >= ismax) max_no = 0;
556     else                   max_no = ismax-pos;
557 
558     PetscCall(MatCreateSubMatrices_MPIBAIJ_local(C,max_no,isrow_block+pos,iscol_block+pos,scall,*submat+pos));
559     if (!max_no) {
560       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
561         smat = (Mat_SubSppt*)(*submat)[pos]->data;
562         smat->nstages = nstages;
563       }
564       pos++; /* advance to next dummy matrix if any */
565     } else pos += max_no;
566   }
567 
568   if (scall == MAT_INITIAL_MATRIX && ismax) {
569     /* save nstages for reuse */
570     subc = (Mat_SeqBAIJ*)((*submat)[0]->data);
571     smat = subc->submatis1;
572     smat->nstages = nstages;
573   }
574 
575   for (i=0; i<ismax; i++) {
576     PetscCall(ISDestroy(&isrow_block[i]));
577     PetscCall(ISDestroy(&iscol_block[i]));
578   }
579   PetscCall(PetscFree2(isrow_block,iscol_block));
580   PetscFunctionReturn(0);
581 }
582 
583 #if defined(PETSC_USE_CTABLE)
584 PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
585 {
586   PetscInt       nGlobalNd = proc_gnode[size];
587   PetscMPIInt    fproc;
588 
589   PetscFunctionBegin;
590   PetscCall(PetscMPIIntCast((PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)),&fproc));
591   if (fproc > size) fproc = size;
592   while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
593     if (row < proc_gnode[fproc]) fproc--;
594     else                         fproc++;
595   }
596   *rank = fproc;
597   PetscFunctionReturn(0);
598 }
599 #endif
600 
601 /* -------------------------------------------------------------------------*/
602 /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */
603 PetscErrorCode MatCreateSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
604 {
605   Mat_MPIBAIJ    *c = (Mat_MPIBAIJ*)C->data;
606   Mat            A  = c->A;
607   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*subc;
608   const PetscInt **icol,**irow;
609   PetscInt       *nrow,*ncol,start;
610   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
611   PetscInt       **sbuf1,**sbuf2,*sbuf2_i,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
612   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
613   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
614   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
615 #if defined(PETSC_USE_CTABLE)
616   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
617 #else
618   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
619 #endif
620   const PetscInt *irow_i,*icol_i;
621   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
622   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
623   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
624   MPI_Comm       comm;
625   PetscScalar    **rbuf4,*rbuf4_i=NULL,**sbuf_aa,*vals,*mat_a=NULL,*imat_a=NULL,*sbuf_aa_i;
626   PetscMPIInt    *onodes1,*olengths1,end;
627   PetscInt       **row2proc,*row2proc_i,*imat_ilen,*imat_j,*imat_i;
628   Mat_SubSppt    *smat_i;
629   PetscBool      *issorted,colflag,iscsorted=PETSC_TRUE;
630   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
631   PetscInt       bs=C->rmap->bs,bs2=c->bs2,rstart = c->rstartbs;
632   PetscBool      ijonly=c->ijonly; /* private flag indicates only matrix data structures are requested */
633   PetscInt       nzA,nzB,*a_i=a->i,*b_i=b->i,*a_j = a->j,*b_j = b->j,ctmp,imark,*cworkA,*cworkB;
634   PetscScalar    *vworkA=NULL,*vworkB=NULL,*a_a = a->a,*b_a = b->a;
635   PetscInt       cstart = c->cstartbs,*bmap = c->garray;
636   PetscBool      *allrows,*allcolumns;
637 
638   PetscFunctionBegin;
639   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
640   size = c->size;
641   rank = c->rank;
642 
643   PetscCall(PetscMalloc5(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns,ismax,&allrows));
644   PetscCall(PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted));
645 
646   for (i=0; i<ismax; i++) {
647     PetscCall(ISSorted(iscol[i],&issorted[i]));
648     if (!issorted[i]) iscsorted = issorted[i]; /* columns are not sorted! */
649     PetscCall(ISSorted(isrow[i],&issorted[i]));
650 
651     /* Check for special case: allcolumns */
652     PetscCall(ISIdentity(iscol[i],&colflag));
653     PetscCall(ISGetLocalSize(iscol[i],&ncol[i]));
654 
655     if (colflag && ncol[i] == c->Nbs) {
656       allcolumns[i] = PETSC_TRUE;
657       icol[i]       = NULL;
658     } else {
659       allcolumns[i] = PETSC_FALSE;
660       PetscCall(ISGetIndices(iscol[i],&icol[i]));
661     }
662 
663     /* Check for special case: allrows */
664     PetscCall(ISIdentity(isrow[i],&colflag));
665     PetscCall(ISGetLocalSize(isrow[i],&nrow[i]));
666     if (colflag && nrow[i] == c->Mbs) {
667       allrows[i] = PETSC_TRUE;
668       irow[i]    = NULL;
669     } else {
670       allrows[i] = PETSC_FALSE;
671       PetscCall(ISGetIndices(isrow[i],&irow[i]));
672     }
673   }
674 
675   if (scall == MAT_REUSE_MATRIX) {
676     /* Assumes new rows are same length as the old rows */
677     for (i=0; i<ismax; i++) {
678       subc = (Mat_SeqBAIJ*)(submats[i]->data);
679       PetscCheckFalse(subc->mbs != nrow[i] || subc->nbs != ncol[i],PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
680 
681       /* Initial matrix as if empty */
682       PetscCall(PetscArrayzero(subc->ilen,subc->mbs));
683 
684       /* Initial matrix as if empty */
685       submats[i]->factortype = C->factortype;
686 
687       smat_i   = subc->submatis1;
688 
689       nrqs        = smat_i->nrqs;
690       nrqr        = smat_i->nrqr;
691       rbuf1       = smat_i->rbuf1;
692       rbuf2       = smat_i->rbuf2;
693       rbuf3       = smat_i->rbuf3;
694       req_source2 = smat_i->req_source2;
695 
696       sbuf1     = smat_i->sbuf1;
697       sbuf2     = smat_i->sbuf2;
698       ptr       = smat_i->ptr;
699       tmp       = smat_i->tmp;
700       ctr       = smat_i->ctr;
701 
702       pa          = smat_i->pa;
703       req_size    = smat_i->req_size;
704       req_source1 = smat_i->req_source1;
705 
706       allcolumns[i] = smat_i->allcolumns;
707       allrows[i]    = smat_i->allrows;
708       row2proc[i]   = smat_i->row2proc;
709       rmap[i]       = smat_i->rmap;
710       cmap[i]       = smat_i->cmap;
711     }
712 
713     if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
714       PetscCheckFalse(!submats[0],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
715       smat_i = (Mat_SubSppt*)submats[0]->data;
716 
717       nrqs        = smat_i->nrqs;
718       nrqr        = smat_i->nrqr;
719       rbuf1       = smat_i->rbuf1;
720       rbuf2       = smat_i->rbuf2;
721       rbuf3       = smat_i->rbuf3;
722       req_source2 = smat_i->req_source2;
723 
724       sbuf1       = smat_i->sbuf1;
725       sbuf2       = smat_i->sbuf2;
726       ptr         = smat_i->ptr;
727       tmp         = smat_i->tmp;
728       ctr         = smat_i->ctr;
729 
730       pa          = smat_i->pa;
731       req_size    = smat_i->req_size;
732       req_source1 = smat_i->req_source1;
733 
734       allcolumns[0] = PETSC_FALSE;
735     }
736   } else { /* scall == MAT_INITIAL_MATRIX */
737     /* Get some new tags to keep the communication clean */
738     PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2));
739     PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag3));
740 
741     /* evaluate communication - mesg to who, length of mesg, and buffer space
742      required. Based on this, buffers are allocated, and data copied into them*/
743     PetscCall(PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4));   /* mesg size, initialize work vectors */
744 
745     for (i=0; i<ismax; i++) {
746       jmax   = nrow[i];
747       irow_i = irow[i];
748 
749       PetscCall(PetscMalloc1(jmax,&row2proc_i));
750       row2proc[i] = row2proc_i;
751 
752       if (issorted[i]) proc = 0;
753       for (j=0; j<jmax; j++) {
754         if (!issorted[i]) proc = 0;
755         if (allrows[i]) row = j;
756         else row = irow_i[j];
757 
758         while (row >= c->rangebs[proc+1]) proc++;
759         w4[proc]++;
760         row2proc_i[j] = proc; /* map row index to proc */
761       }
762       for (j=0; j<size; j++) {
763         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
764       }
765     }
766 
767     nrqs     = 0;              /* no of outgoing messages */
768     msz      = 0;              /* total mesg length (for all procs) */
769     w1[rank] = 0;              /* no mesg sent to self */
770     w3[rank] = 0;
771     for (i=0; i<size; i++) {
772       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
773     }
774     PetscCall(PetscMalloc1(nrqs,&pa)); /*(proc -array)*/
775     for (i=0,j=0; i<size; i++) {
776       if (w1[i]) { pa[j] = i; j++; }
777     }
778 
779     /* Each message would have a header = 1 + 2*(no of IS) + data */
780     for (i=0; i<nrqs; i++) {
781       j      = pa[i];
782       w1[j] += w2[j] + 2* w3[j];
783       msz   += w1[j];
784     }
785     PetscCall(PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz));
786 
787     /* Determine the number of messages to expect, their lengths, from from-ids */
788     PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr));
789     PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1));
790 
791     /* Now post the Irecvs corresponding to these messages */
792     PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag0));
793     PetscCall(PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1));
794 
795     /* Allocate Memory for outgoing messages */
796     PetscCall(PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr));
797     PetscCall(PetscArrayzero(sbuf1,size));
798     PetscCall(PetscArrayzero(ptr,size));
799 
800     {
801       PetscInt *iptr = tmp;
802       k    = 0;
803       for (i=0; i<nrqs; i++) {
804         j        = pa[i];
805         iptr    += k;
806         sbuf1[j] = iptr;
807         k        = w1[j];
808       }
809     }
810 
811     /* Form the outgoing messages. Initialize the header space */
812     for (i=0; i<nrqs; i++) {
813       j           = pa[i];
814       sbuf1[j][0] = 0;
815       PetscCall(PetscArrayzero(sbuf1[j]+1,2*w3[j]));
816       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
817     }
818 
819     /* Parse the isrow and copy data into outbuf */
820     for (i=0; i<ismax; i++) {
821       row2proc_i = row2proc[i];
822       PetscCall(PetscArrayzero(ctr,size));
823       irow_i = irow[i];
824       jmax   = nrow[i];
825       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
826         proc = row2proc_i[j];
827         if (allrows[i]) row = j;
828         else row = irow_i[j];
829 
830         if (proc != rank) { /* copy to the outgoing buf*/
831           ctr[proc]++;
832           *ptr[proc] = row;
833           ptr[proc]++;
834         }
835       }
836       /* Update the headers for the current IS */
837       for (j=0; j<size; j++) { /* Can Optimise this loop too */
838         if ((ctr_j = ctr[j])) {
839           sbuf1_j        = sbuf1[j];
840           k              = ++sbuf1_j[0];
841           sbuf1_j[2*k]   = ctr_j;
842           sbuf1_j[2*k-1] = i;
843         }
844       }
845     }
846 
847     /*  Now  post the sends */
848     PetscCall(PetscMalloc1(nrqs,&s_waits1));
849     for (i=0; i<nrqs; ++i) {
850       j    = pa[i];
851       PetscCallMPI(MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i));
852     }
853 
854     /* Post Receives to capture the buffer size */
855     PetscCall(PetscMalloc1(nrqs,&r_waits2));
856     PetscCall(PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3));
857     if (nrqs) rbuf2[0] = tmp + msz;
858     for (i=1; i<nrqs; ++i) {
859       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
860     }
861     for (i=0; i<nrqs; ++i) {
862       j    = pa[i];
863       PetscCallMPI(MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i));
864     }
865 
866     /* Send to other procs the buf size they should allocate */
867     /* Receive messages*/
868     PetscCall(PetscMalloc1(nrqr,&s_waits2));
869     PetscCall(PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1));
870 
871     PetscCallMPI(MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE));
872     for (i=0; i<nrqr; ++i) {
873       req_size[i] = 0;
874       rbuf1_i        = rbuf1[i];
875       start          = 2*rbuf1_i[0] + 1;
876       end            = olengths1[i];
877       PetscCall(PetscMalloc1(end,&sbuf2[i]));
878       sbuf2_i        = sbuf2[i];
879       for (j=start; j<end; j++) {
880         row             = rbuf1_i[j] - rstart;
881         ncols           = a_i[row+1] - a_i[row] + b_i[row+1] - b_i[row];
882         sbuf2_i[j]      = ncols;
883         req_size[i] += ncols;
884       }
885       req_source1[i] = onodes1[i];
886       /* form the header */
887       sbuf2_i[0] = req_size[i];
888       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
889 
890       PetscCallMPI(MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i));
891     }
892 
893     PetscCall(PetscFree(onodes1));
894     PetscCall(PetscFree(olengths1));
895 
896     PetscCall(PetscFree(r_waits1));
897     PetscCall(PetscFree4(w1,w2,w3,w4));
898 
899     /* Receive messages*/
900     PetscCall(PetscMalloc1(nrqs,&r_waits3));
901 
902     PetscCallMPI(MPI_Waitall(nrqs,r_waits2,MPI_STATUSES_IGNORE));
903     for (i=0; i<nrqs; ++i) {
904       PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf3[i]));
905       req_source2[i] = pa[i];
906       PetscCallMPI(MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i));
907     }
908     PetscCall(PetscFree(r_waits2));
909 
910     /* Wait on sends1 and sends2 */
911     PetscCallMPI(MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE));
912     PetscCallMPI(MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE));
913     PetscCall(PetscFree(s_waits1));
914     PetscCall(PetscFree(s_waits2));
915 
916     /* Now allocate sending buffers for a->j, and send them off */
917     PetscCall(PetscMalloc1(nrqr,&sbuf_aj));
918     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
919     if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aj[0]));
920     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
921 
922     PetscCall(PetscMalloc1(nrqr,&s_waits3));
923     {
924 
925       for (i=0; i<nrqr; i++) {
926         rbuf1_i   = rbuf1[i];
927         sbuf_aj_i = sbuf_aj[i];
928         ct1       = 2*rbuf1_i[0] + 1;
929         ct2       = 0;
930         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
931           kmax = rbuf1[i][2*j];
932           for (k=0; k<kmax; k++,ct1++) {
933             row    = rbuf1_i[ct1] - rstart;
934             nzA    = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
935             ncols  = nzA + nzB;
936             cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
937 
938             /* load the column indices for this row into cols */
939             cols = sbuf_aj_i + ct2;
940             for (l=0; l<nzB; l++) {
941               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
942               else break;
943             }
944             imark = l;
945             for (l=0; l<nzA; l++) {cols[imark+l] = cstart + cworkA[l];}
946             for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
947             ct2 += ncols;
948           }
949         }
950         PetscCallMPI(MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i));
951       }
952     }
953 
954     /* create col map: global col of C -> local col of submatrices */
955 #if defined(PETSC_USE_CTABLE)
956     for (i=0; i<ismax; i++) {
957       if (!allcolumns[i]) {
958         PetscCall(PetscTableCreate(ncol[i],c->Nbs,&cmap[i]));
959 
960         jmax   = ncol[i];
961         icol_i = icol[i];
962         cmap_i = cmap[i];
963         for (j=0; j<jmax; j++) {
964           PetscCall(PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES));
965         }
966       } else cmap[i] = NULL;
967     }
968 #else
969     for (i=0; i<ismax; i++) {
970       if (!allcolumns[i]) {
971         PetscCall(PetscCalloc1(c->Nbs,&cmap[i]));
972         jmax   = ncol[i];
973         icol_i = icol[i];
974         cmap_i = cmap[i];
975         for (j=0; j<jmax; j++) cmap_i[icol_i[j]] = j+1;
976       } else cmap[i] = NULL;
977     }
978 #endif
979 
980     /* Create lens which is required for MatCreate... */
981     for (i=0,j=0; i<ismax; i++) j += nrow[i];
982     PetscCall(PetscMalloc1(ismax,&lens));
983 
984     if (ismax) {
985       PetscCall(PetscCalloc1(j,&lens[0]));
986     }
987     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
988 
989     /* Update lens from local data */
990     for (i=0; i<ismax; i++) {
991       row2proc_i = row2proc[i];
992       jmax = nrow[i];
993       if (!allcolumns[i]) cmap_i = cmap[i];
994       irow_i = irow[i];
995       lens_i = lens[i];
996       for (j=0; j<jmax; j++) {
997         if (allrows[i]) row = j;
998         else row = irow_i[j]; /* global blocked row of C */
999 
1000         proc = row2proc_i[j];
1001         if (proc == rank) {
1002           /* Get indices from matA and then from matB */
1003 #if defined(PETSC_USE_CTABLE)
1004           PetscInt   tt;
1005 #endif
1006           row    = row - rstart;
1007           nzA    = a_i[row+1] - a_i[row];
1008           nzB    = b_i[row+1] - b_i[row];
1009           cworkA =  a_j + a_i[row];
1010           cworkB = b_j + b_i[row];
1011 
1012           if (!allcolumns[i]) {
1013 #if defined(PETSC_USE_CTABLE)
1014             for (k=0; k<nzA; k++) {
1015               PetscCall(PetscTableFind(cmap_i,cstart+cworkA[k]+1,&tt));
1016               if (tt) lens_i[j]++;
1017             }
1018             for (k=0; k<nzB; k++) {
1019               PetscCall(PetscTableFind(cmap_i,bmap[cworkB[k]]+1,&tt));
1020               if (tt) lens_i[j]++;
1021             }
1022 
1023 #else
1024             for (k=0; k<nzA; k++) {
1025               if (cmap_i[cstart + cworkA[k]]) lens_i[j]++;
1026             }
1027             for (k=0; k<nzB; k++) {
1028               if (cmap_i[bmap[cworkB[k]]]) lens_i[j]++;
1029             }
1030 #endif
1031           } else { /* allcolumns */
1032             lens_i[j] = nzA + nzB;
1033           }
1034         }
1035       }
1036     }
1037 
1038     /* Create row map: global row of C -> local row of submatrices */
1039     for (i=0; i<ismax; i++) {
1040       if (!allrows[i]) {
1041 #if defined(PETSC_USE_CTABLE)
1042         PetscCall(PetscTableCreate(nrow[i],c->Mbs,&rmap[i]));
1043         irow_i = irow[i];
1044         jmax   = nrow[i];
1045         for (j=0; j<jmax; j++) {
1046           if (allrows[i]) {
1047             PetscCall(PetscTableAdd(rmap[i],j+1,j+1,INSERT_VALUES));
1048           } else {
1049             PetscCall(PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES));
1050           }
1051         }
1052 #else
1053         PetscCall(PetscCalloc1(c->Mbs,&rmap[i]));
1054         rmap_i = rmap[i];
1055         irow_i = irow[i];
1056         jmax   = nrow[i];
1057         for (j=0; j<jmax; j++) {
1058           if (allrows[i]) rmap_i[j] = j;
1059           else rmap_i[irow_i[j]] = j;
1060         }
1061 #endif
1062       } else rmap[i] = NULL;
1063     }
1064 
1065     /* Update lens from offproc data */
1066     {
1067       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1068 
1069       PetscCallMPI(MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE));
1070       for (tmp2=0; tmp2<nrqs; tmp2++) {
1071         sbuf1_i = sbuf1[pa[tmp2]];
1072         jmax    = sbuf1_i[0];
1073         ct1     = 2*jmax+1;
1074         ct2     = 0;
1075         rbuf2_i = rbuf2[tmp2];
1076         rbuf3_i = rbuf3[tmp2];
1077         for (j=1; j<=jmax; j++) {
1078           is_no  = sbuf1_i[2*j-1];
1079           max1   = sbuf1_i[2*j];
1080           lens_i = lens[is_no];
1081           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1082           rmap_i = rmap[is_no];
1083           for (k=0; k<max1; k++,ct1++) {
1084             if (allrows[is_no]) {
1085               row = sbuf1_i[ct1];
1086             } else {
1087 #if defined(PETSC_USE_CTABLE)
1088               PetscCall(PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row));
1089               row--;
1090               PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1091 #else
1092               row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1093 #endif
1094             }
1095             max2 = rbuf2_i[ct1];
1096             for (l=0; l<max2; l++,ct2++) {
1097               if (!allcolumns[is_no]) {
1098 #if defined(PETSC_USE_CTABLE)
1099                 PetscCall(PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol));
1100 #else
1101                 tcol = cmap_i[rbuf3_i[ct2]];
1102 #endif
1103                 if (tcol) lens_i[row]++;
1104               } else { /* allcolumns */
1105                 lens_i[row]++; /* lens_i[row] += max2 ? */
1106               }
1107             }
1108           }
1109         }
1110       }
1111     }
1112     PetscCall(PetscFree(r_waits3));
1113     PetscCallMPI(MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE));
1114     PetscCall(PetscFree(s_waits3));
1115 
1116     /* Create the submatrices */
1117     for (i=0; i<ismax; i++) {
1118       PetscInt bs_tmp;
1119       if (ijonly) bs_tmp = 1;
1120       else        bs_tmp = bs;
1121 
1122       PetscCall(MatCreate(PETSC_COMM_SELF,submats+i));
1123       PetscCall(MatSetSizes(submats[i],nrow[i]*bs_tmp,ncol[i]*bs_tmp,PETSC_DETERMINE,PETSC_DETERMINE));
1124 
1125       PetscCall(MatSetType(submats[i],((PetscObject)A)->type_name));
1126       PetscCall(MatSeqBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]));
1127       PetscCall(MatSeqSBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i])); /* this subroutine is used by SBAIJ routines */
1128 
1129       /* create struct Mat_SubSppt and attached it to submat */
1130       PetscCall(PetscNew(&smat_i));
1131       subc = (Mat_SeqBAIJ*)submats[i]->data;
1132       subc->submatis1 = smat_i;
1133 
1134       smat_i->destroy          = submats[i]->ops->destroy;
1135       submats[i]->ops->destroy = MatDestroySubMatrix_SeqBAIJ;
1136       submats[i]->factortype   = C->factortype;
1137 
1138       smat_i->id          = i;
1139       smat_i->nrqs        = nrqs;
1140       smat_i->nrqr        = nrqr;
1141       smat_i->rbuf1       = rbuf1;
1142       smat_i->rbuf2       = rbuf2;
1143       smat_i->rbuf3       = rbuf3;
1144       smat_i->sbuf2       = sbuf2;
1145       smat_i->req_source2 = req_source2;
1146 
1147       smat_i->sbuf1       = sbuf1;
1148       smat_i->ptr         = ptr;
1149       smat_i->tmp         = tmp;
1150       smat_i->ctr         = ctr;
1151 
1152       smat_i->pa           = pa;
1153       smat_i->req_size     = req_size;
1154       smat_i->req_source1  = req_source1;
1155 
1156       smat_i->allcolumns  = allcolumns[i];
1157       smat_i->allrows     = allrows[i];
1158       smat_i->singleis    = PETSC_FALSE;
1159       smat_i->row2proc    = row2proc[i];
1160       smat_i->rmap        = rmap[i];
1161       smat_i->cmap        = cmap[i];
1162     }
1163 
1164     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
1165       PetscCall(MatCreate(PETSC_COMM_SELF,&submats[0]));
1166       PetscCall(MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE));
1167       PetscCall(MatSetType(submats[0],MATDUMMY));
1168 
1169       /* create struct Mat_SubSppt and attached it to submat */
1170       PetscCall(PetscNewLog(submats[0],&smat_i));
1171       submats[0]->data = (void*)smat_i;
1172 
1173       smat_i->destroy          = submats[0]->ops->destroy;
1174       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
1175       submats[0]->factortype   = C->factortype;
1176 
1177       smat_i->id          = 0;
1178       smat_i->nrqs        = nrqs;
1179       smat_i->nrqr        = nrqr;
1180       smat_i->rbuf1       = rbuf1;
1181       smat_i->rbuf2       = rbuf2;
1182       smat_i->rbuf3       = rbuf3;
1183       smat_i->sbuf2       = sbuf2;
1184       smat_i->req_source2 = req_source2;
1185 
1186       smat_i->sbuf1       = sbuf1;
1187       smat_i->ptr         = ptr;
1188       smat_i->tmp         = tmp;
1189       smat_i->ctr         = ctr;
1190 
1191       smat_i->pa           = pa;
1192       smat_i->req_size     = req_size;
1193       smat_i->req_source1  = req_source1;
1194 
1195       smat_i->allcolumns  = PETSC_FALSE;
1196       smat_i->singleis    = PETSC_FALSE;
1197       smat_i->row2proc    = NULL;
1198       smat_i->rmap        = NULL;
1199       smat_i->cmap        = NULL;
1200     }
1201 
1202     if (ismax) PetscCall(PetscFree(lens[0]));
1203     PetscCall(PetscFree(lens));
1204     if (sbuf_aj) {
1205       PetscCall(PetscFree(sbuf_aj[0]));
1206       PetscCall(PetscFree(sbuf_aj));
1207     }
1208 
1209   } /* endof scall == MAT_INITIAL_MATRIX */
1210 
1211   /* Post recv matrix values */
1212   if (!ijonly) {
1213     PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag4));
1214     PetscCall(PetscMalloc1(nrqs,&rbuf4));
1215     PetscCall(PetscMalloc1(nrqs,&r_waits4));
1216     for (i=0; i<nrqs; ++i) {
1217       PetscCall(PetscMalloc1(rbuf2[i][0]*bs2,&rbuf4[i]));
1218       PetscCallMPI(MPI_Irecv(rbuf4[i],rbuf2[i][0]*bs2,MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i));
1219     }
1220 
1221     /* Allocate sending buffers for a->a, and send them off */
1222     PetscCall(PetscMalloc1(nrqr,&sbuf_aa));
1223     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1224 
1225     if (nrqr) PetscCall(PetscMalloc1(j*bs2,&sbuf_aa[0]));
1226     for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
1227 
1228     PetscCall(PetscMalloc1(nrqr,&s_waits4));
1229 
1230     for (i=0; i<nrqr; i++) {
1231       rbuf1_i   = rbuf1[i];
1232       sbuf_aa_i = sbuf_aa[i];
1233       ct1       = 2*rbuf1_i[0]+1;
1234       ct2       = 0;
1235       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1236         kmax = rbuf1_i[2*j];
1237         for (k=0; k<kmax; k++,ct1++) {
1238           row    = rbuf1_i[ct1] - rstart;
1239           nzA    = a_i[row+1] - a_i[row];
1240           nzB    = b_i[row+1] - b_i[row];
1241           ncols  = nzA + nzB;
1242           cworkB = b_j + b_i[row];
1243           vworkA = a_a + a_i[row]*bs2;
1244           vworkB = b_a + b_i[row]*bs2;
1245 
1246           /* load the column values for this row into vals*/
1247           vals = sbuf_aa_i+ct2*bs2;
1248           for (l=0; l<nzB; l++) {
1249             if ((bmap[cworkB[l]]) < cstart) {
1250               PetscCall(PetscArraycpy(vals+l*bs2,vworkB+l*bs2,bs2));
1251             } else break;
1252           }
1253           imark = l;
1254           for (l=0; l<nzA; l++) {
1255             PetscCall(PetscArraycpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2));
1256           }
1257           for (l=imark; l<nzB; l++) {
1258             PetscCall(PetscArraycpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2));
1259           }
1260 
1261           ct2 += ncols;
1262         }
1263       }
1264       PetscCallMPI(MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i));
1265     }
1266   }
1267 
1268   /* Assemble the matrices */
1269   /* First assemble the local rows */
1270   for (i=0; i<ismax; i++) {
1271     row2proc_i = row2proc[i];
1272     subc      = (Mat_SeqBAIJ*)submats[i]->data;
1273     imat_ilen = subc->ilen;
1274     imat_j    = subc->j;
1275     imat_i    = subc->i;
1276     imat_a    = subc->a;
1277 
1278     if (!allcolumns[i]) cmap_i = cmap[i];
1279     rmap_i = rmap[i];
1280     irow_i = irow[i];
1281     jmax   = nrow[i];
1282     for (j=0; j<jmax; j++) {
1283       if (allrows[i]) row = j;
1284       else row  = irow_i[j];
1285       proc = row2proc_i[j];
1286 
1287       if (proc == rank) {
1288 
1289         row    = row - rstart;
1290         nzA    = a_i[row+1] - a_i[row];
1291         nzB    = b_i[row+1] - b_i[row];
1292         cworkA = a_j + a_i[row];
1293         cworkB = b_j + b_i[row];
1294         if (!ijonly) {
1295           vworkA = a_a + a_i[row]*bs2;
1296           vworkB = b_a + b_i[row]*bs2;
1297         }
1298 
1299         if (allrows[i]) {
1300           row = row+rstart;
1301         } else {
1302 #if defined(PETSC_USE_CTABLE)
1303           PetscCall(PetscTableFind(rmap_i,row+rstart+1,&row));
1304           row--;
1305 
1306           PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1307 #else
1308           row = rmap_i[row + rstart];
1309 #endif
1310         }
1311         mat_i = imat_i[row];
1312         if (!ijonly) mat_a = imat_a + mat_i*bs2;
1313         mat_j    = imat_j + mat_i;
1314         ilen = imat_ilen[row];
1315 
1316         /* load the column indices for this row into cols*/
1317         if (!allcolumns[i]) {
1318           for (l=0; l<nzB; l++) {
1319             if ((ctmp = bmap[cworkB[l]]) < cstart) {
1320 #if defined(PETSC_USE_CTABLE)
1321               PetscCall(PetscTableFind(cmap_i,ctmp+1,&tcol));
1322               if (tcol) {
1323 #else
1324               if ((tcol = cmap_i[ctmp])) {
1325 #endif
1326                 *mat_j++ = tcol - 1;
1327                 PetscCall(PetscArraycpy(mat_a,vworkB+l*bs2,bs2));
1328                 mat_a   += bs2;
1329                 ilen++;
1330               }
1331             } else break;
1332           }
1333           imark = l;
1334           for (l=0; l<nzA; l++) {
1335 #if defined(PETSC_USE_CTABLE)
1336             PetscCall(PetscTableFind(cmap_i,cstart+cworkA[l]+1,&tcol));
1337             if (tcol) {
1338 #else
1339             if ((tcol = cmap_i[cstart + cworkA[l]])) {
1340 #endif
1341               *mat_j++ = tcol - 1;
1342               if (!ijonly) {
1343                 PetscCall(PetscArraycpy(mat_a,vworkA+l*bs2,bs2));
1344                 mat_a += bs2;
1345               }
1346               ilen++;
1347             }
1348           }
1349           for (l=imark; l<nzB; l++) {
1350 #if defined(PETSC_USE_CTABLE)
1351             PetscCall(PetscTableFind(cmap_i,bmap[cworkB[l]]+1,&tcol));
1352             if (tcol) {
1353 #else
1354             if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1355 #endif
1356               *mat_j++ = tcol - 1;
1357               if (!ijonly) {
1358                 PetscCall(PetscArraycpy(mat_a,vworkB+l*bs2,bs2));
1359                 mat_a += bs2;
1360               }
1361               ilen++;
1362             }
1363           }
1364         } else { /* allcolumns */
1365           for (l=0; l<nzB; l++) {
1366             if ((ctmp = bmap[cworkB[l]]) < cstart) {
1367               *mat_j++ = ctmp;
1368               PetscCall(PetscArraycpy(mat_a,vworkB+l*bs2,bs2));
1369               mat_a   += bs2;
1370               ilen++;
1371             } else break;
1372           }
1373           imark = l;
1374           for (l=0; l<nzA; l++) {
1375             *mat_j++ = cstart+cworkA[l];
1376             if (!ijonly) {
1377               PetscCall(PetscArraycpy(mat_a,vworkA+l*bs2,bs2));
1378               mat_a += bs2;
1379             }
1380             ilen++;
1381           }
1382           for (l=imark; l<nzB; l++) {
1383             *mat_j++ = bmap[cworkB[l]];
1384             if (!ijonly) {
1385               PetscCall(PetscArraycpy(mat_a,vworkB+l*bs2,bs2));
1386               mat_a += bs2;
1387             }
1388             ilen++;
1389           }
1390         }
1391         imat_ilen[row] = ilen;
1392       }
1393     }
1394   }
1395 
1396   /* Now assemble the off proc rows */
1397   if (!ijonly) {
1398     PetscCallMPI(MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE));
1399   }
1400   for (tmp2=0; tmp2<nrqs; tmp2++) {
1401     sbuf1_i = sbuf1[pa[tmp2]];
1402     jmax    = sbuf1_i[0];
1403     ct1     = 2*jmax + 1;
1404     ct2     = 0;
1405     rbuf2_i = rbuf2[tmp2];
1406     rbuf3_i = rbuf3[tmp2];
1407     if (!ijonly) rbuf4_i = rbuf4[tmp2];
1408     for (j=1; j<=jmax; j++) {
1409       is_no     = sbuf1_i[2*j-1];
1410       rmap_i    = rmap[is_no];
1411       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1412       subc      = (Mat_SeqBAIJ*)submats[is_no]->data;
1413       imat_ilen = subc->ilen;
1414       imat_j    = subc->j;
1415       imat_i    = subc->i;
1416       if (!ijonly) imat_a    = subc->a;
1417       max1      = sbuf1_i[2*j];
1418       for (k=0; k<max1; k++,ct1++) { /* for each recved block row */
1419         row = sbuf1_i[ct1];
1420 
1421         if (allrows[is_no]) {
1422           row = sbuf1_i[ct1];
1423         } else {
1424 #if defined(PETSC_USE_CTABLE)
1425           PetscCall(PetscTableFind(rmap_i,row+1,&row));
1426           row--;
1427           PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1428 #else
1429           row = rmap_i[row];
1430 #endif
1431         }
1432         ilen  = imat_ilen[row];
1433         mat_i = imat_i[row];
1434         if (!ijonly) mat_a = imat_a + mat_i*bs2;
1435         mat_j = imat_j + mat_i;
1436         max2  = rbuf2_i[ct1];
1437         if (!allcolumns[is_no]) {
1438           for (l=0; l<max2; l++,ct2++) {
1439 #if defined(PETSC_USE_CTABLE)
1440             PetscCall(PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol));
1441 #else
1442             tcol = cmap_i[rbuf3_i[ct2]];
1443 #endif
1444             if (tcol) {
1445               *mat_j++ = tcol - 1;
1446               if (!ijonly) {
1447                 PetscCall(PetscArraycpy(mat_a,rbuf4_i+ct2*bs2,bs2));
1448                 mat_a += bs2;
1449               }
1450               ilen++;
1451             }
1452           }
1453         } else { /* allcolumns */
1454           for (l=0; l<max2; l++,ct2++) {
1455             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1456             if (!ijonly) {
1457               PetscCall(PetscArraycpy(mat_a,rbuf4_i+ct2*bs2,bs2));
1458               mat_a += bs2;
1459             }
1460             ilen++;
1461           }
1462         }
1463         imat_ilen[row] = ilen;
1464       }
1465     }
1466   }
1467 
1468   if (!iscsorted) { /* sort column indices of the rows */
1469     MatScalar *work;
1470 
1471     PetscCall(PetscMalloc1(bs2,&work));
1472     for (i=0; i<ismax; i++) {
1473       subc      = (Mat_SeqBAIJ*)submats[i]->data;
1474       imat_ilen = subc->ilen;
1475       imat_j    = subc->j;
1476       imat_i    = subc->i;
1477       if (!ijonly) imat_a = subc->a;
1478       if (allcolumns[i]) continue;
1479 
1480       jmax = nrow[i];
1481       for (j=0; j<jmax; j++) {
1482         mat_i = imat_i[j];
1483         mat_j = imat_j + mat_i;
1484         ilen  = imat_ilen[j];
1485         if (ijonly) {
1486           PetscCall(PetscSortInt(ilen,mat_j));
1487         } else {
1488           mat_a = imat_a + mat_i*bs2;
1489           PetscCall(PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work));
1490         }
1491       }
1492     }
1493     PetscCall(PetscFree(work));
1494   }
1495 
1496   if (!ijonly) {
1497     PetscCall(PetscFree(r_waits4));
1498     PetscCallMPI(MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE));
1499     PetscCall(PetscFree(s_waits4));
1500   }
1501 
1502   /* Restore the indices */
1503   for (i=0; i<ismax; i++) {
1504     if (!allrows[i]) {
1505       PetscCall(ISRestoreIndices(isrow[i],irow+i));
1506     }
1507     if (!allcolumns[i]) {
1508       PetscCall(ISRestoreIndices(iscol[i],icol+i));
1509     }
1510   }
1511 
1512   for (i=0; i<ismax; i++) {
1513     PetscCall(MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY));
1514     PetscCall(MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY));
1515   }
1516 
1517   PetscCall(PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted));
1518   PetscCall(PetscFree5(row2proc,cmap,rmap,allcolumns,allrows));
1519 
1520   if (!ijonly) {
1521     if (sbuf_aa) {
1522       PetscCall(PetscFree(sbuf_aa[0]));
1523       PetscCall(PetscFree(sbuf_aa));
1524     }
1525 
1526     for (i=0; i<nrqs; ++i) {
1527       PetscCall(PetscFree(rbuf4[i]));
1528     }
1529     PetscCall(PetscFree(rbuf4));
1530   }
1531   c->ijonly = PETSC_FALSE; /* set back to the default */
1532   PetscFunctionReturn(0);
1533 }
1534