xref: /petsc/src/mat/impls/baij/mpi/mpibaij.c (revision 912ebf9aa32cd6ffee33ffd0333e9b4fec2e8450)
1 #ifndef lint
2 static char vcid[] = "$Id: mpibaij.c,v 1.57 1997/03/13 16:30:14 curfman Exp balay $";
3 #endif
4 
5 #include "src/mat/impls/baij/mpi/mpibaij.h"
6 #include "src/vec/vecimpl.h"
7 
8 #include "draw.h"
9 #include "pinclude/pviewer.h"
10 
11 extern int MatSetUpMultiply_MPIBAIJ(Mat);
12 extern int DisAssemble_MPIBAIJ(Mat);
13 extern int MatIncreaseOverlap_MPIBAIJ(Mat,int,IS *,int);
14 extern int MatGetSubMatrices_MPIBAIJ(Mat,int,IS *,IS *,MatGetSubMatrixCall,Mat **);
15 extern int MatLUFactorSymbolic_MPIBAIJ(Mat,IS,IS,double,Mat *);
16 extern int MatLUFactorNumeric_MPIBAIJ(Mat,Mat *);
17 extern int MatLUFactor_MPIBAIJ(Mat,IS,IS,double);
18 extern int MatSolve_MPIBAIJ(Mat,Vec,Vec);
19 extern int MatSolveAdd_MPIBAIJ(Mat,Vec,Vec,Vec);
20 extern int MatSolveTrans_MPIBAIJ(Mat,Vec,Vec);
21 extern int MatSolveTransAdd_MPIBAIJ(Mat,Vec,Vec,Vec);
22 extern int MatILUFactorSymbolic_MPIBAIJ(Mat,IS,IS,double,int,Mat *);
23 
24 
25 /*
26      Local utility routine that creates a mapping from the global column
27    number to the local number in the off-diagonal part of the local
28    storage of the matrix.  This is done in a non scable way since the
29    length of colmap equals the global matrix length.
30 */
31 #undef __FUNC__
32 #define __FUNC__ "CreateColmap_MPIBAIJ_Private"
33 static int CreateColmap_MPIBAIJ_Private(Mat mat)
34 {
35   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
36   Mat_SeqBAIJ *B = (Mat_SeqBAIJ*) baij->B->data;
37   int         nbs = B->nbs,i,bs=B->bs;;
38 
39   baij->colmap = (int *) PetscMalloc(baij->Nbs*sizeof(int));CHKPTRQ(baij->colmap);
40   PLogObjectMemory(mat,baij->Nbs*sizeof(int));
41   PetscMemzero(baij->colmap,baij->Nbs*sizeof(int));
42   for ( i=0; i<nbs; i++ ) baij->colmap[baij->garray[i]] = i*bs+1;
43   return 0;
44 }
45 
46 #undef __FUNC__
47 #define __FUNC__ "MatGetRowIJ_MPIBAIJ("
48 static int MatGetRowIJ_MPIBAIJ(Mat mat,int shift,PetscTruth symmetric,int *n,int **ia,int **ja,
49                             PetscTruth *done)
50 {
51   Mat_MPIBAIJ *aij = (Mat_MPIBAIJ *) mat->data;
52   int         ierr;
53   if (aij->size == 1) {
54     ierr = MatGetRowIJ(aij->A,shift,symmetric,n,ia,ja,done); CHKERRQ(ierr);
55   } else SETERRQ(1,0,"not supported in parallel");
56   return 0;
57 }
58 
59 #undef __FUNC__
60 #define __FUNC__ "MatRestoreRowIJ_MPIBAIJ"
61 static int MatRestoreRowIJ_MPIBAIJ(Mat mat,int shift,PetscTruth symmetric,int *n,int **ia,int **ja,
62                                 PetscTruth *done)
63 {
64   Mat_MPIBAIJ *aij = (Mat_MPIBAIJ *) mat->data;
65   int        ierr;
66   if (aij->size == 1) {
67     ierr = MatRestoreRowIJ(aij->A,shift,symmetric,n,ia,ja,done); CHKERRQ(ierr);
68   } else SETERRQ(1,0,"not supported in parallel");
69   return 0;
70 }
71 #define CHUNKSIZE  10
72 
73 #define  MatSetValues_SeqBAIJ_A_Private(row,col,value,addv) \
74 { \
75  \
76     brow = row/bs;  \
77     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
78     rmax = imax[brow]; nrow = ailen[brow]; \
79       bcol = col/bs; \
80       ridx = row % bs; cidx = col % bs; \
81       low = 0; high = nrow; \
82       while (high-low > 3) { \
83         t = (low+high)/2; \
84         if (rp[t] > bcol) high = t; \
85         else              low  = t; \
86       } \
87       for ( _i=low; _i<high; _i++ ) { \
88         if (rp[_i] > bcol) break; \
89         if (rp[_i] == bcol) { \
90           bap  = ap +  bs2*_i + bs*cidx + ridx; \
91           if (addv == ADD_VALUES) *bap += value;  \
92           else                    *bap  = value;  \
93           goto _noinsert; \
94         } \
95       } \
96       if (a->nonew) goto _noinsert; \
97       if (nrow >= rmax) { \
98         /* there is no extra room in row, therefore enlarge */ \
99         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
100         Scalar *new_a; \
101  \
102         /* malloc new storage space */ \
103         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int); \
104         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
105         new_j   = (int *) (new_a + bs2*new_nz); \
106         new_i   = new_j + new_nz; \
107  \
108         /* copy over old data into new slots */ \
109         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];} \
110         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
111         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int)); \
112         len = (new_nz - CHUNKSIZE - ai[brow] - nrow); \
113         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow, \
114                                                            len*sizeof(int)); \
115         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar)); \
116         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
117         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE), \
118                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));  \
119         /* free up old matrix storage */ \
120         PetscFree(a->a);  \
121         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} \
122         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
123         a->singlemalloc = 1; \
124  \
125         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
126         rmax = imax[brow] = imax[brow] + CHUNKSIZE; \
127         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
128         a->maxnz += bs2*CHUNKSIZE; \
129         a->reallocs++; \
130         a->nz++; \
131       } \
132       N = nrow++ - 1;  \
133       /* shift up all the later entries in this row */ \
134       for ( ii=N; ii>=_i; ii-- ) { \
135         rp[ii+1] = rp[ii]; \
136         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
137       } \
138       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
139       rp[_i]                      = bcol;  \
140       ap[bs2*_i + bs*cidx + ridx] = value;  \
141       _noinsert:; \
142     ailen[brow] = nrow; \
143 }
144 
145 extern int MatSetValues_SeqBAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
146 #undef __FUNC__
147 #define __FUNC__ "MatSetValues_MPIBAIJ"
148 int MatSetValues_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
149 {
150   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
151   Scalar      value;
152   int         ierr,i,j,row,col;
153   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
154   int         rend_orig=baij->rend_bs,cstart_orig=baij->cstart_bs;
155   int         cend_orig=baij->cend_bs,bs=baij->bs;
156 
157   /* Some Variables required in the macro */
158   Mat         A = baij->A;
159   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) (A)->data;
160   int         *rp,ii,nrow,_i,rmax,N;
161   int         *imax=a->imax,*ai=a->i,*ailen=a->ilen;
162   int         *aj=a->j,brow,bcol;
163   int         low,high,t,ridx,cidx,bs2=a->bs2;
164   Scalar      *ap,*aa=a->a,*bap;
165 
166   for ( i=0; i<m; i++ ) {
167 #if defined(PETSC_BOPT_g)
168     if (im[i] < 0) SETERRQ(1,0,"Negative row");
169     if (im[i] >= baij->M) SETERRQ(1,0,"Row too large");
170 #endif
171     if (im[i] >= rstart_orig && im[i] < rend_orig) {
172       row = im[i] - rstart_orig;
173       for ( j=0; j<n; j++ ) {
174         if (in[j] >= cstart_orig && in[j] < cend_orig){
175           col = in[j] - cstart_orig;
176           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
177           MatSetValues_SeqBAIJ_A_Private(row,col,value,addv);
178           /* ierr = MatSetValues_SeqBAIJ(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
179         }
180 #if defined(PETSC_BOPT_g)
181         else if (in[j] < 0) {SETERRQ(1,0,"Negative column");}
182         else if (in[j] >= baij->N) {SETERRQ(1,0,"Col too large");}
183 #endif
184         else {
185           if (mat->was_assembled) {
186             if (!baij->colmap) {
187               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
188             }
189             col = baij->colmap[in[j]/bs] - 1 + in[j]%bs;
190             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
191               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
192               col =  in[j];
193             }
194           }
195           else col = in[j];
196           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
197           ierr = MatSetValues_SeqBAIJ(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr);
198         }
199       }
200     }
201     else {
202       if (roworiented && !baij->donotstash) {
203         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
204       }
205       else {
206         if (!baij->donotstash) {
207           row = im[i];
208 	  for ( j=0; j<n; j++ ) {
209 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
210           }
211         }
212       }
213     }
214   }
215   return 0;
216 }
217 
218 extern int MatSetValuesBlocked_SeqBAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
219 #undef __FUNC__
220 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ"
221 int MatSetValuesBlocked_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
222 {
223   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
224   Scalar      *value,*tmp;
225   int         ierr,i,j,ii,jj,row,col,k,l;
226   int         roworiented = baij->roworiented,rstart=baij->rstart ;
227   int         rend=baij->rend,cstart=baij->cstart,stepval;
228   int         cend=baij->cend,bs=baij->bs,bs2=baij->bs2;
229 
230   /* Should be stashed somewhere to avoid multiple mallocs */
231   tmp = (Scalar*) PetscMalloc(bs2*sizeof(Scalar)); CHKPTRQ(tmp);
232   if (roworiented) {
233     stepval = (n-1)*bs;
234   } else {
235     stepval = (m-1)*bs;
236   }
237   for ( i=0; i<m; i++ ) {
238 #if defined(PETSC_BOPT_g)
239     if (im[i] < 0) SETERRQ(1,0,"Negative row");
240     if (im[i] >= baij->Mbs) SETERRQ(1,0,"Row too large");
241 #endif
242     if (im[i] >= rstart && im[i] < rend) {
243       row = im[i] - rstart;
244       for ( j=0; j<n; j++ ) {
245         if (roworiented) {
246           value = v + i*(stepval+bs)*bs + j*bs;
247         } else {
248           value = v + j*(stepval+bs)*bs + i*bs;
249         }
250         for ( ii=0; ii<bs; ii++,value+=stepval )
251           for (jj=0; jj<bs; jj++ )
252             *tmp++  = *value++;
253         tmp -=bs2;
254 
255         if (in[j] >= cstart && in[j] < cend){
256           col = in[j] - cstart;
257           ierr = MatSetValuesBlocked_SeqBAIJ(baij->A,1,&row,1,&col,tmp,addv);CHKERRQ(ierr);
258         }
259 #if defined(PETSC_BOPT_g)
260         else if (in[j] < 0) {SETERRQ(1,0,"Negative column");}
261         else if (in[j] >= baij->Nbs) {SETERRQ(1,0,"Col too large");}
262 #endif
263         else {
264           if (mat->was_assembled) {
265             if (!baij->colmap) {
266               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
267             }
268             col = baij->colmap[in[j]] - 1;
269             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
270               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
271               col =  in[j];
272             }
273           }
274           else col = in[j];
275           ierr = MatSetValuesBlocked_SeqBAIJ(baij->B,1,&row,1,&col,tmp,addv);CHKERRQ(ierr);
276         }
277       }
278     }
279     else {
280       if (!baij->donotstash) {
281         if (roworiented ) {
282           row   = im[i]*bs;
283           value = v + i*(stepval+bs)*bs;
284           for ( j=0; j<bs; j++,row++ ) {
285             for ( k=0; k<n; k++ ) {
286               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
287                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
288               }
289             }
290           }
291         }
292         else {
293           for ( j=0; j<n; j++ ) {
294             value = v + j*(stepval+bs)*bs + i*bs;
295             col   = in[j]*bs;
296             for ( k=0; k<bs; k++,col++,value+=stepval) {
297               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
298                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
299               }
300             }
301           }
302 
303         }
304       }
305     }
306   }
307   PetscFree(tmp);
308   return 0;
309 }
310 
311 #undef __FUNC__
312 #define __FUNC__ "MatGetValues_MPIBAIJ"
313 int MatGetValues_MPIBAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
314 {
315   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
316   int        bs=baij->bs,ierr,i,j, bsrstart = baij->rstart*bs, bsrend = baij->rend*bs;
317   int        bscstart = baij->cstart*bs, bscend = baij->cend*bs,row,col;
318 
319   for ( i=0; i<m; i++ ) {
320     if (idxm[i] < 0) SETERRQ(1,0,"Negative row");
321     if (idxm[i] >= baij->M) SETERRQ(1,0,"Row too large");
322     if (idxm[i] >= bsrstart && idxm[i] < bsrend) {
323       row = idxm[i] - bsrstart;
324       for ( j=0; j<n; j++ ) {
325         if (idxn[j] < 0) SETERRQ(1,0,"Negative column");
326         if (idxn[j] >= baij->N) SETERRQ(1,0,"Col too large");
327         if (idxn[j] >= bscstart && idxn[j] < bscend){
328           col = idxn[j] - bscstart;
329           ierr = MatGetValues(baij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
330         }
331         else {
332           if (!baij->colmap) {
333             ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
334           }
335           if((baij->colmap[idxn[j]/bs]-1 < 0) ||
336              (baij->garray[(baij->colmap[idxn[j]/bs]-1)/bs] != idxn[j]/bs)) *(v+i*n+j) = 0.0;
337           else {
338             col  = (baij->colmap[idxn[j]/bs]-1) + idxn[j]%bs;
339             ierr = MatGetValues(baij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
340           }
341         }
342       }
343     }
344     else {
345       SETERRQ(1,0,"Only local values currently supported");
346     }
347   }
348   return 0;
349 }
350 
351 #undef __FUNC__
352 #define __FUNC__ "MatNorm_MPIBAIJ"
353 int MatNorm_MPIBAIJ(Mat mat,NormType type,double *norm)
354 {
355   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
356   Mat_SeqBAIJ *amat = (Mat_SeqBAIJ*) baij->A->data, *bmat = (Mat_SeqBAIJ*) baij->B->data;
357   int        ierr, i,bs2=baij->bs2;
358   double     sum = 0.0;
359   Scalar     *v;
360 
361   if (baij->size == 1) {
362     ierr =  MatNorm(baij->A,type,norm); CHKERRQ(ierr);
363   } else {
364     if (type == NORM_FROBENIUS) {
365       v = amat->a;
366       for (i=0; i<amat->nz*bs2; i++ ) {
367 #if defined(PETSC_COMPLEX)
368         sum += real(conj(*v)*(*v)); v++;
369 #else
370         sum += (*v)*(*v); v++;
371 #endif
372       }
373       v = bmat->a;
374       for (i=0; i<bmat->nz*bs2; i++ ) {
375 #if defined(PETSC_COMPLEX)
376         sum += real(conj(*v)*(*v)); v++;
377 #else
378         sum += (*v)*(*v); v++;
379 #endif
380       }
381       MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);
382       *norm = sqrt(*norm);
383     }
384     else
385       SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
386   }
387   return 0;
388 }
389 
390 #undef __FUNC__
391 #define __FUNC__ "MatAssemblyBegin_MPIBAIJ"
392 int MatAssemblyBegin_MPIBAIJ(Mat mat,MatAssemblyType mode)
393 {
394   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
395   MPI_Comm    comm = mat->comm;
396   int         size = baij->size, *owners = baij->rowners,bs=baij->bs;
397   int         rank = baij->rank,tag = mat->tag, *owner,*starts,count,ierr;
398   MPI_Request *send_waits,*recv_waits;
399   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
400   InsertMode  addv;
401   Scalar      *rvalues,*svalues;
402 
403   /* make sure all processors are either in INSERTMODE or ADDMODE */
404   MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);
405   if (addv == (ADD_VALUES|INSERT_VALUES)) {
406     SETERRQ(1,0,"Some processors inserted others added");
407   }
408   mat->insertmode = addv; /* in case this processor had no cache */
409 
410   /*  first count number of contributors to each processor */
411   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
412   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
413   owner = (int *) PetscMalloc( (baij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
414   for ( i=0; i<baij->stash.n; i++ ) {
415     idx = baij->stash.idx[i];
416     for ( j=0; j<size; j++ ) {
417       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
418         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
419       }
420     }
421   }
422   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
423 
424   /* inform other processors of number of messages and max length*/
425   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
426   MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);
427   nreceives = work[rank];
428   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
429   nmax = work[rank];
430   PetscFree(work);
431 
432   /* post receives:
433        1) each message will consist of ordered pairs
434      (global index,value) we store the global index as a double
435      to simplify the message passing.
436        2) since we don't know how long each individual message is we
437      allocate the largest needed buffer for each receive. Potentially
438      this is a lot of wasted space.
439 
440 
441        This could be done better.
442   */
443   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));
444   CHKPTRQ(rvalues);
445   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));
446   CHKPTRQ(recv_waits);
447   for ( i=0; i<nreceives; i++ ) {
448     MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,
449               comm,recv_waits+i);
450   }
451 
452   /* do sends:
453       1) starts[i] gives the starting index in svalues for stuff going to
454          the ith processor
455   */
456   svalues = (Scalar *) PetscMalloc(3*(baij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
457   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
458   CHKPTRQ(send_waits);
459   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
460   starts[0] = 0;
461   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
462   for ( i=0; i<baij->stash.n; i++ ) {
463     svalues[3*starts[owner[i]]]       = (Scalar)  baij->stash.idx[i];
464     svalues[3*starts[owner[i]]+1]     = (Scalar)  baij->stash.idy[i];
465     svalues[3*(starts[owner[i]]++)+2] =  baij->stash.array[i];
466   }
467   PetscFree(owner);
468   starts[0] = 0;
469   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
470   count = 0;
471   for ( i=0; i<size; i++ ) {
472     if (procs[i]) {
473       MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,
474                 comm,send_waits+count++);
475     }
476   }
477   PetscFree(starts); PetscFree(nprocs);
478 
479   /* Free cache space */
480   PLogInfo(mat,"MatAssemblyBegin_MPIBAIJ:Number of off-processor values %d\n",baij->stash.n);
481   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
482 
483   baij->svalues    = svalues;    baij->rvalues    = rvalues;
484   baij->nsends     = nsends;     baij->nrecvs     = nreceives;
485   baij->send_waits = send_waits; baij->recv_waits = recv_waits;
486   baij->rmax       = nmax;
487 
488   return 0;
489 }
490 
491 
492 #undef __FUNC__
493 #define __FUNC__ "MatAssemblyEnd_MPIBAIJ"
494 int MatAssemblyEnd_MPIBAIJ(Mat mat,MatAssemblyType mode)
495 {
496   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
497   MPI_Status  *send_status,recv_status;
498   int         imdex,nrecvs = baij->nrecvs, count = nrecvs, i, n, ierr;
499   int         bs=baij->bs,row,col,other_disassembled;
500   Scalar      *values,val;
501   InsertMode  addv = mat->insertmode;
502 
503   /*  wait on receives */
504   while (count) {
505     MPI_Waitany(nrecvs,baij->recv_waits,&imdex,&recv_status);
506     /* unpack receives into our local space */
507     values = baij->rvalues + 3*imdex*baij->rmax;
508     MPI_Get_count(&recv_status,MPIU_SCALAR,&n);
509     n = n/3;
510     for ( i=0; i<n; i++ ) {
511       row = (int) PetscReal(values[3*i]) - baij->rstart*bs;
512       col = (int) PetscReal(values[3*i+1]);
513       val = values[3*i+2];
514       if (col >= baij->cstart*bs && col < baij->cend*bs) {
515         col -= baij->cstart*bs;
516         MatSetValues(baij->A,1,&row,1,&col,&val,addv);
517       }
518       else {
519         if (mat->was_assembled) {
520           if (!baij->colmap) {
521             ierr = CreateColmap_MPIBAIJ_Private(mat); CHKERRQ(ierr);
522           }
523           col = (baij->colmap[col/bs]-1)*bs + col%bs;
524           if (col < 0  && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
525             ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
526             col = (int) PetscReal(values[3*i+1]);
527           }
528         }
529         MatSetValues(baij->B,1,&row,1,&col,&val,addv);
530       }
531     }
532     count--;
533   }
534   PetscFree(baij->recv_waits); PetscFree(baij->rvalues);
535 
536   /* wait on sends */
537   if (baij->nsends) {
538     send_status = (MPI_Status *) PetscMalloc(baij->nsends*sizeof(MPI_Status));
539     CHKPTRQ(send_status);
540     MPI_Waitall(baij->nsends,baij->send_waits,send_status);
541     PetscFree(send_status);
542   }
543   PetscFree(baij->send_waits); PetscFree(baij->svalues);
544 
545   ierr = MatAssemblyBegin(baij->A,mode); CHKERRQ(ierr);
546   ierr = MatAssemblyEnd(baij->A,mode); CHKERRQ(ierr);
547 
548   /* determine if any processor has disassembled, if so we must
549      also disassemble ourselfs, in order that we may reassemble. */
550   MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);
551   if (mat->was_assembled && !other_disassembled) {
552     ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
553   }
554 
555   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
556     ierr = MatSetUpMultiply_MPIBAIJ(mat); CHKERRQ(ierr);
557   }
558   ierr = MatAssemblyBegin(baij->B,mode); CHKERRQ(ierr);
559   ierr = MatAssemblyEnd(baij->B,mode); CHKERRQ(ierr);
560 
561   if (baij->rowvalues) {PetscFree(baij->rowvalues); baij->rowvalues = 0;}
562   return 0;
563 }
564 
565 #undef __FUNC__
566 #define __FUNC__ "MatView_MPIBAIJ_Binary"
567 static int MatView_MPIBAIJ_Binary(Mat mat,Viewer viewer)
568 {
569   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
570   int          ierr;
571 
572   if (baij->size == 1) {
573     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
574   }
575   else SETERRQ(1,0,"Only uniprocessor output supported");
576   return 0;
577 }
578 
579 #undef __FUNC__
580 #define __FUNC__ "MatView_MPIBAIJ_ASCIIorDraworMatlab"
581 static int MatView_MPIBAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
582 {
583   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
584   int          ierr, format,rank,bs = baij->bs;
585   FILE         *fd;
586   ViewerType   vtype;
587 
588   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
589   if (vtype  == ASCII_FILES_VIEWER || vtype == ASCII_FILE_VIEWER) {
590     ierr = ViewerGetFormat(viewer,&format);
591     if (format == VIEWER_FORMAT_ASCII_INFO_LONG) {
592       MatInfo info;
593       MPI_Comm_rank(mat->comm,&rank);
594       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
595       ierr = MatGetInfo(mat,MAT_LOCAL,&info);
596       PetscSequentialPhaseBegin(mat->comm,1);
597       fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
598               rank,baij->m,(int)info.nz_used*bs,(int)info.nz_allocated*bs,
599               baij->bs,(int)info.memory);
600       ierr = MatGetInfo(baij->A,MAT_LOCAL,&info);
601       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
602       ierr = MatGetInfo(baij->B,MAT_LOCAL,&info);
603       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
604       fflush(fd);
605       PetscSequentialPhaseEnd(mat->comm,1);
606       ierr = VecScatterView(baij->Mvctx,viewer); CHKERRQ(ierr);
607       return 0;
608     }
609     else if (format == VIEWER_FORMAT_ASCII_INFO) {
610       PetscPrintf(mat->comm,"  block size is %d\n",bs);
611       return 0;
612     }
613   }
614 
615   if (vtype == DRAW_VIEWER) {
616     Draw       draw;
617     PetscTruth isnull;
618     ierr = ViewerDrawGetDraw(viewer,&draw); CHKERRQ(ierr);
619     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) return 0;
620   }
621 
622   if (vtype == ASCII_FILE_VIEWER) {
623     ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
624     PetscSequentialPhaseBegin(mat->comm,1);
625     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
626            baij->rank,baij->m,baij->rstart*bs,baij->rend*bs,baij->n,
627             baij->cstart*bs,baij->cend*bs);
628     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
629     ierr = MatView(baij->B,viewer); CHKERRQ(ierr);
630     fflush(fd);
631     PetscSequentialPhaseEnd(mat->comm,1);
632   }
633   else {
634     int size = baij->size;
635     rank = baij->rank;
636     if (size == 1) {
637       ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
638     }
639     else {
640       /* assemble the entire matrix onto first processor. */
641       Mat         A;
642       Mat_SeqBAIJ *Aloc;
643       int         M = baij->M, N = baij->N,*ai,*aj,row,col,i,j,k,*rvals;
644       int         mbs=baij->mbs;
645       Scalar      *a;
646 
647       if (!rank) {
648         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
649         CHKERRQ(ierr);
650       }
651       else {
652         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
653         CHKERRQ(ierr);
654       }
655       PLogObjectParent(mat,A);
656 
657       /* copy over the A part */
658       Aloc = (Mat_SeqBAIJ*) baij->A->data;
659       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
660       row = baij->rstart;
661       rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
662 
663       for ( i=0; i<mbs; i++ ) {
664         rvals[0] = bs*(baij->rstart + i);
665         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
666         for ( j=ai[i]; j<ai[i+1]; j++ ) {
667           col = (baij->cstart+aj[j])*bs;
668           for (k=0; k<bs; k++ ) {
669             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
670             col++; a += bs;
671           }
672         }
673       }
674       /* copy over the B part */
675       Aloc = (Mat_SeqBAIJ*) baij->B->data;
676       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
677       row = baij->rstart*bs;
678       for ( i=0; i<mbs; i++ ) {
679         rvals[0] = bs*(baij->rstart + i);
680         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
681         for ( j=ai[i]; j<ai[i+1]; j++ ) {
682           col = baij->garray[aj[j]]*bs;
683           for (k=0; k<bs; k++ ) {
684             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
685             col++; a += bs;
686           }
687         }
688       }
689       PetscFree(rvals);
690       ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
691       ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
692       if (!rank) {
693         ierr = MatView(((Mat_MPIBAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
694       }
695       ierr = MatDestroy(A); CHKERRQ(ierr);
696     }
697   }
698   return 0;
699 }
700 
701 
702 
703 #undef __FUNC__
704 #define __FUNC__ "MatView_MPIBAIJ"
705 int MatView_MPIBAIJ(PetscObject obj,Viewer viewer)
706 {
707   Mat         mat = (Mat) obj;
708   int         ierr;
709   ViewerType  vtype;
710 
711   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
712   if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER ||
713       vtype == DRAW_VIEWER       || vtype == MATLAB_VIEWER) {
714     ierr = MatView_MPIBAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
715   }
716   else if (vtype == BINARY_FILE_VIEWER) {
717     return MatView_MPIBAIJ_Binary(mat,viewer);
718   }
719   return 0;
720 }
721 
722 #undef __FUNC__
723 #define __FUNC__ "MatDestroy_MPIBAIJ"
724 int MatDestroy_MPIBAIJ(PetscObject obj)
725 {
726   Mat         mat = (Mat) obj;
727   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
728   int         ierr;
729 
730 #if defined(PETSC_LOG)
731   PLogObjectState(obj,"Rows=%d, Cols=%d",baij->M,baij->N);
732 #endif
733 
734   PetscFree(baij->rowners);
735   ierr = MatDestroy(baij->A); CHKERRQ(ierr);
736   ierr = MatDestroy(baij->B); CHKERRQ(ierr);
737   if (baij->colmap) PetscFree(baij->colmap);
738   if (baij->garray) PetscFree(baij->garray);
739   if (baij->lvec)   VecDestroy(baij->lvec);
740   if (baij->Mvctx)  VecScatterDestroy(baij->Mvctx);
741   if (baij->rowvalues) PetscFree(baij->rowvalues);
742   PetscFree(baij);
743   if (mat->mapping) {
744     ierr = ISLocalToGlobalMappingDestroy(mat->mapping); CHKERRQ(ierr);
745   }
746   PLogObjectDestroy(mat);
747   PetscHeaderDestroy(mat);
748   return 0;
749 }
750 
751 #undef __FUNC__
752 #define __FUNC__ "MatMult_MPIBAIJ"
753 int MatMult_MPIBAIJ(Mat A,Vec xx,Vec yy)
754 {
755   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
756   int         ierr, nt;
757 
758   VecGetLocalSize_Fast(xx,nt);
759   if (nt != a->n) {
760     SETERRQ(1,0,"Incompatible partition of A and xx");
761   }
762   VecGetLocalSize_Fast(yy,nt);
763   if (nt != a->m) {
764     SETERRQ(1,0,"Incompatible parition of A and yy");
765   }
766   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
767   ierr = (*a->A->ops.mult)(a->A,xx,yy); CHKERRQ(ierr);
768   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
769   ierr = (*a->B->ops.multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
770   ierr = VecScatterPostRecvs(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
771   return 0;
772 }
773 
774 #undef __FUNC__
775 #define __FUNC__ "MatMultAdd_MPIBAIJ"
776 int MatMultAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
777 {
778   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
779   int        ierr;
780   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
781   ierr = (*a->A->ops.multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
782   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
783   ierr = (*a->B->ops.multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
784   return 0;
785 }
786 
787 #undef __FUNC__
788 #define __FUNC__ "MatMultTrans_MPIBAIJ"
789 int MatMultTrans_MPIBAIJ(Mat A,Vec xx,Vec yy)
790 {
791   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
792   int        ierr;
793 
794   /* do nondiagonal part */
795   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
796   /* send it on its way */
797   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
798   /* do local part */
799   ierr = (*a->A->ops.multtrans)(a->A,xx,yy); CHKERRQ(ierr);
800   /* receive remote parts: note this assumes the values are not actually */
801   /* inserted in yy until the next line, which is true for my implementation*/
802   /* but is not perhaps always true. */
803   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
804   return 0;
805 }
806 
807 #undef __FUNC__
808 #define __FUNC__ "MatMultTransAdd_MPIBAIJ"
809 int MatMultTransAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
810 {
811   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
812   int        ierr;
813 
814   /* do nondiagonal part */
815   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
816   /* send it on its way */
817   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
818   /* do local part */
819   ierr = (*a->A->ops.multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
820   /* receive remote parts: note this assumes the values are not actually */
821   /* inserted in yy until the next line, which is true for my implementation*/
822   /* but is not perhaps always true. */
823   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
824   return 0;
825 }
826 
827 /*
828   This only works correctly for square matrices where the subblock A->A is the
829    diagonal block
830 */
831 #undef __FUNC__
832 #define __FUNC__ "MatGetDiagonal_MPIBAIJ"
833 int MatGetDiagonal_MPIBAIJ(Mat A,Vec v)
834 {
835   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
836   if (a->M != a->N)
837     SETERRQ(1,0,"Supports only square matrix where A->A is diag block");
838   return MatGetDiagonal(a->A,v);
839 }
840 
841 #undef __FUNC__
842 #define __FUNC__ "MatScale_MPIBAIJ"
843 int MatScale_MPIBAIJ(Scalar *aa,Mat A)
844 {
845   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
846   int        ierr;
847   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
848   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
849   return 0;
850 }
851 
852 #undef __FUNC__
853 #define __FUNC__ "MatGetSize_MPIBAIJ"
854 int MatGetSize_MPIBAIJ(Mat matin,int *m,int *n)
855 {
856   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
857   *m = mat->M; *n = mat->N;
858   return 0;
859 }
860 
861 #undef __FUNC__
862 #define __FUNC__ "MatGetLocalSize_MPIBAIJ"
863 int MatGetLocalSize_MPIBAIJ(Mat matin,int *m,int *n)
864 {
865   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
866   *m = mat->m; *n = mat->N;
867   return 0;
868 }
869 
870 #undef __FUNC__
871 #define __FUNC__ "MatGetOwnershipRange_MPIBAIJ"
872 int MatGetOwnershipRange_MPIBAIJ(Mat matin,int *m,int *n)
873 {
874   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
875   *m = mat->rstart*mat->bs; *n = mat->rend*mat->bs;
876   return 0;
877 }
878 
879 extern int MatGetRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
880 extern int MatRestoreRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
881 
882 #undef __FUNC__
883 #define __FUNC__ "MatGetRow_MPIBAIJ"
884 int MatGetRow_MPIBAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
885 {
886   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
887   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
888   int        bs = mat->bs, bs2 = mat->bs2, i, ierr, *cworkA, *cworkB, **pcA, **pcB;
889   int        nztot, nzA, nzB, lrow, brstart = mat->rstart*bs, brend = mat->rend*bs;
890   int        *cmap, *idx_p,cstart = mat->cstart;
891 
892   if (mat->getrowactive == PETSC_TRUE) SETERRQ(1,0,"Already active");
893   mat->getrowactive = PETSC_TRUE;
894 
895   if (!mat->rowvalues && (idx || v)) {
896     /*
897         allocate enough space to hold information from the longest row.
898     */
899     Mat_SeqBAIJ *Aa = (Mat_SeqBAIJ *) mat->A->data,*Ba = (Mat_SeqBAIJ *) mat->B->data;
900     int     max = 1,mbs = mat->mbs,tmp;
901     for ( i=0; i<mbs; i++ ) {
902       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
903       if (max < tmp) { max = tmp; }
904     }
905     mat->rowvalues = (Scalar *) PetscMalloc( max*bs2*(sizeof(int)+sizeof(Scalar)));
906     CHKPTRQ(mat->rowvalues);
907     mat->rowindices = (int *) (mat->rowvalues + max*bs2);
908   }
909 
910 
911   if (row < brstart || row >= brend) SETERRQ(1,0,"Only local rows")
912   lrow = row - brstart;
913 
914   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
915   if (!v)   {pvA = 0; pvB = 0;}
916   if (!idx) {pcA = 0; if (!v) pcB = 0;}
917   ierr = (*mat->A->ops.getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
918   ierr = (*mat->B->ops.getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
919   nztot = nzA + nzB;
920 
921   cmap  = mat->garray;
922   if (v  || idx) {
923     if (nztot) {
924       /* Sort by increasing column numbers, assuming A and B already sorted */
925       int imark = -1;
926       if (v) {
927         *v = v_p = mat->rowvalues;
928         for ( i=0; i<nzB; i++ ) {
929           if (cmap[cworkB[i]/bs] < cstart)   v_p[i] = vworkB[i];
930           else break;
931         }
932         imark = i;
933         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
934         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
935       }
936       if (idx) {
937         *idx = idx_p = mat->rowindices;
938         if (imark > -1) {
939           for ( i=0; i<imark; i++ ) {
940             idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs;
941           }
942         } else {
943           for ( i=0; i<nzB; i++ ) {
944             if (cmap[cworkB[i]/bs] < cstart)
945               idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
946             else break;
947           }
948           imark = i;
949         }
950         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart*bs + cworkA[i];
951         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
952       }
953     }
954     else {
955       if (idx) *idx = 0;
956       if (v)   *v   = 0;
957     }
958   }
959   *nz = nztot;
960   ierr = (*mat->A->ops.restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
961   ierr = (*mat->B->ops.restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
962   return 0;
963 }
964 
965 #undef __FUNC__
966 #define __FUNC__ "MatRestoreRow_MPIBAIJ"
967 int MatRestoreRow_MPIBAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
968 {
969   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
970   if (baij->getrowactive == PETSC_FALSE) {
971     SETERRQ(1,0,"MatGetRow not called");
972   }
973   baij->getrowactive = PETSC_FALSE;
974   return 0;
975 }
976 
977 #undef __FUNC__
978 #define __FUNC__ "MatGetBlockSize_MPIBAIJ"
979 int MatGetBlockSize_MPIBAIJ(Mat mat,int *bs)
980 {
981   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
982   *bs = baij->bs;
983   return 0;
984 }
985 
986 #undef __FUNC__
987 #define __FUNC__ "MatZeroEntries_MPIBAIJ"
988 int MatZeroEntries_MPIBAIJ(Mat A)
989 {
990   Mat_MPIBAIJ *l = (Mat_MPIBAIJ *) A->data;
991   int         ierr;
992   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
993   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
994   return 0;
995 }
996 
997 #undef __FUNC__
998 #define __FUNC__ "MatGetInfo_MPIBAIJ"
999 int MatGetInfo_MPIBAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1000 {
1001   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) matin->data;
1002   Mat         A = a->A, B = a->B;
1003   int         ierr;
1004   double      isend[5], irecv[5];
1005 
1006   info->rows_global    = (double)a->M;
1007   info->columns_global = (double)a->N;
1008   info->rows_local     = (double)a->m;
1009   info->columns_local  = (double)a->N;
1010   info->block_size     = (double)a->bs;
1011   ierr = MatGetInfo(A,MAT_LOCAL,info); CHKERRQ(ierr);
1012   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->memory;
1013   ierr = MatGetInfo(B,MAT_LOCAL,info); CHKERRQ(ierr);
1014   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->memory;
1015   if (flag == MAT_LOCAL) {
1016     info->nz_used      = isend[0];
1017     info->nz_allocated = isend[1];
1018     info->nz_unneeded  = isend[2];
1019     info->memory       = isend[3];
1020     info->mallocs      = isend[4];
1021   } else if (flag == MAT_GLOBAL_MAX) {
1022     MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_MAX,matin->comm);
1023     info->nz_used      = irecv[0];
1024     info->nz_allocated = irecv[1];
1025     info->nz_unneeded  = irecv[2];
1026     info->memory       = irecv[3];
1027     info->mallocs      = irecv[4];
1028   } else if (flag == MAT_GLOBAL_SUM) {
1029     MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_SUM,matin->comm);
1030     info->nz_used      = irecv[0];
1031     info->nz_allocated = irecv[1];
1032     info->nz_unneeded  = irecv[2];
1033     info->memory       = irecv[3];
1034     info->mallocs      = irecv[4];
1035   }
1036   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1037   info->fill_ratio_needed = 0;
1038   info->factor_mallocs    = 0;
1039   return 0;
1040 }
1041 
1042 #undef __FUNC__
1043 #define __FUNC__ "MatSetOption_MPIBAIJ"
1044 int MatSetOption_MPIBAIJ(Mat A,MatOption op)
1045 {
1046   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1047 
1048   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1049       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1050       op == MAT_COLUMNS_UNSORTED ||
1051       op == MAT_COLUMNS_SORTED ||
1052       op == MAT_NEW_NONZERO_LOCATION_ERROR) {
1053         MatSetOption(a->A,op);
1054         MatSetOption(a->B,op);
1055   } else if (op == MAT_ROW_ORIENTED) {
1056         a->roworiented = 1;
1057         MatSetOption(a->A,op);
1058         MatSetOption(a->B,op);
1059   } else if (op == MAT_ROWS_SORTED ||
1060              op == MAT_ROWS_UNSORTED ||
1061              op == MAT_SYMMETRIC ||
1062              op == MAT_STRUCTURALLY_SYMMETRIC ||
1063              op == MAT_YES_NEW_DIAGONALS)
1064     PLogInfo(A,"Info:MatSetOption_MPIBAIJ:Option ignored\n");
1065   else if (op == MAT_COLUMN_ORIENTED) {
1066     a->roworiented = 0;
1067     MatSetOption(a->A,op);
1068     MatSetOption(a->B,op);
1069   } else if (op == MAT_IGNORE_OFF_PROCESSOR_ENTRIES) {
1070     a->donotstash = 1;
1071   } else if (op == MAT_NO_NEW_DIAGONALS)
1072     {SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");}
1073   else
1074     {SETERRQ(PETSC_ERR_SUP,0,"unknown option");}
1075   return 0;
1076 }
1077 
1078 #undef __FUNC__
1079 #define __FUNC__ "MatTranspose_MPIBAIJ("
1080 int MatTranspose_MPIBAIJ(Mat A,Mat *matout)
1081 {
1082   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) A->data;
1083   Mat_SeqBAIJ *Aloc;
1084   Mat        B;
1085   int        ierr,M=baij->M,N=baij->N,*ai,*aj,row,i,*rvals,j,k,col;
1086   int        bs=baij->bs,mbs=baij->mbs;
1087   Scalar     *a;
1088 
1089   if (matout == PETSC_NULL && M != N)
1090     SETERRQ(1,0,"Square matrix only for in-place");
1091   ierr = MatCreateMPIBAIJ(A->comm,baij->bs,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);
1092   CHKERRQ(ierr);
1093 
1094   /* copy over the A part */
1095   Aloc = (Mat_SeqBAIJ*) baij->A->data;
1096   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1097   row = baij->rstart;
1098   rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1099 
1100   for ( i=0; i<mbs; i++ ) {
1101     rvals[0] = bs*(baij->rstart + i);
1102     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1103     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1104       col = (baij->cstart+aj[j])*bs;
1105       for (k=0; k<bs; k++ ) {
1106         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1107         col++; a += bs;
1108       }
1109     }
1110   }
1111   /* copy over the B part */
1112   Aloc = (Mat_SeqBAIJ*) baij->B->data;
1113   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1114   row = baij->rstart*bs;
1115   for ( i=0; i<mbs; i++ ) {
1116     rvals[0] = bs*(baij->rstart + i);
1117     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1118     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1119       col = baij->garray[aj[j]]*bs;
1120       for (k=0; k<bs; k++ ) {
1121         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1122         col++; a += bs;
1123       }
1124     }
1125   }
1126   PetscFree(rvals);
1127   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1128   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1129 
1130   if (matout != PETSC_NULL) {
1131     *matout = B;
1132   } else {
1133     /* This isn't really an in-place transpose .... but free data structures from baij */
1134     PetscFree(baij->rowners);
1135     ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1136     ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1137     if (baij->colmap) PetscFree(baij->colmap);
1138     if (baij->garray) PetscFree(baij->garray);
1139     if (baij->lvec) VecDestroy(baij->lvec);
1140     if (baij->Mvctx) VecScatterDestroy(baij->Mvctx);
1141     PetscFree(baij);
1142     PetscMemcpy(A,B,sizeof(struct _Mat));
1143     PetscHeaderDestroy(B);
1144   }
1145   return 0;
1146 }
1147 
1148 #undef __FUNC__
1149 #define __FUNC__ "MatDiagonalScale_MPIBAIJ"
1150 int MatDiagonalScale_MPIBAIJ(Mat A,Vec ll,Vec rr)
1151 {
1152   Mat a = ((Mat_MPIBAIJ *) A->data)->A;
1153   Mat b = ((Mat_MPIBAIJ *) A->data)->B;
1154   int ierr,s1,s2,s3;
1155 
1156   if (ll)  {
1157     ierr = VecGetLocalSize(ll,&s1); CHKERRQ(ierr);
1158     ierr = MatGetLocalSize(A,&s2,&s3); CHKERRQ(ierr);
1159     if (s1!=s2) SETERRQ(1,0,"non-conforming local sizes");
1160     ierr = MatDiagonalScale(a,ll,0); CHKERRQ(ierr);
1161     ierr = MatDiagonalScale(b,ll,0); CHKERRQ(ierr);
1162   }
1163   if (rr) SETERRQ(1,0,"not supported for right vector");
1164   return 0;
1165 }
1166 
1167 /* the code does not do the diagonal entries correctly unless the
1168    matrix is square and the column and row owerships are identical.
1169    This is a BUG. The only way to fix it seems to be to access
1170    baij->A and baij->B directly and not through the MatZeroRows()
1171    routine.
1172 */
1173 #undef __FUNC__
1174 #define __FUNC__ "MatZeroRows_MPIBAIJ"
1175 int MatZeroRows_MPIBAIJ(Mat A,IS is,Scalar *diag)
1176 {
1177   Mat_MPIBAIJ    *l = (Mat_MPIBAIJ *) A->data;
1178   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
1179   int            *procs,*nprocs,j,found,idx,nsends,*work;
1180   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
1181   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
1182   int            *lens,imdex,*lrows,*values,bs=l->bs;
1183   MPI_Comm       comm = A->comm;
1184   MPI_Request    *send_waits,*recv_waits;
1185   MPI_Status     recv_status,*send_status;
1186   IS             istmp;
1187 
1188   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
1189   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
1190 
1191   /*  first count number of contributors to each processor */
1192   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
1193   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
1194   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
1195   for ( i=0; i<N; i++ ) {
1196     idx = rows[i];
1197     found = 0;
1198     for ( j=0; j<size; j++ ) {
1199       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
1200         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
1201       }
1202     }
1203     if (!found) SETERRQ(1,0,"Index out of range");
1204   }
1205   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
1206 
1207   /* inform other processors of number of messages and max length*/
1208   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
1209   MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);
1210   nrecvs = work[rank];
1211   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
1212   nmax = work[rank];
1213   PetscFree(work);
1214 
1215   /* post receives:   */
1216   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); /*see note */
1217   CHKPTRQ(rvalues);
1218   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));
1219   CHKPTRQ(recv_waits);
1220   for ( i=0; i<nrecvs; i++ ) {
1221     MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
1222   }
1223 
1224   /* do sends:
1225       1) starts[i] gives the starting index in svalues for stuff going to
1226          the ith processor
1227   */
1228   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
1229   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
1230   CHKPTRQ(send_waits);
1231   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
1232   starts[0] = 0;
1233   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1234   for ( i=0; i<N; i++ ) {
1235     svalues[starts[owner[i]]++] = rows[i];
1236   }
1237   ISRestoreIndices(is,&rows);
1238 
1239   starts[0] = 0;
1240   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1241   count = 0;
1242   for ( i=0; i<size; i++ ) {
1243     if (procs[i]) {
1244       MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);
1245     }
1246   }
1247   PetscFree(starts);
1248 
1249   base = owners[rank]*bs;
1250 
1251   /*  wait on receives */
1252   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
1253   source = lens + nrecvs;
1254   count  = nrecvs; slen = 0;
1255   while (count) {
1256     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
1257     /* unpack receives into our local space */
1258     MPI_Get_count(&recv_status,MPI_INT,&n);
1259     source[imdex]  = recv_status.MPI_SOURCE;
1260     lens[imdex]  = n;
1261     slen += n;
1262     count--;
1263   }
1264   PetscFree(recv_waits);
1265 
1266   /* move the data into the send scatter */
1267   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
1268   count = 0;
1269   for ( i=0; i<nrecvs; i++ ) {
1270     values = rvalues + i*nmax;
1271     for ( j=0; j<lens[i]; j++ ) {
1272       lrows[count++] = values[j] - base;
1273     }
1274   }
1275   PetscFree(rvalues); PetscFree(lens);
1276   PetscFree(owner); PetscFree(nprocs);
1277 
1278   /* actually zap the local rows */
1279   ierr = ISCreateGeneral(MPI_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
1280   PLogObjectParent(A,istmp);
1281   PetscFree(lrows);
1282   ierr = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
1283   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
1284   ierr = ISDestroy(istmp); CHKERRQ(ierr);
1285 
1286   /* wait on sends */
1287   if (nsends) {
1288     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));
1289     CHKPTRQ(send_status);
1290     MPI_Waitall(nsends,send_waits,send_status);
1291     PetscFree(send_status);
1292   }
1293   PetscFree(send_waits); PetscFree(svalues);
1294 
1295   return 0;
1296 }
1297 extern int MatPrintHelp_SeqBAIJ(Mat);
1298 #undef __FUNC__
1299 #define __FUNC__ "MatPrintHelp_MPIBAIJ"
1300 int MatPrintHelp_MPIBAIJ(Mat A)
1301 {
1302   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1303 
1304   if (!a->rank) return MatPrintHelp_SeqBAIJ(a->A);
1305   else return 0;
1306 }
1307 
1308 #undef __FUNC__
1309 #define __FUNC__ "MatSetUnfactored_MPIBAIJ"
1310 int MatSetUnfactored_MPIBAIJ(Mat A)
1311 {
1312   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1313   int         ierr;
1314   ierr = MatSetUnfactored(a->A); CHKERRQ(ierr);
1315   return 0;
1316 }
1317 
1318 static int MatConvertSameType_MPIBAIJ(Mat,Mat *,int);
1319 
1320 /* -------------------------------------------------------------------*/
1321 static struct _MatOps MatOps = {
1322   MatSetValues_MPIBAIJ,MatGetRow_MPIBAIJ,MatRestoreRow_MPIBAIJ,MatMult_MPIBAIJ,
1323   MatMultAdd_MPIBAIJ,MatMultTrans_MPIBAIJ,MatMultTransAdd_MPIBAIJ,0,
1324   0,0,0,0,
1325   0,0,MatTranspose_MPIBAIJ,MatGetInfo_MPIBAIJ,
1326   0,MatGetDiagonal_MPIBAIJ,MatDiagonalScale_MPIBAIJ,MatNorm_MPIBAIJ,
1327   MatAssemblyBegin_MPIBAIJ,MatAssemblyEnd_MPIBAIJ,0,MatSetOption_MPIBAIJ,
1328   MatZeroEntries_MPIBAIJ,MatZeroRows_MPIBAIJ,0,
1329   0,0,0,MatGetSize_MPIBAIJ,
1330   MatGetLocalSize_MPIBAIJ,MatGetOwnershipRange_MPIBAIJ,0,0,
1331   0,0,MatConvertSameType_MPIBAIJ,0,0,
1332   0,0,0,MatGetSubMatrices_MPIBAIJ,
1333   MatIncreaseOverlap_MPIBAIJ,MatGetValues_MPIBAIJ,0,MatPrintHelp_MPIBAIJ,
1334   MatScale_MPIBAIJ,0,0,0,MatGetBlockSize_MPIBAIJ,
1335   0,0,0,0,0,0,MatSetUnfactored_MPIBAIJ,0,MatSetValuesBlocked_MPIBAIJ};
1336 
1337 
1338 #undef __FUNC__
1339 #define __FUNC__ "MatCreateMPIBAIJ"
1340 /*@C
1341    MatCreateMPIBAIJ - Creates a sparse parallel matrix in block AIJ format
1342    (block compressed row).  For good matrix assembly performance
1343    the user should preallocate the matrix storage by setting the parameters
1344    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1345    performance can be increased by more than a factor of 50.
1346 
1347    Input Parameters:
1348 .  comm - MPI communicator
1349 .  bs   - size of blockk
1350 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1351            This value should be the same as the local size used in creating the
1352            y vector for the matrix-vector product y = Ax.
1353 .  n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1354            This value should be the same as the local size used in creating the
1355            x vector for the matrix-vector product y = Ax.
1356 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1357 .  N - number of global columns (or PETSC_DECIDE to have calculated if n is given)
1358 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1359            submatrix  (same for all local rows)
1360 .  d_nzz - array containing the number of block nonzeros in the various block rows
1361            of the in diagonal portion of the local (possibly different for each block
1362            row) or PETSC_NULL.  You must leave room for the diagonal entry even if
1363            it is zero.
1364 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1365            submatrix (same for all local rows).
1366 .  o_nzz - array containing the number of nonzeros in the various block rows of the
1367            off-diagonal portion of the local submatrix (possibly different for
1368            each block row) or PETSC_NULL.
1369 
1370    Output Parameter:
1371 .  A - the matrix
1372 
1373    Notes:
1374    The user MUST specify either the local or global matrix dimensions
1375    (possibly both).
1376 
1377    Storage Information:
1378    For a square global matrix we define each processor's diagonal portion
1379    to be its local rows and the corresponding columns (a square submatrix);
1380    each processor's off-diagonal portion encompasses the remainder of the
1381    local matrix (a rectangular submatrix).
1382 
1383    The user can specify preallocated storage for the diagonal part of
1384    the local submatrix with either d_nz or d_nnz (not both).  Set
1385    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1386    memory allocation.  Likewise, specify preallocated storage for the
1387    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1388 
1389    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1390    the figure below we depict these three local rows and all columns (0-11).
1391 
1392 $          0 1 2 3 4 5 6 7 8 9 10 11
1393 $         -------------------
1394 $  row 3  |  o o o d d d o o o o o o
1395 $  row 4  |  o o o d d d o o o o o o
1396 $  row 5  |  o o o d d d o o o o o o
1397 $         -------------------
1398 $
1399 
1400    Thus, any entries in the d locations are stored in the d (diagonal)
1401    submatrix, and any entries in the o locations are stored in the
1402    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1403    stored simply in the MATSEQBAIJ format for compressed row storage.
1404 
1405    Now d_nz should indicate the number of nonzeros per row in the d matrix,
1406    and o_nz should indicate the number of nonzeros per row in the o matrix.
1407    In general, for PDE problems in which most nonzeros are near the diagonal,
1408    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1409    or you will get TERRIBLE performance; see the users' manual chapter on
1410    matrices.
1411 
1412 .keywords: matrix, block, aij, compressed row, sparse, parallel
1413 
1414 .seealso: MatCreate(), MatCreateSeqBAIJ(), MatSetValues()
1415 @*/
1416 int MatCreateMPIBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,
1417                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1418 {
1419   Mat          B;
1420   Mat_MPIBAIJ  *b;
1421   int          ierr, i,sum[2],work[2],mbs,nbs,Mbs=PETSC_DECIDE,Nbs=PETSC_DECIDE,size;
1422 
1423   if (bs < 1) SETERRQ(1,0,"invalid block size specified");
1424   *A = 0;
1425   PetscHeaderCreate(B,_Mat,MAT_COOKIE,MATMPIBAIJ,comm);
1426   PLogObjectCreate(B);
1427   B->data       = (void *) (b = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(b);
1428   PetscMemzero(b,sizeof(Mat_MPIBAIJ));
1429   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
1430   MPI_Comm_size(comm,&size);
1431   if (size == 1) {
1432     B->ops.getrowij          = MatGetRowIJ_MPIBAIJ;
1433     B->ops.restorerowij      = MatRestoreRowIJ_MPIBAIJ;
1434     B->ops.lufactorsymbolic  = MatLUFactorSymbolic_MPIBAIJ;
1435     B->ops.lufactornumeric   = MatLUFactorNumeric_MPIBAIJ;
1436     B->ops.lufactor          = MatLUFactor_MPIBAIJ;
1437     B->ops.solve             = MatSolve_MPIBAIJ;
1438     B->ops.solveadd          = MatSolveAdd_MPIBAIJ;
1439     B->ops.solvetrans        = MatSolveTrans_MPIBAIJ;
1440     B->ops.solvetransadd     = MatSolveTransAdd_MPIBAIJ;
1441     B->ops.ilufactorsymbolic = MatILUFactorSymbolic_MPIBAIJ;
1442   }
1443 
1444   B->destroy    = MatDestroy_MPIBAIJ;
1445   B->view       = MatView_MPIBAIJ;
1446   B->mapping    = 0;
1447   B->factor     = 0;
1448   B->assembled  = PETSC_FALSE;
1449 
1450   B->insertmode = NOT_SET_VALUES;
1451   MPI_Comm_rank(comm,&b->rank);
1452   MPI_Comm_size(comm,&b->size);
1453 
1454   if ( m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL))
1455     SETERRQ(1,0,"Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
1456   if ( M == PETSC_DECIDE && m == PETSC_DECIDE) SETERRQ(1,0,"either M or m should be specified");
1457   if ( M == PETSC_DECIDE && n == PETSC_DECIDE)SETERRQ(1,0,"either N or n should be specified");
1458   if ( M != PETSC_DECIDE && m != PETSC_DECIDE) M = PETSC_DECIDE;
1459   if ( N != PETSC_DECIDE && n != PETSC_DECIDE) N = PETSC_DECIDE;
1460 
1461   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1462     work[0] = m; work[1] = n;
1463     mbs = m/bs; nbs = n/bs;
1464     MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );
1465     if (M == PETSC_DECIDE) {M = sum[0]; Mbs = M/bs;}
1466     if (N == PETSC_DECIDE) {N = sum[1]; Nbs = N/bs;}
1467   }
1468   if (m == PETSC_DECIDE) {
1469     Mbs = M/bs;
1470     if (Mbs*bs != M) SETERRQ(1,0,"No of global rows must be divisible by blocksize");
1471     mbs = Mbs/b->size + ((Mbs % b->size) > b->rank);
1472     m   = mbs*bs;
1473   }
1474   if (n == PETSC_DECIDE) {
1475     Nbs = N/bs;
1476     if (Nbs*bs != N) SETERRQ(1,0,"No of global cols must be divisible by blocksize");
1477     nbs = Nbs/b->size + ((Nbs % b->size) > b->rank);
1478     n   = nbs*bs;
1479   }
1480   if (mbs*bs != m || nbs*bs != n) SETERRQ(1,0,"No of local rows, cols must be divisible by blocksize");
1481 
1482   b->m = m; B->m = m;
1483   b->n = n; B->n = n;
1484   b->N = N; B->N = N;
1485   b->M = M; B->M = M;
1486   b->bs  = bs;
1487   b->bs2 = bs*bs;
1488   b->mbs = mbs;
1489   b->nbs = nbs;
1490   b->Mbs = Mbs;
1491   b->Nbs = Nbs;
1492 
1493   /* build local table of row and column ownerships */
1494   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
1495   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIBAIJ));
1496   b->cowners = b->rowners + b->size + 2;
1497   MPI_Allgather(&mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);
1498   b->rowners[0] = 0;
1499   for ( i=2; i<=b->size; i++ ) {
1500     b->rowners[i] += b->rowners[i-1];
1501   }
1502   b->rstart    = b->rowners[b->rank];
1503   b->rend      = b->rowners[b->rank+1];
1504   b->rstart_bs = b->rstart * bs;
1505   b->rend_bs   = b->rend * bs;
1506 
1507   MPI_Allgather(&nbs,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);
1508   b->cowners[0] = 0;
1509   for ( i=2; i<=b->size; i++ ) {
1510     b->cowners[i] += b->cowners[i-1];
1511   }
1512   b->cstart    = b->cowners[b->rank];
1513   b->cend      = b->cowners[b->rank+1];
1514   b->cstart_bs = b->cstart * bs;
1515   b->cend_bs   = b->cend * bs;
1516 
1517   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1518   ierr = MatCreateSeqBAIJ(MPI_COMM_SELF,bs,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
1519   PLogObjectParent(B,b->A);
1520   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1521   ierr = MatCreateSeqBAIJ(MPI_COMM_SELF,bs,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
1522   PLogObjectParent(B,b->B);
1523 
1524   /* build cache for off array entries formed */
1525   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
1526   b->donotstash  = 0;
1527   b->colmap      = 0;
1528   b->garray      = 0;
1529   b->roworiented = 1;
1530 
1531   /* stuff used for matrix vector multiply */
1532   b->lvec      = 0;
1533   b->Mvctx     = 0;
1534 
1535   /* stuff for MatGetRow() */
1536   b->rowindices   = 0;
1537   b->rowvalues    = 0;
1538   b->getrowactive = PETSC_FALSE;
1539 
1540   *A = B;
1541   return 0;
1542 }
1543 
1544 #undef __FUNC__
1545 #define __FUNC__ "MatConvertSameType_MPIBAIJ"
1546 static int MatConvertSameType_MPIBAIJ(Mat matin,Mat *newmat,int cpvalues)
1547 {
1548   Mat         mat;
1549   Mat_MPIBAIJ *a,*oldmat = (Mat_MPIBAIJ *) matin->data;
1550   int         ierr, len=0, flg;
1551 
1552   *newmat       = 0;
1553   PetscHeaderCreate(mat,_Mat,MAT_COOKIE,MATMPIBAIJ,matin->comm);
1554   PLogObjectCreate(mat);
1555   mat->data       = (void *) (a = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(a);
1556   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1557   mat->destroy    = MatDestroy_MPIBAIJ;
1558   mat->view       = MatView_MPIBAIJ;
1559   mat->factor     = matin->factor;
1560   mat->assembled  = PETSC_TRUE;
1561 
1562   a->m = mat->m   = oldmat->m;
1563   a->n = mat->n   = oldmat->n;
1564   a->M = mat->M   = oldmat->M;
1565   a->N = mat->N   = oldmat->N;
1566 
1567   a->bs  = oldmat->bs;
1568   a->bs2 = oldmat->bs2;
1569   a->mbs = oldmat->mbs;
1570   a->nbs = oldmat->nbs;
1571   a->Mbs = oldmat->Mbs;
1572   a->Nbs = oldmat->Nbs;
1573 
1574   a->rstart       = oldmat->rstart;
1575   a->rend         = oldmat->rend;
1576   a->cstart       = oldmat->cstart;
1577   a->cend         = oldmat->cend;
1578   a->size         = oldmat->size;
1579   a->rank         = oldmat->rank;
1580   mat->insertmode = NOT_SET_VALUES;
1581   a->rowvalues    = 0;
1582   a->getrowactive = PETSC_FALSE;
1583 
1584   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
1585   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIBAIJ));
1586   a->cowners = a->rowners + a->size + 2;
1587   PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));
1588   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
1589   if (oldmat->colmap) {
1590     a->colmap = (int *) PetscMalloc((a->Nbs)*sizeof(int));CHKPTRQ(a->colmap);
1591     PLogObjectMemory(mat,(a->Nbs)*sizeof(int));
1592     PetscMemcpy(a->colmap,oldmat->colmap,(a->Nbs)*sizeof(int));
1593   } else a->colmap = 0;
1594   if (oldmat->garray && (len = ((Mat_SeqBAIJ *) (oldmat->B->data))->nbs)) {
1595     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
1596     PLogObjectMemory(mat,len*sizeof(int));
1597     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
1598   } else a->garray = 0;
1599 
1600   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
1601   PLogObjectParent(mat,a->lvec);
1602   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
1603   PLogObjectParent(mat,a->Mvctx);
1604   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
1605   PLogObjectParent(mat,a->A);
1606   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
1607   PLogObjectParent(mat,a->B);
1608   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
1609   if (flg) {
1610     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
1611   }
1612   *newmat = mat;
1613   return 0;
1614 }
1615 
1616 #include "sys.h"
1617 
1618 #undef __FUNC__
1619 #define __FUNC__ "MatLoad_MPIBAIJ"
1620 int MatLoad_MPIBAIJ(Viewer viewer,MatType type,Mat *newmat)
1621 {
1622   Mat          A;
1623   int          i, nz, ierr, j,rstart, rend, fd;
1624   Scalar       *vals,*buf;
1625   MPI_Comm     comm = ((PetscObject)viewer)->comm;
1626   MPI_Status   status;
1627   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,*browners,maxnz,*cols;
1628   int          *locrowlens,*sndcounts = 0,*procsnz = 0, jj,*mycols,*ibuf;
1629   int          flg,tag = ((PetscObject)viewer)->tag,bs=1,bs2,Mbs,mbs,extra_rows;
1630   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
1631   int          dcount,kmax,k,nzcount,tmp;
1632 
1633 
1634   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
1635   bs2  = bs*bs;
1636 
1637   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
1638   if (!rank) {
1639     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
1640     ierr = PetscBinaryRead(fd,(char *)header,4,BINARY_INT); CHKERRQ(ierr);
1641     if (header[0] != MAT_COOKIE) SETERRQ(1,0,"not matrix object");
1642   }
1643 
1644   MPI_Bcast(header+1,3,MPI_INT,0,comm);
1645   M = header[1]; N = header[2];
1646 
1647   if (M != N) SETERRQ(1,0,"Can only do square matrices");
1648 
1649   /*
1650      This code adds extra rows to make sure the number of rows is
1651      divisible by the blocksize
1652   */
1653   Mbs        = M/bs;
1654   extra_rows = bs - M + bs*(Mbs);
1655   if (extra_rows == bs) extra_rows = 0;
1656   else                  Mbs++;
1657   if (extra_rows &&!rank) {
1658     PLogInfo(0,"MatLoad_MPIBAIJ:Padding loaded matrix to match blocksize\n");
1659   }
1660 
1661   /* determine ownership of all rows */
1662   mbs = Mbs/size + ((Mbs % size) > rank);
1663   m   = mbs * bs;
1664   rowners = (int *) PetscMalloc(2*(size+2)*sizeof(int)); CHKPTRQ(rowners);
1665   browners = rowners + size + 1;
1666   MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);
1667   rowners[0] = 0;
1668   for ( i=2; i<=size; i++ ) rowners[i] += rowners[i-1];
1669   for ( i=0; i<=size;  i++ ) browners[i] = rowners[i]*bs;
1670   rstart = rowners[rank];
1671   rend   = rowners[rank+1];
1672 
1673   /* distribute row lengths to all processors */
1674   locrowlens = (int*) PetscMalloc( (rend-rstart)*bs*sizeof(int) ); CHKPTRQ(locrowlens);
1675   if (!rank) {
1676     rowlengths = (int*) PetscMalloc( (M+extra_rows)*sizeof(int) ); CHKPTRQ(rowlengths);
1677     ierr = PetscBinaryRead(fd,rowlengths,M,BINARY_INT); CHKERRQ(ierr);
1678     for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
1679     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
1680     for ( i=0; i<size; i++ ) sndcounts[i] = browners[i+1] - browners[i];
1681     MPI_Scatterv(rowlengths,sndcounts,browners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);
1682     PetscFree(sndcounts);
1683   }
1684   else {
1685     MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT, 0,comm);
1686   }
1687 
1688   if (!rank) {
1689     /* calculate the number of nonzeros on each processor */
1690     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
1691     PetscMemzero(procsnz,size*sizeof(int));
1692     for ( i=0; i<size; i++ ) {
1693       for ( j=rowners[i]*bs; j< rowners[i+1]*bs; j++ ) {
1694         procsnz[i] += rowlengths[j];
1695       }
1696     }
1697     PetscFree(rowlengths);
1698 
1699     /* determine max buffer needed and allocate it */
1700     maxnz = 0;
1701     for ( i=0; i<size; i++ ) {
1702       maxnz = PetscMax(maxnz,procsnz[i]);
1703     }
1704     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
1705 
1706     /* read in my part of the matrix column indices  */
1707     nz = procsnz[0];
1708     ibuf = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
1709     mycols = ibuf;
1710     if (size == 1)  nz -= extra_rows;
1711     ierr = PetscBinaryRead(fd,mycols,nz,BINARY_INT); CHKERRQ(ierr);
1712     if (size == 1)  for (i=0; i< extra_rows; i++) { mycols[nz+i] = M+i; }
1713 
1714     /* read in every ones (except the last) and ship off */
1715     for ( i=1; i<size-1; i++ ) {
1716       nz = procsnz[i];
1717       ierr = PetscBinaryRead(fd,cols,nz,BINARY_INT); CHKERRQ(ierr);
1718       MPI_Send(cols,nz,MPI_INT,i,tag,comm);
1719     }
1720     /* read in the stuff for the last proc */
1721     if ( size != 1 ) {
1722       nz = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
1723       ierr = PetscBinaryRead(fd,cols,nz,BINARY_INT); CHKERRQ(ierr);
1724       for ( i=0; i<extra_rows; i++ ) cols[nz+i] = M+i;
1725       MPI_Send(cols,nz+extra_rows,MPI_INT,size-1,tag,comm);
1726     }
1727     PetscFree(cols);
1728   }
1729   else {
1730     /* determine buffer space needed for message */
1731     nz = 0;
1732     for ( i=0; i<m; i++ ) {
1733       nz += locrowlens[i];
1734     }
1735     ibuf = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
1736     mycols = ibuf;
1737     /* receive message of column indices*/
1738     MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);
1739     MPI_Get_count(&status,MPI_INT,&maxnz);
1740     if (maxnz != nz) SETERRQ(1,0,"something is wrong with file");
1741   }
1742 
1743   /* loop over local rows, determining number of off diagonal entries */
1744   dlens  = (int *) PetscMalloc( 2*(rend-rstart+1)*sizeof(int) ); CHKPTRQ(dlens);
1745   odlens = dlens + (rend-rstart);
1746   mask   = (int *) PetscMalloc( 3*Mbs*sizeof(int) ); CHKPTRQ(mask);
1747   PetscMemzero(mask,3*Mbs*sizeof(int));
1748   masked1 = mask    + Mbs;
1749   masked2 = masked1 + Mbs;
1750   rowcount = 0; nzcount = 0;
1751   for ( i=0; i<mbs; i++ ) {
1752     dcount  = 0;
1753     odcount = 0;
1754     for ( j=0; j<bs; j++ ) {
1755       kmax = locrowlens[rowcount];
1756       for ( k=0; k<kmax; k++ ) {
1757         tmp = mycols[nzcount++]/bs;
1758         if (!mask[tmp]) {
1759           mask[tmp] = 1;
1760           if (tmp < rstart || tmp >= rend ) masked2[odcount++] = tmp;
1761           else masked1[dcount++] = tmp;
1762         }
1763       }
1764       rowcount++;
1765     }
1766 
1767     dlens[i]  = dcount;
1768     odlens[i] = odcount;
1769 
1770     /* zero out the mask elements we set */
1771     for ( j=0; j<dcount; j++ ) mask[masked1[j]] = 0;
1772     for ( j=0; j<odcount; j++ ) mask[masked2[j]] = 0;
1773   }
1774 
1775   /* create our matrix */
1776   ierr = MatCreateMPIBAIJ(comm,bs,m,PETSC_DECIDE,M+extra_rows,N+extra_rows,0,dlens,0,odlens,newmat);
1777          CHKERRQ(ierr);
1778   A = *newmat;
1779   MatSetOption(A,MAT_COLUMNS_SORTED);
1780 
1781   if (!rank) {
1782     buf = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(buf);
1783     /* read in my part of the matrix numerical values  */
1784     nz = procsnz[0];
1785     vals = buf;
1786     mycols = ibuf;
1787     if (size == 1)  nz -= extra_rows;
1788     ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
1789     if (size == 1)  for (i=0; i< extra_rows; i++) { vals[nz+i] = 1.0; }
1790 
1791     /* insert into matrix */
1792     jj      = rstart*bs;
1793     for ( i=0; i<m; i++ ) {
1794       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
1795       mycols += locrowlens[i];
1796       vals   += locrowlens[i];
1797       jj++;
1798     }
1799     /* read in other processors (except the last one) and ship out */
1800     for ( i=1; i<size-1; i++ ) {
1801       nz = procsnz[i];
1802       vals = buf;
1803       ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
1804       MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);
1805     }
1806     /* the last proc */
1807     if ( size != 1 ){
1808       nz = procsnz[i] - extra_rows;
1809       vals = buf;
1810       ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
1811       for ( i=0; i<extra_rows; i++ ) vals[nz+i] = 1.0;
1812       MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,size-1,A->tag,comm);
1813     }
1814     PetscFree(procsnz);
1815   }
1816   else {
1817     /* receive numeric values */
1818     buf = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(buf);
1819 
1820     /* receive message of values*/
1821     vals = buf;
1822     mycols = ibuf;
1823     MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);
1824     MPI_Get_count(&status,MPIU_SCALAR,&maxnz);
1825     if (maxnz != nz) SETERRQ(1,0,"something is wrong with file");
1826 
1827     /* insert into matrix */
1828     jj      = rstart*bs;
1829     for ( i=0; i<m; i++ ) {
1830       ierr    = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
1831       mycols += locrowlens[i];
1832       vals   += locrowlens[i];
1833       jj++;
1834     }
1835   }
1836   PetscFree(locrowlens);
1837   PetscFree(buf);
1838   PetscFree(ibuf);
1839   PetscFree(rowners);
1840   PetscFree(dlens);
1841   PetscFree(mask);
1842   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1843   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1844   return 0;
1845 }
1846 
1847 
1848