xref: /petsc/src/mat/impls/baij/mpi/mpibaij.c (revision 30793edc096a79a776e222bc7117c70553e9bed7)
1 #ifndef lint
2 static char vcid[] = "$Id: mpibaij.c,v 1.59 1997/03/26 01:36:18 bsmith Exp balay $";
3 #endif
4 
5 #include "pinclude/pviewer.h"
6 #include "src/mat/impls/baij/mpi/mpibaij.h"
7 #include "src/vec/vecimpl.h"
8 
9 
10 extern int MatSetUpMultiply_MPIBAIJ(Mat);
11 extern int DisAssemble_MPIBAIJ(Mat);
12 extern int MatIncreaseOverlap_MPIBAIJ(Mat,int,IS *,int);
13 extern int MatGetSubMatrices_MPIBAIJ(Mat,int,IS *,IS *,MatGetSubMatrixCall,Mat **);
14 extern int MatLUFactorSymbolic_MPIBAIJ(Mat,IS,IS,double,Mat *);
15 extern int MatLUFactorNumeric_MPIBAIJ(Mat,Mat *);
16 extern int MatLUFactor_MPIBAIJ(Mat,IS,IS,double);
17 extern int MatSolve_MPIBAIJ(Mat,Vec,Vec);
18 extern int MatSolveAdd_MPIBAIJ(Mat,Vec,Vec,Vec);
19 extern int MatSolveTrans_MPIBAIJ(Mat,Vec,Vec);
20 extern int MatSolveTransAdd_MPIBAIJ(Mat,Vec,Vec,Vec);
21 extern int MatILUFactorSymbolic_MPIBAIJ(Mat,IS,IS,double,int,Mat *);
22 
23 
24 /*
25      Local utility routine that creates a mapping from the global column
26    number to the local number in the off-diagonal part of the local
27    storage of the matrix.  This is done in a non scable way since the
28    length of colmap equals the global matrix length.
29 */
30 #undef __FUNC__
31 #define __FUNC__ "CreateColmap_MPIBAIJ_Private"
32 static int CreateColmap_MPIBAIJ_Private(Mat mat)
33 {
34   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
35   Mat_SeqBAIJ *B = (Mat_SeqBAIJ*) baij->B->data;
36   int         nbs = B->nbs,i,bs=B->bs;;
37 
38   baij->colmap = (int *) PetscMalloc(baij->Nbs*sizeof(int));CHKPTRQ(baij->colmap);
39   PLogObjectMemory(mat,baij->Nbs*sizeof(int));
40   PetscMemzero(baij->colmap,baij->Nbs*sizeof(int));
41   for ( i=0; i<nbs; i++ ) baij->colmap[baij->garray[i]] = i*bs+1;
42   return 0;
43 }
44 
45 #undef __FUNC__
46 #define __FUNC__ "MatGetRowIJ_MPIBAIJ("
47 static int MatGetRowIJ_MPIBAIJ(Mat mat,int shift,PetscTruth symmetric,int *n,int **ia,int **ja,
48                             PetscTruth *done)
49 {
50   Mat_MPIBAIJ *aij = (Mat_MPIBAIJ *) mat->data;
51   int         ierr;
52   if (aij->size == 1) {
53     ierr = MatGetRowIJ(aij->A,shift,symmetric,n,ia,ja,done); CHKERRQ(ierr);
54   } else SETERRQ(1,0,"not supported in parallel");
55   return 0;
56 }
57 
58 #undef __FUNC__
59 #define __FUNC__ "MatRestoreRowIJ_MPIBAIJ"
60 static int MatRestoreRowIJ_MPIBAIJ(Mat mat,int shift,PetscTruth symmetric,int *n,int **ia,int **ja,
61                                 PetscTruth *done)
62 {
63   Mat_MPIBAIJ *aij = (Mat_MPIBAIJ *) mat->data;
64   int        ierr;
65   if (aij->size == 1) {
66     ierr = MatRestoreRowIJ(aij->A,shift,symmetric,n,ia,ja,done); CHKERRQ(ierr);
67   } else SETERRQ(1,0,"not supported in parallel");
68   return 0;
69 }
70 #define CHUNKSIZE  10
71 
72 #define  MatSetValues_SeqBAIJ_A_Private(row,col,value,addv) \
73 { \
74  \
75     brow = row/bs;  \
76     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
77     rmax = imax[brow]; nrow = ailen[brow]; \
78       bcol = col/bs; \
79       ridx = row % bs; cidx = col % bs; \
80       low = 0; high = nrow; \
81       while (high-low > 3) { \
82         t = (low+high)/2; \
83         if (rp[t] > bcol) high = t; \
84         else              low  = t; \
85       } \
86       for ( _i=low; _i<high; _i++ ) { \
87         if (rp[_i] > bcol) break; \
88         if (rp[_i] == bcol) { \
89           bap  = ap +  bs2*_i + bs*cidx + ridx; \
90           if (addv == ADD_VALUES) *bap += value;  \
91           else                    *bap  = value;  \
92           goto _noinsert; \
93         } \
94       } \
95       if (a->nonew) goto _noinsert; \
96       if (nrow >= rmax) { \
97         /* there is no extra room in row, therefore enlarge */ \
98         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
99         Scalar *new_a; \
100  \
101         /* malloc new storage space */ \
102         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int); \
103         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
104         new_j   = (int *) (new_a + bs2*new_nz); \
105         new_i   = new_j + new_nz; \
106  \
107         /* copy over old data into new slots */ \
108         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];} \
109         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
110         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int)); \
111         len = (new_nz - CHUNKSIZE - ai[brow] - nrow); \
112         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow, \
113                                                            len*sizeof(int)); \
114         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar)); \
115         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
116         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE), \
117                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));  \
118         /* free up old matrix storage */ \
119         PetscFree(a->a);  \
120         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} \
121         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
122         a->singlemalloc = 1; \
123  \
124         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
125         rmax = imax[brow] = imax[brow] + CHUNKSIZE; \
126         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
127         a->maxnz += bs2*CHUNKSIZE; \
128         a->reallocs++; \
129         a->nz++; \
130       } \
131       N = nrow++ - 1;  \
132       /* shift up all the later entries in this row */ \
133       for ( ii=N; ii>=_i; ii-- ) { \
134         rp[ii+1] = rp[ii]; \
135         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
136       } \
137       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
138       rp[_i]                      = bcol;  \
139       ap[bs2*_i + bs*cidx + ridx] = value;  \
140       _noinsert:; \
141     ailen[brow] = nrow; \
142 }
143 
144 extern int MatSetValues_SeqBAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
145 #undef __FUNC__
146 #define __FUNC__ "MatSetValues_MPIBAIJ"
147 int MatSetValues_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
148 {
149   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
150   Scalar      value;
151   int         ierr,i,j,row,col;
152   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
153   int         rend_orig=baij->rend_bs,cstart_orig=baij->cstart_bs;
154   int         cend_orig=baij->cend_bs,bs=baij->bs;
155 
156   /* Some Variables required in the macro */
157   Mat         A = baij->A;
158   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) (A)->data;
159   int         *rp,ii,nrow,_i,rmax,N;
160   int         *imax=a->imax,*ai=a->i,*ailen=a->ilen;
161   int         *aj=a->j,brow,bcol;
162   int         low,high,t,ridx,cidx,bs2=a->bs2;
163   Scalar      *ap,*aa=a->a,*bap;
164 
165   for ( i=0; i<m; i++ ) {
166 #if defined(PETSC_BOPT_g)
167     if (im[i] < 0) SETERRQ(1,0,"Negative row");
168     if (im[i] >= baij->M) SETERRQ(1,0,"Row too large");
169 #endif
170     if (im[i] >= rstart_orig && im[i] < rend_orig) {
171       row = im[i] - rstart_orig;
172       for ( j=0; j<n; j++ ) {
173         if (in[j] >= cstart_orig && in[j] < cend_orig){
174           col = in[j] - cstart_orig;
175           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
176           MatSetValues_SeqBAIJ_A_Private(row,col,value,addv);
177           /* ierr = MatSetValues_SeqBAIJ(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
178         }
179 #if defined(PETSC_BOPT_g)
180         else if (in[j] < 0) {SETERRQ(1,0,"Negative column");}
181         else if (in[j] >= baij->N) {SETERRQ(1,0,"Col too large");}
182 #endif
183         else {
184           if (mat->was_assembled) {
185             if (!baij->colmap) {
186               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
187             }
188             col = baij->colmap[in[j]/bs] - 1 + in[j]%bs;
189             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
190               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
191               col =  in[j];
192             }
193           }
194           else col = in[j];
195           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
196           ierr = MatSetValues_SeqBAIJ(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr);
197         }
198       }
199     }
200     else {
201       if (roworiented && !baij->donotstash) {
202         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
203       }
204       else {
205         if (!baij->donotstash) {
206           row = im[i];
207 	  for ( j=0; j<n; j++ ) {
208 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
209           }
210         }
211       }
212     }
213   }
214   return 0;
215 }
216 
217 extern int MatSetValuesBlocked_SeqBAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
218 #undef __FUNC__
219 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ"
220 int MatSetValuesBlocked_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
221 {
222   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
223   Scalar      *value,*barray=baij->barray;
224   int         ierr,i,j,ii,jj,row,col,k,l;
225   int         roworiented = baij->roworiented,rstart=baij->rstart ;
226   int         rend=baij->rend,cstart=baij->cstart,stepval;
227   int         cend=baij->cend,bs=baij->bs,bs2=baij->bs2;
228 
229 
230   if(!barray) {
231     barray = (Scalar*) PetscMalloc(bs2*sizeof(Scalar)); CHKPTRQ(barray);
232     baij->barray = barray;
233   }
234 
235   if (roworiented) {
236     stepval = (n-1)*bs;
237   } else {
238     stepval = (m-1)*bs;
239   }
240   for ( i=0; i<m; i++ ) {
241 #if defined(PETSC_BOPT_g)
242     if (im[i] < 0) SETERRQ(1,0,"Negative row");
243     if (im[i] >= baij->Mbs) SETERRQ(1,0,"Row too large");
244 #endif
245     if (im[i] >= rstart && im[i] < rend) {
246       row = im[i] - rstart;
247       for ( j=0; j<n; j++ ) {
248         if (roworiented) {
249           value = v + i*(stepval+bs)*bs + j*bs;
250         } else {
251           value = v + j*(stepval+bs)*bs + i*bs;
252         }
253         for ( ii=0; ii<bs; ii++,value+=stepval )
254           for (jj=0; jj<bs; jj++ )
255             *barray++  = *value++;
256         barray -=bs2;
257 
258         if (in[j] >= cstart && in[j] < cend){
259           col = in[j] - cstart;
260           ierr = MatSetValuesBlocked_SeqBAIJ(baij->A,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
261         }
262 #if defined(PETSC_BOPT_g)
263         else if (in[j] < 0) {SETERRQ(1,0,"Negative column");}
264         else if (in[j] >= baij->Nbs) {SETERRQ(1,0,"Col too large");}
265 #endif
266         else {
267           if (mat->was_assembled) {
268             if (!baij->colmap) {
269               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
270             }
271             col = baij->colmap[in[j]] - 1;
272             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
273               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
274               col =  in[j];
275             }
276           }
277           else col = in[j];
278           ierr = MatSetValuesBlocked_SeqBAIJ(baij->B,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
279         }
280       }
281     }
282     else {
283       if (!baij->donotstash) {
284         if (roworiented ) {
285           row   = im[i]*bs;
286           value = v + i*(stepval+bs)*bs;
287           for ( j=0; j<bs; j++,row++ ) {
288             for ( k=0; k<n; k++ ) {
289               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
290                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
291               }
292             }
293           }
294         }
295         else {
296           for ( j=0; j<n; j++ ) {
297             value = v + j*(stepval+bs)*bs + i*bs;
298             col   = in[j]*bs;
299             for ( k=0; k<bs; k++,col++,value+=stepval) {
300               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
301                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
302               }
303             }
304           }
305 
306         }
307       }
308     }
309   }
310   return 0;
311 }
312 
313 #undef __FUNC__
314 #define __FUNC__ "MatGetValues_MPIBAIJ"
315 int MatGetValues_MPIBAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
316 {
317   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
318   int        bs=baij->bs,ierr,i,j, bsrstart = baij->rstart*bs, bsrend = baij->rend*bs;
319   int        bscstart = baij->cstart*bs, bscend = baij->cend*bs,row,col;
320 
321   for ( i=0; i<m; i++ ) {
322     if (idxm[i] < 0) SETERRQ(1,0,"Negative row");
323     if (idxm[i] >= baij->M) SETERRQ(1,0,"Row too large");
324     if (idxm[i] >= bsrstart && idxm[i] < bsrend) {
325       row = idxm[i] - bsrstart;
326       for ( j=0; j<n; j++ ) {
327         if (idxn[j] < 0) SETERRQ(1,0,"Negative column");
328         if (idxn[j] >= baij->N) SETERRQ(1,0,"Col too large");
329         if (idxn[j] >= bscstart && idxn[j] < bscend){
330           col = idxn[j] - bscstart;
331           ierr = MatGetValues(baij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
332         }
333         else {
334           if (!baij->colmap) {
335             ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
336           }
337           if((baij->colmap[idxn[j]/bs]-1 < 0) ||
338              (baij->garray[(baij->colmap[idxn[j]/bs]-1)/bs] != idxn[j]/bs)) *(v+i*n+j) = 0.0;
339           else {
340             col  = (baij->colmap[idxn[j]/bs]-1) + idxn[j]%bs;
341             ierr = MatGetValues(baij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
342           }
343         }
344       }
345     }
346     else {
347       SETERRQ(1,0,"Only local values currently supported");
348     }
349   }
350   return 0;
351 }
352 
353 #undef __FUNC__
354 #define __FUNC__ "MatNorm_MPIBAIJ"
355 int MatNorm_MPIBAIJ(Mat mat,NormType type,double *norm)
356 {
357   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
358   Mat_SeqBAIJ *amat = (Mat_SeqBAIJ*) baij->A->data, *bmat = (Mat_SeqBAIJ*) baij->B->data;
359   int        ierr, i,bs2=baij->bs2;
360   double     sum = 0.0;
361   Scalar     *v;
362 
363   if (baij->size == 1) {
364     ierr =  MatNorm(baij->A,type,norm); CHKERRQ(ierr);
365   } else {
366     if (type == NORM_FROBENIUS) {
367       v = amat->a;
368       for (i=0; i<amat->nz*bs2; i++ ) {
369 #if defined(PETSC_COMPLEX)
370         sum += real(conj(*v)*(*v)); v++;
371 #else
372         sum += (*v)*(*v); v++;
373 #endif
374       }
375       v = bmat->a;
376       for (i=0; i<bmat->nz*bs2; i++ ) {
377 #if defined(PETSC_COMPLEX)
378         sum += real(conj(*v)*(*v)); v++;
379 #else
380         sum += (*v)*(*v); v++;
381 #endif
382       }
383       MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);
384       *norm = sqrt(*norm);
385     }
386     else
387       SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
388   }
389   return 0;
390 }
391 
392 #undef __FUNC__
393 #define __FUNC__ "MatAssemblyBegin_MPIBAIJ"
394 int MatAssemblyBegin_MPIBAIJ(Mat mat,MatAssemblyType mode)
395 {
396   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
397   MPI_Comm    comm = mat->comm;
398   int         size = baij->size, *owners = baij->rowners,bs=baij->bs;
399   int         rank = baij->rank,tag = mat->tag, *owner,*starts,count,ierr;
400   MPI_Request *send_waits,*recv_waits;
401   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
402   InsertMode  addv;
403   Scalar      *rvalues,*svalues;
404 
405   /* make sure all processors are either in INSERTMODE or ADDMODE */
406   MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);
407   if (addv == (ADD_VALUES|INSERT_VALUES)) {
408     SETERRQ(1,0,"Some processors inserted others added");
409   }
410   mat->insertmode = addv; /* in case this processor had no cache */
411 
412   /*  first count number of contributors to each processor */
413   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
414   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
415   owner = (int *) PetscMalloc( (baij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
416   for ( i=0; i<baij->stash.n; i++ ) {
417     idx = baij->stash.idx[i];
418     for ( j=0; j<size; j++ ) {
419       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
420         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
421       }
422     }
423   }
424   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
425 
426   /* inform other processors of number of messages and max length*/
427   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
428   MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);
429   nreceives = work[rank];
430   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
431   nmax = work[rank];
432   PetscFree(work);
433 
434   /* post receives:
435        1) each message will consist of ordered pairs
436      (global index,value) we store the global index as a double
437      to simplify the message passing.
438        2) since we don't know how long each individual message is we
439      allocate the largest needed buffer for each receive. Potentially
440      this is a lot of wasted space.
441 
442 
443        This could be done better.
444   */
445   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));
446   CHKPTRQ(rvalues);
447   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));
448   CHKPTRQ(recv_waits);
449   for ( i=0; i<nreceives; i++ ) {
450     MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,
451               comm,recv_waits+i);
452   }
453 
454   /* do sends:
455       1) starts[i] gives the starting index in svalues for stuff going to
456          the ith processor
457   */
458   svalues = (Scalar *) PetscMalloc(3*(baij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
459   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
460   CHKPTRQ(send_waits);
461   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
462   starts[0] = 0;
463   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
464   for ( i=0; i<baij->stash.n; i++ ) {
465     svalues[3*starts[owner[i]]]       = (Scalar)  baij->stash.idx[i];
466     svalues[3*starts[owner[i]]+1]     = (Scalar)  baij->stash.idy[i];
467     svalues[3*(starts[owner[i]]++)+2] =  baij->stash.array[i];
468   }
469   PetscFree(owner);
470   starts[0] = 0;
471   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
472   count = 0;
473   for ( i=0; i<size; i++ ) {
474     if (procs[i]) {
475       MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,
476                 comm,send_waits+count++);
477     }
478   }
479   PetscFree(starts); PetscFree(nprocs);
480 
481   /* Free cache space */
482   PLogInfo(mat,"MatAssemblyBegin_MPIBAIJ:Number of off-processor values %d\n",baij->stash.n);
483   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
484 
485   baij->svalues    = svalues;    baij->rvalues    = rvalues;
486   baij->nsends     = nsends;     baij->nrecvs     = nreceives;
487   baij->send_waits = send_waits; baij->recv_waits = recv_waits;
488   baij->rmax       = nmax;
489 
490   return 0;
491 }
492 
493 
494 #undef __FUNC__
495 #define __FUNC__ "MatAssemblyEnd_MPIBAIJ"
496 int MatAssemblyEnd_MPIBAIJ(Mat mat,MatAssemblyType mode)
497 {
498   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
499   MPI_Status  *send_status,recv_status;
500   int         imdex,nrecvs = baij->nrecvs, count = nrecvs, i, n, ierr;
501   int         bs=baij->bs,row,col,other_disassembled;
502   Scalar      *values,val;
503   InsertMode  addv = mat->insertmode;
504 
505   /*  wait on receives */
506   while (count) {
507     MPI_Waitany(nrecvs,baij->recv_waits,&imdex,&recv_status);
508     /* unpack receives into our local space */
509     values = baij->rvalues + 3*imdex*baij->rmax;
510     MPI_Get_count(&recv_status,MPIU_SCALAR,&n);
511     n = n/3;
512     for ( i=0; i<n; i++ ) {
513       row = (int) PetscReal(values[3*i]) - baij->rstart*bs;
514       col = (int) PetscReal(values[3*i+1]);
515       val = values[3*i+2];
516       if (col >= baij->cstart*bs && col < baij->cend*bs) {
517         col -= baij->cstart*bs;
518         MatSetValues(baij->A,1,&row,1,&col,&val,addv);
519       }
520       else {
521         if (mat->was_assembled) {
522           if (!baij->colmap) {
523             ierr = CreateColmap_MPIBAIJ_Private(mat); CHKERRQ(ierr);
524           }
525           col = (baij->colmap[col/bs]-1)*bs + col%bs;
526           if (col < 0  && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
527             ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
528             col = (int) PetscReal(values[3*i+1]);
529           }
530         }
531         MatSetValues(baij->B,1,&row,1,&col,&val,addv);
532       }
533     }
534     count--;
535   }
536   PetscFree(baij->recv_waits); PetscFree(baij->rvalues);
537 
538   /* wait on sends */
539   if (baij->nsends) {
540     send_status = (MPI_Status *) PetscMalloc(baij->nsends*sizeof(MPI_Status));
541     CHKPTRQ(send_status);
542     MPI_Waitall(baij->nsends,baij->send_waits,send_status);
543     PetscFree(send_status);
544   }
545   PetscFree(baij->send_waits); PetscFree(baij->svalues);
546 
547   ierr = MatAssemblyBegin(baij->A,mode); CHKERRQ(ierr);
548   ierr = MatAssemblyEnd(baij->A,mode); CHKERRQ(ierr);
549 
550   /* determine if any processor has disassembled, if so we must
551      also disassemble ourselfs, in order that we may reassemble. */
552   MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);
553   if (mat->was_assembled && !other_disassembled) {
554     ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
555   }
556 
557   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
558     ierr = MatSetUpMultiply_MPIBAIJ(mat); CHKERRQ(ierr);
559   }
560   ierr = MatAssemblyBegin(baij->B,mode); CHKERRQ(ierr);
561   ierr = MatAssemblyEnd(baij->B,mode); CHKERRQ(ierr);
562 
563   if (baij->rowvalues) {PetscFree(baij->rowvalues); baij->rowvalues = 0;}
564   return 0;
565 }
566 
567 #undef __FUNC__
568 #define __FUNC__ "MatView_MPIBAIJ_Binary"
569 static int MatView_MPIBAIJ_Binary(Mat mat,Viewer viewer)
570 {
571   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
572   int          ierr;
573 
574   if (baij->size == 1) {
575     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
576   }
577   else SETERRQ(1,0,"Only uniprocessor output supported");
578   return 0;
579 }
580 
581 #undef __FUNC__
582 #define __FUNC__ "MatView_MPIBAIJ_ASCIIorDraworMatlab"
583 static int MatView_MPIBAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
584 {
585   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
586   int          ierr, format,rank,bs = baij->bs;
587   FILE         *fd;
588   ViewerType   vtype;
589 
590   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
591   if (vtype  == ASCII_FILES_VIEWER || vtype == ASCII_FILE_VIEWER) {
592     ierr = ViewerGetFormat(viewer,&format);
593     if (format == VIEWER_FORMAT_ASCII_INFO_LONG) {
594       MatInfo info;
595       MPI_Comm_rank(mat->comm,&rank);
596       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
597       ierr = MatGetInfo(mat,MAT_LOCAL,&info);
598       PetscSequentialPhaseBegin(mat->comm,1);
599       fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
600               rank,baij->m,(int)info.nz_used*bs,(int)info.nz_allocated*bs,
601               baij->bs,(int)info.memory);
602       ierr = MatGetInfo(baij->A,MAT_LOCAL,&info);
603       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
604       ierr = MatGetInfo(baij->B,MAT_LOCAL,&info);
605       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
606       fflush(fd);
607       PetscSequentialPhaseEnd(mat->comm,1);
608       ierr = VecScatterView(baij->Mvctx,viewer); CHKERRQ(ierr);
609       return 0;
610     }
611     else if (format == VIEWER_FORMAT_ASCII_INFO) {
612       PetscPrintf(mat->comm,"  block size is %d\n",bs);
613       return 0;
614     }
615   }
616 
617   if (vtype == DRAW_VIEWER) {
618     Draw       draw;
619     PetscTruth isnull;
620     ierr = ViewerDrawGetDraw(viewer,&draw); CHKERRQ(ierr);
621     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) return 0;
622   }
623 
624   if (vtype == ASCII_FILE_VIEWER) {
625     ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
626     PetscSequentialPhaseBegin(mat->comm,1);
627     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
628            baij->rank,baij->m,baij->rstart*bs,baij->rend*bs,baij->n,
629             baij->cstart*bs,baij->cend*bs);
630     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
631     ierr = MatView(baij->B,viewer); CHKERRQ(ierr);
632     fflush(fd);
633     PetscSequentialPhaseEnd(mat->comm,1);
634   }
635   else {
636     int size = baij->size;
637     rank = baij->rank;
638     if (size == 1) {
639       ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
640     }
641     else {
642       /* assemble the entire matrix onto first processor. */
643       Mat         A;
644       Mat_SeqBAIJ *Aloc;
645       int         M = baij->M, N = baij->N,*ai,*aj,row,col,i,j,k,*rvals;
646       int         mbs=baij->mbs;
647       Scalar      *a;
648 
649       if (!rank) {
650         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
651         CHKERRQ(ierr);
652       }
653       else {
654         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
655         CHKERRQ(ierr);
656       }
657       PLogObjectParent(mat,A);
658 
659       /* copy over the A part */
660       Aloc = (Mat_SeqBAIJ*) baij->A->data;
661       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
662       row = baij->rstart;
663       rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
664 
665       for ( i=0; i<mbs; i++ ) {
666         rvals[0] = bs*(baij->rstart + i);
667         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
668         for ( j=ai[i]; j<ai[i+1]; j++ ) {
669           col = (baij->cstart+aj[j])*bs;
670           for (k=0; k<bs; k++ ) {
671             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
672             col++; a += bs;
673           }
674         }
675       }
676       /* copy over the B part */
677       Aloc = (Mat_SeqBAIJ*) baij->B->data;
678       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
679       row = baij->rstart*bs;
680       for ( i=0; i<mbs; i++ ) {
681         rvals[0] = bs*(baij->rstart + i);
682         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
683         for ( j=ai[i]; j<ai[i+1]; j++ ) {
684           col = baij->garray[aj[j]]*bs;
685           for (k=0; k<bs; k++ ) {
686             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
687             col++; a += bs;
688           }
689         }
690       }
691       PetscFree(rvals);
692       ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
693       ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
694       if (!rank) {
695         ierr = MatView(((Mat_MPIBAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
696       }
697       ierr = MatDestroy(A); CHKERRQ(ierr);
698     }
699   }
700   return 0;
701 }
702 
703 
704 
705 #undef __FUNC__
706 #define __FUNC__ "MatView_MPIBAIJ"
707 int MatView_MPIBAIJ(PetscObject obj,Viewer viewer)
708 {
709   Mat         mat = (Mat) obj;
710   int         ierr;
711   ViewerType  vtype;
712 
713   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
714   if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER ||
715       vtype == DRAW_VIEWER       || vtype == MATLAB_VIEWER) {
716     ierr = MatView_MPIBAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
717   }
718   else if (vtype == BINARY_FILE_VIEWER) {
719     return MatView_MPIBAIJ_Binary(mat,viewer);
720   }
721   return 0;
722 }
723 
724 #undef __FUNC__
725 #define __FUNC__ "MatDestroy_MPIBAIJ"
726 int MatDestroy_MPIBAIJ(PetscObject obj)
727 {
728   Mat         mat = (Mat) obj;
729   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
730   int         ierr;
731 
732 #if defined(PETSC_LOG)
733   PLogObjectState(obj,"Rows=%d, Cols=%d",baij->M,baij->N);
734 #endif
735 
736   PetscFree(baij->rowners);
737   ierr = MatDestroy(baij->A); CHKERRQ(ierr);
738   ierr = MatDestroy(baij->B); CHKERRQ(ierr);
739   if (baij->colmap) PetscFree(baij->colmap);
740   if (baij->garray) PetscFree(baij->garray);
741   if (baij->lvec)   VecDestroy(baij->lvec);
742   if (baij->Mvctx)  VecScatterDestroy(baij->Mvctx);
743   if (baij->rowvalues) PetscFree(baij->rowvalues);
744   if (baij->barray) PetscFree(baij->barray);
745   PetscFree(baij);
746   if (mat->mapping) {
747     ierr = ISLocalToGlobalMappingDestroy(mat->mapping); CHKERRQ(ierr);
748   }
749   PLogObjectDestroy(mat);
750   PetscHeaderDestroy(mat);
751   return 0;
752 }
753 
754 #undef __FUNC__
755 #define __FUNC__ "MatMult_MPIBAIJ"
756 int MatMult_MPIBAIJ(Mat A,Vec xx,Vec yy)
757 {
758   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
759   int         ierr, nt;
760 
761   VecGetLocalSize_Fast(xx,nt);
762   if (nt != a->n) {
763     SETERRQ(1,0,"Incompatible partition of A and xx");
764   }
765   VecGetLocalSize_Fast(yy,nt);
766   if (nt != a->m) {
767     SETERRQ(1,0,"Incompatible parition of A and yy");
768   }
769   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
770   ierr = (*a->A->ops.mult)(a->A,xx,yy); CHKERRQ(ierr);
771   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
772   ierr = (*a->B->ops.multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
773   ierr = VecScatterPostRecvs(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
774   return 0;
775 }
776 
777 #undef __FUNC__
778 #define __FUNC__ "MatMultAdd_MPIBAIJ"
779 int MatMultAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
780 {
781   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
782   int        ierr;
783   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
784   ierr = (*a->A->ops.multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
785   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
786   ierr = (*a->B->ops.multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
787   return 0;
788 }
789 
790 #undef __FUNC__
791 #define __FUNC__ "MatMultTrans_MPIBAIJ"
792 int MatMultTrans_MPIBAIJ(Mat A,Vec xx,Vec yy)
793 {
794   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
795   int        ierr;
796 
797   /* do nondiagonal part */
798   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
799   /* send it on its way */
800   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
801   /* do local part */
802   ierr = (*a->A->ops.multtrans)(a->A,xx,yy); CHKERRQ(ierr);
803   /* receive remote parts: note this assumes the values are not actually */
804   /* inserted in yy until the next line, which is true for my implementation*/
805   /* but is not perhaps always true. */
806   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
807   return 0;
808 }
809 
810 #undef __FUNC__
811 #define __FUNC__ "MatMultTransAdd_MPIBAIJ"
812 int MatMultTransAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
813 {
814   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
815   int        ierr;
816 
817   /* do nondiagonal part */
818   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
819   /* send it on its way */
820   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
821   /* do local part */
822   ierr = (*a->A->ops.multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
823   /* receive remote parts: note this assumes the values are not actually */
824   /* inserted in yy until the next line, which is true for my implementation*/
825   /* but is not perhaps always true. */
826   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
827   return 0;
828 }
829 
830 /*
831   This only works correctly for square matrices where the subblock A->A is the
832    diagonal block
833 */
834 #undef __FUNC__
835 #define __FUNC__ "MatGetDiagonal_MPIBAIJ"
836 int MatGetDiagonal_MPIBAIJ(Mat A,Vec v)
837 {
838   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
839   if (a->M != a->N)
840     SETERRQ(1,0,"Supports only square matrix where A->A is diag block");
841   return MatGetDiagonal(a->A,v);
842 }
843 
844 #undef __FUNC__
845 #define __FUNC__ "MatScale_MPIBAIJ"
846 int MatScale_MPIBAIJ(Scalar *aa,Mat A)
847 {
848   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
849   int        ierr;
850   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
851   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
852   return 0;
853 }
854 
855 #undef __FUNC__
856 #define __FUNC__ "MatGetSize_MPIBAIJ"
857 int MatGetSize_MPIBAIJ(Mat matin,int *m,int *n)
858 {
859   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
860   *m = mat->M; *n = mat->N;
861   return 0;
862 }
863 
864 #undef __FUNC__
865 #define __FUNC__ "MatGetLocalSize_MPIBAIJ"
866 int MatGetLocalSize_MPIBAIJ(Mat matin,int *m,int *n)
867 {
868   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
869   *m = mat->m; *n = mat->N;
870   return 0;
871 }
872 
873 #undef __FUNC__
874 #define __FUNC__ "MatGetOwnershipRange_MPIBAIJ"
875 int MatGetOwnershipRange_MPIBAIJ(Mat matin,int *m,int *n)
876 {
877   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
878   *m = mat->rstart*mat->bs; *n = mat->rend*mat->bs;
879   return 0;
880 }
881 
882 extern int MatGetRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
883 extern int MatRestoreRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
884 
885 #undef __FUNC__
886 #define __FUNC__ "MatGetRow_MPIBAIJ"
887 int MatGetRow_MPIBAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
888 {
889   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
890   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
891   int        bs = mat->bs, bs2 = mat->bs2, i, ierr, *cworkA, *cworkB, **pcA, **pcB;
892   int        nztot, nzA, nzB, lrow, brstart = mat->rstart*bs, brend = mat->rend*bs;
893   int        *cmap, *idx_p,cstart = mat->cstart;
894 
895   if (mat->getrowactive == PETSC_TRUE) SETERRQ(1,0,"Already active");
896   mat->getrowactive = PETSC_TRUE;
897 
898   if (!mat->rowvalues && (idx || v)) {
899     /*
900         allocate enough space to hold information from the longest row.
901     */
902     Mat_SeqBAIJ *Aa = (Mat_SeqBAIJ *) mat->A->data,*Ba = (Mat_SeqBAIJ *) mat->B->data;
903     int     max = 1,mbs = mat->mbs,tmp;
904     for ( i=0; i<mbs; i++ ) {
905       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
906       if (max < tmp) { max = tmp; }
907     }
908     mat->rowvalues = (Scalar *) PetscMalloc( max*bs2*(sizeof(int)+sizeof(Scalar)));
909     CHKPTRQ(mat->rowvalues);
910     mat->rowindices = (int *) (mat->rowvalues + max*bs2);
911   }
912 
913 
914   if (row < brstart || row >= brend) SETERRQ(1,0,"Only local rows")
915   lrow = row - brstart;
916 
917   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
918   if (!v)   {pvA = 0; pvB = 0;}
919   if (!idx) {pcA = 0; if (!v) pcB = 0;}
920   ierr = (*mat->A->ops.getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
921   ierr = (*mat->B->ops.getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
922   nztot = nzA + nzB;
923 
924   cmap  = mat->garray;
925   if (v  || idx) {
926     if (nztot) {
927       /* Sort by increasing column numbers, assuming A and B already sorted */
928       int imark = -1;
929       if (v) {
930         *v = v_p = mat->rowvalues;
931         for ( i=0; i<nzB; i++ ) {
932           if (cmap[cworkB[i]/bs] < cstart)   v_p[i] = vworkB[i];
933           else break;
934         }
935         imark = i;
936         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
937         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
938       }
939       if (idx) {
940         *idx = idx_p = mat->rowindices;
941         if (imark > -1) {
942           for ( i=0; i<imark; i++ ) {
943             idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs;
944           }
945         } else {
946           for ( i=0; i<nzB; i++ ) {
947             if (cmap[cworkB[i]/bs] < cstart)
948               idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
949             else break;
950           }
951           imark = i;
952         }
953         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart*bs + cworkA[i];
954         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
955       }
956     }
957     else {
958       if (idx) *idx = 0;
959       if (v)   *v   = 0;
960     }
961   }
962   *nz = nztot;
963   ierr = (*mat->A->ops.restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
964   ierr = (*mat->B->ops.restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
965   return 0;
966 }
967 
968 #undef __FUNC__
969 #define __FUNC__ "MatRestoreRow_MPIBAIJ"
970 int MatRestoreRow_MPIBAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
971 {
972   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
973   if (baij->getrowactive == PETSC_FALSE) {
974     SETERRQ(1,0,"MatGetRow not called");
975   }
976   baij->getrowactive = PETSC_FALSE;
977   return 0;
978 }
979 
980 #undef __FUNC__
981 #define __FUNC__ "MatGetBlockSize_MPIBAIJ"
982 int MatGetBlockSize_MPIBAIJ(Mat mat,int *bs)
983 {
984   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
985   *bs = baij->bs;
986   return 0;
987 }
988 
989 #undef __FUNC__
990 #define __FUNC__ "MatZeroEntries_MPIBAIJ"
991 int MatZeroEntries_MPIBAIJ(Mat A)
992 {
993   Mat_MPIBAIJ *l = (Mat_MPIBAIJ *) A->data;
994   int         ierr;
995   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
996   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
997   return 0;
998 }
999 
1000 #undef __FUNC__
1001 #define __FUNC__ "MatGetInfo_MPIBAIJ"
1002 int MatGetInfo_MPIBAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1003 {
1004   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) matin->data;
1005   Mat         A = a->A, B = a->B;
1006   int         ierr;
1007   double      isend[5], irecv[5];
1008 
1009   info->rows_global    = (double)a->M;
1010   info->columns_global = (double)a->N;
1011   info->rows_local     = (double)a->m;
1012   info->columns_local  = (double)a->N;
1013   info->block_size     = (double)a->bs;
1014   ierr = MatGetInfo(A,MAT_LOCAL,info); CHKERRQ(ierr);
1015   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->memory;
1016   ierr = MatGetInfo(B,MAT_LOCAL,info); CHKERRQ(ierr);
1017   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->memory;
1018   if (flag == MAT_LOCAL) {
1019     info->nz_used      = isend[0];
1020     info->nz_allocated = isend[1];
1021     info->nz_unneeded  = isend[2];
1022     info->memory       = isend[3];
1023     info->mallocs      = isend[4];
1024   } else if (flag == MAT_GLOBAL_MAX) {
1025     MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_MAX,matin->comm);
1026     info->nz_used      = irecv[0];
1027     info->nz_allocated = irecv[1];
1028     info->nz_unneeded  = irecv[2];
1029     info->memory       = irecv[3];
1030     info->mallocs      = irecv[4];
1031   } else if (flag == MAT_GLOBAL_SUM) {
1032     MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_SUM,matin->comm);
1033     info->nz_used      = irecv[0];
1034     info->nz_allocated = irecv[1];
1035     info->nz_unneeded  = irecv[2];
1036     info->memory       = irecv[3];
1037     info->mallocs      = irecv[4];
1038   }
1039   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1040   info->fill_ratio_needed = 0;
1041   info->factor_mallocs    = 0;
1042   return 0;
1043 }
1044 
1045 #undef __FUNC__
1046 #define __FUNC__ "MatSetOption_MPIBAIJ"
1047 int MatSetOption_MPIBAIJ(Mat A,MatOption op)
1048 {
1049   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1050 
1051   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1052       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1053       op == MAT_COLUMNS_UNSORTED ||
1054       op == MAT_COLUMNS_SORTED ||
1055       op == MAT_NEW_NONZERO_LOCATION_ERROR) {
1056         MatSetOption(a->A,op);
1057         MatSetOption(a->B,op);
1058   } else if (op == MAT_ROW_ORIENTED) {
1059         a->roworiented = 1;
1060         MatSetOption(a->A,op);
1061         MatSetOption(a->B,op);
1062   } else if (op == MAT_ROWS_SORTED ||
1063              op == MAT_ROWS_UNSORTED ||
1064              op == MAT_SYMMETRIC ||
1065              op == MAT_STRUCTURALLY_SYMMETRIC ||
1066              op == MAT_YES_NEW_DIAGONALS)
1067     PLogInfo(A,"Info:MatSetOption_MPIBAIJ:Option ignored\n");
1068   else if (op == MAT_COLUMN_ORIENTED) {
1069     a->roworiented = 0;
1070     MatSetOption(a->A,op);
1071     MatSetOption(a->B,op);
1072   } else if (op == MAT_IGNORE_OFF_PROCESSOR_ENTRIES) {
1073     a->donotstash = 1;
1074   } else if (op == MAT_NO_NEW_DIAGONALS)
1075     {SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");}
1076   else
1077     {SETERRQ(PETSC_ERR_SUP,0,"unknown option");}
1078   return 0;
1079 }
1080 
1081 #undef __FUNC__
1082 #define __FUNC__ "MatTranspose_MPIBAIJ("
1083 int MatTranspose_MPIBAIJ(Mat A,Mat *matout)
1084 {
1085   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) A->data;
1086   Mat_SeqBAIJ *Aloc;
1087   Mat        B;
1088   int        ierr,M=baij->M,N=baij->N,*ai,*aj,row,i,*rvals,j,k,col;
1089   int        bs=baij->bs,mbs=baij->mbs;
1090   Scalar     *a;
1091 
1092   if (matout == PETSC_NULL && M != N)
1093     SETERRQ(1,0,"Square matrix only for in-place");
1094   ierr = MatCreateMPIBAIJ(A->comm,baij->bs,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);
1095   CHKERRQ(ierr);
1096 
1097   /* copy over the A part */
1098   Aloc = (Mat_SeqBAIJ*) baij->A->data;
1099   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1100   row = baij->rstart;
1101   rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1102 
1103   for ( i=0; i<mbs; i++ ) {
1104     rvals[0] = bs*(baij->rstart + i);
1105     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1106     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1107       col = (baij->cstart+aj[j])*bs;
1108       for (k=0; k<bs; k++ ) {
1109         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1110         col++; a += bs;
1111       }
1112     }
1113   }
1114   /* copy over the B part */
1115   Aloc = (Mat_SeqBAIJ*) baij->B->data;
1116   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1117   row = baij->rstart*bs;
1118   for ( i=0; i<mbs; i++ ) {
1119     rvals[0] = bs*(baij->rstart + i);
1120     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1121     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1122       col = baij->garray[aj[j]]*bs;
1123       for (k=0; k<bs; k++ ) {
1124         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1125         col++; a += bs;
1126       }
1127     }
1128   }
1129   PetscFree(rvals);
1130   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1131   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1132 
1133   if (matout != PETSC_NULL) {
1134     *matout = B;
1135   } else {
1136     /* This isn't really an in-place transpose .... but free data structures from baij */
1137     PetscFree(baij->rowners);
1138     ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1139     ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1140     if (baij->colmap) PetscFree(baij->colmap);
1141     if (baij->garray) PetscFree(baij->garray);
1142     if (baij->lvec) VecDestroy(baij->lvec);
1143     if (baij->Mvctx) VecScatterDestroy(baij->Mvctx);
1144     PetscFree(baij);
1145     PetscMemcpy(A,B,sizeof(struct _Mat));
1146     PetscHeaderDestroy(B);
1147   }
1148   return 0;
1149 }
1150 
1151 #undef __FUNC__
1152 #define __FUNC__ "MatDiagonalScale_MPIBAIJ"
1153 int MatDiagonalScale_MPIBAIJ(Mat A,Vec ll,Vec rr)
1154 {
1155   Mat a = ((Mat_MPIBAIJ *) A->data)->A;
1156   Mat b = ((Mat_MPIBAIJ *) A->data)->B;
1157   int ierr,s1,s2,s3;
1158 
1159   if (ll)  {
1160     ierr = VecGetLocalSize(ll,&s1); CHKERRQ(ierr);
1161     ierr = MatGetLocalSize(A,&s2,&s3); CHKERRQ(ierr);
1162     if (s1!=s2) SETERRQ(1,0,"non-conforming local sizes");
1163     ierr = MatDiagonalScale(a,ll,0); CHKERRQ(ierr);
1164     ierr = MatDiagonalScale(b,ll,0); CHKERRQ(ierr);
1165   }
1166   if (rr) SETERRQ(1,0,"not supported for right vector");
1167   return 0;
1168 }
1169 
1170 /* the code does not do the diagonal entries correctly unless the
1171    matrix is square and the column and row owerships are identical.
1172    This is a BUG. The only way to fix it seems to be to access
1173    baij->A and baij->B directly and not through the MatZeroRows()
1174    routine.
1175 */
1176 #undef __FUNC__
1177 #define __FUNC__ "MatZeroRows_MPIBAIJ"
1178 int MatZeroRows_MPIBAIJ(Mat A,IS is,Scalar *diag)
1179 {
1180   Mat_MPIBAIJ    *l = (Mat_MPIBAIJ *) A->data;
1181   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
1182   int            *procs,*nprocs,j,found,idx,nsends,*work;
1183   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
1184   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
1185   int            *lens,imdex,*lrows,*values,bs=l->bs;
1186   MPI_Comm       comm = A->comm;
1187   MPI_Request    *send_waits,*recv_waits;
1188   MPI_Status     recv_status,*send_status;
1189   IS             istmp;
1190 
1191   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
1192   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
1193 
1194   /*  first count number of contributors to each processor */
1195   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
1196   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
1197   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
1198   for ( i=0; i<N; i++ ) {
1199     idx = rows[i];
1200     found = 0;
1201     for ( j=0; j<size; j++ ) {
1202       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
1203         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
1204       }
1205     }
1206     if (!found) SETERRQ(1,0,"Index out of range");
1207   }
1208   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
1209 
1210   /* inform other processors of number of messages and max length*/
1211   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
1212   MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);
1213   nrecvs = work[rank];
1214   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
1215   nmax = work[rank];
1216   PetscFree(work);
1217 
1218   /* post receives:   */
1219   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); /*see note */
1220   CHKPTRQ(rvalues);
1221   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));
1222   CHKPTRQ(recv_waits);
1223   for ( i=0; i<nrecvs; i++ ) {
1224     MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
1225   }
1226 
1227   /* do sends:
1228       1) starts[i] gives the starting index in svalues for stuff going to
1229          the ith processor
1230   */
1231   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
1232   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
1233   CHKPTRQ(send_waits);
1234   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
1235   starts[0] = 0;
1236   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1237   for ( i=0; i<N; i++ ) {
1238     svalues[starts[owner[i]]++] = rows[i];
1239   }
1240   ISRestoreIndices(is,&rows);
1241 
1242   starts[0] = 0;
1243   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1244   count = 0;
1245   for ( i=0; i<size; i++ ) {
1246     if (procs[i]) {
1247       MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);
1248     }
1249   }
1250   PetscFree(starts);
1251 
1252   base = owners[rank]*bs;
1253 
1254   /*  wait on receives */
1255   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
1256   source = lens + nrecvs;
1257   count  = nrecvs; slen = 0;
1258   while (count) {
1259     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
1260     /* unpack receives into our local space */
1261     MPI_Get_count(&recv_status,MPI_INT,&n);
1262     source[imdex]  = recv_status.MPI_SOURCE;
1263     lens[imdex]  = n;
1264     slen += n;
1265     count--;
1266   }
1267   PetscFree(recv_waits);
1268 
1269   /* move the data into the send scatter */
1270   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
1271   count = 0;
1272   for ( i=0; i<nrecvs; i++ ) {
1273     values = rvalues + i*nmax;
1274     for ( j=0; j<lens[i]; j++ ) {
1275       lrows[count++] = values[j] - base;
1276     }
1277   }
1278   PetscFree(rvalues); PetscFree(lens);
1279   PetscFree(owner); PetscFree(nprocs);
1280 
1281   /* actually zap the local rows */
1282   ierr = ISCreateGeneral(MPI_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
1283   PLogObjectParent(A,istmp);
1284   PetscFree(lrows);
1285   ierr = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
1286   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
1287   ierr = ISDestroy(istmp); CHKERRQ(ierr);
1288 
1289   /* wait on sends */
1290   if (nsends) {
1291     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));
1292     CHKPTRQ(send_status);
1293     MPI_Waitall(nsends,send_waits,send_status);
1294     PetscFree(send_status);
1295   }
1296   PetscFree(send_waits); PetscFree(svalues);
1297 
1298   return 0;
1299 }
1300 extern int MatPrintHelp_SeqBAIJ(Mat);
1301 #undef __FUNC__
1302 #define __FUNC__ "MatPrintHelp_MPIBAIJ"
1303 int MatPrintHelp_MPIBAIJ(Mat A)
1304 {
1305   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1306 
1307   if (!a->rank) return MatPrintHelp_SeqBAIJ(a->A);
1308   else return 0;
1309 }
1310 
1311 #undef __FUNC__
1312 #define __FUNC__ "MatSetUnfactored_MPIBAIJ"
1313 int MatSetUnfactored_MPIBAIJ(Mat A)
1314 {
1315   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1316   int         ierr;
1317   ierr = MatSetUnfactored(a->A); CHKERRQ(ierr);
1318   return 0;
1319 }
1320 
1321 static int MatConvertSameType_MPIBAIJ(Mat,Mat *,int);
1322 
1323 /* -------------------------------------------------------------------*/
1324 static struct _MatOps MatOps = {
1325   MatSetValues_MPIBAIJ,MatGetRow_MPIBAIJ,MatRestoreRow_MPIBAIJ,MatMult_MPIBAIJ,
1326   MatMultAdd_MPIBAIJ,MatMultTrans_MPIBAIJ,MatMultTransAdd_MPIBAIJ,0,
1327   0,0,0,0,
1328   0,0,MatTranspose_MPIBAIJ,MatGetInfo_MPIBAIJ,
1329   0,MatGetDiagonal_MPIBAIJ,MatDiagonalScale_MPIBAIJ,MatNorm_MPIBAIJ,
1330   MatAssemblyBegin_MPIBAIJ,MatAssemblyEnd_MPIBAIJ,0,MatSetOption_MPIBAIJ,
1331   MatZeroEntries_MPIBAIJ,MatZeroRows_MPIBAIJ,0,
1332   0,0,0,MatGetSize_MPIBAIJ,
1333   MatGetLocalSize_MPIBAIJ,MatGetOwnershipRange_MPIBAIJ,0,0,
1334   0,0,MatConvertSameType_MPIBAIJ,0,0,
1335   0,0,0,MatGetSubMatrices_MPIBAIJ,
1336   MatIncreaseOverlap_MPIBAIJ,MatGetValues_MPIBAIJ,0,MatPrintHelp_MPIBAIJ,
1337   MatScale_MPIBAIJ,0,0,0,MatGetBlockSize_MPIBAIJ,
1338   0,0,0,0,0,0,MatSetUnfactored_MPIBAIJ,0,MatSetValuesBlocked_MPIBAIJ};
1339 
1340 
1341 #undef __FUNC__
1342 #define __FUNC__ "MatCreateMPIBAIJ"
1343 /*@C
1344    MatCreateMPIBAIJ - Creates a sparse parallel matrix in block AIJ format
1345    (block compressed row).  For good matrix assembly performance
1346    the user should preallocate the matrix storage by setting the parameters
1347    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1348    performance can be increased by more than a factor of 50.
1349 
1350    Input Parameters:
1351 .  comm - MPI communicator
1352 .  bs   - size of blockk
1353 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1354            This value should be the same as the local size used in creating the
1355            y vector for the matrix-vector product y = Ax.
1356 .  n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1357            This value should be the same as the local size used in creating the
1358            x vector for the matrix-vector product y = Ax.
1359 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1360 .  N - number of global columns (or PETSC_DECIDE to have calculated if n is given)
1361 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1362            submatrix  (same for all local rows)
1363 .  d_nzz - array containing the number of block nonzeros in the various block rows
1364            of the in diagonal portion of the local (possibly different for each block
1365            row) or PETSC_NULL.  You must leave room for the diagonal entry even if
1366            it is zero.
1367 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1368            submatrix (same for all local rows).
1369 .  o_nzz - array containing the number of nonzeros in the various block rows of the
1370            off-diagonal portion of the local submatrix (possibly different for
1371            each block row) or PETSC_NULL.
1372 
1373    Output Parameter:
1374 .  A - the matrix
1375 
1376    Notes:
1377    The user MUST specify either the local or global matrix dimensions
1378    (possibly both).
1379 
1380    Storage Information:
1381    For a square global matrix we define each processor's diagonal portion
1382    to be its local rows and the corresponding columns (a square submatrix);
1383    each processor's off-diagonal portion encompasses the remainder of the
1384    local matrix (a rectangular submatrix).
1385 
1386    The user can specify preallocated storage for the diagonal part of
1387    the local submatrix with either d_nz or d_nnz (not both).  Set
1388    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1389    memory allocation.  Likewise, specify preallocated storage for the
1390    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1391 
1392    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1393    the figure below we depict these three local rows and all columns (0-11).
1394 
1395 $          0 1 2 3 4 5 6 7 8 9 10 11
1396 $         -------------------
1397 $  row 3  |  o o o d d d o o o o o o
1398 $  row 4  |  o o o d d d o o o o o o
1399 $  row 5  |  o o o d d d o o o o o o
1400 $         -------------------
1401 $
1402 
1403    Thus, any entries in the d locations are stored in the d (diagonal)
1404    submatrix, and any entries in the o locations are stored in the
1405    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1406    stored simply in the MATSEQBAIJ format for compressed row storage.
1407 
1408    Now d_nz should indicate the number of nonzeros per row in the d matrix,
1409    and o_nz should indicate the number of nonzeros per row in the o matrix.
1410    In general, for PDE problems in which most nonzeros are near the diagonal,
1411    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1412    or you will get TERRIBLE performance; see the users' manual chapter on
1413    matrices.
1414 
1415 .keywords: matrix, block, aij, compressed row, sparse, parallel
1416 
1417 .seealso: MatCreate(), MatCreateSeqBAIJ(), MatSetValues()
1418 @*/
1419 int MatCreateMPIBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,
1420                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1421 {
1422   Mat          B;
1423   Mat_MPIBAIJ  *b;
1424   int          ierr, i,sum[2],work[2],mbs,nbs,Mbs=PETSC_DECIDE,Nbs=PETSC_DECIDE,size;
1425 
1426   if (bs < 1) SETERRQ(1,0,"invalid block size specified");
1427   *A = 0;
1428   PetscHeaderCreate(B,_Mat,MAT_COOKIE,MATMPIBAIJ,comm);
1429   PLogObjectCreate(B);
1430   B->data       = (void *) (b = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(b);
1431   PetscMemzero(b,sizeof(Mat_MPIBAIJ));
1432   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
1433   MPI_Comm_size(comm,&size);
1434   if (size == 1) {
1435     B->ops.getrowij          = MatGetRowIJ_MPIBAIJ;
1436     B->ops.restorerowij      = MatRestoreRowIJ_MPIBAIJ;
1437     B->ops.lufactorsymbolic  = MatLUFactorSymbolic_MPIBAIJ;
1438     B->ops.lufactornumeric   = MatLUFactorNumeric_MPIBAIJ;
1439     B->ops.lufactor          = MatLUFactor_MPIBAIJ;
1440     B->ops.solve             = MatSolve_MPIBAIJ;
1441     B->ops.solveadd          = MatSolveAdd_MPIBAIJ;
1442     B->ops.solvetrans        = MatSolveTrans_MPIBAIJ;
1443     B->ops.solvetransadd     = MatSolveTransAdd_MPIBAIJ;
1444     B->ops.ilufactorsymbolic = MatILUFactorSymbolic_MPIBAIJ;
1445   }
1446 
1447   B->destroy    = MatDestroy_MPIBAIJ;
1448   B->view       = MatView_MPIBAIJ;
1449   B->mapping    = 0;
1450   B->factor     = 0;
1451   B->assembled  = PETSC_FALSE;
1452 
1453   B->insertmode = NOT_SET_VALUES;
1454   MPI_Comm_rank(comm,&b->rank);
1455   MPI_Comm_size(comm,&b->size);
1456 
1457   if ( m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL))
1458     SETERRQ(1,0,"Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
1459   if ( M == PETSC_DECIDE && m == PETSC_DECIDE) SETERRQ(1,0,"either M or m should be specified");
1460   if ( M == PETSC_DECIDE && n == PETSC_DECIDE)SETERRQ(1,0,"either N or n should be specified");
1461   if ( M != PETSC_DECIDE && m != PETSC_DECIDE) M = PETSC_DECIDE;
1462   if ( N != PETSC_DECIDE && n != PETSC_DECIDE) N = PETSC_DECIDE;
1463 
1464   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1465     work[0] = m; work[1] = n;
1466     mbs = m/bs; nbs = n/bs;
1467     MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );
1468     if (M == PETSC_DECIDE) {M = sum[0]; Mbs = M/bs;}
1469     if (N == PETSC_DECIDE) {N = sum[1]; Nbs = N/bs;}
1470   }
1471   if (m == PETSC_DECIDE) {
1472     Mbs = M/bs;
1473     if (Mbs*bs != M) SETERRQ(1,0,"No of global rows must be divisible by blocksize");
1474     mbs = Mbs/b->size + ((Mbs % b->size) > b->rank);
1475     m   = mbs*bs;
1476   }
1477   if (n == PETSC_DECIDE) {
1478     Nbs = N/bs;
1479     if (Nbs*bs != N) SETERRQ(1,0,"No of global cols must be divisible by blocksize");
1480     nbs = Nbs/b->size + ((Nbs % b->size) > b->rank);
1481     n   = nbs*bs;
1482   }
1483   if (mbs*bs != m || nbs*bs != n) SETERRQ(1,0,"No of local rows, cols must be divisible by blocksize");
1484 
1485   b->m = m; B->m = m;
1486   b->n = n; B->n = n;
1487   b->N = N; B->N = N;
1488   b->M = M; B->M = M;
1489   b->bs  = bs;
1490   b->bs2 = bs*bs;
1491   b->mbs = mbs;
1492   b->nbs = nbs;
1493   b->Mbs = Mbs;
1494   b->Nbs = Nbs;
1495 
1496   /* build local table of row and column ownerships */
1497   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
1498   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIBAIJ));
1499   b->cowners = b->rowners + b->size + 2;
1500   MPI_Allgather(&mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);
1501   b->rowners[0] = 0;
1502   for ( i=2; i<=b->size; i++ ) {
1503     b->rowners[i] += b->rowners[i-1];
1504   }
1505   b->rstart    = b->rowners[b->rank];
1506   b->rend      = b->rowners[b->rank+1];
1507   b->rstart_bs = b->rstart * bs;
1508   b->rend_bs   = b->rend * bs;
1509 
1510   MPI_Allgather(&nbs,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);
1511   b->cowners[0] = 0;
1512   for ( i=2; i<=b->size; i++ ) {
1513     b->cowners[i] += b->cowners[i-1];
1514   }
1515   b->cstart    = b->cowners[b->rank];
1516   b->cend      = b->cowners[b->rank+1];
1517   b->cstart_bs = b->cstart * bs;
1518   b->cend_bs   = b->cend * bs;
1519 
1520   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1521   ierr = MatCreateSeqBAIJ(MPI_COMM_SELF,bs,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
1522   PLogObjectParent(B,b->A);
1523   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1524   ierr = MatCreateSeqBAIJ(MPI_COMM_SELF,bs,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
1525   PLogObjectParent(B,b->B);
1526 
1527   /* build cache for off array entries formed */
1528   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
1529   b->donotstash  = 0;
1530   b->colmap      = 0;
1531   b->garray      = 0;
1532   b->roworiented = 1;
1533 
1534   /* stuff used in block assembly */
1535   b->barray      = 0;
1536 
1537   /* stuff used for matrix vector multiply */
1538   b->lvec        = 0;
1539   b->Mvctx       = 0;
1540 
1541   /* stuff for MatGetRow() */
1542   b->rowindices   = 0;
1543   b->rowvalues    = 0;
1544   b->getrowactive = PETSC_FALSE;
1545 
1546   *A = B;
1547   return 0;
1548 }
1549 
1550 #undef __FUNC__
1551 #define __FUNC__ "MatConvertSameType_MPIBAIJ"
1552 static int MatConvertSameType_MPIBAIJ(Mat matin,Mat *newmat,int cpvalues)
1553 {
1554   Mat         mat;
1555   Mat_MPIBAIJ *a,*oldmat = (Mat_MPIBAIJ *) matin->data;
1556   int         ierr, len=0, flg;
1557 
1558   *newmat       = 0;
1559   PetscHeaderCreate(mat,_Mat,MAT_COOKIE,MATMPIBAIJ,matin->comm);
1560   PLogObjectCreate(mat);
1561   mat->data       = (void *) (a = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(a);
1562   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1563   mat->destroy    = MatDestroy_MPIBAIJ;
1564   mat->view       = MatView_MPIBAIJ;
1565   mat->factor     = matin->factor;
1566   mat->assembled  = PETSC_TRUE;
1567 
1568   a->m = mat->m   = oldmat->m;
1569   a->n = mat->n   = oldmat->n;
1570   a->M = mat->M   = oldmat->M;
1571   a->N = mat->N   = oldmat->N;
1572 
1573   a->bs  = oldmat->bs;
1574   a->bs2 = oldmat->bs2;
1575   a->mbs = oldmat->mbs;
1576   a->nbs = oldmat->nbs;
1577   a->Mbs = oldmat->Mbs;
1578   a->Nbs = oldmat->Nbs;
1579 
1580   a->rstart       = oldmat->rstart;
1581   a->rend         = oldmat->rend;
1582   a->cstart       = oldmat->cstart;
1583   a->cend         = oldmat->cend;
1584   a->size         = oldmat->size;
1585   a->rank         = oldmat->rank;
1586   mat->insertmode = NOT_SET_VALUES;
1587   a->rowvalues    = 0;
1588   a->getrowactive = PETSC_FALSE;
1589   a->barray       = 0;
1590 
1591   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
1592   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIBAIJ));
1593   a->cowners = a->rowners + a->size + 2;
1594   PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));
1595   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
1596   if (oldmat->colmap) {
1597     a->colmap = (int *) PetscMalloc((a->Nbs)*sizeof(int));CHKPTRQ(a->colmap);
1598     PLogObjectMemory(mat,(a->Nbs)*sizeof(int));
1599     PetscMemcpy(a->colmap,oldmat->colmap,(a->Nbs)*sizeof(int));
1600   } else a->colmap = 0;
1601   if (oldmat->garray && (len = ((Mat_SeqBAIJ *) (oldmat->B->data))->nbs)) {
1602     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
1603     PLogObjectMemory(mat,len*sizeof(int));
1604     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
1605   } else a->garray = 0;
1606 
1607   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
1608   PLogObjectParent(mat,a->lvec);
1609   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
1610   PLogObjectParent(mat,a->Mvctx);
1611   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
1612   PLogObjectParent(mat,a->A);
1613   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
1614   PLogObjectParent(mat,a->B);
1615   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
1616   if (flg) {
1617     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
1618   }
1619   *newmat = mat;
1620   return 0;
1621 }
1622 
1623 #include "sys.h"
1624 
1625 #undef __FUNC__
1626 #define __FUNC__ "MatLoad_MPIBAIJ"
1627 int MatLoad_MPIBAIJ(Viewer viewer,MatType type,Mat *newmat)
1628 {
1629   Mat          A;
1630   int          i, nz, ierr, j,rstart, rend, fd;
1631   Scalar       *vals,*buf;
1632   MPI_Comm     comm = ((PetscObject)viewer)->comm;
1633   MPI_Status   status;
1634   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,*browners,maxnz,*cols;
1635   int          *locrowlens,*sndcounts = 0,*procsnz = 0, jj,*mycols,*ibuf;
1636   int          flg,tag = ((PetscObject)viewer)->tag,bs=1,bs2,Mbs,mbs,extra_rows;
1637   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
1638   int          dcount,kmax,k,nzcount,tmp;
1639 
1640 
1641   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
1642   bs2  = bs*bs;
1643 
1644   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
1645   if (!rank) {
1646     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
1647     ierr = PetscBinaryRead(fd,(char *)header,4,BINARY_INT); CHKERRQ(ierr);
1648     if (header[0] != MAT_COOKIE) SETERRQ(1,0,"not matrix object");
1649   }
1650 
1651   MPI_Bcast(header+1,3,MPI_INT,0,comm);
1652   M = header[1]; N = header[2];
1653 
1654   if (M != N) SETERRQ(1,0,"Can only do square matrices");
1655 
1656   /*
1657      This code adds extra rows to make sure the number of rows is
1658      divisible by the blocksize
1659   */
1660   Mbs        = M/bs;
1661   extra_rows = bs - M + bs*(Mbs);
1662   if (extra_rows == bs) extra_rows = 0;
1663   else                  Mbs++;
1664   if (extra_rows &&!rank) {
1665     PLogInfo(0,"MatLoad_MPIBAIJ:Padding loaded matrix to match blocksize\n");
1666   }
1667 
1668   /* determine ownership of all rows */
1669   mbs = Mbs/size + ((Mbs % size) > rank);
1670   m   = mbs * bs;
1671   rowners = (int *) PetscMalloc(2*(size+2)*sizeof(int)); CHKPTRQ(rowners);
1672   browners = rowners + size + 1;
1673   MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);
1674   rowners[0] = 0;
1675   for ( i=2; i<=size; i++ ) rowners[i] += rowners[i-1];
1676   for ( i=0; i<=size;  i++ ) browners[i] = rowners[i]*bs;
1677   rstart = rowners[rank];
1678   rend   = rowners[rank+1];
1679 
1680   /* distribute row lengths to all processors */
1681   locrowlens = (int*) PetscMalloc( (rend-rstart)*bs*sizeof(int) ); CHKPTRQ(locrowlens);
1682   if (!rank) {
1683     rowlengths = (int*) PetscMalloc( (M+extra_rows)*sizeof(int) ); CHKPTRQ(rowlengths);
1684     ierr = PetscBinaryRead(fd,rowlengths,M,BINARY_INT); CHKERRQ(ierr);
1685     for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
1686     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
1687     for ( i=0; i<size; i++ ) sndcounts[i] = browners[i+1] - browners[i];
1688     MPI_Scatterv(rowlengths,sndcounts,browners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);
1689     PetscFree(sndcounts);
1690   }
1691   else {
1692     MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT, 0,comm);
1693   }
1694 
1695   if (!rank) {
1696     /* calculate the number of nonzeros on each processor */
1697     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
1698     PetscMemzero(procsnz,size*sizeof(int));
1699     for ( i=0; i<size; i++ ) {
1700       for ( j=rowners[i]*bs; j< rowners[i+1]*bs; j++ ) {
1701         procsnz[i] += rowlengths[j];
1702       }
1703     }
1704     PetscFree(rowlengths);
1705 
1706     /* determine max buffer needed and allocate it */
1707     maxnz = 0;
1708     for ( i=0; i<size; i++ ) {
1709       maxnz = PetscMax(maxnz,procsnz[i]);
1710     }
1711     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
1712 
1713     /* read in my part of the matrix column indices  */
1714     nz = procsnz[0];
1715     ibuf = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
1716     mycols = ibuf;
1717     if (size == 1)  nz -= extra_rows;
1718     ierr = PetscBinaryRead(fd,mycols,nz,BINARY_INT); CHKERRQ(ierr);
1719     if (size == 1)  for (i=0; i< extra_rows; i++) { mycols[nz+i] = M+i; }
1720 
1721     /* read in every ones (except the last) and ship off */
1722     for ( i=1; i<size-1; i++ ) {
1723       nz = procsnz[i];
1724       ierr = PetscBinaryRead(fd,cols,nz,BINARY_INT); CHKERRQ(ierr);
1725       MPI_Send(cols,nz,MPI_INT,i,tag,comm);
1726     }
1727     /* read in the stuff for the last proc */
1728     if ( size != 1 ) {
1729       nz = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
1730       ierr = PetscBinaryRead(fd,cols,nz,BINARY_INT); CHKERRQ(ierr);
1731       for ( i=0; i<extra_rows; i++ ) cols[nz+i] = M+i;
1732       MPI_Send(cols,nz+extra_rows,MPI_INT,size-1,tag,comm);
1733     }
1734     PetscFree(cols);
1735   }
1736   else {
1737     /* determine buffer space needed for message */
1738     nz = 0;
1739     for ( i=0; i<m; i++ ) {
1740       nz += locrowlens[i];
1741     }
1742     ibuf = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
1743     mycols = ibuf;
1744     /* receive message of column indices*/
1745     MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);
1746     MPI_Get_count(&status,MPI_INT,&maxnz);
1747     if (maxnz != nz) SETERRQ(1,0,"something is wrong with file");
1748   }
1749 
1750   /* loop over local rows, determining number of off diagonal entries */
1751   dlens  = (int *) PetscMalloc( 2*(rend-rstart+1)*sizeof(int) ); CHKPTRQ(dlens);
1752   odlens = dlens + (rend-rstart);
1753   mask   = (int *) PetscMalloc( 3*Mbs*sizeof(int) ); CHKPTRQ(mask);
1754   PetscMemzero(mask,3*Mbs*sizeof(int));
1755   masked1 = mask    + Mbs;
1756   masked2 = masked1 + Mbs;
1757   rowcount = 0; nzcount = 0;
1758   for ( i=0; i<mbs; i++ ) {
1759     dcount  = 0;
1760     odcount = 0;
1761     for ( j=0; j<bs; j++ ) {
1762       kmax = locrowlens[rowcount];
1763       for ( k=0; k<kmax; k++ ) {
1764         tmp = mycols[nzcount++]/bs;
1765         if (!mask[tmp]) {
1766           mask[tmp] = 1;
1767           if (tmp < rstart || tmp >= rend ) masked2[odcount++] = tmp;
1768           else masked1[dcount++] = tmp;
1769         }
1770       }
1771       rowcount++;
1772     }
1773 
1774     dlens[i]  = dcount;
1775     odlens[i] = odcount;
1776 
1777     /* zero out the mask elements we set */
1778     for ( j=0; j<dcount; j++ ) mask[masked1[j]] = 0;
1779     for ( j=0; j<odcount; j++ ) mask[masked2[j]] = 0;
1780   }
1781 
1782   /* create our matrix */
1783   ierr = MatCreateMPIBAIJ(comm,bs,m,PETSC_DECIDE,M+extra_rows,N+extra_rows,0,dlens,0,odlens,newmat);
1784          CHKERRQ(ierr);
1785   A = *newmat;
1786   MatSetOption(A,MAT_COLUMNS_SORTED);
1787 
1788   if (!rank) {
1789     buf = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(buf);
1790     /* read in my part of the matrix numerical values  */
1791     nz = procsnz[0];
1792     vals = buf;
1793     mycols = ibuf;
1794     if (size == 1)  nz -= extra_rows;
1795     ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
1796     if (size == 1)  for (i=0; i< extra_rows; i++) { vals[nz+i] = 1.0; }
1797 
1798     /* insert into matrix */
1799     jj      = rstart*bs;
1800     for ( i=0; i<m; i++ ) {
1801       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
1802       mycols += locrowlens[i];
1803       vals   += locrowlens[i];
1804       jj++;
1805     }
1806     /* read in other processors (except the last one) and ship out */
1807     for ( i=1; i<size-1; i++ ) {
1808       nz = procsnz[i];
1809       vals = buf;
1810       ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
1811       MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);
1812     }
1813     /* the last proc */
1814     if ( size != 1 ){
1815       nz = procsnz[i] - extra_rows;
1816       vals = buf;
1817       ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
1818       for ( i=0; i<extra_rows; i++ ) vals[nz+i] = 1.0;
1819       MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,size-1,A->tag,comm);
1820     }
1821     PetscFree(procsnz);
1822   }
1823   else {
1824     /* receive numeric values */
1825     buf = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(buf);
1826 
1827     /* receive message of values*/
1828     vals = buf;
1829     mycols = ibuf;
1830     MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);
1831     MPI_Get_count(&status,MPIU_SCALAR,&maxnz);
1832     if (maxnz != nz) SETERRQ(1,0,"something is wrong with file");
1833 
1834     /* insert into matrix */
1835     jj      = rstart*bs;
1836     for ( i=0; i<m; i++ ) {
1837       ierr    = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
1838       mycols += locrowlens[i];
1839       vals   += locrowlens[i];
1840       jj++;
1841     }
1842   }
1843   PetscFree(locrowlens);
1844   PetscFree(buf);
1845   PetscFree(ibuf);
1846   PetscFree(rowners);
1847   PetscFree(dlens);
1848   PetscFree(mask);
1849   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1850   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1851   return 0;
1852 }
1853 
1854 
1855