xref: /petsc/src/mat/impls/baij/mpi/mpibaij.c (revision 6a6a5d1dfdd5fcb6e31127fc1e7bbfb8f4c48c0a)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: mpibaij.c,v 1.100 1998/01/27 01:23:50 balay Exp balay $";
3 #endif
4 
5 #include "pinclude/pviewer.h"
6 #include "src/mat/impls/baij/mpi/mpibaij.h"
7 #include "src/vec/vecimpl.h"
8 
9 
10 extern int MatSetUpMultiply_MPIBAIJ(Mat);
11 extern int DisAssemble_MPIBAIJ(Mat);
12 extern int MatIncreaseOverlap_MPIBAIJ(Mat,int,IS *,int);
13 extern int MatGetSubMatrices_MPIBAIJ(Mat,int,IS *,IS *,MatGetSubMatrixCall,Mat **);
14 
15 /*
16      Local utility routine that creates a mapping from the global column
17    number to the local number in the off-diagonal part of the local
18    storage of the matrix.  This is done in a non scable way since the
19    length of colmap equals the global matrix length.
20 */
21 #undef __FUNC__
22 #define __FUNC__ "CreateColmap_MPIBAIJ_Private"
23 static int CreateColmap_MPIBAIJ_Private(Mat mat)
24 {
25   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
26   Mat_SeqBAIJ *B = (Mat_SeqBAIJ*) baij->B->data;
27   int         nbs = B->nbs,i,bs=B->bs;;
28 
29   PetscFunctionBegin;
30   baij->colmap = (int *) PetscMalloc((baij->Nbs+1)*sizeof(int));CHKPTRQ(baij->colmap);
31   PLogObjectMemory(mat,baij->Nbs*sizeof(int));
32   PetscMemzero(baij->colmap,baij->Nbs*sizeof(int));
33   for ( i=0; i<nbs; i++ ) baij->colmap[baij->garray[i]] = i*bs+1;
34   PetscFunctionReturn(0);
35 }
36 
37 #define CHUNKSIZE  10
38 
39 #define  MatSetValues_SeqBAIJ_A_Private(row,col,value,addv) \
40 { \
41  \
42     brow = row/bs;  \
43     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
44     rmax = aimax[brow]; nrow = ailen[brow]; \
45       bcol = col/bs; \
46       ridx = row % bs; cidx = col % bs; \
47       low = 0; high = nrow; \
48       while (high-low > 3) { \
49         t = (low+high)/2; \
50         if (rp[t] > bcol) high = t; \
51         else              low  = t; \
52       } \
53       for ( _i=low; _i<high; _i++ ) { \
54         if (rp[_i] > bcol) break; \
55         if (rp[_i] == bcol) { \
56           bap  = ap +  bs2*_i + bs*cidx + ridx; \
57           if (addv == ADD_VALUES) *bap += value;  \
58           else                    *bap  = value;  \
59           goto a_noinsert; \
60         } \
61       } \
62       if (a->nonew == 1) goto a_noinsert; \
63       else if (a->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
64       if (nrow >= rmax) { \
65         /* there is no extra room in row, therefore enlarge */ \
66         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
67         Scalar *new_a; \
68  \
69         if (a->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
70  \
71         /* malloc new storage space */ \
72         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int); \
73         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
74         new_j   = (int *) (new_a + bs2*new_nz); \
75         new_i   = new_j + new_nz; \
76  \
77         /* copy over old data into new slots */ \
78         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];} \
79         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
80         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int)); \
81         len = (new_nz - CHUNKSIZE - ai[brow] - nrow); \
82         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow, \
83                                                            len*sizeof(int)); \
84         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar)); \
85         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
86         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE), \
87                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));  \
88         /* free up old matrix storage */ \
89         PetscFree(a->a);  \
90         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} \
91         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
92         a->singlemalloc = 1; \
93  \
94         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
95         rmax = aimax[brow] = aimax[brow] + CHUNKSIZE; \
96         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
97         a->maxnz += bs2*CHUNKSIZE; \
98         a->reallocs++; \
99         a->nz++; \
100       } \
101       N = nrow++ - 1;  \
102       /* shift up all the later entries in this row */ \
103       for ( ii=N; ii>=_i; ii-- ) { \
104         rp[ii+1] = rp[ii]; \
105         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
106       } \
107       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
108       rp[_i]                      = bcol;  \
109       ap[bs2*_i + bs*cidx + ridx] = value;  \
110       a_noinsert:; \
111     ailen[brow] = nrow; \
112 }
113 
114 #define  MatSetValues_SeqBAIJ_B_Private(row,col,value,addv) \
115 { \
116  \
117     brow = row/bs;  \
118     rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
119     rmax = bimax[brow]; nrow = bilen[brow]; \
120       bcol = col/bs; \
121       ridx = row % bs; cidx = col % bs; \
122       low = 0; high = nrow; \
123       while (high-low > 3) { \
124         t = (low+high)/2; \
125         if (rp[t] > bcol) high = t; \
126         else              low  = t; \
127       } \
128       for ( _i=low; _i<high; _i++ ) { \
129         if (rp[_i] > bcol) break; \
130         if (rp[_i] == bcol) { \
131           bap  = ap +  bs2*_i + bs*cidx + ridx; \
132           if (addv == ADD_VALUES) *bap += value;  \
133           else                    *bap  = value;  \
134           goto b_noinsert; \
135         } \
136       } \
137       if (b->nonew == 1) goto b_noinsert; \
138       else if (b->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
139       if (nrow >= rmax) { \
140         /* there is no extra room in row, therefore enlarge */ \
141         int    new_nz = bi[b->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
142         Scalar *new_a; \
143  \
144         if (b->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
145  \
146         /* malloc new storage space */ \
147         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(b->mbs+1)*sizeof(int); \
148         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
149         new_j   = (int *) (new_a + bs2*new_nz); \
150         new_i   = new_j + new_nz; \
151  \
152         /* copy over old data into new slots */ \
153         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = bi[ii];} \
154         for ( ii=brow+1; ii<b->mbs+1; ii++ ) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
155         PetscMemcpy(new_j,bj,(bi[brow]+nrow)*sizeof(int)); \
156         len = (new_nz - CHUNKSIZE - bi[brow] - nrow); \
157         PetscMemcpy(new_j+bi[brow]+nrow+CHUNKSIZE,bj+bi[brow]+nrow, \
158                                                            len*sizeof(int)); \
159         PetscMemcpy(new_a,ba,(bi[brow]+nrow)*bs2*sizeof(Scalar)); \
160         PetscMemzero(new_a+bs2*(bi[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
161         PetscMemcpy(new_a+bs2*(bi[brow]+nrow+CHUNKSIZE), \
162                     ba+bs2*(bi[brow]+nrow),bs2*len*sizeof(Scalar));  \
163         /* free up old matrix storage */ \
164         PetscFree(b->a);  \
165         if (!b->singlemalloc) {PetscFree(b->i);PetscFree(b->j);} \
166         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
167         b->singlemalloc = 1; \
168  \
169         rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
170         rmax = bimax[brow] = bimax[brow] + CHUNKSIZE; \
171         PLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
172         b->maxnz += bs2*CHUNKSIZE; \
173         b->reallocs++; \
174         b->nz++; \
175       } \
176       N = nrow++ - 1;  \
177       /* shift up all the later entries in this row */ \
178       for ( ii=N; ii>=_i; ii-- ) { \
179         rp[ii+1] = rp[ii]; \
180         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
181       } \
182       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
183       rp[_i]                      = bcol;  \
184       ap[bs2*_i + bs*cidx + ridx] = value;  \
185       b_noinsert:; \
186     bilen[brow] = nrow; \
187 }
188 
189 #undef __FUNC__
190 #define __FUNC__ "MatSetValues_MPIBAIJ"
191 int MatSetValues_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
192 {
193   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
194   Scalar      value;
195   int         ierr,i,j,row,col;
196   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
197   int         rend_orig=baij->rend_bs,cstart_orig=baij->cstart_bs;
198   int         cend_orig=baij->cend_bs,bs=baij->bs;
199 
200   /* Some Variables required in the macro */
201   Mat         A = baij->A;
202   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) (A)->data;
203   int         *aimax=a->imax,*ai=a->i,*ailen=a->ilen,*aj=a->j;
204   Scalar      *aa=a->a;
205 
206   Mat         B = baij->B;
207   Mat_SeqBAIJ *b = (Mat_SeqBAIJ *) (B)->data;
208   int         *bimax=b->imax,*bi=b->i,*bilen=b->ilen,*bj=b->j;
209   Scalar      *ba=b->a;
210 
211   int         *rp,ii,nrow,_i,rmax,N,brow,bcol;
212   int         low,high,t,ridx,cidx,bs2=a->bs2;
213   Scalar      *ap,*bap;
214 
215   PetscFunctionBegin;
216   for ( i=0; i<m; i++ ) {
217 #if defined(USE_PETSC_BOPT_g)
218     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
219     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
220 #endif
221     if (im[i] >= rstart_orig && im[i] < rend_orig) {
222       row = im[i] - rstart_orig;
223       for ( j=0; j<n; j++ ) {
224         if (in[j] >= cstart_orig && in[j] < cend_orig){
225           col = in[j] - cstart_orig;
226           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
227           MatSetValues_SeqBAIJ_A_Private(row,col,value,addv);
228           /* ierr = MatSetValues_SeqBAIJ(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
229         }
230 #if defined(USE_PETSC_BOPT_g)
231         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
232         else if (in[j] >= baij->N) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Col too large");}
233 #endif
234         else {
235           if (mat->was_assembled) {
236             if (!baij->colmap) {
237               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
238             }
239             col = baij->colmap[in[j]/bs] - 1 + in[j]%bs;
240             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
241               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
242               col =  in[j];
243               /* Reinitialize the variables required by MatSetValues_SeqBAIJ_B_Private() */
244               B = baij->B;
245               b = (Mat_SeqBAIJ *) (B)->data;
246               bimax=b->imax;bi=b->i;bilen=b->ilen;bj=b->j;
247               ba=b->a;
248             }
249           } else col = in[j];
250           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
251           MatSetValues_SeqBAIJ_B_Private(row,col,value,addv);
252           /* ierr = MatSetValues_SeqBAIJ(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
253         }
254       }
255     } else {
256       if (roworiented && !baij->donotstash) {
257         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
258       } else {
259         if (!baij->donotstash) {
260           row = im[i];
261 	  for ( j=0; j<n; j++ ) {
262 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
263           }
264         }
265       }
266     }
267   }
268   PetscFunctionReturn(0);
269 }
270 
271 extern int MatSetValuesBlocked_SeqBAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
272 #undef __FUNC__
273 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ"
274 int MatSetValuesBlocked_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
275 {
276   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
277   Scalar      *value,*barray=baij->barray;
278   int         ierr,i,j,ii,jj,row,col,k,l;
279   int         roworiented = baij->roworiented,rstart=baij->rstart ;
280   int         rend=baij->rend,cstart=baij->cstart,stepval;
281   int         cend=baij->cend,bs=baij->bs,bs2=baij->bs2;
282 
283   if(!barray) {
284     baij->barray = barray = (Scalar*) PetscMalloc(bs2*sizeof(Scalar)); CHKPTRQ(barray);
285   }
286 
287   if (roworiented) {
288     stepval = (n-1)*bs;
289   } else {
290     stepval = (m-1)*bs;
291   }
292   for ( i=0; i<m; i++ ) {
293 #if defined(USE_PETSC_BOPT_g)
294     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
295     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
296 #endif
297     if (im[i] >= rstart && im[i] < rend) {
298       row = im[i] - rstart;
299       for ( j=0; j<n; j++ ) {
300         /* If NumCol = 1 then a copy is not required */
301         if ((roworiented) && (n == 1)) {
302           barray = v + i*bs2;
303         } else if((!roworiented) && (m == 1)) {
304           barray = v + j*bs2;
305         } else { /* Here a copy is required */
306           if (roworiented) {
307             value = v + i*(stepval+bs)*bs + j*bs;
308           } else {
309             value = v + j*(stepval+bs)*bs + i*bs;
310           }
311           for ( ii=0; ii<bs; ii++,value+=stepval ) {
312             for (jj=0; jj<bs; jj++ ) {
313               *barray++  = *value++;
314             }
315           }
316           barray -=bs2;
317         }
318 
319         if (in[j] >= cstart && in[j] < cend){
320           col  = in[j] - cstart;
321           ierr = MatSetValuesBlocked_SeqBAIJ(baij->A,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
322         }
323 #if defined(USE_PETSC_BOPT_g)
324         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
325         else if (in[j] >= baij->Nbs) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");}
326 #endif
327         else {
328           if (mat->was_assembled) {
329             if (!baij->colmap) {
330               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
331             }
332 
333 #if defined(USE_PETSC_BOPT_g)
334             if ((baij->colmap[in[j]] - 1) % bs) {SETERRQ(PETSC_ERR_PLIB,0,"Incorrect colmap");}
335 #endif
336             col = (baij->colmap[in[j]] - 1)/bs;
337             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
338               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
339               col =  in[j];
340             }
341           }
342           else col = in[j];
343           ierr = MatSetValuesBlocked_SeqBAIJ(baij->B,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
344         }
345       }
346     } else {
347       if (!baij->donotstash) {
348         if (roworiented ) {
349           row   = im[i]*bs;
350           value = v + i*(stepval+bs)*bs;
351           for ( j=0; j<bs; j++,row++ ) {
352             for ( k=0; k<n; k++ ) {
353               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
354                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
355               }
356             }
357           }
358         } else {
359           for ( j=0; j<n; j++ ) {
360             value = v + j*(stepval+bs)*bs + i*bs;
361             col   = in[j]*bs;
362             for ( k=0; k<bs; k++,col++,value+=stepval) {
363               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
364                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
365               }
366             }
367           }
368         }
369       }
370     }
371   }
372   PetscFunctionReturn(0);
373 }
374 #include <math.h>
375 #define HASH_KEY 0.6180339887
376 /* #define HASH1(size,key) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
377 #define HASH(size,key,tmp) (tmp = (key)*HASH_KEY,(int)((size)*(tmp-(int)tmp)))
378 /* #define HASH(size,key,tmp) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
379 #undef __FUNC__
380 #define __FUNC__ "MatSetValues_MPIBAIJ_HT"
381 int MatSetValues_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
382 {
383   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
384   int         ierr,i,j,row,col;
385   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
386   int         rend_orig=baij->rend_bs,Nbs=baij->Nbs;
387   int         total_ct=baij->ht_total_ct,insert_ct=baij->ht_insert_ct;
388   int         h1,key,size=baij->ht_size,bs=baij->bs,*HT=baij->ht,idx;
389   double      tmp;
390   Scalar      ** HD = baij->hd,value;
391 
392 
393   PetscFunctionBegin;
394 
395   for ( i=0; i<m; i++ ) {
396 #if defined(USE_PETSC_BOPT_g)
397     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
398     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
399 #endif
400       row = im[i];
401     if (row >= rstart_orig && row < rend_orig) {
402       for ( j=0; j<n; j++ ) {
403         col = in[j];
404         if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
405         /* Look up into the Hash Table */
406         key = (row/bs)*Nbs+(col/bs)+1;
407         h1  = HASH(size,key,tmp);
408 
409 
410         idx = h1;
411 #if defined(USE_PETSC_BOPT_g)
412         insert_ct++;
413         total_ct++;
414         if (HT[idx] != key) {
415           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++,total_ct++);
416           if (idx == size) {
417             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++,total_ct++);
418             if (idx == h1) {
419               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
420             }
421           }
422         }
423 #else
424         if (HT[idx] != key) {
425           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++);
426           if (idx == size) {
427             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++);
428             if (idx == h1) {
429               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
430             }
431           }
432         }
433 #endif
434         /* A HASH table entry is found, so insert the values at the correct address */
435         if (addv == ADD_VALUES) *(HD[idx]+ (col % bs)*bs + (row % bs)) += value;
436         else                    *(HD[idx]+ (col % bs)*bs + (row % bs))  = value;
437         break;
438       }
439     } else {
440       if (roworiented && !baij->donotstash) {
441         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
442       } else {
443         if (!baij->donotstash) {
444           row = im[i];
445 	  for ( j=0; j<n; j++ ) {
446 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
447           }
448         }
449       }
450     }
451   }
452 #if defined(USE_PETSC_BOPT_g)
453   baij->ht_total_ct = total_ct;
454   baij->ht_insert_ct = insert_ct;
455 #endif
456   PetscFunctionReturn(0);
457 }
458 
459 #undef __FUNC__
460 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ_HT"
461 int MatSetValuesBlocked_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
462 {
463   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
464   int         ierr,i,j,ii,jj,row,col,k,l;
465   int         roworiented = baij->roworiented,rstart=baij->rstart ;
466   int         rend=baij->rend,stepval,bs=baij->bs,bs2=baij->bs2;
467   int         total_ct=baij->ht_total_ct,insert_ct=baij->ht_insert_ct;
468 
469   int         h1,key,size=baij->ht_size,idx,*HT=baij->ht,Nbs=baij->Nbs;
470   double      tmp;
471   Scalar      ** HD = baij->hd,*value,*v_t,*baij_a;
472 
473   PetscFunctionBegin;
474 
475   if (roworiented) {
476     stepval = (n-1)*bs;
477   } else {
478     stepval = (m-1)*bs;
479   }
480   for ( i=0; i<m; i++ ) {
481 #if defined(USE_PETSC_BOPT_g)
482     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
483     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
484 #endif
485     row   = im[i];
486     v_t   = v + i*bs2;
487     if (row >= rstart && row < rend) {
488       for ( j=0; j<n; j++ ) {
489         col = in[j];
490 
491         /* Look up into the Hash Table */
492         key = row*Nbs+col+1;
493         h1  = HASH(size,key,tmp);
494 
495         idx = h1;
496 #if defined(USE_PETSC_BOPT_g)
497         total_ct++;
498         insert_ct++;
499        if (HT[idx] != key) {
500           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++,total_ct++);
501           if (idx == size) {
502             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++,total_ct++);
503             if (idx == h1) {
504               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
505             }
506           }
507         }
508 #else
509         if (HT[idx] != key) {
510           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++);
511           if (idx == size) {
512             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++);
513             if (idx == h1) {
514               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
515             }
516           }
517         }
518 #endif
519         baij_a = HD[idx];
520         if (roworiented) {
521           /*value = v + i*(stepval+bs)*bs + j*bs;*/
522           /* value = v + (i*(stepval+bs)+j)*bs; */
523           value = v_t;
524           v_t  += bs;
525           if (addv == ADD_VALUES) {
526             for ( ii=0; ii<bs; ii++,value+=stepval) {
527               for ( jj=ii; jj<bs2; jj+=bs ) {
528                 baij_a[jj]  += *value++;
529               }
530             }
531           } else {
532             for ( ii=0; ii<bs; ii++,value+=stepval) {
533               for ( jj=ii; jj<bs2; jj+=bs ) {
534                 baij_a[jj]  = *value++;
535               }
536             }
537           }
538         } else {
539           value = v + j*(stepval+bs)*bs + i*bs;
540           if (addv == ADD_VALUES) {
541             for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
542               for ( jj=0; jj<bs; jj++ ) {
543                 baij_a[jj]  += *value++;
544               }
545             }
546           } else {
547             for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
548               for ( jj=0; jj<bs; jj++ ) {
549                 baij_a[jj]  = *value++;
550               }
551             }
552           }
553         }
554       }
555     } else {
556       if (!baij->donotstash) {
557         if (roworiented ) {
558           row   = im[i]*bs;
559           value = v + i*(stepval+bs)*bs;
560           for ( j=0; j<bs; j++,row++ ) {
561             for ( k=0; k<n; k++ ) {
562               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
563                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
564               }
565             }
566           }
567         } else {
568           for ( j=0; j<n; j++ ) {
569             value = v + j*(stepval+bs)*bs + i*bs;
570             col   = in[j]*bs;
571             for ( k=0; k<bs; k++,col++,value+=stepval) {
572               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
573                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
574               }
575             }
576           }
577         }
578       }
579     }
580   }
581 #if defined(USE_PETSC_BOPT_g)
582   baij->ht_total_ct = total_ct;
583   baij->ht_insert_ct = insert_ct;
584 #endif
585   PetscFunctionReturn(0);
586 }
587 #undef __FUNC__
588 #define __FUNC__ "MatGetValues_MPIBAIJ"
589 int MatGetValues_MPIBAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
590 {
591   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
592   int        bs=baij->bs,ierr,i,j, bsrstart = baij->rstart*bs, bsrend = baij->rend*bs;
593   int        bscstart = baij->cstart*bs, bscend = baij->cend*bs,row,col;
594 
595   for ( i=0; i<m; i++ ) {
596     if (idxm[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
597     if (idxm[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
598     if (idxm[i] >= bsrstart && idxm[i] < bsrend) {
599       row = idxm[i] - bsrstart;
600       for ( j=0; j<n; j++ ) {
601         if (idxn[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");
602         if (idxn[j] >= baij->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");
603         if (idxn[j] >= bscstart && idxn[j] < bscend){
604           col = idxn[j] - bscstart;
605           ierr = MatGetValues(baij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
606         } else {
607           if (!baij->colmap) {
608             ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
609           }
610           if((baij->colmap[idxn[j]/bs]-1 < 0) ||
611              (baij->garray[(baij->colmap[idxn[j]/bs]-1)/bs] != idxn[j]/bs)) *(v+i*n+j) = 0.0;
612           else {
613             col  = (baij->colmap[idxn[j]/bs]-1) + idxn[j]%bs;
614             ierr = MatGetValues(baij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
615           }
616         }
617       }
618     } else {
619       SETERRQ(PETSC_ERR_SUP,0,"Only local values currently supported");
620     }
621   }
622  PetscFunctionReturn(0);
623 }
624 
625 #undef __FUNC__
626 #define __FUNC__ "MatNorm_MPIBAIJ"
627 int MatNorm_MPIBAIJ(Mat mat,NormType type,double *norm)
628 {
629   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
630   Mat_SeqBAIJ *amat = (Mat_SeqBAIJ*) baij->A->data, *bmat = (Mat_SeqBAIJ*) baij->B->data;
631   int        ierr, i,bs2=baij->bs2;
632   double     sum = 0.0;
633   Scalar     *v;
634 
635   PetscFunctionBegin;
636   if (baij->size == 1) {
637     ierr =  MatNorm(baij->A,type,norm); CHKERRQ(ierr);
638   } else {
639     if (type == NORM_FROBENIUS) {
640       v = amat->a;
641       for (i=0; i<amat->nz*bs2; i++ ) {
642 #if defined(USE_PETSC_COMPLEX)
643         sum += real(conj(*v)*(*v)); v++;
644 #else
645         sum += (*v)*(*v); v++;
646 #endif
647       }
648       v = bmat->a;
649       for (i=0; i<bmat->nz*bs2; i++ ) {
650 #if defined(USE_PETSC_COMPLEX)
651         sum += real(conj(*v)*(*v)); v++;
652 #else
653         sum += (*v)*(*v); v++;
654 #endif
655       }
656       ierr = MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
657       *norm = sqrt(*norm);
658     } else {
659       SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
660     }
661   }
662   PetscFunctionReturn(0);
663 }
664 
665 #undef __FUNC__
666 #define __FUNC__ "MatAssemblyBegin_MPIBAIJ"
667 int MatAssemblyBegin_MPIBAIJ(Mat mat,MatAssemblyType mode)
668 {
669   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
670   MPI_Comm    comm = mat->comm;
671   int         size = baij->size, *owners = baij->rowners,bs=baij->bs;
672   int         rank = baij->rank,tag = mat->tag, *owner,*starts,count,ierr;
673   MPI_Request *send_waits,*recv_waits;
674   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
675   InsertMode  addv;
676   Scalar      *rvalues,*svalues;
677 
678   PetscFunctionBegin;
679   /* make sure all processors are either in INSERTMODE or ADDMODE */
680   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);CHKERRQ(ierr);
681   if (addv == (ADD_VALUES|INSERT_VALUES)) {
682     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Some processors inserted others added");
683   }
684   mat->insertmode = addv; /* in case this processor had no cache */
685 
686   /*  first count number of contributors to each processor */
687   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
688   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
689   owner = (int *) PetscMalloc( (baij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
690   for ( i=0; i<baij->stash.n; i++ ) {
691     idx = baij->stash.idx[i];
692     for ( j=0; j<size; j++ ) {
693       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
694         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
695       }
696     }
697   }
698   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
699 
700   /* inform other processors of number of messages and max length*/
701   work      = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
702   ierr      = MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
703   nreceives = work[rank];
704   ierr      = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
705   nmax      = work[rank];
706   PetscFree(work);
707 
708   /* post receives:
709        1) each message will consist of ordered pairs
710      (global index,value) we store the global index as a double
711      to simplify the message passing.
712        2) since we don't know how long each individual message is we
713      allocate the largest needed buffer for each receive. Potentially
714      this is a lot of wasted space.
715 
716 
717        This could be done better.
718   */
719   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));CHKPTRQ(rvalues);
720   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
721   for ( i=0; i<nreceives; i++ ) {
722     ierr = MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
723   }
724 
725   /* do sends:
726       1) starts[i] gives the starting index in svalues for stuff going to
727          the ith processor
728   */
729   svalues = (Scalar *) PetscMalloc(3*(baij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
730   send_waits = (MPI_Request *) PetscMalloc((nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
731   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
732   starts[0] = 0;
733   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
734   for ( i=0; i<baij->stash.n; i++ ) {
735     svalues[3*starts[owner[i]]]       = (Scalar)  baij->stash.idx[i];
736     svalues[3*starts[owner[i]]+1]     = (Scalar)  baij->stash.idy[i];
737     svalues[3*(starts[owner[i]]++)+2] =  baij->stash.array[i];
738   }
739   PetscFree(owner);
740   starts[0] = 0;
741   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
742   count = 0;
743   for ( i=0; i<size; i++ ) {
744     if (procs[i]) {
745       ierr = MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
746     }
747   }
748   PetscFree(starts); PetscFree(nprocs);
749 
750   /* Free cache space */
751   PLogInfo(0,"MatAssemblyBegin_MPIBAIJ:Number of off-processor values %d\n",baij->stash.n);
752   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
753 
754   baij->svalues    = svalues;    baij->rvalues    = rvalues;
755   baij->nsends     = nsends;     baij->nrecvs     = nreceives;
756   baij->send_waits = send_waits; baij->recv_waits = recv_waits;
757   baij->rmax       = nmax;
758 
759   PetscFunctionReturn(0);
760 }
761 
762 /*
763   Creates the hash table, and sets the table
764   This table is created only once.
765   If new entried need to be added to the matrix
766   then the hash table has to be destroyed and
767   recreated.
768 */
769 #undef __FUNC__
770 #define __FUNC__ "MatCreateHashTable_MPIBAIJ_Private"
771 int MatCreateHashTable_MPIBAIJ_Private(Mat mat,double factor)
772 {
773   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
774   Mat         A = baij->A, B=baij->B;
775   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data, *b=(Mat_SeqBAIJ *)B->data;
776   int         i,j,k,nz=a->nz+b->nz,h1,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
777   int         size,ct=0,max=0,bs2=baij->bs2,rstart=baij->rstart;
778   int         cstart=baij->cstart,*garray=baij->garray,row,col,Nbs=baij->Nbs;
779   int         *HT,key;
780   Scalar      **HD;
781   double      tmp;
782 
783   PetscFunctionBegin;
784   baij->ht_size=(int)(factor*nz);
785   size = baij->ht_size;
786 
787   if (baij->ht) {
788     PetscFunctionReturn(0);
789   }
790 
791   /* Allocate Memory for Hash Table */
792   baij->hd = (Scalar**)PetscMalloc((size)*(sizeof(int)+sizeof(Scalar*))+1); CHKPTRQ(baij->hd);
793   baij->ht = (int*)(baij->hd + size);
794   HD = baij->hd;
795   HT = baij->ht;
796 
797 
798   PetscMemzero(HD,size*(sizeof(int)+sizeof(Scalar*)));
799 
800 
801   /* Loop Over A */
802   for ( i=0; i<a->mbs; i++ ) {
803     for ( j=ai[i]; j<ai[i+1]; j++ ) {
804       row = i+rstart;
805       col = aj[j]+cstart;
806 
807       key = row*Nbs + col + 1;
808       h1  = HASH(size,key,tmp);
809       for ( k=0; k<size; k++ ){
810         if (HT[(h1+k)%size] == 0.0) {
811           HT[(h1+k)%size] = key;
812           HD[(h1+k)%size] = a->a + j*bs2;
813           break;
814 #if defined(USE_PETSC_BOPT_g)
815         } else {
816           ct++;
817 #endif
818         }
819       }
820 #if defined(USE_PETSC_BOPT_g)
821       if (k> max) max = k;
822 #endif
823     }
824   }
825   /* Loop Over B */
826   for ( i=0; i<b->mbs; i++ ) {
827     for ( j=bi[i]; j<bi[i+1]; j++ ) {
828       row = i+rstart;
829       col = garray[bj[j]];
830       key = row*Nbs + col + 1;
831       h1  = HASH(size,key,tmp);
832       for ( k=0; k<size; k++ ){
833         if (HT[(h1+k)%size] == 0.0) {
834           HT[(h1+k)%size] = key;
835           HD[(h1+k)%size] = b->a + j*bs2;
836           break;
837 #if defined(USE_PETSC_BOPT_g)
838         } else {
839           ct++;
840 #endif
841         }
842       }
843 #if defined(USE_PETSC_BOPT_g)
844       if (k> max) max = k;
845 #endif
846     }
847   }
848 
849   /* Print Summary */
850 #if defined(USE_PETSC_BOPT_g)
851   for ( i=0,j=0; i<size; i++)
852     if (HT[i]) {j++;}
853   PLogInfo(0,"MatCreateHashTable_MPIBAIJ_Private: Average Search = %5.2f,max search = %d\n",
854            (j== 0)? 0.0:((double)(ct+j))/j,max);
855 #endif
856   PetscFunctionReturn(0);
857 }
858 
859 #undef __FUNC__
860 #define __FUNC__ "MatAssemblyEnd_MPIBAIJ"
861 int MatAssemblyEnd_MPIBAIJ(Mat mat,MatAssemblyType mode)
862 {
863   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
864   MPI_Status  *send_status,recv_status;
865   int         imdex,nrecvs = baij->nrecvs, count = nrecvs, i, n, ierr;
866   int         bs=baij->bs,row,col,other_disassembled,flg;
867   Scalar      *values,val;
868   InsertMode  addv = mat->insertmode;
869 
870   PetscFunctionBegin;
871   /*  wait on receives */
872   while (count) {
873     ierr = MPI_Waitany(nrecvs,baij->recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
874     /* unpack receives into our local space */
875     values = baij->rvalues + 3*imdex*baij->rmax;
876     ierr = MPI_Get_count(&recv_status,MPIU_SCALAR,&n);CHKERRQ(ierr);
877     n = n/3;
878     for ( i=0; i<n; i++ ) {
879       row = (int) PetscReal(values[3*i]) - baij->rstart*bs;
880       col = (int) PetscReal(values[3*i+1]);
881       val = values[3*i+2];
882       if (col >= baij->cstart*bs && col < baij->cend*bs) {
883         col -= baij->cstart*bs;
884         ierr = MatSetValues(baij->A,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
885       } else {
886         if (mat->was_assembled) {
887           if (!baij->colmap) {
888             ierr = CreateColmap_MPIBAIJ_Private(mat); CHKERRQ(ierr);
889           }
890           col = (baij->colmap[col/bs]) - 1 + col%bs;
891           if (col < 0  && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
892             ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
893             col = (int) PetscReal(values[3*i+1]);
894           }
895         }
896         ierr = MatSetValues(baij->B,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
897       }
898     }
899     count--;
900   }
901   PetscFree(baij->recv_waits); PetscFree(baij->rvalues);
902 
903   /* wait on sends */
904   if (baij->nsends) {
905     send_status = (MPI_Status *) PetscMalloc(baij->nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
906     ierr        = MPI_Waitall(baij->nsends,baij->send_waits,send_status);CHKERRQ(ierr);
907     PetscFree(send_status);
908   }
909   PetscFree(baij->send_waits); PetscFree(baij->svalues);
910 
911   ierr = MatAssemblyBegin(baij->A,mode); CHKERRQ(ierr);
912   ierr = MatAssemblyEnd(baij->A,mode); CHKERRQ(ierr);
913 
914   /* determine if any processor has disassembled, if so we must
915      also disassemble ourselfs, in order that we may reassemble. */
916   ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
917   if (mat->was_assembled && !other_disassembled) {
918     ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
919   }
920 
921   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
922     ierr = MatSetUpMultiply_MPIBAIJ(mat); CHKERRQ(ierr);
923   }
924   ierr = MatAssemblyBegin(baij->B,mode); CHKERRQ(ierr);
925   ierr = MatAssemblyEnd(baij->B,mode); CHKERRQ(ierr);
926 
927 #if defined(USE_PETSC_BOPT_g)
928   if (baij->ht && mode== MAT_FINAL_ASSEMBLY) {
929     PLogInfo(0,"MatAssemblyEnd_MPIBAIJ:Average Hash Table Search in MatSetValues = %5.2f\n",
930              ((double)baij->ht_total_ct)/baij->ht_insert_ct);
931     baij->ht_total_ct  = 0;
932     baij->ht_insert_ct = 0;
933   }
934 #endif
935   ierr = OptionsHasName(PETSC_NULL,"-use_hash",&flg); CHKERRQ(ierr);
936   if (flg && !baij->ht && mode== MAT_FINAL_ASSEMBLY) {
937     double fact = 1.39;
938     ierr = OptionsGetDouble(PETSC_NULL,"-use_hash",&fact,&flg); CHKERRQ(ierr);
939     if (fact <= 1.0) fact = 1.39;
940     PLogInfo(0,"MatAssemblyEnd_MPIBAIJ:Hash table Factor used %5.2f\n",fact);
941     ierr = MatCreateHashTable_MPIBAIJ_Private(mat,fact); CHKERRQ(ierr);
942     mat->ops.setvalues        = MatSetValues_MPIBAIJ_HT;
943     mat->ops.setvaluesblocked = MatSetValuesBlocked_MPIBAIJ_HT;
944   }
945 
946   if (baij->rowvalues) {PetscFree(baij->rowvalues); baij->rowvalues = 0;}
947   PetscFunctionReturn(0);
948 }
949 
950 #undef __FUNC__
951 #define __FUNC__ "MatView_MPIBAIJ_Binary"
952 static int MatView_MPIBAIJ_Binary(Mat mat,Viewer viewer)
953 {
954   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
955   int          ierr;
956 
957   PetscFunctionBegin;
958   if (baij->size == 1) {
959     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
960   } else SETERRQ(PETSC_ERR_SUP,0,"Only uniprocessor output supported");
961   PetscFunctionReturn(0);
962 }
963 
964 #undef __FUNC__
965 #define __FUNC__ "MatView_MPIBAIJ_ASCIIorDraworMatlab"
966 static int MatView_MPIBAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
967 {
968   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
969   int          ierr, format,rank,bs = baij->bs;
970   FILE         *fd;
971   ViewerType   vtype;
972 
973   PetscFunctionBegin;
974   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
975   if (vtype  == ASCII_FILES_VIEWER || vtype == ASCII_FILE_VIEWER) {
976     ierr = ViewerGetFormat(viewer,&format);
977     if (format == VIEWER_FORMAT_ASCII_INFO_LONG) {
978       MatInfo info;
979       MPI_Comm_rank(mat->comm,&rank);
980       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
981       ierr = MatGetInfo(mat,MAT_LOCAL,&info);
982       PetscSequentialPhaseBegin(mat->comm,1);
983       fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
984               rank,baij->m,(int)info.nz_used*bs,(int)info.nz_allocated*bs,
985               baij->bs,(int)info.memory);
986       ierr = MatGetInfo(baij->A,MAT_LOCAL,&info);
987       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
988       ierr = MatGetInfo(baij->B,MAT_LOCAL,&info);
989       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
990       fflush(fd);
991       PetscSequentialPhaseEnd(mat->comm,1);
992       ierr = VecScatterView(baij->Mvctx,viewer); CHKERRQ(ierr);
993       PetscFunctionReturn(0);
994     } else if (format == VIEWER_FORMAT_ASCII_INFO) {
995       PetscPrintf(mat->comm,"  block size is %d\n",bs);
996       PetscFunctionReturn(0);
997     }
998   }
999 
1000   if (vtype == DRAW_VIEWER) {
1001     Draw       draw;
1002     PetscTruth isnull;
1003     ierr = ViewerDrawGetDraw(viewer,&draw); CHKERRQ(ierr);
1004     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
1005   }
1006 
1007   if (vtype == ASCII_FILE_VIEWER) {
1008     ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
1009     PetscSequentialPhaseBegin(mat->comm,1);
1010     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
1011            baij->rank,baij->m,baij->rstart*bs,baij->rend*bs,baij->n,
1012             baij->cstart*bs,baij->cend*bs);
1013     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
1014     ierr = MatView(baij->B,viewer); CHKERRQ(ierr);
1015     fflush(fd);
1016     PetscSequentialPhaseEnd(mat->comm,1);
1017   } else {
1018     int size = baij->size;
1019     rank = baij->rank;
1020     if (size == 1) {
1021       ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
1022     } else {
1023       /* assemble the entire matrix onto first processor. */
1024       Mat         A;
1025       Mat_SeqBAIJ *Aloc;
1026       int         M = baij->M, N = baij->N,*ai,*aj,row,col,i,j,k,*rvals;
1027       int         mbs=baij->mbs;
1028       Scalar      *a;
1029 
1030       if (!rank) {
1031         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
1032       } else {
1033         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
1034       }
1035       PLogObjectParent(mat,A);
1036 
1037       /* copy over the A part */
1038       Aloc = (Mat_SeqBAIJ*) baij->A->data;
1039       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1040       row = baij->rstart;
1041       rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1042 
1043       for ( i=0; i<mbs; i++ ) {
1044         rvals[0] = bs*(baij->rstart + i);
1045         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1046         for ( j=ai[i]; j<ai[i+1]; j++ ) {
1047           col = (baij->cstart+aj[j])*bs;
1048           for (k=0; k<bs; k++ ) {
1049             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
1050             col++; a += bs;
1051           }
1052         }
1053       }
1054       /* copy over the B part */
1055       Aloc = (Mat_SeqBAIJ*) baij->B->data;
1056       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1057       row = baij->rstart*bs;
1058       for ( i=0; i<mbs; i++ ) {
1059         rvals[0] = bs*(baij->rstart + i);
1060         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1061         for ( j=ai[i]; j<ai[i+1]; j++ ) {
1062           col = baij->garray[aj[j]]*bs;
1063           for (k=0; k<bs; k++ ) {
1064             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
1065             col++; a += bs;
1066           }
1067         }
1068       }
1069       PetscFree(rvals);
1070       ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1071       ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1072       /*
1073          Everyone has to call to draw the matrix since the graphics waits are
1074          synchronized across all processors that share the Draw object
1075       */
1076       if (!rank || vtype == DRAW_VIEWER) {
1077         ierr = MatView(((Mat_MPIBAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
1078       }
1079       ierr = MatDestroy(A); CHKERRQ(ierr);
1080     }
1081   }
1082   PetscFunctionReturn(0);
1083 }
1084 
1085 
1086 
1087 #undef __FUNC__
1088 #define __FUNC__ "MatView_MPIBAIJ"
1089 int MatView_MPIBAIJ(PetscObject obj,Viewer viewer)
1090 {
1091   Mat         mat = (Mat) obj;
1092   int         ierr;
1093   ViewerType  vtype;
1094 
1095   PetscFunctionBegin;
1096   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
1097   if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER ||
1098       vtype == DRAW_VIEWER       || vtype == MATLAB_VIEWER) {
1099     ierr = MatView_MPIBAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
1100   } else if (vtype == BINARY_FILE_VIEWER) {
1101     ierr = MatView_MPIBAIJ_Binary(mat,viewer);CHKERRQ(ierr);
1102   }
1103   PetscFunctionReturn(0);
1104 }
1105 
1106 #undef __FUNC__
1107 #define __FUNC__ "MatDestroy_MPIBAIJ"
1108 int MatDestroy_MPIBAIJ(PetscObject obj)
1109 {
1110   Mat         mat = (Mat) obj;
1111   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1112   int         ierr;
1113 
1114   PetscFunctionBegin;
1115 #if defined(USE_PETSC_LOG)
1116   PLogObjectState(obj,"Rows=%d, Cols=%d",baij->M,baij->N);
1117 #endif
1118 
1119   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
1120   PetscFree(baij->rowners);
1121   ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1122   ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1123   if (baij->colmap) PetscFree(baij->colmap);
1124   if (baij->garray) PetscFree(baij->garray);
1125   if (baij->lvec)   VecDestroy(baij->lvec);
1126   if (baij->Mvctx)  VecScatterDestroy(baij->Mvctx);
1127   if (baij->rowvalues) PetscFree(baij->rowvalues);
1128   if (baij->barray) PetscFree(baij->barray);
1129   if (baij->hd) PetscFree(baij->hd);
1130   PetscFree(baij);
1131   PLogObjectDestroy(mat);
1132   PetscHeaderDestroy(mat);
1133   PetscFunctionReturn(0);
1134 }
1135 
1136 #undef __FUNC__
1137 #define __FUNC__ "MatMult_MPIBAIJ"
1138 int MatMult_MPIBAIJ(Mat A,Vec xx,Vec yy)
1139 {
1140   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1141   int         ierr, nt;
1142 
1143   PetscFunctionBegin;
1144   VecGetLocalSize_Fast(xx,nt);
1145   if (nt != a->n) {
1146     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible partition of A and xx");
1147   }
1148   VecGetLocalSize_Fast(yy,nt);
1149   if (nt != a->m) {
1150     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible parition of A and yy");
1151   }
1152   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1153   ierr = (*a->A->ops.mult)(a->A,xx,yy); CHKERRQ(ierr);
1154   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1155   ierr = (*a->B->ops.multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
1156   ierr = VecScatterPostRecvs(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1157   PetscFunctionReturn(0);
1158 }
1159 
1160 #undef __FUNC__
1161 #define __FUNC__ "MatMultAdd_MPIBAIJ"
1162 int MatMultAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1163 {
1164   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1165   int        ierr;
1166 
1167   PetscFunctionBegin;
1168   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1169   ierr = (*a->A->ops.multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1170   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1171   ierr = (*a->B->ops.multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
1172   PetscFunctionReturn(0);
1173 }
1174 
1175 #undef __FUNC__
1176 #define __FUNC__ "MatMultTrans_MPIBAIJ"
1177 int MatMultTrans_MPIBAIJ(Mat A,Vec xx,Vec yy)
1178 {
1179   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1180   int         ierr;
1181 
1182   PetscFunctionBegin;
1183   /* do nondiagonal part */
1184   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1185   /* send it on its way */
1186   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1187   /* do local part */
1188   ierr = (*a->A->ops.multtrans)(a->A,xx,yy); CHKERRQ(ierr);
1189   /* receive remote parts: note this assumes the values are not actually */
1190   /* inserted in yy until the next line, which is true for my implementation*/
1191   /* but is not perhaps always true. */
1192   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1193   PetscFunctionReturn(0);
1194 }
1195 
1196 #undef __FUNC__
1197 #define __FUNC__ "MatMultTransAdd_MPIBAIJ"
1198 int MatMultTransAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1199 {
1200   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1201   int         ierr;
1202 
1203   PetscFunctionBegin;
1204   /* do nondiagonal part */
1205   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1206   /* send it on its way */
1207   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1208   /* do local part */
1209   ierr = (*a->A->ops.multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1210   /* receive remote parts: note this assumes the values are not actually */
1211   /* inserted in yy until the next line, which is true for my implementation*/
1212   /* but is not perhaps always true. */
1213   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1214   PetscFunctionReturn(0);
1215 }
1216 
1217 /*
1218   This only works correctly for square matrices where the subblock A->A is the
1219    diagonal block
1220 */
1221 #undef __FUNC__
1222 #define __FUNC__ "MatGetDiagonal_MPIBAIJ"
1223 int MatGetDiagonal_MPIBAIJ(Mat A,Vec v)
1224 {
1225   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1226   int         ierr;
1227 
1228   PetscFunctionBegin;
1229   if (a->M != a->N) SETERRQ(PETSC_ERR_SUP,0,"Supports only square matrix where A->A is diag block");
1230   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
1231   PetscFunctionReturn(0);
1232 }
1233 
1234 #undef __FUNC__
1235 #define __FUNC__ "MatScale_MPIBAIJ"
1236 int MatScale_MPIBAIJ(Scalar *aa,Mat A)
1237 {
1238   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1239   int         ierr;
1240 
1241   PetscFunctionBegin;
1242   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
1243   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
1244   PetscFunctionReturn(0);
1245 }
1246 
1247 #undef __FUNC__
1248 #define __FUNC__ "MatGetSize_MPIBAIJ"
1249 int MatGetSize_MPIBAIJ(Mat matin,int *m,int *n)
1250 {
1251   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1252 
1253   PetscFunctionBegin;
1254   if (m) *m = mat->M;
1255   if (n) *n = mat->N;
1256   PetscFunctionReturn(0);
1257 }
1258 
1259 #undef __FUNC__
1260 #define __FUNC__ "MatGetLocalSize_MPIBAIJ"
1261 int MatGetLocalSize_MPIBAIJ(Mat matin,int *m,int *n)
1262 {
1263   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1264 
1265   PetscFunctionBegin;
1266   *m = mat->m; *n = mat->N;
1267   PetscFunctionReturn(0);
1268 }
1269 
1270 #undef __FUNC__
1271 #define __FUNC__ "MatGetOwnershipRange_MPIBAIJ"
1272 int MatGetOwnershipRange_MPIBAIJ(Mat matin,int *m,int *n)
1273 {
1274   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1275 
1276   PetscFunctionBegin;
1277   *m = mat->rstart*mat->bs; *n = mat->rend*mat->bs;
1278   PetscFunctionReturn(0);
1279 }
1280 
1281 extern int MatGetRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1282 extern int MatRestoreRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1283 
1284 #undef __FUNC__
1285 #define __FUNC__ "MatGetRow_MPIBAIJ"
1286 int MatGetRow_MPIBAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1287 {
1288   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1289   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1290   int        bs = mat->bs, bs2 = mat->bs2, i, ierr, *cworkA, *cworkB, **pcA, **pcB;
1291   int        nztot, nzA, nzB, lrow, brstart = mat->rstart*bs, brend = mat->rend*bs;
1292   int        *cmap, *idx_p,cstart = mat->cstart;
1293 
1294   PetscFunctionBegin;
1295   if (mat->getrowactive == PETSC_TRUE) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Already active");
1296   mat->getrowactive = PETSC_TRUE;
1297 
1298   if (!mat->rowvalues && (idx || v)) {
1299     /*
1300         allocate enough space to hold information from the longest row.
1301     */
1302     Mat_SeqBAIJ *Aa = (Mat_SeqBAIJ *) mat->A->data,*Ba = (Mat_SeqBAIJ *) mat->B->data;
1303     int     max = 1,mbs = mat->mbs,tmp;
1304     for ( i=0; i<mbs; i++ ) {
1305       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1306       if (max < tmp) { max = tmp; }
1307     }
1308     mat->rowvalues = (Scalar *) PetscMalloc( max*bs2*(sizeof(int)+sizeof(Scalar)));
1309     CHKPTRQ(mat->rowvalues);
1310     mat->rowindices = (int *) (mat->rowvalues + max*bs2);
1311   }
1312 
1313   if (row < brstart || row >= brend) SETERRQ(PETSC_ERR_SUP,0,"Only local rows")
1314   lrow = row - brstart;
1315 
1316   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1317   if (!v)   {pvA = 0; pvB = 0;}
1318   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1319   ierr = (*mat->A->ops.getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1320   ierr = (*mat->B->ops.getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1321   nztot = nzA + nzB;
1322 
1323   cmap  = mat->garray;
1324   if (v  || idx) {
1325     if (nztot) {
1326       /* Sort by increasing column numbers, assuming A and B already sorted */
1327       int imark = -1;
1328       if (v) {
1329         *v = v_p = mat->rowvalues;
1330         for ( i=0; i<nzB; i++ ) {
1331           if (cmap[cworkB[i]/bs] < cstart)   v_p[i] = vworkB[i];
1332           else break;
1333         }
1334         imark = i;
1335         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1336         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1337       }
1338       if (idx) {
1339         *idx = idx_p = mat->rowindices;
1340         if (imark > -1) {
1341           for ( i=0; i<imark; i++ ) {
1342             idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs;
1343           }
1344         } else {
1345           for ( i=0; i<nzB; i++ ) {
1346             if (cmap[cworkB[i]/bs] < cstart)
1347               idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1348             else break;
1349           }
1350           imark = i;
1351         }
1352         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart*bs + cworkA[i];
1353         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1354       }
1355     } else {
1356       if (idx) *idx = 0;
1357       if (v)   *v   = 0;
1358     }
1359   }
1360   *nz = nztot;
1361   ierr = (*mat->A->ops.restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1362   ierr = (*mat->B->ops.restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1363   PetscFunctionReturn(0);
1364 }
1365 
1366 #undef __FUNC__
1367 #define __FUNC__ "MatRestoreRow_MPIBAIJ"
1368 int MatRestoreRow_MPIBAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1369 {
1370   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1371 
1372   PetscFunctionBegin;
1373   if (baij->getrowactive == PETSC_FALSE) {
1374     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"MatGetRow not called");
1375   }
1376   baij->getrowactive = PETSC_FALSE;
1377   PetscFunctionReturn(0);
1378 }
1379 
1380 #undef __FUNC__
1381 #define __FUNC__ "MatGetBlockSize_MPIBAIJ"
1382 int MatGetBlockSize_MPIBAIJ(Mat mat,int *bs)
1383 {
1384   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1385 
1386   PetscFunctionBegin;
1387   *bs = baij->bs;
1388   PetscFunctionReturn(0);
1389 }
1390 
1391 #undef __FUNC__
1392 #define __FUNC__ "MatZeroEntries_MPIBAIJ"
1393 int MatZeroEntries_MPIBAIJ(Mat A)
1394 {
1395   Mat_MPIBAIJ *l = (Mat_MPIBAIJ *) A->data;
1396   int         ierr;
1397 
1398   PetscFunctionBegin;
1399   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
1400   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
1401   PetscFunctionReturn(0);
1402 }
1403 
1404 #undef __FUNC__
1405 #define __FUNC__ "MatGetInfo_MPIBAIJ"
1406 int MatGetInfo_MPIBAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1407 {
1408   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) matin->data;
1409   Mat         A = a->A, B = a->B;
1410   int         ierr;
1411   double      isend[5], irecv[5];
1412 
1413   PetscFunctionBegin;
1414   info->block_size     = (double)a->bs;
1415   ierr = MatGetInfo(A,MAT_LOCAL,info); CHKERRQ(ierr);
1416   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->memory;
1417   ierr = MatGetInfo(B,MAT_LOCAL,info); CHKERRQ(ierr);
1418   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->memory;
1419   if (flag == MAT_LOCAL) {
1420     info->nz_used      = isend[0];
1421     info->nz_allocated = isend[1];
1422     info->nz_unneeded  = isend[2];
1423     info->memory       = isend[3];
1424     info->mallocs      = isend[4];
1425   } else if (flag == MAT_GLOBAL_MAX) {
1426     ierr = MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_MAX,matin->comm);CHKERRQ(ierr);
1427     info->nz_used      = irecv[0];
1428     info->nz_allocated = irecv[1];
1429     info->nz_unneeded  = irecv[2];
1430     info->memory       = irecv[3];
1431     info->mallocs      = irecv[4];
1432   } else if (flag == MAT_GLOBAL_SUM) {
1433     ierr = MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_SUM,matin->comm);CHKERRQ(ierr);
1434     info->nz_used      = irecv[0];
1435     info->nz_allocated = irecv[1];
1436     info->nz_unneeded  = irecv[2];
1437     info->memory       = irecv[3];
1438     info->mallocs      = irecv[4];
1439   }
1440   info->rows_global       = (double)a->M;
1441   info->columns_global    = (double)a->N;
1442   info->rows_local        = (double)a->m;
1443   info->columns_local     = (double)a->N;
1444   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1445   info->fill_ratio_needed = 0;
1446   info->factor_mallocs    = 0;
1447   PetscFunctionReturn(0);
1448 }
1449 
1450 #undef __FUNC__
1451 #define __FUNC__ "MatSetOption_MPIBAIJ"
1452 int MatSetOption_MPIBAIJ(Mat A,MatOption op)
1453 {
1454   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1455 
1456   PetscFunctionBegin;
1457   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1458       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1459       op == MAT_COLUMNS_UNSORTED ||
1460       op == MAT_COLUMNS_SORTED ||
1461       op == MAT_NEW_NONZERO_ALLOCATION_ERROR ||
1462       op == MAT_NEW_NONZERO_LOCATION_ERROR) {
1463         MatSetOption(a->A,op);
1464         MatSetOption(a->B,op);
1465   } else if (op == MAT_ROW_ORIENTED) {
1466         a->roworiented = 1;
1467         MatSetOption(a->A,op);
1468         MatSetOption(a->B,op);
1469   } else if (op == MAT_ROWS_SORTED ||
1470              op == MAT_ROWS_UNSORTED ||
1471              op == MAT_SYMMETRIC ||
1472              op == MAT_STRUCTURALLY_SYMMETRIC ||
1473              op == MAT_YES_NEW_DIAGONALS)
1474     PLogInfo(A,"Info:MatSetOption_MPIBAIJ:Option ignored\n");
1475   else if (op == MAT_COLUMN_ORIENTED) {
1476     a->roworiented = 0;
1477     MatSetOption(a->A,op);
1478     MatSetOption(a->B,op);
1479   } else if (op == MAT_IGNORE_OFF_PROC_ENTRIES) {
1480     a->donotstash = 1;
1481   } else if (op == MAT_NO_NEW_DIAGONALS) {
1482     SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");
1483   } else {
1484     SETERRQ(PETSC_ERR_SUP,0,"unknown option");
1485   }
1486   PetscFunctionReturn(0);
1487 }
1488 
1489 #undef __FUNC__
1490 #define __FUNC__ "MatTranspose_MPIBAIJ("
1491 int MatTranspose_MPIBAIJ(Mat A,Mat *matout)
1492 {
1493   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) A->data;
1494   Mat_SeqBAIJ *Aloc;
1495   Mat        B;
1496   int        ierr,M=baij->M,N=baij->N,*ai,*aj,row,i,*rvals,j,k,col;
1497   int        bs=baij->bs,mbs=baij->mbs;
1498   Scalar     *a;
1499 
1500   PetscFunctionBegin;
1501   if (matout == PETSC_NULL && M != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Square matrix only for in-place");
1502   ierr = MatCreateMPIBAIJ(A->comm,baij->bs,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);
1503   CHKERRQ(ierr);
1504 
1505   /* copy over the A part */
1506   Aloc = (Mat_SeqBAIJ*) baij->A->data;
1507   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1508   row = baij->rstart;
1509   rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1510 
1511   for ( i=0; i<mbs; i++ ) {
1512     rvals[0] = bs*(baij->rstart + i);
1513     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1514     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1515       col = (baij->cstart+aj[j])*bs;
1516       for (k=0; k<bs; k++ ) {
1517         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1518         col++; a += bs;
1519       }
1520     }
1521   }
1522   /* copy over the B part */
1523   Aloc = (Mat_SeqBAIJ*) baij->B->data;
1524   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1525   row = baij->rstart*bs;
1526   for ( i=0; i<mbs; i++ ) {
1527     rvals[0] = bs*(baij->rstart + i);
1528     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1529     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1530       col = baij->garray[aj[j]]*bs;
1531       for (k=0; k<bs; k++ ) {
1532         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1533         col++; a += bs;
1534       }
1535     }
1536   }
1537   PetscFree(rvals);
1538   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1539   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1540 
1541   if (matout != PETSC_NULL) {
1542     *matout = B;
1543   } else {
1544     /* This isn't really an in-place transpose .... but free data structures from baij */
1545     PetscFree(baij->rowners);
1546     ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1547     ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1548     if (baij->colmap) PetscFree(baij->colmap);
1549     if (baij->garray) PetscFree(baij->garray);
1550     if (baij->lvec) VecDestroy(baij->lvec);
1551     if (baij->Mvctx) VecScatterDestroy(baij->Mvctx);
1552     PetscFree(baij);
1553     PetscMemcpy(A,B,sizeof(struct _p_Mat));
1554     PetscHeaderDestroy(B);
1555   }
1556   PetscFunctionReturn(0);
1557 }
1558 
1559 #undef __FUNC__
1560 #define __FUNC__ "MatDiagonalScale_MPIBAIJ"
1561 int MatDiagonalScale_MPIBAIJ(Mat A,Vec ll,Vec rr)
1562 {
1563   Mat a = ((Mat_MPIBAIJ *) A->data)->A;
1564   Mat b = ((Mat_MPIBAIJ *) A->data)->B;
1565   int ierr,s1,s2,s3;
1566 
1567   PetscFunctionBegin;
1568   if (ll)  {
1569     ierr = VecGetLocalSize(ll,&s1); CHKERRQ(ierr);
1570     ierr = MatGetLocalSize(A,&s2,&s3); CHKERRQ(ierr);
1571     if (s1!=s2) SETERRQ(PETSC_ERR_ARG_SIZ,0,"non-conforming local sizes");
1572     ierr = MatDiagonalScale(a,ll,0); CHKERRQ(ierr);
1573     ierr = MatDiagonalScale(b,ll,0); CHKERRQ(ierr);
1574   }
1575   if (rr) SETERRQ(PETSC_ERR_SUP,0,"not supported for right vector");
1576   PetscFunctionReturn(0);
1577 }
1578 
1579 #undef __FUNC__
1580 #define __FUNC__ "MatZeroRows_MPIBAIJ"
1581 int MatZeroRows_MPIBAIJ(Mat A,IS is,Scalar *diag)
1582 {
1583   Mat_MPIBAIJ    *l = (Mat_MPIBAIJ *) A->data;
1584   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
1585   int            *procs,*nprocs,j,found,idx,nsends,*work,row;
1586   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
1587   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
1588   int            *lens,imdex,*lrows,*values,bs=l->bs,rstart_bs=l->rstart_bs;
1589   MPI_Comm       comm = A->comm;
1590   MPI_Request    *send_waits,*recv_waits;
1591   MPI_Status     recv_status,*send_status;
1592   IS             istmp;
1593 
1594   PetscFunctionBegin;
1595   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
1596   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
1597 
1598   /*  first count number of contributors to each processor */
1599   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
1600   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
1601   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
1602   for ( i=0; i<N; i++ ) {
1603     idx = rows[i];
1604     found = 0;
1605     for ( j=0; j<size; j++ ) {
1606       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
1607         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
1608       }
1609     }
1610     if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Index out of range");
1611   }
1612   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
1613 
1614   /* inform other processors of number of messages and max length*/
1615   work   = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
1616   ierr   = MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
1617   nrecvs = work[rank];
1618   ierr   = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
1619   nmax   = work[rank];
1620   PetscFree(work);
1621 
1622   /* post receives:   */
1623   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); CHKPTRQ(rvalues);
1624   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
1625   for ( i=0; i<nrecvs; i++ ) {
1626     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
1627   }
1628 
1629   /* do sends:
1630      1) starts[i] gives the starting index in svalues for stuff going to
1631      the ith processor
1632   */
1633   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
1634   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
1635   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
1636   starts[0] = 0;
1637   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1638   for ( i=0; i<N; i++ ) {
1639     svalues[starts[owner[i]]++] = rows[i];
1640   }
1641   ISRestoreIndices(is,&rows);
1642 
1643   starts[0] = 0;
1644   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1645   count = 0;
1646   for ( i=0; i<size; i++ ) {
1647     if (procs[i]) {
1648       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
1649     }
1650   }
1651   PetscFree(starts);
1652 
1653   base = owners[rank]*bs;
1654 
1655   /*  wait on receives */
1656   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
1657   source = lens + nrecvs;
1658   count  = nrecvs; slen = 0;
1659   while (count) {
1660     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
1661     /* unpack receives into our local space */
1662     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
1663     source[imdex]  = recv_status.MPI_SOURCE;
1664     lens[imdex]  = n;
1665     slen += n;
1666     count--;
1667   }
1668   PetscFree(recv_waits);
1669 
1670   /* move the data into the send scatter */
1671   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
1672   count = 0;
1673   for ( i=0; i<nrecvs; i++ ) {
1674     values = rvalues + i*nmax;
1675     for ( j=0; j<lens[i]; j++ ) {
1676       lrows[count++] = values[j] - base;
1677     }
1678   }
1679   PetscFree(rvalues); PetscFree(lens);
1680   PetscFree(owner); PetscFree(nprocs);
1681 
1682   /* actually zap the local rows */
1683   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
1684   PLogObjectParent(A,istmp);
1685 
1686   ierr = MatZeroRows(l->A,istmp,0); CHKERRQ(ierr);
1687   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
1688   ierr = ISDestroy(istmp); CHKERRQ(ierr);
1689 
1690   if (diag) {
1691     for ( i = 0; i < slen; i++ ) {
1692       row = lrows[i] + rstart_bs;
1693       ierr = MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES); CHKERRQ(ierr);
1694     }
1695     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1696     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1697   }
1698   PetscFree(lrows);
1699 
1700   /* wait on sends */
1701   if (nsends) {
1702     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
1703     ierr        = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
1704     PetscFree(send_status);
1705   }
1706   PetscFree(send_waits); PetscFree(svalues);
1707 
1708   PetscFunctionReturn(0);
1709 }
1710 extern int MatPrintHelp_SeqBAIJ(Mat);
1711 #undef __FUNC__
1712 #define __FUNC__ "MatPrintHelp_MPIBAIJ"
1713 int MatPrintHelp_MPIBAIJ(Mat A)
1714 {
1715   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1716   int         ierr;
1717 
1718   PetscFunctionBegin;
1719   if (!a->rank) {
1720     ierr = MatPrintHelp_SeqBAIJ(a->A);CHKERRQ(ierr);
1721   }
1722   PetscFunctionReturn(0);
1723 }
1724 
1725 #undef __FUNC__
1726 #define __FUNC__ "MatSetUnfactored_MPIBAIJ"
1727 int MatSetUnfactored_MPIBAIJ(Mat A)
1728 {
1729   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1730   int         ierr;
1731 
1732   PetscFunctionBegin;
1733   ierr = MatSetUnfactored(a->A); CHKERRQ(ierr);
1734   PetscFunctionReturn(0);
1735 }
1736 
1737 static int MatConvertSameType_MPIBAIJ(Mat,Mat *,int);
1738 
1739 /* -------------------------------------------------------------------*/
1740 static struct _MatOps MatOps = {
1741   MatSetValues_MPIBAIJ,MatGetRow_MPIBAIJ,MatRestoreRow_MPIBAIJ,MatMult_MPIBAIJ,
1742   MatMultAdd_MPIBAIJ,MatMultTrans_MPIBAIJ,MatMultTransAdd_MPIBAIJ,0,
1743   0,0,0,0,
1744   0,0,MatTranspose_MPIBAIJ,MatGetInfo_MPIBAIJ,
1745   0,MatGetDiagonal_MPIBAIJ,MatDiagonalScale_MPIBAIJ,MatNorm_MPIBAIJ,
1746   MatAssemblyBegin_MPIBAIJ,MatAssemblyEnd_MPIBAIJ,0,MatSetOption_MPIBAIJ,
1747   MatZeroEntries_MPIBAIJ,MatZeroRows_MPIBAIJ,0,
1748   0,0,0,MatGetSize_MPIBAIJ,
1749   MatGetLocalSize_MPIBAIJ,MatGetOwnershipRange_MPIBAIJ,0,0,
1750   0,0,MatConvertSameType_MPIBAIJ,0,0,
1751   0,0,0,MatGetSubMatrices_MPIBAIJ,
1752   MatIncreaseOverlap_MPIBAIJ,MatGetValues_MPIBAIJ,0,MatPrintHelp_MPIBAIJ,
1753   MatScale_MPIBAIJ,0,0,0,MatGetBlockSize_MPIBAIJ,
1754   0,0,0,0,0,0,MatSetUnfactored_MPIBAIJ,0,MatSetValuesBlocked_MPIBAIJ};
1755 
1756 
1757 #undef __FUNC__
1758 #define __FUNC__ "MatCreateMPIBAIJ"
1759 /*@C
1760    MatCreateMPIBAIJ - Creates a sparse parallel matrix in block AIJ format
1761    (block compressed row).  For good matrix assembly performance
1762    the user should preallocate the matrix storage by setting the parameters
1763    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1764    performance can be increased by more than a factor of 50.
1765 
1766    Input Parameters:
1767 .  comm - MPI communicator
1768 .  bs   - size of blockk
1769 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1770            This value should be the same as the local size used in creating the
1771            y vector for the matrix-vector product y = Ax.
1772 .  n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1773            This value should be the same as the local size used in creating the
1774            x vector for the matrix-vector product y = Ax.
1775 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1776 .  N - number of global columns (or PETSC_DECIDE to have calculated if n is given)
1777 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1778            submatrix  (same for all local rows)
1779 .  d_nzz - array containing the number of block nonzeros in the various block rows
1780            of the in diagonal portion of the local (possibly different for each block
1781            row) or PETSC_NULL.  You must leave room for the diagonal entry even if
1782            it is zero.
1783 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1784            submatrix (same for all local rows).
1785 .  o_nzz - array containing the number of nonzeros in the various block rows of the
1786            off-diagonal portion of the local submatrix (possibly different for
1787            each block row) or PETSC_NULL.
1788 
1789    Output Parameter:
1790 .  A - the matrix
1791 
1792    Notes:
1793    The user MUST specify either the local or global matrix dimensions
1794    (possibly both).
1795 
1796    Storage Information:
1797    For a square global matrix we define each processor's diagonal portion
1798    to be its local rows and the corresponding columns (a square submatrix);
1799    each processor's off-diagonal portion encompasses the remainder of the
1800    local matrix (a rectangular submatrix).
1801 
1802    The user can specify preallocated storage for the diagonal part of
1803    the local submatrix with either d_nz or d_nnz (not both).  Set
1804    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1805    memory allocation.  Likewise, specify preallocated storage for the
1806    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1807 
1808    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1809    the figure below we depict these three local rows and all columns (0-11).
1810 
1811 $          0 1 2 3 4 5 6 7 8 9 10 11
1812 $         -------------------
1813 $  row 3  |  o o o d d d o o o o o o
1814 $  row 4  |  o o o d d d o o o o o o
1815 $  row 5  |  o o o d d d o o o o o o
1816 $         -------------------
1817 $
1818 
1819    Thus, any entries in the d locations are stored in the d (diagonal)
1820    submatrix, and any entries in the o locations are stored in the
1821    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1822    stored simply in the MATSEQBAIJ format for compressed row storage.
1823 
1824    Now d_nz should indicate the number of block nonzeros per row in the d matrix,
1825    and o_nz should indicate the number of block nonzeros per row in the o matrix.
1826    In general, for PDE problems in which most nonzeros are near the diagonal,
1827    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1828    or you will get TERRIBLE performance; see the users' manual chapter on
1829    matrices.
1830 
1831 .keywords: matrix, block, aij, compressed row, sparse, parallel
1832 
1833 .seealso: MatCreate(), MatCreateSeqBAIJ(), MatSetValues()
1834 @*/
1835 int MatCreateMPIBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,
1836                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1837 {
1838   Mat          B;
1839   Mat_MPIBAIJ  *b;
1840   int          ierr, i,sum[2],work[2],mbs,nbs,Mbs=PETSC_DECIDE,Nbs=PETSC_DECIDE,size;
1841 
1842   PetscFunctionBegin;
1843   if (bs < 1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Invalid block size specified, must be positive");
1844 
1845   MPI_Comm_size(comm,&size);
1846   if (size == 1) {
1847     if (M == PETSC_DECIDE) M = m;
1848     if (N == PETSC_DECIDE) N = n;
1849     ierr = MatCreateSeqBAIJ(comm,bs,M,N,d_nz,d_nnz,A); CHKERRQ(ierr);
1850     PetscFunctionReturn(0);
1851   }
1852 
1853   *A = 0;
1854   PetscHeaderCreate(B,_p_Mat,MAT_COOKIE,MATMPIBAIJ,comm,MatDestroy,MatView);
1855   PLogObjectCreate(B);
1856   B->data       = (void *) (b = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(b);
1857   PetscMemzero(b,sizeof(Mat_MPIBAIJ));
1858   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
1859 
1860   B->destroy    = MatDestroy_MPIBAIJ;
1861   B->view       = MatView_MPIBAIJ;
1862   B->mapping    = 0;
1863   B->factor     = 0;
1864   B->assembled  = PETSC_FALSE;
1865 
1866   B->insertmode = NOT_SET_VALUES;
1867   MPI_Comm_rank(comm,&b->rank);
1868   MPI_Comm_size(comm,&b->size);
1869 
1870   if ( m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL)) {
1871     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
1872   }
1873   if ( M == PETSC_DECIDE && m == PETSC_DECIDE) {
1874     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either M or m should be specified");
1875   }
1876   if ( N == PETSC_DECIDE && n == PETSC_DECIDE) {
1877     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either N or n should be specified");
1878   }
1879   if ( M != PETSC_DECIDE && m != PETSC_DECIDE) M = PETSC_DECIDE;
1880   if ( N != PETSC_DECIDE && n != PETSC_DECIDE) N = PETSC_DECIDE;
1881 
1882   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1883     work[0] = m; work[1] = n;
1884     mbs = m/bs; nbs = n/bs;
1885     ierr = MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );CHKERRQ(ierr);
1886     if (M == PETSC_DECIDE) {M = sum[0]; Mbs = M/bs;}
1887     if (N == PETSC_DECIDE) {N = sum[1]; Nbs = N/bs;}
1888   }
1889   if (m == PETSC_DECIDE) {
1890     Mbs = M/bs;
1891     if (Mbs*bs != M) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global rows must be divisible by blocksize");
1892     mbs = Mbs/b->size + ((Mbs % b->size) > b->rank);
1893     m   = mbs*bs;
1894   }
1895   if (n == PETSC_DECIDE) {
1896     Nbs = N/bs;
1897     if (Nbs*bs != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global cols must be divisible by blocksize");
1898     nbs = Nbs/b->size + ((Nbs % b->size) > b->rank);
1899     n   = nbs*bs;
1900   }
1901   if (mbs*bs != m || nbs*bs != n) {
1902     SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of local rows, cols must be divisible by blocksize");
1903   }
1904 
1905   b->m = m; B->m = m;
1906   b->n = n; B->n = n;
1907   b->N = N; B->N = N;
1908   b->M = M; B->M = M;
1909   b->bs  = bs;
1910   b->bs2 = bs*bs;
1911   b->mbs = mbs;
1912   b->nbs = nbs;
1913   b->Mbs = Mbs;
1914   b->Nbs = Nbs;
1915 
1916   /* build local table of row and column ownerships */
1917   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
1918   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
1919   b->cowners = b->rowners + b->size + 2;
1920   ierr = MPI_Allgather(&mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1921   b->rowners[0] = 0;
1922   for ( i=2; i<=b->size; i++ ) {
1923     b->rowners[i] += b->rowners[i-1];
1924   }
1925   b->rstart    = b->rowners[b->rank];
1926   b->rend      = b->rowners[b->rank+1];
1927   b->rstart_bs = b->rstart * bs;
1928   b->rend_bs   = b->rend * bs;
1929 
1930   ierr = MPI_Allgather(&nbs,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1931   b->cowners[0] = 0;
1932   for ( i=2; i<=b->size; i++ ) {
1933     b->cowners[i] += b->cowners[i-1];
1934   }
1935   b->cstart    = b->cowners[b->rank];
1936   b->cend      = b->cowners[b->rank+1];
1937   b->cstart_bs = b->cstart * bs;
1938   b->cend_bs   = b->cend * bs;
1939 
1940 
1941   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1942   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
1943   PLogObjectParent(B,b->A);
1944   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1945   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
1946   PLogObjectParent(B,b->B);
1947 
1948   /* build cache for off array entries formed */
1949   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
1950   b->donotstash  = 0;
1951   b->colmap      = 0;
1952   b->garray      = 0;
1953   b->roworiented = 1;
1954 
1955   /* stuff used in block assembly */
1956   b->barray       = 0;
1957 
1958   /* stuff used for matrix vector multiply */
1959   b->lvec         = 0;
1960   b->Mvctx        = 0;
1961 
1962   /* stuff for MatGetRow() */
1963   b->rowindices   = 0;
1964   b->rowvalues    = 0;
1965   b->getrowactive = PETSC_FALSE;
1966 
1967   /* hash table stuff */
1968   b->ht           = 0;
1969   b->hd           = 0;
1970   b->ht_size      = 0;
1971   b->ht_total_ct  = 0;
1972   b->ht_insert_ct = 0;
1973 
1974   *A = B;
1975   PetscFunctionReturn(0);
1976 }
1977 
1978 #undef __FUNC__
1979 #define __FUNC__ "MatConvertSameType_MPIBAIJ"
1980 static int MatConvertSameType_MPIBAIJ(Mat matin,Mat *newmat,int cpvalues)
1981 {
1982   Mat         mat;
1983   Mat_MPIBAIJ *a,*oldmat = (Mat_MPIBAIJ *) matin->data;
1984   int         ierr, len=0, flg;
1985 
1986   PetscFunctionBegin;
1987   *newmat       = 0;
1988   PetscHeaderCreate(mat,_p_Mat,MAT_COOKIE,MATMPIBAIJ,matin->comm,MatDestroy,MatView);
1989   PLogObjectCreate(mat);
1990   mat->data       = (void *) (a = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(a);
1991   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1992   mat->destroy    = MatDestroy_MPIBAIJ;
1993   mat->view       = MatView_MPIBAIJ;
1994   mat->factor     = matin->factor;
1995   mat->assembled  = PETSC_TRUE;
1996 
1997   a->m = mat->m   = oldmat->m;
1998   a->n = mat->n   = oldmat->n;
1999   a->M = mat->M   = oldmat->M;
2000   a->N = mat->N   = oldmat->N;
2001 
2002   a->bs  = oldmat->bs;
2003   a->bs2 = oldmat->bs2;
2004   a->mbs = oldmat->mbs;
2005   a->nbs = oldmat->nbs;
2006   a->Mbs = oldmat->Mbs;
2007   a->Nbs = oldmat->Nbs;
2008 
2009   a->rstart       = oldmat->rstart;
2010   a->rend         = oldmat->rend;
2011   a->cstart       = oldmat->cstart;
2012   a->cend         = oldmat->cend;
2013   a->size         = oldmat->size;
2014   a->rank         = oldmat->rank;
2015   mat->insertmode = NOT_SET_VALUES;
2016   a->rowvalues    = 0;
2017   a->getrowactive = PETSC_FALSE;
2018   a->barray       = 0;
2019 
2020   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
2021   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
2022   a->cowners = a->rowners + a->size + 2;
2023   PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));
2024   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
2025   if (oldmat->colmap) {
2026     a->colmap = (int *) PetscMalloc((a->Nbs)*sizeof(int));CHKPTRQ(a->colmap);
2027     PLogObjectMemory(mat,(a->Nbs)*sizeof(int));
2028     PetscMemcpy(a->colmap,oldmat->colmap,(a->Nbs)*sizeof(int));
2029   } else a->colmap = 0;
2030   if (oldmat->garray && (len = ((Mat_SeqBAIJ *) (oldmat->B->data))->nbs)) {
2031     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
2032     PLogObjectMemory(mat,len*sizeof(int));
2033     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
2034   } else a->garray = 0;
2035 
2036   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
2037   PLogObjectParent(mat,a->lvec);
2038   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
2039   PLogObjectParent(mat,a->Mvctx);
2040   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
2041   PLogObjectParent(mat,a->A);
2042   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
2043   PLogObjectParent(mat,a->B);
2044   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
2045   if (flg) {
2046     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
2047   }
2048   *newmat = mat;
2049   PetscFunctionReturn(0);
2050 }
2051 
2052 #include "sys.h"
2053 
2054 #undef __FUNC__
2055 #define __FUNC__ "MatLoad_MPIBAIJ"
2056 int MatLoad_MPIBAIJ(Viewer viewer,MatType type,Mat *newmat)
2057 {
2058   Mat          A;
2059   int          i, nz, ierr, j,rstart, rend, fd;
2060   Scalar       *vals,*buf;
2061   MPI_Comm     comm = ((PetscObject)viewer)->comm;
2062   MPI_Status   status;
2063   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,*browners,maxnz,*cols;
2064   int          *locrowlens,*sndcounts = 0,*procsnz = 0, jj,*mycols,*ibuf;
2065   int          flg,tag = ((PetscObject)viewer)->tag,bs=1,bs2,Mbs,mbs,extra_rows;
2066   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
2067   int          dcount,kmax,k,nzcount,tmp;
2068 
2069   PetscFunctionBegin;
2070   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
2071   bs2  = bs*bs;
2072 
2073   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
2074   if (!rank) {
2075     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
2076     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT); CHKERRQ(ierr);
2077     if (header[0] != MAT_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"not matrix object");
2078     if (header[3] < 0) {
2079       SETERRQ(PETSC_ERR_FILE_UNEXPECTED,1,"Matrix stored in special format, cannot load as MPIBAIJ");
2080     }
2081   }
2082 
2083   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
2084   M = header[1]; N = header[2];
2085 
2086   if (M != N) SETERRQ(PETSC_ERR_SUP,0,"Can only do square matrices");
2087 
2088   /*
2089      This code adds extra rows to make sure the number of rows is
2090      divisible by the blocksize
2091   */
2092   Mbs        = M/bs;
2093   extra_rows = bs - M + bs*(Mbs);
2094   if (extra_rows == bs) extra_rows = 0;
2095   else                  Mbs++;
2096   if (extra_rows &&!rank) {
2097     PLogInfo(0,"MatLoad_MPIBAIJ:Padding loaded matrix to match blocksize\n");
2098   }
2099 
2100   /* determine ownership of all rows */
2101   mbs = Mbs/size + ((Mbs % size) > rank);
2102   m   = mbs * bs;
2103   rowners = (int *) PetscMalloc(2*(size+2)*sizeof(int)); CHKPTRQ(rowners);
2104   browners = rowners + size + 1;
2105   ierr = MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2106   rowners[0] = 0;
2107   for ( i=2; i<=size; i++ ) rowners[i] += rowners[i-1];
2108   for ( i=0; i<=size;  i++ ) browners[i] = rowners[i]*bs;
2109   rstart = rowners[rank];
2110   rend   = rowners[rank+1];
2111 
2112   /* distribute row lengths to all processors */
2113   locrowlens = (int*) PetscMalloc( (rend-rstart)*bs*sizeof(int) ); CHKPTRQ(locrowlens);
2114   if (!rank) {
2115     rowlengths = (int*) PetscMalloc( (M+extra_rows)*sizeof(int) ); CHKPTRQ(rowlengths);
2116     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT); CHKERRQ(ierr);
2117     for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
2118     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
2119     for ( i=0; i<size; i++ ) sndcounts[i] = browners[i+1] - browners[i];
2120     ierr = MPI_Scatterv(rowlengths,sndcounts,browners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);CHKERRQ(ierr);
2121     PetscFree(sndcounts);
2122   } else {
2123     ierr = MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT, 0,comm);CHKERRQ(ierr);
2124   }
2125 
2126   if (!rank) {
2127     /* calculate the number of nonzeros on each processor */
2128     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
2129     PetscMemzero(procsnz,size*sizeof(int));
2130     for ( i=0; i<size; i++ ) {
2131       for ( j=rowners[i]*bs; j< rowners[i+1]*bs; j++ ) {
2132         procsnz[i] += rowlengths[j];
2133       }
2134     }
2135     PetscFree(rowlengths);
2136 
2137     /* determine max buffer needed and allocate it */
2138     maxnz = 0;
2139     for ( i=0; i<size; i++ ) {
2140       maxnz = PetscMax(maxnz,procsnz[i]);
2141     }
2142     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
2143 
2144     /* read in my part of the matrix column indices  */
2145     nz = procsnz[0];
2146     ibuf = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2147     mycols = ibuf;
2148     if (size == 1)  nz -= extra_rows;
2149     ierr = PetscBinaryRead(fd,mycols,nz,PETSC_INT); CHKERRQ(ierr);
2150     if (size == 1)  for (i=0; i< extra_rows; i++) { mycols[nz+i] = M+i; }
2151 
2152     /* read in every ones (except the last) and ship off */
2153     for ( i=1; i<size-1; i++ ) {
2154       nz   = procsnz[i];
2155       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2156       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
2157     }
2158     /* read in the stuff for the last proc */
2159     if ( size != 1 ) {
2160       nz   = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
2161       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2162       for ( i=0; i<extra_rows; i++ ) cols[nz+i] = M+i;
2163       ierr = MPI_Send(cols,nz+extra_rows,MPI_INT,size-1,tag,comm);CHKERRQ(ierr);
2164     }
2165     PetscFree(cols);
2166   } else {
2167     /* determine buffer space needed for message */
2168     nz = 0;
2169     for ( i=0; i<m; i++ ) {
2170       nz += locrowlens[i];
2171     }
2172     ibuf   = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2173     mycols = ibuf;
2174     /* receive message of column indices*/
2175     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
2176     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
2177     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2178   }
2179 
2180   /* loop over local rows, determining number of off diagonal entries */
2181   dlens  = (int *) PetscMalloc( 2*(rend-rstart+1)*sizeof(int) ); CHKPTRQ(dlens);
2182   odlens = dlens + (rend-rstart);
2183   mask   = (int *) PetscMalloc( 3*Mbs*sizeof(int) ); CHKPTRQ(mask);
2184   PetscMemzero(mask,3*Mbs*sizeof(int));
2185   masked1 = mask    + Mbs;
2186   masked2 = masked1 + Mbs;
2187   rowcount = 0; nzcount = 0;
2188   for ( i=0; i<mbs; i++ ) {
2189     dcount  = 0;
2190     odcount = 0;
2191     for ( j=0; j<bs; j++ ) {
2192       kmax = locrowlens[rowcount];
2193       for ( k=0; k<kmax; k++ ) {
2194         tmp = mycols[nzcount++]/bs;
2195         if (!mask[tmp]) {
2196           mask[tmp] = 1;
2197           if (tmp < rstart || tmp >= rend ) masked2[odcount++] = tmp;
2198           else masked1[dcount++] = tmp;
2199         }
2200       }
2201       rowcount++;
2202     }
2203 
2204     dlens[i]  = dcount;
2205     odlens[i] = odcount;
2206 
2207     /* zero out the mask elements we set */
2208     for ( j=0; j<dcount; j++ ) mask[masked1[j]] = 0;
2209     for ( j=0; j<odcount; j++ ) mask[masked2[j]] = 0;
2210   }
2211 
2212   /* create our matrix */
2213   ierr = MatCreateMPIBAIJ(comm,bs,m,PETSC_DECIDE,M+extra_rows,N+extra_rows,0,dlens,0,odlens,newmat);
2214          CHKERRQ(ierr);
2215   A = *newmat;
2216   MatSetOption(A,MAT_COLUMNS_SORTED);
2217 
2218   if (!rank) {
2219     buf = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(buf);
2220     /* read in my part of the matrix numerical values  */
2221     nz = procsnz[0];
2222     vals = buf;
2223     mycols = ibuf;
2224     if (size == 1)  nz -= extra_rows;
2225     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2226     if (size == 1)  for (i=0; i< extra_rows; i++) { vals[nz+i] = 1.0; }
2227 
2228     /* insert into matrix */
2229     jj      = rstart*bs;
2230     for ( i=0; i<m; i++ ) {
2231       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2232       mycols += locrowlens[i];
2233       vals   += locrowlens[i];
2234       jj++;
2235     }
2236     /* read in other processors (except the last one) and ship out */
2237     for ( i=1; i<size-1; i++ ) {
2238       nz   = procsnz[i];
2239       vals = buf;
2240       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2241       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
2242     }
2243     /* the last proc */
2244     if ( size != 1 ){
2245       nz   = procsnz[i] - extra_rows;
2246       vals = buf;
2247       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2248       for ( i=0; i<extra_rows; i++ ) vals[nz+i] = 1.0;
2249       ierr = MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,size-1,A->tag,comm);CHKERRQ(ierr);
2250     }
2251     PetscFree(procsnz);
2252   } else {
2253     /* receive numeric values */
2254     buf = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(buf);
2255 
2256     /* receive message of values*/
2257     vals   = buf;
2258     mycols = ibuf;
2259     ierr   = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
2260     ierr   = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
2261     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2262 
2263     /* insert into matrix */
2264     jj      = rstart*bs;
2265     for ( i=0; i<m; i++ ) {
2266       ierr    = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2267       mycols += locrowlens[i];
2268       vals   += locrowlens[i];
2269       jj++;
2270     }
2271   }
2272   PetscFree(locrowlens);
2273   PetscFree(buf);
2274   PetscFree(ibuf);
2275   PetscFree(rowners);
2276   PetscFree(dlens);
2277   PetscFree(mask);
2278   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2279   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2280   PetscFunctionReturn(0);
2281 }
2282 
2283 
2284