xref: /petsc/src/mat/impls/baij/mpi/mpibaij.c (revision e30ad881237f693a94b293d84250cdaa32f1c7f8)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: mpibaij.c,v 1.101 1998/01/27 23:43:07 balay Exp balay $";
3 #endif
4 
5 #include "pinclude/pviewer.h"
6 #include "src/mat/impls/baij/mpi/mpibaij.h"
7 #include "src/vec/vecimpl.h"
8 
9 
10 extern int MatSetUpMultiply_MPIBAIJ(Mat);
11 extern int DisAssemble_MPIBAIJ(Mat);
12 extern int MatIncreaseOverlap_MPIBAIJ(Mat,int,IS *,int);
13 extern int MatGetSubMatrices_MPIBAIJ(Mat,int,IS *,IS *,MatGetSubMatrixCall,Mat **);
14 
15 /*
16      Local utility routine that creates a mapping from the global column
17    number to the local number in the off-diagonal part of the local
18    storage of the matrix.  This is done in a non scable way since the
19    length of colmap equals the global matrix length.
20 */
21 #undef __FUNC__
22 #define __FUNC__ "CreateColmap_MPIBAIJ_Private"
23 static int CreateColmap_MPIBAIJ_Private(Mat mat)
24 {
25   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
26   Mat_SeqBAIJ *B = (Mat_SeqBAIJ*) baij->B->data;
27   int         nbs = B->nbs,i,bs=B->bs;;
28 
29   PetscFunctionBegin;
30   baij->colmap = (int *) PetscMalloc((baij->Nbs+1)*sizeof(int));CHKPTRQ(baij->colmap);
31   PLogObjectMemory(mat,baij->Nbs*sizeof(int));
32   PetscMemzero(baij->colmap,baij->Nbs*sizeof(int));
33   for ( i=0; i<nbs; i++ ) baij->colmap[baij->garray[i]] = i*bs+1;
34   PetscFunctionReturn(0);
35 }
36 
37 #define CHUNKSIZE  10
38 
39 #define  MatSetValues_SeqBAIJ_A_Private(row,col,value,addv) \
40 { \
41  \
42     brow = row/bs;  \
43     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
44     rmax = aimax[brow]; nrow = ailen[brow]; \
45       bcol = col/bs; \
46       ridx = row % bs; cidx = col % bs; \
47       low = 0; high = nrow; \
48       while (high-low > 3) { \
49         t = (low+high)/2; \
50         if (rp[t] > bcol) high = t; \
51         else              low  = t; \
52       } \
53       for ( _i=low; _i<high; _i++ ) { \
54         if (rp[_i] > bcol) break; \
55         if (rp[_i] == bcol) { \
56           bap  = ap +  bs2*_i + bs*cidx + ridx; \
57           if (addv == ADD_VALUES) *bap += value;  \
58           else                    *bap  = value;  \
59           goto a_noinsert; \
60         } \
61       } \
62       if (a->nonew == 1) goto a_noinsert; \
63       else if (a->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
64       if (nrow >= rmax) { \
65         /* there is no extra room in row, therefore enlarge */ \
66         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
67         Scalar *new_a; \
68  \
69         if (a->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
70  \
71         /* malloc new storage space */ \
72         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int); \
73         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
74         new_j   = (int *) (new_a + bs2*new_nz); \
75         new_i   = new_j + new_nz; \
76  \
77         /* copy over old data into new slots */ \
78         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];} \
79         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
80         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int)); \
81         len = (new_nz - CHUNKSIZE - ai[brow] - nrow); \
82         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow, \
83                                                            len*sizeof(int)); \
84         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar)); \
85         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
86         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE), \
87                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));  \
88         /* free up old matrix storage */ \
89         PetscFree(a->a);  \
90         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} \
91         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
92         a->singlemalloc = 1; \
93  \
94         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
95         rmax = aimax[brow] = aimax[brow] + CHUNKSIZE; \
96         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
97         a->maxnz += bs2*CHUNKSIZE; \
98         a->reallocs++; \
99         a->nz++; \
100       } \
101       N = nrow++ - 1;  \
102       /* shift up all the later entries in this row */ \
103       for ( ii=N; ii>=_i; ii-- ) { \
104         rp[ii+1] = rp[ii]; \
105         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
106       } \
107       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
108       rp[_i]                      = bcol;  \
109       ap[bs2*_i + bs*cidx + ridx] = value;  \
110       a_noinsert:; \
111     ailen[brow] = nrow; \
112 }
113 
114 #define  MatSetValues_SeqBAIJ_B_Private(row,col,value,addv) \
115 { \
116  \
117     brow = row/bs;  \
118     rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
119     rmax = bimax[brow]; nrow = bilen[brow]; \
120       bcol = col/bs; \
121       ridx = row % bs; cidx = col % bs; \
122       low = 0; high = nrow; \
123       while (high-low > 3) { \
124         t = (low+high)/2; \
125         if (rp[t] > bcol) high = t; \
126         else              low  = t; \
127       } \
128       for ( _i=low; _i<high; _i++ ) { \
129         if (rp[_i] > bcol) break; \
130         if (rp[_i] == bcol) { \
131           bap  = ap +  bs2*_i + bs*cidx + ridx; \
132           if (addv == ADD_VALUES) *bap += value;  \
133           else                    *bap  = value;  \
134           goto b_noinsert; \
135         } \
136       } \
137       if (b->nonew == 1) goto b_noinsert; \
138       else if (b->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
139       if (nrow >= rmax) { \
140         /* there is no extra room in row, therefore enlarge */ \
141         int    new_nz = bi[b->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
142         Scalar *new_a; \
143  \
144         if (b->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
145  \
146         /* malloc new storage space */ \
147         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(b->mbs+1)*sizeof(int); \
148         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
149         new_j   = (int *) (new_a + bs2*new_nz); \
150         new_i   = new_j + new_nz; \
151  \
152         /* copy over old data into new slots */ \
153         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = bi[ii];} \
154         for ( ii=brow+1; ii<b->mbs+1; ii++ ) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
155         PetscMemcpy(new_j,bj,(bi[brow]+nrow)*sizeof(int)); \
156         len = (new_nz - CHUNKSIZE - bi[brow] - nrow); \
157         PetscMemcpy(new_j+bi[brow]+nrow+CHUNKSIZE,bj+bi[brow]+nrow, \
158                                                            len*sizeof(int)); \
159         PetscMemcpy(new_a,ba,(bi[brow]+nrow)*bs2*sizeof(Scalar)); \
160         PetscMemzero(new_a+bs2*(bi[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
161         PetscMemcpy(new_a+bs2*(bi[brow]+nrow+CHUNKSIZE), \
162                     ba+bs2*(bi[brow]+nrow),bs2*len*sizeof(Scalar));  \
163         /* free up old matrix storage */ \
164         PetscFree(b->a);  \
165         if (!b->singlemalloc) {PetscFree(b->i);PetscFree(b->j);} \
166         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
167         b->singlemalloc = 1; \
168  \
169         rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
170         rmax = bimax[brow] = bimax[brow] + CHUNKSIZE; \
171         PLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
172         b->maxnz += bs2*CHUNKSIZE; \
173         b->reallocs++; \
174         b->nz++; \
175       } \
176       N = nrow++ - 1;  \
177       /* shift up all the later entries in this row */ \
178       for ( ii=N; ii>=_i; ii-- ) { \
179         rp[ii+1] = rp[ii]; \
180         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
181       } \
182       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
183       rp[_i]                      = bcol;  \
184       ap[bs2*_i + bs*cidx + ridx] = value;  \
185       b_noinsert:; \
186     bilen[brow] = nrow; \
187 }
188 
189 #undef __FUNC__
190 #define __FUNC__ "MatSetValues_MPIBAIJ"
191 int MatSetValues_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
192 {
193   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
194   Scalar      value;
195   int         ierr,i,j,row,col;
196   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
197   int         rend_orig=baij->rend_bs,cstart_orig=baij->cstart_bs;
198   int         cend_orig=baij->cend_bs,bs=baij->bs;
199 
200   /* Some Variables required in the macro */
201   Mat         A = baij->A;
202   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) (A)->data;
203   int         *aimax=a->imax,*ai=a->i,*ailen=a->ilen,*aj=a->j;
204   Scalar      *aa=a->a;
205 
206   Mat         B = baij->B;
207   Mat_SeqBAIJ *b = (Mat_SeqBAIJ *) (B)->data;
208   int         *bimax=b->imax,*bi=b->i,*bilen=b->ilen,*bj=b->j;
209   Scalar      *ba=b->a;
210 
211   int         *rp,ii,nrow,_i,rmax,N,brow,bcol;
212   int         low,high,t,ridx,cidx,bs2=a->bs2;
213   Scalar      *ap,*bap;
214 
215   PetscFunctionBegin;
216   for ( i=0; i<m; i++ ) {
217 #if defined(USE_PETSC_BOPT_g)
218     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
219     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
220 #endif
221     if (im[i] >= rstart_orig && im[i] < rend_orig) {
222       row = im[i] - rstart_orig;
223       for ( j=0; j<n; j++ ) {
224         if (in[j] >= cstart_orig && in[j] < cend_orig){
225           col = in[j] - cstart_orig;
226           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
227           MatSetValues_SeqBAIJ_A_Private(row,col,value,addv);
228           /* ierr = MatSetValues_SeqBAIJ(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
229         }
230 #if defined(USE_PETSC_BOPT_g)
231         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
232         else if (in[j] >= baij->N) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Col too large");}
233 #endif
234         else {
235           if (mat->was_assembled) {
236             if (!baij->colmap) {
237               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
238             }
239             col = baij->colmap[in[j]/bs] - 1 + in[j]%bs;
240             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
241               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
242               col =  in[j];
243               /* Reinitialize the variables required by MatSetValues_SeqBAIJ_B_Private() */
244               B = baij->B;
245               b = (Mat_SeqBAIJ *) (B)->data;
246               bimax=b->imax;bi=b->i;bilen=b->ilen;bj=b->j;
247               ba=b->a;
248             }
249           } else col = in[j];
250           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
251           MatSetValues_SeqBAIJ_B_Private(row,col,value,addv);
252           /* ierr = MatSetValues_SeqBAIJ(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
253         }
254       }
255     } else {
256       if (roworiented && !baij->donotstash) {
257         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
258       } else {
259         if (!baij->donotstash) {
260           row = im[i];
261 	  for ( j=0; j<n; j++ ) {
262 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
263           }
264         }
265       }
266     }
267   }
268   PetscFunctionReturn(0);
269 }
270 
271 extern int MatSetValuesBlocked_SeqBAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
272 #undef __FUNC__
273 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ"
274 int MatSetValuesBlocked_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
275 {
276   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
277   Scalar      *value,*barray=baij->barray;
278   int         ierr,i,j,ii,jj,row,col,k,l;
279   int         roworiented = baij->roworiented,rstart=baij->rstart ;
280   int         rend=baij->rend,cstart=baij->cstart,stepval;
281   int         cend=baij->cend,bs=baij->bs,bs2=baij->bs2;
282 
283   if(!barray) {
284     baij->barray = barray = (Scalar*) PetscMalloc(bs2*sizeof(Scalar)); CHKPTRQ(barray);
285   }
286 
287   if (roworiented) {
288     stepval = (n-1)*bs;
289   } else {
290     stepval = (m-1)*bs;
291   }
292   for ( i=0; i<m; i++ ) {
293 #if defined(USE_PETSC_BOPT_g)
294     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
295     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
296 #endif
297     if (im[i] >= rstart && im[i] < rend) {
298       row = im[i] - rstart;
299       for ( j=0; j<n; j++ ) {
300         /* If NumCol = 1 then a copy is not required */
301         if ((roworiented) && (n == 1)) {
302           barray = v + i*bs2;
303         } else if((!roworiented) && (m == 1)) {
304           barray = v + j*bs2;
305         } else { /* Here a copy is required */
306           if (roworiented) {
307             value = v + i*(stepval+bs)*bs + j*bs;
308           } else {
309             value = v + j*(stepval+bs)*bs + i*bs;
310           }
311           for ( ii=0; ii<bs; ii++,value+=stepval ) {
312             for (jj=0; jj<bs; jj++ ) {
313               *barray++  = *value++;
314             }
315           }
316           barray -=bs2;
317         }
318 
319         if (in[j] >= cstart && in[j] < cend){
320           col  = in[j] - cstart;
321           ierr = MatSetValuesBlocked_SeqBAIJ(baij->A,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
322         }
323 #if defined(USE_PETSC_BOPT_g)
324         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
325         else if (in[j] >= baij->Nbs) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");}
326 #endif
327         else {
328           if (mat->was_assembled) {
329             if (!baij->colmap) {
330               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
331             }
332 
333 #if defined(USE_PETSC_BOPT_g)
334             if ((baij->colmap[in[j]] - 1) % bs) {SETERRQ(PETSC_ERR_PLIB,0,"Incorrect colmap");}
335 #endif
336             col = (baij->colmap[in[j]] - 1)/bs;
337             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
338               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
339               col =  in[j];
340             }
341           }
342           else col = in[j];
343           ierr = MatSetValuesBlocked_SeqBAIJ(baij->B,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
344         }
345       }
346     } else {
347       if (!baij->donotstash) {
348         if (roworiented ) {
349           row   = im[i]*bs;
350           value = v + i*(stepval+bs)*bs;
351           for ( j=0; j<bs; j++,row++ ) {
352             for ( k=0; k<n; k++ ) {
353               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
354                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
355               }
356             }
357           }
358         } else {
359           for ( j=0; j<n; j++ ) {
360             value = v + j*(stepval+bs)*bs + i*bs;
361             col   = in[j]*bs;
362             for ( k=0; k<bs; k++,col++,value+=stepval) {
363               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
364                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
365               }
366             }
367           }
368         }
369       }
370     }
371   }
372   PetscFunctionReturn(0);
373 }
374 #include <math.h>
375 #define HASH_KEY 0.6180339887
376 /* #define HASH1(size,key) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
377 #define HASH(size,key,tmp) (tmp = (key)*HASH_KEY,(int)((size)*(tmp-(int)tmp)))
378 /* #define HASH(size,key,tmp) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
379 #undef __FUNC__
380 #define __FUNC__ "MatSetValues_MPIBAIJ_HT"
381 int MatSetValues_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
382 {
383   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
384   int         ierr,i,j,row,col;
385   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
386   int         rend_orig=baij->rend_bs,Nbs=baij->Nbs;
387   int         total_ct=baij->ht_total_ct,insert_ct=baij->ht_insert_ct;
388   int         h1,key,size=baij->ht_size,bs=baij->bs,*HT=baij->ht,idx;
389   double      tmp;
390   Scalar      ** HD = baij->hd,value;
391 
392 
393   PetscFunctionBegin;
394 
395   for ( i=0; i<m; i++ ) {
396 #if defined(USE_PETSC_BOPT_g)
397     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
398     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
399 #endif
400       row = im[i];
401     if (row >= rstart_orig && row < rend_orig) {
402       for ( j=0; j<n; j++ ) {
403         col = in[j];
404         if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
405         /* Look up into the Hash Table */
406         key = (row/bs)*Nbs+(col/bs)+1;
407         h1  = HASH(size,key,tmp);
408 
409 
410         idx = h1;
411 #if defined(USE_PETSC_BOPT_g)
412         insert_ct++;
413         total_ct++;
414         if (HT[idx] != key) {
415           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++,total_ct++);
416           if (idx == size) {
417             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++,total_ct++);
418             if (idx == h1) {
419               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
420             }
421           }
422         }
423 #else
424         if (HT[idx] != key) {
425           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++);
426           if (idx == size) {
427             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++);
428             if (idx == h1) {
429               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
430             }
431           }
432         }
433 #endif
434         /* A HASH table entry is found, so insert the values at the correct address */
435         if (addv == ADD_VALUES) *(HD[idx]+ (col % bs)*bs + (row % bs)) += value;
436         else                    *(HD[idx]+ (col % bs)*bs + (row % bs))  = value;
437       }
438     } else {
439       if (roworiented && !baij->donotstash) {
440         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
441       } else {
442         if (!baij->donotstash) {
443           row = im[i];
444 	  for ( j=0; j<n; j++ ) {
445 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
446           }
447         }
448       }
449     }
450   }
451 #if defined(USE_PETSC_BOPT_g)
452   baij->ht_total_ct = total_ct;
453   baij->ht_insert_ct = insert_ct;
454 #endif
455   PetscFunctionReturn(0);
456 }
457 
458 #undef __FUNC__
459 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ_HT"
460 int MatSetValuesBlocked_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
461 {
462   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
463   int         ierr,i,j,ii,jj,row,col,k,l;
464   int         roworiented = baij->roworiented,rstart=baij->rstart ;
465   int         rend=baij->rend,stepval,bs=baij->bs,bs2=baij->bs2;
466   int         total_ct=baij->ht_total_ct,insert_ct=baij->ht_insert_ct;
467 
468   int         h1,key,size=baij->ht_size,idx,*HT=baij->ht,Nbs=baij->Nbs;
469   double      tmp;
470   Scalar      ** HD = baij->hd,*value,*v_t,*baij_a;
471 
472   PetscFunctionBegin;
473 
474   if (roworiented) {
475     stepval = (n-1)*bs;
476   } else {
477     stepval = (m-1)*bs;
478   }
479   for ( i=0; i<m; i++ ) {
480 #if defined(USE_PETSC_BOPT_g)
481     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
482     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
483 #endif
484     row   = im[i];
485     v_t   = v + i*bs2;
486     if (row >= rstart && row < rend) {
487       for ( j=0; j<n; j++ ) {
488         col = in[j];
489 
490         /* Look up into the Hash Table */
491         key = row*Nbs+col+1;
492         h1  = HASH(size,key,tmp);
493 
494         idx = h1;
495 #if defined(USE_PETSC_BOPT_g)
496         total_ct++;
497         insert_ct++;
498        if (HT[idx] != key) {
499           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++,total_ct++);
500           if (idx == size) {
501             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++,total_ct++);
502             if (idx == h1) {
503               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
504             }
505           }
506         }
507 #else
508         if (HT[idx] != key) {
509           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++);
510           if (idx == size) {
511             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++);
512             if (idx == h1) {
513               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
514             }
515           }
516         }
517 #endif
518         baij_a = HD[idx];
519         if (roworiented) {
520           /*value = v + i*(stepval+bs)*bs + j*bs;*/
521           /* value = v + (i*(stepval+bs)+j)*bs; */
522           value = v_t;
523           v_t  += bs;
524           if (addv == ADD_VALUES) {
525             for ( ii=0; ii<bs; ii++,value+=stepval) {
526               for ( jj=ii; jj<bs2; jj+=bs ) {
527                 baij_a[jj]  += *value++;
528               }
529             }
530           } else {
531             for ( ii=0; ii<bs; ii++,value+=stepval) {
532               for ( jj=ii; jj<bs2; jj+=bs ) {
533                 baij_a[jj]  = *value++;
534               }
535             }
536           }
537         } else {
538           value = v + j*(stepval+bs)*bs + i*bs;
539           if (addv == ADD_VALUES) {
540             for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
541               for ( jj=0; jj<bs; jj++ ) {
542                 baij_a[jj]  += *value++;
543               }
544             }
545           } else {
546             for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
547               for ( jj=0; jj<bs; jj++ ) {
548                 baij_a[jj]  = *value++;
549               }
550             }
551           }
552         }
553       }
554     } else {
555       if (!baij->donotstash) {
556         if (roworiented ) {
557           row   = im[i]*bs;
558           value = v + i*(stepval+bs)*bs;
559           for ( j=0; j<bs; j++,row++ ) {
560             for ( k=0; k<n; k++ ) {
561               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
562                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
563               }
564             }
565           }
566         } else {
567           for ( j=0; j<n; j++ ) {
568             value = v + j*(stepval+bs)*bs + i*bs;
569             col   = in[j]*bs;
570             for ( k=0; k<bs; k++,col++,value+=stepval) {
571               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
572                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
573               }
574             }
575           }
576         }
577       }
578     }
579   }
580 #if defined(USE_PETSC_BOPT_g)
581   baij->ht_total_ct = total_ct;
582   baij->ht_insert_ct = insert_ct;
583 #endif
584   PetscFunctionReturn(0);
585 }
586 #undef __FUNC__
587 #define __FUNC__ "MatGetValues_MPIBAIJ"
588 int MatGetValues_MPIBAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
589 {
590   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
591   int        bs=baij->bs,ierr,i,j, bsrstart = baij->rstart*bs, bsrend = baij->rend*bs;
592   int        bscstart = baij->cstart*bs, bscend = baij->cend*bs,row,col;
593 
594   for ( i=0; i<m; i++ ) {
595     if (idxm[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
596     if (idxm[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
597     if (idxm[i] >= bsrstart && idxm[i] < bsrend) {
598       row = idxm[i] - bsrstart;
599       for ( j=0; j<n; j++ ) {
600         if (idxn[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");
601         if (idxn[j] >= baij->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");
602         if (idxn[j] >= bscstart && idxn[j] < bscend){
603           col = idxn[j] - bscstart;
604           ierr = MatGetValues(baij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
605         } else {
606           if (!baij->colmap) {
607             ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
608           }
609           if((baij->colmap[idxn[j]/bs]-1 < 0) ||
610              (baij->garray[(baij->colmap[idxn[j]/bs]-1)/bs] != idxn[j]/bs)) *(v+i*n+j) = 0.0;
611           else {
612             col  = (baij->colmap[idxn[j]/bs]-1) + idxn[j]%bs;
613             ierr = MatGetValues(baij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
614           }
615         }
616       }
617     } else {
618       SETERRQ(PETSC_ERR_SUP,0,"Only local values currently supported");
619     }
620   }
621  PetscFunctionReturn(0);
622 }
623 
624 #undef __FUNC__
625 #define __FUNC__ "MatNorm_MPIBAIJ"
626 int MatNorm_MPIBAIJ(Mat mat,NormType type,double *norm)
627 {
628   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
629   Mat_SeqBAIJ *amat = (Mat_SeqBAIJ*) baij->A->data, *bmat = (Mat_SeqBAIJ*) baij->B->data;
630   int        ierr, i,bs2=baij->bs2;
631   double     sum = 0.0;
632   Scalar     *v;
633 
634   PetscFunctionBegin;
635   if (baij->size == 1) {
636     ierr =  MatNorm(baij->A,type,norm); CHKERRQ(ierr);
637   } else {
638     if (type == NORM_FROBENIUS) {
639       v = amat->a;
640       for (i=0; i<amat->nz*bs2; i++ ) {
641 #if defined(USE_PETSC_COMPLEX)
642         sum += real(conj(*v)*(*v)); v++;
643 #else
644         sum += (*v)*(*v); v++;
645 #endif
646       }
647       v = bmat->a;
648       for (i=0; i<bmat->nz*bs2; i++ ) {
649 #if defined(USE_PETSC_COMPLEX)
650         sum += real(conj(*v)*(*v)); v++;
651 #else
652         sum += (*v)*(*v); v++;
653 #endif
654       }
655       ierr = MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
656       *norm = sqrt(*norm);
657     } else {
658       SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
659     }
660   }
661   PetscFunctionReturn(0);
662 }
663 
664 #undef __FUNC__
665 #define __FUNC__ "MatAssemblyBegin_MPIBAIJ"
666 int MatAssemblyBegin_MPIBAIJ(Mat mat,MatAssemblyType mode)
667 {
668   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
669   MPI_Comm    comm = mat->comm;
670   int         size = baij->size, *owners = baij->rowners,bs=baij->bs;
671   int         rank = baij->rank,tag = mat->tag, *owner,*starts,count,ierr;
672   MPI_Request *send_waits,*recv_waits;
673   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
674   InsertMode  addv;
675   Scalar      *rvalues,*svalues;
676 
677   PetscFunctionBegin;
678   /* make sure all processors are either in INSERTMODE or ADDMODE */
679   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);CHKERRQ(ierr);
680   if (addv == (ADD_VALUES|INSERT_VALUES)) {
681     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Some processors inserted others added");
682   }
683   mat->insertmode = addv; /* in case this processor had no cache */
684 
685   /*  first count number of contributors to each processor */
686   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
687   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
688   owner = (int *) PetscMalloc( (baij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
689   for ( i=0; i<baij->stash.n; i++ ) {
690     idx = baij->stash.idx[i];
691     for ( j=0; j<size; j++ ) {
692       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
693         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
694       }
695     }
696   }
697   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
698 
699   /* inform other processors of number of messages and max length*/
700   work      = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
701   ierr      = MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
702   nreceives = work[rank];
703   ierr      = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
704   nmax      = work[rank];
705   PetscFree(work);
706 
707   /* post receives:
708        1) each message will consist of ordered pairs
709      (global index,value) we store the global index as a double
710      to simplify the message passing.
711        2) since we don't know how long each individual message is we
712      allocate the largest needed buffer for each receive. Potentially
713      this is a lot of wasted space.
714 
715 
716        This could be done better.
717   */
718   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));CHKPTRQ(rvalues);
719   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
720   for ( i=0; i<nreceives; i++ ) {
721     ierr = MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
722   }
723 
724   /* do sends:
725       1) starts[i] gives the starting index in svalues for stuff going to
726          the ith processor
727   */
728   svalues = (Scalar *) PetscMalloc(3*(baij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
729   send_waits = (MPI_Request *) PetscMalloc((nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
730   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
731   starts[0] = 0;
732   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
733   for ( i=0; i<baij->stash.n; i++ ) {
734     svalues[3*starts[owner[i]]]       = (Scalar)  baij->stash.idx[i];
735     svalues[3*starts[owner[i]]+1]     = (Scalar)  baij->stash.idy[i];
736     svalues[3*(starts[owner[i]]++)+2] =  baij->stash.array[i];
737   }
738   PetscFree(owner);
739   starts[0] = 0;
740   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
741   count = 0;
742   for ( i=0; i<size; i++ ) {
743     if (procs[i]) {
744       ierr = MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
745     }
746   }
747   PetscFree(starts); PetscFree(nprocs);
748 
749   /* Free cache space */
750   PLogInfo(0,"MatAssemblyBegin_MPIBAIJ:Number of off-processor values %d\n",baij->stash.n);
751   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
752 
753   baij->svalues    = svalues;    baij->rvalues    = rvalues;
754   baij->nsends     = nsends;     baij->nrecvs     = nreceives;
755   baij->send_waits = send_waits; baij->recv_waits = recv_waits;
756   baij->rmax       = nmax;
757 
758   PetscFunctionReturn(0);
759 }
760 
761 /*
762   Creates the hash table, and sets the table
763   This table is created only once.
764   If new entried need to be added to the matrix
765   then the hash table has to be destroyed and
766   recreated.
767 */
768 #undef __FUNC__
769 #define __FUNC__ "MatCreateHashTable_MPIBAIJ_Private"
770 int MatCreateHashTable_MPIBAIJ_Private(Mat mat,double factor)
771 {
772   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
773   Mat         A = baij->A, B=baij->B;
774   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data, *b=(Mat_SeqBAIJ *)B->data;
775   int         i,j,k,nz=a->nz+b->nz,h1,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
776   int         size,ct=0,max=0,bs2=baij->bs2,rstart=baij->rstart;
777   int         cstart=baij->cstart,*garray=baij->garray,row,col,Nbs=baij->Nbs;
778   int         *HT,key;
779   Scalar      **HD;
780   double      tmp;
781 
782   PetscFunctionBegin;
783   baij->ht_size=(int)(factor*nz);
784   size = baij->ht_size;
785 
786   if (baij->ht) {
787     PetscFunctionReturn(0);
788   }
789 
790   /* Allocate Memory for Hash Table */
791   baij->hd = (Scalar**)PetscMalloc((size)*(sizeof(int)+sizeof(Scalar*))+1); CHKPTRQ(baij->hd);
792   baij->ht = (int*)(baij->hd + size);
793   HD = baij->hd;
794   HT = baij->ht;
795 
796 
797   PetscMemzero(HD,size*(sizeof(int)+sizeof(Scalar*)));
798 
799 
800   /* Loop Over A */
801   for ( i=0; i<a->mbs; i++ ) {
802     for ( j=ai[i]; j<ai[i+1]; j++ ) {
803       row = i+rstart;
804       col = aj[j]+cstart;
805 
806       key = row*Nbs + col + 1;
807       h1  = HASH(size,key,tmp);
808       for ( k=0; k<size; k++ ){
809         if (HT[(h1+k)%size] == 0.0) {
810           HT[(h1+k)%size] = key;
811           HD[(h1+k)%size] = a->a + j*bs2;
812           break;
813 #if defined(USE_PETSC_BOPT_g)
814         } else {
815           ct++;
816 #endif
817         }
818       }
819 #if defined(USE_PETSC_BOPT_g)
820       if (k> max) max = k;
821 #endif
822     }
823   }
824   /* Loop Over B */
825   for ( i=0; i<b->mbs; i++ ) {
826     for ( j=bi[i]; j<bi[i+1]; j++ ) {
827       row = i+rstart;
828       col = garray[bj[j]];
829       key = row*Nbs + col + 1;
830       h1  = HASH(size,key,tmp);
831       for ( k=0; k<size; k++ ){
832         if (HT[(h1+k)%size] == 0.0) {
833           HT[(h1+k)%size] = key;
834           HD[(h1+k)%size] = b->a + j*bs2;
835           break;
836 #if defined(USE_PETSC_BOPT_g)
837         } else {
838           ct++;
839 #endif
840         }
841       }
842 #if defined(USE_PETSC_BOPT_g)
843       if (k> max) max = k;
844 #endif
845     }
846   }
847 
848   /* Print Summary */
849 #if defined(USE_PETSC_BOPT_g)
850   for ( i=0,j=0; i<size; i++)
851     if (HT[i]) {j++;}
852   PLogInfo(0,"MatCreateHashTable_MPIBAIJ_Private: Average Search = %5.2f,max search = %d\n",
853            (j== 0)? 0.0:((double)(ct+j))/j,max);
854 #endif
855   PetscFunctionReturn(0);
856 }
857 
858 #undef __FUNC__
859 #define __FUNC__ "MatAssemblyEnd_MPIBAIJ"
860 int MatAssemblyEnd_MPIBAIJ(Mat mat,MatAssemblyType mode)
861 {
862   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
863   MPI_Status  *send_status,recv_status;
864   int         imdex,nrecvs = baij->nrecvs, count = nrecvs, i, n, ierr;
865   int         bs=baij->bs,row,col,other_disassembled,flg;
866   Scalar      *values,val;
867   InsertMode  addv = mat->insertmode;
868 
869   PetscFunctionBegin;
870   /*  wait on receives */
871   while (count) {
872     ierr = MPI_Waitany(nrecvs,baij->recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
873     /* unpack receives into our local space */
874     values = baij->rvalues + 3*imdex*baij->rmax;
875     ierr = MPI_Get_count(&recv_status,MPIU_SCALAR,&n);CHKERRQ(ierr);
876     n = n/3;
877     for ( i=0; i<n; i++ ) {
878       row = (int) PetscReal(values[3*i]) - baij->rstart*bs;
879       col = (int) PetscReal(values[3*i+1]);
880       val = values[3*i+2];
881       if (col >= baij->cstart*bs && col < baij->cend*bs) {
882         col -= baij->cstart*bs;
883         ierr = MatSetValues(baij->A,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
884       } else {
885         if (mat->was_assembled) {
886           if (!baij->colmap) {
887             ierr = CreateColmap_MPIBAIJ_Private(mat); CHKERRQ(ierr);
888           }
889           col = (baij->colmap[col/bs]) - 1 + col%bs;
890           if (col < 0  && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
891             ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
892             col = (int) PetscReal(values[3*i+1]);
893           }
894         }
895         ierr = MatSetValues(baij->B,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
896       }
897     }
898     count--;
899   }
900   PetscFree(baij->recv_waits); PetscFree(baij->rvalues);
901 
902   /* wait on sends */
903   if (baij->nsends) {
904     send_status = (MPI_Status *) PetscMalloc(baij->nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
905     ierr        = MPI_Waitall(baij->nsends,baij->send_waits,send_status);CHKERRQ(ierr);
906     PetscFree(send_status);
907   }
908   PetscFree(baij->send_waits); PetscFree(baij->svalues);
909 
910   ierr = MatAssemblyBegin(baij->A,mode); CHKERRQ(ierr);
911   ierr = MatAssemblyEnd(baij->A,mode); CHKERRQ(ierr);
912 
913   /* determine if any processor has disassembled, if so we must
914      also disassemble ourselfs, in order that we may reassemble. */
915   ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
916   if (mat->was_assembled && !other_disassembled) {
917     ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
918   }
919 
920   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
921     ierr = MatSetUpMultiply_MPIBAIJ(mat); CHKERRQ(ierr);
922   }
923   ierr = MatAssemblyBegin(baij->B,mode); CHKERRQ(ierr);
924   ierr = MatAssemblyEnd(baij->B,mode); CHKERRQ(ierr);
925 
926 #if defined(USE_PETSC_BOPT_g)
927   if (baij->ht && mode== MAT_FINAL_ASSEMBLY) {
928     PLogInfo(0,"MatAssemblyEnd_MPIBAIJ:Average Hash Table Search in MatSetValues = %5.2f\n",
929              ((double)baij->ht_total_ct)/baij->ht_insert_ct);
930     baij->ht_total_ct  = 0;
931     baij->ht_insert_ct = 0;
932   }
933 #endif
934   ierr = OptionsHasName(PETSC_NULL,"-use_hash",&flg); CHKERRQ(ierr);
935   if (flg && !baij->ht && mode== MAT_FINAL_ASSEMBLY) {
936     double fact = 1.39;
937     ierr = OptionsGetDouble(PETSC_NULL,"-use_hash",&fact,&flg); CHKERRQ(ierr);
938     if (fact <= 1.0) fact = 1.39;
939     PLogInfo(0,"MatAssemblyEnd_MPIBAIJ:Hash table Factor used %5.2f\n",fact);
940     ierr = MatCreateHashTable_MPIBAIJ_Private(mat,fact); CHKERRQ(ierr);
941     mat->ops.setvalues        = MatSetValues_MPIBAIJ_HT;
942     mat->ops.setvaluesblocked = MatSetValuesBlocked_MPIBAIJ_HT;
943   }
944 
945   if (baij->rowvalues) {PetscFree(baij->rowvalues); baij->rowvalues = 0;}
946   PetscFunctionReturn(0);
947 }
948 
949 #undef __FUNC__
950 #define __FUNC__ "MatView_MPIBAIJ_Binary"
951 static int MatView_MPIBAIJ_Binary(Mat mat,Viewer viewer)
952 {
953   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
954   int          ierr;
955 
956   PetscFunctionBegin;
957   if (baij->size == 1) {
958     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
959   } else SETERRQ(PETSC_ERR_SUP,0,"Only uniprocessor output supported");
960   PetscFunctionReturn(0);
961 }
962 
963 #undef __FUNC__
964 #define __FUNC__ "MatView_MPIBAIJ_ASCIIorDraworMatlab"
965 static int MatView_MPIBAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
966 {
967   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
968   int          ierr, format,rank,bs = baij->bs;
969   FILE         *fd;
970   ViewerType   vtype;
971 
972   PetscFunctionBegin;
973   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
974   if (vtype  == ASCII_FILES_VIEWER || vtype == ASCII_FILE_VIEWER) {
975     ierr = ViewerGetFormat(viewer,&format);
976     if (format == VIEWER_FORMAT_ASCII_INFO_LONG) {
977       MatInfo info;
978       MPI_Comm_rank(mat->comm,&rank);
979       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
980       ierr = MatGetInfo(mat,MAT_LOCAL,&info);
981       PetscSequentialPhaseBegin(mat->comm,1);
982       fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
983               rank,baij->m,(int)info.nz_used*bs,(int)info.nz_allocated*bs,
984               baij->bs,(int)info.memory);
985       ierr = MatGetInfo(baij->A,MAT_LOCAL,&info);
986       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
987       ierr = MatGetInfo(baij->B,MAT_LOCAL,&info);
988       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
989       fflush(fd);
990       PetscSequentialPhaseEnd(mat->comm,1);
991       ierr = VecScatterView(baij->Mvctx,viewer); CHKERRQ(ierr);
992       PetscFunctionReturn(0);
993     } else if (format == VIEWER_FORMAT_ASCII_INFO) {
994       PetscPrintf(mat->comm,"  block size is %d\n",bs);
995       PetscFunctionReturn(0);
996     }
997   }
998 
999   if (vtype == DRAW_VIEWER) {
1000     Draw       draw;
1001     PetscTruth isnull;
1002     ierr = ViewerDrawGetDraw(viewer,&draw); CHKERRQ(ierr);
1003     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
1004   }
1005 
1006   if (vtype == ASCII_FILE_VIEWER) {
1007     ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
1008     PetscSequentialPhaseBegin(mat->comm,1);
1009     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
1010            baij->rank,baij->m,baij->rstart*bs,baij->rend*bs,baij->n,
1011             baij->cstart*bs,baij->cend*bs);
1012     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
1013     ierr = MatView(baij->B,viewer); CHKERRQ(ierr);
1014     fflush(fd);
1015     PetscSequentialPhaseEnd(mat->comm,1);
1016   } else {
1017     int size = baij->size;
1018     rank = baij->rank;
1019     if (size == 1) {
1020       ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
1021     } else {
1022       /* assemble the entire matrix onto first processor. */
1023       Mat         A;
1024       Mat_SeqBAIJ *Aloc;
1025       int         M = baij->M, N = baij->N,*ai,*aj,row,col,i,j,k,*rvals;
1026       int         mbs=baij->mbs;
1027       Scalar      *a;
1028 
1029       if (!rank) {
1030         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
1031       } else {
1032         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
1033       }
1034       PLogObjectParent(mat,A);
1035 
1036       /* copy over the A part */
1037       Aloc = (Mat_SeqBAIJ*) baij->A->data;
1038       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1039       row = baij->rstart;
1040       rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1041 
1042       for ( i=0; i<mbs; i++ ) {
1043         rvals[0] = bs*(baij->rstart + i);
1044         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1045         for ( j=ai[i]; j<ai[i+1]; j++ ) {
1046           col = (baij->cstart+aj[j])*bs;
1047           for (k=0; k<bs; k++ ) {
1048             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
1049             col++; a += bs;
1050           }
1051         }
1052       }
1053       /* copy over the B part */
1054       Aloc = (Mat_SeqBAIJ*) baij->B->data;
1055       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1056       row = baij->rstart*bs;
1057       for ( i=0; i<mbs; i++ ) {
1058         rvals[0] = bs*(baij->rstart + i);
1059         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1060         for ( j=ai[i]; j<ai[i+1]; j++ ) {
1061           col = baij->garray[aj[j]]*bs;
1062           for (k=0; k<bs; k++ ) {
1063             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
1064             col++; a += bs;
1065           }
1066         }
1067       }
1068       PetscFree(rvals);
1069       ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1070       ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1071       /*
1072          Everyone has to call to draw the matrix since the graphics waits are
1073          synchronized across all processors that share the Draw object
1074       */
1075       if (!rank || vtype == DRAW_VIEWER) {
1076         ierr = MatView(((Mat_MPIBAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
1077       }
1078       ierr = MatDestroy(A); CHKERRQ(ierr);
1079     }
1080   }
1081   PetscFunctionReturn(0);
1082 }
1083 
1084 
1085 
1086 #undef __FUNC__
1087 #define __FUNC__ "MatView_MPIBAIJ"
1088 int MatView_MPIBAIJ(PetscObject obj,Viewer viewer)
1089 {
1090   Mat         mat = (Mat) obj;
1091   int         ierr;
1092   ViewerType  vtype;
1093 
1094   PetscFunctionBegin;
1095   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
1096   if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER ||
1097       vtype == DRAW_VIEWER       || vtype == MATLAB_VIEWER) {
1098     ierr = MatView_MPIBAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
1099   } else if (vtype == BINARY_FILE_VIEWER) {
1100     ierr = MatView_MPIBAIJ_Binary(mat,viewer);CHKERRQ(ierr);
1101   }
1102   PetscFunctionReturn(0);
1103 }
1104 
1105 #undef __FUNC__
1106 #define __FUNC__ "MatDestroy_MPIBAIJ"
1107 int MatDestroy_MPIBAIJ(PetscObject obj)
1108 {
1109   Mat         mat = (Mat) obj;
1110   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1111   int         ierr;
1112 
1113   PetscFunctionBegin;
1114 #if defined(USE_PETSC_LOG)
1115   PLogObjectState(obj,"Rows=%d, Cols=%d",baij->M,baij->N);
1116 #endif
1117 
1118   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
1119   PetscFree(baij->rowners);
1120   ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1121   ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1122   if (baij->colmap) PetscFree(baij->colmap);
1123   if (baij->garray) PetscFree(baij->garray);
1124   if (baij->lvec)   VecDestroy(baij->lvec);
1125   if (baij->Mvctx)  VecScatterDestroy(baij->Mvctx);
1126   if (baij->rowvalues) PetscFree(baij->rowvalues);
1127   if (baij->barray) PetscFree(baij->barray);
1128   if (baij->hd) PetscFree(baij->hd);
1129   PetscFree(baij);
1130   PLogObjectDestroy(mat);
1131   PetscHeaderDestroy(mat);
1132   PetscFunctionReturn(0);
1133 }
1134 
1135 #undef __FUNC__
1136 #define __FUNC__ "MatMult_MPIBAIJ"
1137 int MatMult_MPIBAIJ(Mat A,Vec xx,Vec yy)
1138 {
1139   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1140   int         ierr, nt;
1141 
1142   PetscFunctionBegin;
1143   VecGetLocalSize_Fast(xx,nt);
1144   if (nt != a->n) {
1145     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible partition of A and xx");
1146   }
1147   VecGetLocalSize_Fast(yy,nt);
1148   if (nt != a->m) {
1149     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible parition of A and yy");
1150   }
1151   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1152   ierr = (*a->A->ops.mult)(a->A,xx,yy); CHKERRQ(ierr);
1153   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1154   ierr = (*a->B->ops.multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
1155   ierr = VecScatterPostRecvs(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1156   PetscFunctionReturn(0);
1157 }
1158 
1159 #undef __FUNC__
1160 #define __FUNC__ "MatMultAdd_MPIBAIJ"
1161 int MatMultAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1162 {
1163   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1164   int        ierr;
1165 
1166   PetscFunctionBegin;
1167   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1168   ierr = (*a->A->ops.multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1169   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1170   ierr = (*a->B->ops.multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
1171   PetscFunctionReturn(0);
1172 }
1173 
1174 #undef __FUNC__
1175 #define __FUNC__ "MatMultTrans_MPIBAIJ"
1176 int MatMultTrans_MPIBAIJ(Mat A,Vec xx,Vec yy)
1177 {
1178   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1179   int         ierr;
1180 
1181   PetscFunctionBegin;
1182   /* do nondiagonal part */
1183   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1184   /* send it on its way */
1185   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1186   /* do local part */
1187   ierr = (*a->A->ops.multtrans)(a->A,xx,yy); CHKERRQ(ierr);
1188   /* receive remote parts: note this assumes the values are not actually */
1189   /* inserted in yy until the next line, which is true for my implementation*/
1190   /* but is not perhaps always true. */
1191   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1192   PetscFunctionReturn(0);
1193 }
1194 
1195 #undef __FUNC__
1196 #define __FUNC__ "MatMultTransAdd_MPIBAIJ"
1197 int MatMultTransAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1198 {
1199   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1200   int         ierr;
1201 
1202   PetscFunctionBegin;
1203   /* do nondiagonal part */
1204   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1205   /* send it on its way */
1206   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1207   /* do local part */
1208   ierr = (*a->A->ops.multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1209   /* receive remote parts: note this assumes the values are not actually */
1210   /* inserted in yy until the next line, which is true for my implementation*/
1211   /* but is not perhaps always true. */
1212   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1213   PetscFunctionReturn(0);
1214 }
1215 
1216 /*
1217   This only works correctly for square matrices where the subblock A->A is the
1218    diagonal block
1219 */
1220 #undef __FUNC__
1221 #define __FUNC__ "MatGetDiagonal_MPIBAIJ"
1222 int MatGetDiagonal_MPIBAIJ(Mat A,Vec v)
1223 {
1224   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1225   int         ierr;
1226 
1227   PetscFunctionBegin;
1228   if (a->M != a->N) SETERRQ(PETSC_ERR_SUP,0,"Supports only square matrix where A->A is diag block");
1229   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
1230   PetscFunctionReturn(0);
1231 }
1232 
1233 #undef __FUNC__
1234 #define __FUNC__ "MatScale_MPIBAIJ"
1235 int MatScale_MPIBAIJ(Scalar *aa,Mat A)
1236 {
1237   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1238   int         ierr;
1239 
1240   PetscFunctionBegin;
1241   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
1242   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
1243   PetscFunctionReturn(0);
1244 }
1245 
1246 #undef __FUNC__
1247 #define __FUNC__ "MatGetSize_MPIBAIJ"
1248 int MatGetSize_MPIBAIJ(Mat matin,int *m,int *n)
1249 {
1250   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1251 
1252   PetscFunctionBegin;
1253   if (m) *m = mat->M;
1254   if (n) *n = mat->N;
1255   PetscFunctionReturn(0);
1256 }
1257 
1258 #undef __FUNC__
1259 #define __FUNC__ "MatGetLocalSize_MPIBAIJ"
1260 int MatGetLocalSize_MPIBAIJ(Mat matin,int *m,int *n)
1261 {
1262   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1263 
1264   PetscFunctionBegin;
1265   *m = mat->m; *n = mat->N;
1266   PetscFunctionReturn(0);
1267 }
1268 
1269 #undef __FUNC__
1270 #define __FUNC__ "MatGetOwnershipRange_MPIBAIJ"
1271 int MatGetOwnershipRange_MPIBAIJ(Mat matin,int *m,int *n)
1272 {
1273   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1274 
1275   PetscFunctionBegin;
1276   *m = mat->rstart*mat->bs; *n = mat->rend*mat->bs;
1277   PetscFunctionReturn(0);
1278 }
1279 
1280 extern int MatGetRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1281 extern int MatRestoreRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1282 
1283 #undef __FUNC__
1284 #define __FUNC__ "MatGetRow_MPIBAIJ"
1285 int MatGetRow_MPIBAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1286 {
1287   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1288   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1289   int        bs = mat->bs, bs2 = mat->bs2, i, ierr, *cworkA, *cworkB, **pcA, **pcB;
1290   int        nztot, nzA, nzB, lrow, brstart = mat->rstart*bs, brend = mat->rend*bs;
1291   int        *cmap, *idx_p,cstart = mat->cstart;
1292 
1293   PetscFunctionBegin;
1294   if (mat->getrowactive == PETSC_TRUE) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Already active");
1295   mat->getrowactive = PETSC_TRUE;
1296 
1297   if (!mat->rowvalues && (idx || v)) {
1298     /*
1299         allocate enough space to hold information from the longest row.
1300     */
1301     Mat_SeqBAIJ *Aa = (Mat_SeqBAIJ *) mat->A->data,*Ba = (Mat_SeqBAIJ *) mat->B->data;
1302     int     max = 1,mbs = mat->mbs,tmp;
1303     for ( i=0; i<mbs; i++ ) {
1304       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1305       if (max < tmp) { max = tmp; }
1306     }
1307     mat->rowvalues = (Scalar *) PetscMalloc( max*bs2*(sizeof(int)+sizeof(Scalar)));
1308     CHKPTRQ(mat->rowvalues);
1309     mat->rowindices = (int *) (mat->rowvalues + max*bs2);
1310   }
1311 
1312   if (row < brstart || row >= brend) SETERRQ(PETSC_ERR_SUP,0,"Only local rows")
1313   lrow = row - brstart;
1314 
1315   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1316   if (!v)   {pvA = 0; pvB = 0;}
1317   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1318   ierr = (*mat->A->ops.getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1319   ierr = (*mat->B->ops.getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1320   nztot = nzA + nzB;
1321 
1322   cmap  = mat->garray;
1323   if (v  || idx) {
1324     if (nztot) {
1325       /* Sort by increasing column numbers, assuming A and B already sorted */
1326       int imark = -1;
1327       if (v) {
1328         *v = v_p = mat->rowvalues;
1329         for ( i=0; i<nzB; i++ ) {
1330           if (cmap[cworkB[i]/bs] < cstart)   v_p[i] = vworkB[i];
1331           else break;
1332         }
1333         imark = i;
1334         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1335         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1336       }
1337       if (idx) {
1338         *idx = idx_p = mat->rowindices;
1339         if (imark > -1) {
1340           for ( i=0; i<imark; i++ ) {
1341             idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs;
1342           }
1343         } else {
1344           for ( i=0; i<nzB; i++ ) {
1345             if (cmap[cworkB[i]/bs] < cstart)
1346               idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1347             else break;
1348           }
1349           imark = i;
1350         }
1351         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart*bs + cworkA[i];
1352         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1353       }
1354     } else {
1355       if (idx) *idx = 0;
1356       if (v)   *v   = 0;
1357     }
1358   }
1359   *nz = nztot;
1360   ierr = (*mat->A->ops.restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1361   ierr = (*mat->B->ops.restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1362   PetscFunctionReturn(0);
1363 }
1364 
1365 #undef __FUNC__
1366 #define __FUNC__ "MatRestoreRow_MPIBAIJ"
1367 int MatRestoreRow_MPIBAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1368 {
1369   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1370 
1371   PetscFunctionBegin;
1372   if (baij->getrowactive == PETSC_FALSE) {
1373     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"MatGetRow not called");
1374   }
1375   baij->getrowactive = PETSC_FALSE;
1376   PetscFunctionReturn(0);
1377 }
1378 
1379 #undef __FUNC__
1380 #define __FUNC__ "MatGetBlockSize_MPIBAIJ"
1381 int MatGetBlockSize_MPIBAIJ(Mat mat,int *bs)
1382 {
1383   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1384 
1385   PetscFunctionBegin;
1386   *bs = baij->bs;
1387   PetscFunctionReturn(0);
1388 }
1389 
1390 #undef __FUNC__
1391 #define __FUNC__ "MatZeroEntries_MPIBAIJ"
1392 int MatZeroEntries_MPIBAIJ(Mat A)
1393 {
1394   Mat_MPIBAIJ *l = (Mat_MPIBAIJ *) A->data;
1395   int         ierr;
1396 
1397   PetscFunctionBegin;
1398   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
1399   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
1400   PetscFunctionReturn(0);
1401 }
1402 
1403 #undef __FUNC__
1404 #define __FUNC__ "MatGetInfo_MPIBAIJ"
1405 int MatGetInfo_MPIBAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1406 {
1407   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) matin->data;
1408   Mat         A = a->A, B = a->B;
1409   int         ierr;
1410   double      isend[5], irecv[5];
1411 
1412   PetscFunctionBegin;
1413   info->block_size     = (double)a->bs;
1414   ierr = MatGetInfo(A,MAT_LOCAL,info); CHKERRQ(ierr);
1415   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->memory;
1416   ierr = MatGetInfo(B,MAT_LOCAL,info); CHKERRQ(ierr);
1417   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->memory;
1418   if (flag == MAT_LOCAL) {
1419     info->nz_used      = isend[0];
1420     info->nz_allocated = isend[1];
1421     info->nz_unneeded  = isend[2];
1422     info->memory       = isend[3];
1423     info->mallocs      = isend[4];
1424   } else if (flag == MAT_GLOBAL_MAX) {
1425     ierr = MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_MAX,matin->comm);CHKERRQ(ierr);
1426     info->nz_used      = irecv[0];
1427     info->nz_allocated = irecv[1];
1428     info->nz_unneeded  = irecv[2];
1429     info->memory       = irecv[3];
1430     info->mallocs      = irecv[4];
1431   } else if (flag == MAT_GLOBAL_SUM) {
1432     ierr = MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_SUM,matin->comm);CHKERRQ(ierr);
1433     info->nz_used      = irecv[0];
1434     info->nz_allocated = irecv[1];
1435     info->nz_unneeded  = irecv[2];
1436     info->memory       = irecv[3];
1437     info->mallocs      = irecv[4];
1438   }
1439   info->rows_global       = (double)a->M;
1440   info->columns_global    = (double)a->N;
1441   info->rows_local        = (double)a->m;
1442   info->columns_local     = (double)a->N;
1443   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1444   info->fill_ratio_needed = 0;
1445   info->factor_mallocs    = 0;
1446   PetscFunctionReturn(0);
1447 }
1448 
1449 #undef __FUNC__
1450 #define __FUNC__ "MatSetOption_MPIBAIJ"
1451 int MatSetOption_MPIBAIJ(Mat A,MatOption op)
1452 {
1453   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1454 
1455   PetscFunctionBegin;
1456   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1457       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1458       op == MAT_COLUMNS_UNSORTED ||
1459       op == MAT_COLUMNS_SORTED ||
1460       op == MAT_NEW_NONZERO_ALLOCATION_ERROR ||
1461       op == MAT_NEW_NONZERO_LOCATION_ERROR) {
1462         MatSetOption(a->A,op);
1463         MatSetOption(a->B,op);
1464   } else if (op == MAT_ROW_ORIENTED) {
1465         a->roworiented = 1;
1466         MatSetOption(a->A,op);
1467         MatSetOption(a->B,op);
1468   } else if (op == MAT_ROWS_SORTED ||
1469              op == MAT_ROWS_UNSORTED ||
1470              op == MAT_SYMMETRIC ||
1471              op == MAT_STRUCTURALLY_SYMMETRIC ||
1472              op == MAT_YES_NEW_DIAGONALS)
1473     PLogInfo(A,"Info:MatSetOption_MPIBAIJ:Option ignored\n");
1474   else if (op == MAT_COLUMN_ORIENTED) {
1475     a->roworiented = 0;
1476     MatSetOption(a->A,op);
1477     MatSetOption(a->B,op);
1478   } else if (op == MAT_IGNORE_OFF_PROC_ENTRIES) {
1479     a->donotstash = 1;
1480   } else if (op == MAT_NO_NEW_DIAGONALS) {
1481     SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");
1482   } else {
1483     SETERRQ(PETSC_ERR_SUP,0,"unknown option");
1484   }
1485   PetscFunctionReturn(0);
1486 }
1487 
1488 #undef __FUNC__
1489 #define __FUNC__ "MatTranspose_MPIBAIJ("
1490 int MatTranspose_MPIBAIJ(Mat A,Mat *matout)
1491 {
1492   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) A->data;
1493   Mat_SeqBAIJ *Aloc;
1494   Mat        B;
1495   int        ierr,M=baij->M,N=baij->N,*ai,*aj,row,i,*rvals,j,k,col;
1496   int        bs=baij->bs,mbs=baij->mbs;
1497   Scalar     *a;
1498 
1499   PetscFunctionBegin;
1500   if (matout == PETSC_NULL && M != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Square matrix only for in-place");
1501   ierr = MatCreateMPIBAIJ(A->comm,baij->bs,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);
1502   CHKERRQ(ierr);
1503 
1504   /* copy over the A part */
1505   Aloc = (Mat_SeqBAIJ*) baij->A->data;
1506   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1507   row = baij->rstart;
1508   rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1509 
1510   for ( i=0; i<mbs; i++ ) {
1511     rvals[0] = bs*(baij->rstart + i);
1512     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1513     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1514       col = (baij->cstart+aj[j])*bs;
1515       for (k=0; k<bs; k++ ) {
1516         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1517         col++; a += bs;
1518       }
1519     }
1520   }
1521   /* copy over the B part */
1522   Aloc = (Mat_SeqBAIJ*) baij->B->data;
1523   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1524   row = baij->rstart*bs;
1525   for ( i=0; i<mbs; i++ ) {
1526     rvals[0] = bs*(baij->rstart + i);
1527     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1528     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1529       col = baij->garray[aj[j]]*bs;
1530       for (k=0; k<bs; k++ ) {
1531         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1532         col++; a += bs;
1533       }
1534     }
1535   }
1536   PetscFree(rvals);
1537   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1538   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1539 
1540   if (matout != PETSC_NULL) {
1541     *matout = B;
1542   } else {
1543     /* This isn't really an in-place transpose .... but free data structures from baij */
1544     PetscFree(baij->rowners);
1545     ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1546     ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1547     if (baij->colmap) PetscFree(baij->colmap);
1548     if (baij->garray) PetscFree(baij->garray);
1549     if (baij->lvec) VecDestroy(baij->lvec);
1550     if (baij->Mvctx) VecScatterDestroy(baij->Mvctx);
1551     PetscFree(baij);
1552     PetscMemcpy(A,B,sizeof(struct _p_Mat));
1553     PetscHeaderDestroy(B);
1554   }
1555   PetscFunctionReturn(0);
1556 }
1557 
1558 #undef __FUNC__
1559 #define __FUNC__ "MatDiagonalScale_MPIBAIJ"
1560 int MatDiagonalScale_MPIBAIJ(Mat A,Vec ll,Vec rr)
1561 {
1562   Mat a = ((Mat_MPIBAIJ *) A->data)->A;
1563   Mat b = ((Mat_MPIBAIJ *) A->data)->B;
1564   int ierr,s1,s2,s3;
1565 
1566   PetscFunctionBegin;
1567   if (ll)  {
1568     ierr = VecGetLocalSize(ll,&s1); CHKERRQ(ierr);
1569     ierr = MatGetLocalSize(A,&s2,&s3); CHKERRQ(ierr);
1570     if (s1!=s2) SETERRQ(PETSC_ERR_ARG_SIZ,0,"non-conforming local sizes");
1571     ierr = MatDiagonalScale(a,ll,0); CHKERRQ(ierr);
1572     ierr = MatDiagonalScale(b,ll,0); CHKERRQ(ierr);
1573   }
1574   if (rr) SETERRQ(PETSC_ERR_SUP,0,"not supported for right vector");
1575   PetscFunctionReturn(0);
1576 }
1577 
1578 #undef __FUNC__
1579 #define __FUNC__ "MatZeroRows_MPIBAIJ"
1580 int MatZeroRows_MPIBAIJ(Mat A,IS is,Scalar *diag)
1581 {
1582   Mat_MPIBAIJ    *l = (Mat_MPIBAIJ *) A->data;
1583   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
1584   int            *procs,*nprocs,j,found,idx,nsends,*work,row;
1585   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
1586   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
1587   int            *lens,imdex,*lrows,*values,bs=l->bs,rstart_bs=l->rstart_bs;
1588   MPI_Comm       comm = A->comm;
1589   MPI_Request    *send_waits,*recv_waits;
1590   MPI_Status     recv_status,*send_status;
1591   IS             istmp;
1592 
1593   PetscFunctionBegin;
1594   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
1595   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
1596 
1597   /*  first count number of contributors to each processor */
1598   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
1599   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
1600   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
1601   for ( i=0; i<N; i++ ) {
1602     idx = rows[i];
1603     found = 0;
1604     for ( j=0; j<size; j++ ) {
1605       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
1606         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
1607       }
1608     }
1609     if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Index out of range");
1610   }
1611   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
1612 
1613   /* inform other processors of number of messages and max length*/
1614   work   = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
1615   ierr   = MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
1616   nrecvs = work[rank];
1617   ierr   = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
1618   nmax   = work[rank];
1619   PetscFree(work);
1620 
1621   /* post receives:   */
1622   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); CHKPTRQ(rvalues);
1623   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
1624   for ( i=0; i<nrecvs; i++ ) {
1625     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
1626   }
1627 
1628   /* do sends:
1629      1) starts[i] gives the starting index in svalues for stuff going to
1630      the ith processor
1631   */
1632   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
1633   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
1634   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
1635   starts[0] = 0;
1636   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1637   for ( i=0; i<N; i++ ) {
1638     svalues[starts[owner[i]]++] = rows[i];
1639   }
1640   ISRestoreIndices(is,&rows);
1641 
1642   starts[0] = 0;
1643   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1644   count = 0;
1645   for ( i=0; i<size; i++ ) {
1646     if (procs[i]) {
1647       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
1648     }
1649   }
1650   PetscFree(starts);
1651 
1652   base = owners[rank]*bs;
1653 
1654   /*  wait on receives */
1655   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
1656   source = lens + nrecvs;
1657   count  = nrecvs; slen = 0;
1658   while (count) {
1659     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
1660     /* unpack receives into our local space */
1661     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
1662     source[imdex]  = recv_status.MPI_SOURCE;
1663     lens[imdex]  = n;
1664     slen += n;
1665     count--;
1666   }
1667   PetscFree(recv_waits);
1668 
1669   /* move the data into the send scatter */
1670   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
1671   count = 0;
1672   for ( i=0; i<nrecvs; i++ ) {
1673     values = rvalues + i*nmax;
1674     for ( j=0; j<lens[i]; j++ ) {
1675       lrows[count++] = values[j] - base;
1676     }
1677   }
1678   PetscFree(rvalues); PetscFree(lens);
1679   PetscFree(owner); PetscFree(nprocs);
1680 
1681   /* actually zap the local rows */
1682   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
1683   PLogObjectParent(A,istmp);
1684 
1685   ierr = MatZeroRows(l->A,istmp,0); CHKERRQ(ierr);
1686   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
1687   ierr = ISDestroy(istmp); CHKERRQ(ierr);
1688 
1689   if (diag) {
1690     for ( i = 0; i < slen; i++ ) {
1691       row = lrows[i] + rstart_bs;
1692       ierr = MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES); CHKERRQ(ierr);
1693     }
1694     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1695     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1696   }
1697   PetscFree(lrows);
1698 
1699   /* wait on sends */
1700   if (nsends) {
1701     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
1702     ierr        = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
1703     PetscFree(send_status);
1704   }
1705   PetscFree(send_waits); PetscFree(svalues);
1706 
1707   PetscFunctionReturn(0);
1708 }
1709 extern int MatPrintHelp_SeqBAIJ(Mat);
1710 #undef __FUNC__
1711 #define __FUNC__ "MatPrintHelp_MPIBAIJ"
1712 int MatPrintHelp_MPIBAIJ(Mat A)
1713 {
1714   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1715   int         ierr;
1716 
1717   PetscFunctionBegin;
1718   if (!a->rank) {
1719     ierr = MatPrintHelp_SeqBAIJ(a->A);CHKERRQ(ierr);
1720   }
1721   PetscFunctionReturn(0);
1722 }
1723 
1724 #undef __FUNC__
1725 #define __FUNC__ "MatSetUnfactored_MPIBAIJ"
1726 int MatSetUnfactored_MPIBAIJ(Mat A)
1727 {
1728   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1729   int         ierr;
1730 
1731   PetscFunctionBegin;
1732   ierr = MatSetUnfactored(a->A); CHKERRQ(ierr);
1733   PetscFunctionReturn(0);
1734 }
1735 
1736 static int MatConvertSameType_MPIBAIJ(Mat,Mat *,int);
1737 
1738 /* -------------------------------------------------------------------*/
1739 static struct _MatOps MatOps = {
1740   MatSetValues_MPIBAIJ,MatGetRow_MPIBAIJ,MatRestoreRow_MPIBAIJ,MatMult_MPIBAIJ,
1741   MatMultAdd_MPIBAIJ,MatMultTrans_MPIBAIJ,MatMultTransAdd_MPIBAIJ,0,
1742   0,0,0,0,
1743   0,0,MatTranspose_MPIBAIJ,MatGetInfo_MPIBAIJ,
1744   0,MatGetDiagonal_MPIBAIJ,MatDiagonalScale_MPIBAIJ,MatNorm_MPIBAIJ,
1745   MatAssemblyBegin_MPIBAIJ,MatAssemblyEnd_MPIBAIJ,0,MatSetOption_MPIBAIJ,
1746   MatZeroEntries_MPIBAIJ,MatZeroRows_MPIBAIJ,0,
1747   0,0,0,MatGetSize_MPIBAIJ,
1748   MatGetLocalSize_MPIBAIJ,MatGetOwnershipRange_MPIBAIJ,0,0,
1749   0,0,MatConvertSameType_MPIBAIJ,0,0,
1750   0,0,0,MatGetSubMatrices_MPIBAIJ,
1751   MatIncreaseOverlap_MPIBAIJ,MatGetValues_MPIBAIJ,0,MatPrintHelp_MPIBAIJ,
1752   MatScale_MPIBAIJ,0,0,0,MatGetBlockSize_MPIBAIJ,
1753   0,0,0,0,0,0,MatSetUnfactored_MPIBAIJ,0,MatSetValuesBlocked_MPIBAIJ};
1754 
1755 
1756 #undef __FUNC__
1757 #define __FUNC__ "MatCreateMPIBAIJ"
1758 /*@C
1759    MatCreateMPIBAIJ - Creates a sparse parallel matrix in block AIJ format
1760    (block compressed row).  For good matrix assembly performance
1761    the user should preallocate the matrix storage by setting the parameters
1762    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1763    performance can be increased by more than a factor of 50.
1764 
1765    Input Parameters:
1766 .  comm - MPI communicator
1767 .  bs   - size of blockk
1768 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1769            This value should be the same as the local size used in creating the
1770            y vector for the matrix-vector product y = Ax.
1771 .  n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1772            This value should be the same as the local size used in creating the
1773            x vector for the matrix-vector product y = Ax.
1774 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1775 .  N - number of global columns (or PETSC_DECIDE to have calculated if n is given)
1776 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1777            submatrix  (same for all local rows)
1778 .  d_nzz - array containing the number of block nonzeros in the various block rows
1779            of the in diagonal portion of the local (possibly different for each block
1780            row) or PETSC_NULL.  You must leave room for the diagonal entry even if
1781            it is zero.
1782 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1783            submatrix (same for all local rows).
1784 .  o_nzz - array containing the number of nonzeros in the various block rows of the
1785            off-diagonal portion of the local submatrix (possibly different for
1786            each block row) or PETSC_NULL.
1787 
1788    Output Parameter:
1789 .  A - the matrix
1790 
1791    Notes:
1792    The user MUST specify either the local or global matrix dimensions
1793    (possibly both).
1794 
1795    Storage Information:
1796    For a square global matrix we define each processor's diagonal portion
1797    to be its local rows and the corresponding columns (a square submatrix);
1798    each processor's off-diagonal portion encompasses the remainder of the
1799    local matrix (a rectangular submatrix).
1800 
1801    The user can specify preallocated storage for the diagonal part of
1802    the local submatrix with either d_nz or d_nnz (not both).  Set
1803    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1804    memory allocation.  Likewise, specify preallocated storage for the
1805    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1806 
1807    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1808    the figure below we depict these three local rows and all columns (0-11).
1809 
1810 $          0 1 2 3 4 5 6 7 8 9 10 11
1811 $         -------------------
1812 $  row 3  |  o o o d d d o o o o o o
1813 $  row 4  |  o o o d d d o o o o o o
1814 $  row 5  |  o o o d d d o o o o o o
1815 $         -------------------
1816 $
1817 
1818    Thus, any entries in the d locations are stored in the d (diagonal)
1819    submatrix, and any entries in the o locations are stored in the
1820    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1821    stored simply in the MATSEQBAIJ format for compressed row storage.
1822 
1823    Now d_nz should indicate the number of block nonzeros per row in the d matrix,
1824    and o_nz should indicate the number of block nonzeros per row in the o matrix.
1825    In general, for PDE problems in which most nonzeros are near the diagonal,
1826    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1827    or you will get TERRIBLE performance; see the users' manual chapter on
1828    matrices.
1829 
1830 .keywords: matrix, block, aij, compressed row, sparse, parallel
1831 
1832 .seealso: MatCreate(), MatCreateSeqBAIJ(), MatSetValues()
1833 @*/
1834 int MatCreateMPIBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,
1835                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1836 {
1837   Mat          B;
1838   Mat_MPIBAIJ  *b;
1839   int          ierr, i,sum[2],work[2],mbs,nbs,Mbs=PETSC_DECIDE,Nbs=PETSC_DECIDE,size;
1840 
1841   PetscFunctionBegin;
1842   if (bs < 1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Invalid block size specified, must be positive");
1843 
1844   MPI_Comm_size(comm,&size);
1845   if (size == 1) {
1846     if (M == PETSC_DECIDE) M = m;
1847     if (N == PETSC_DECIDE) N = n;
1848     ierr = MatCreateSeqBAIJ(comm,bs,M,N,d_nz,d_nnz,A); CHKERRQ(ierr);
1849     PetscFunctionReturn(0);
1850   }
1851 
1852   *A = 0;
1853   PetscHeaderCreate(B,_p_Mat,MAT_COOKIE,MATMPIBAIJ,comm,MatDestroy,MatView);
1854   PLogObjectCreate(B);
1855   B->data       = (void *) (b = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(b);
1856   PetscMemzero(b,sizeof(Mat_MPIBAIJ));
1857   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
1858 
1859   B->destroy    = MatDestroy_MPIBAIJ;
1860   B->view       = MatView_MPIBAIJ;
1861   B->mapping    = 0;
1862   B->factor     = 0;
1863   B->assembled  = PETSC_FALSE;
1864 
1865   B->insertmode = NOT_SET_VALUES;
1866   MPI_Comm_rank(comm,&b->rank);
1867   MPI_Comm_size(comm,&b->size);
1868 
1869   if ( m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL)) {
1870     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
1871   }
1872   if ( M == PETSC_DECIDE && m == PETSC_DECIDE) {
1873     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either M or m should be specified");
1874   }
1875   if ( N == PETSC_DECIDE && n == PETSC_DECIDE) {
1876     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either N or n should be specified");
1877   }
1878   if ( M != PETSC_DECIDE && m != PETSC_DECIDE) M = PETSC_DECIDE;
1879   if ( N != PETSC_DECIDE && n != PETSC_DECIDE) N = PETSC_DECIDE;
1880 
1881   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1882     work[0] = m; work[1] = n;
1883     mbs = m/bs; nbs = n/bs;
1884     ierr = MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );CHKERRQ(ierr);
1885     if (M == PETSC_DECIDE) {M = sum[0]; Mbs = M/bs;}
1886     if (N == PETSC_DECIDE) {N = sum[1]; Nbs = N/bs;}
1887   }
1888   if (m == PETSC_DECIDE) {
1889     Mbs = M/bs;
1890     if (Mbs*bs != M) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global rows must be divisible by blocksize");
1891     mbs = Mbs/b->size + ((Mbs % b->size) > b->rank);
1892     m   = mbs*bs;
1893   }
1894   if (n == PETSC_DECIDE) {
1895     Nbs = N/bs;
1896     if (Nbs*bs != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global cols must be divisible by blocksize");
1897     nbs = Nbs/b->size + ((Nbs % b->size) > b->rank);
1898     n   = nbs*bs;
1899   }
1900   if (mbs*bs != m || nbs*bs != n) {
1901     SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of local rows, cols must be divisible by blocksize");
1902   }
1903 
1904   b->m = m; B->m = m;
1905   b->n = n; B->n = n;
1906   b->N = N; B->N = N;
1907   b->M = M; B->M = M;
1908   b->bs  = bs;
1909   b->bs2 = bs*bs;
1910   b->mbs = mbs;
1911   b->nbs = nbs;
1912   b->Mbs = Mbs;
1913   b->Nbs = Nbs;
1914 
1915   /* build local table of row and column ownerships */
1916   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
1917   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
1918   b->cowners = b->rowners + b->size + 2;
1919   ierr = MPI_Allgather(&mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1920   b->rowners[0] = 0;
1921   for ( i=2; i<=b->size; i++ ) {
1922     b->rowners[i] += b->rowners[i-1];
1923   }
1924   b->rstart    = b->rowners[b->rank];
1925   b->rend      = b->rowners[b->rank+1];
1926   b->rstart_bs = b->rstart * bs;
1927   b->rend_bs   = b->rend * bs;
1928 
1929   ierr = MPI_Allgather(&nbs,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1930   b->cowners[0] = 0;
1931   for ( i=2; i<=b->size; i++ ) {
1932     b->cowners[i] += b->cowners[i-1];
1933   }
1934   b->cstart    = b->cowners[b->rank];
1935   b->cend      = b->cowners[b->rank+1];
1936   b->cstart_bs = b->cstart * bs;
1937   b->cend_bs   = b->cend * bs;
1938 
1939 
1940   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1941   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
1942   PLogObjectParent(B,b->A);
1943   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1944   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
1945   PLogObjectParent(B,b->B);
1946 
1947   /* build cache for off array entries formed */
1948   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
1949   b->donotstash  = 0;
1950   b->colmap      = 0;
1951   b->garray      = 0;
1952   b->roworiented = 1;
1953 
1954   /* stuff used in block assembly */
1955   b->barray       = 0;
1956 
1957   /* stuff used for matrix vector multiply */
1958   b->lvec         = 0;
1959   b->Mvctx        = 0;
1960 
1961   /* stuff for MatGetRow() */
1962   b->rowindices   = 0;
1963   b->rowvalues    = 0;
1964   b->getrowactive = PETSC_FALSE;
1965 
1966   /* hash table stuff */
1967   b->ht           = 0;
1968   b->hd           = 0;
1969   b->ht_size      = 0;
1970   b->ht_total_ct  = 0;
1971   b->ht_insert_ct = 0;
1972 
1973   *A = B;
1974   PetscFunctionReturn(0);
1975 }
1976 
1977 #undef __FUNC__
1978 #define __FUNC__ "MatConvertSameType_MPIBAIJ"
1979 static int MatConvertSameType_MPIBAIJ(Mat matin,Mat *newmat,int cpvalues)
1980 {
1981   Mat         mat;
1982   Mat_MPIBAIJ *a,*oldmat = (Mat_MPIBAIJ *) matin->data;
1983   int         ierr, len=0, flg;
1984 
1985   PetscFunctionBegin;
1986   *newmat       = 0;
1987   PetscHeaderCreate(mat,_p_Mat,MAT_COOKIE,MATMPIBAIJ,matin->comm,MatDestroy,MatView);
1988   PLogObjectCreate(mat);
1989   mat->data       = (void *) (a = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(a);
1990   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1991   mat->destroy    = MatDestroy_MPIBAIJ;
1992   mat->view       = MatView_MPIBAIJ;
1993   mat->factor     = matin->factor;
1994   mat->assembled  = PETSC_TRUE;
1995 
1996   a->m = mat->m   = oldmat->m;
1997   a->n = mat->n   = oldmat->n;
1998   a->M = mat->M   = oldmat->M;
1999   a->N = mat->N   = oldmat->N;
2000 
2001   a->bs  = oldmat->bs;
2002   a->bs2 = oldmat->bs2;
2003   a->mbs = oldmat->mbs;
2004   a->nbs = oldmat->nbs;
2005   a->Mbs = oldmat->Mbs;
2006   a->Nbs = oldmat->Nbs;
2007 
2008   a->rstart       = oldmat->rstart;
2009   a->rend         = oldmat->rend;
2010   a->cstart       = oldmat->cstart;
2011   a->cend         = oldmat->cend;
2012   a->size         = oldmat->size;
2013   a->rank         = oldmat->rank;
2014   mat->insertmode = NOT_SET_VALUES;
2015   a->rowvalues    = 0;
2016   a->getrowactive = PETSC_FALSE;
2017   a->barray       = 0;
2018 
2019   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
2020   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
2021   a->cowners = a->rowners + a->size + 2;
2022   PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));
2023   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
2024   if (oldmat->colmap) {
2025     a->colmap = (int *) PetscMalloc((a->Nbs)*sizeof(int));CHKPTRQ(a->colmap);
2026     PLogObjectMemory(mat,(a->Nbs)*sizeof(int));
2027     PetscMemcpy(a->colmap,oldmat->colmap,(a->Nbs)*sizeof(int));
2028   } else a->colmap = 0;
2029   if (oldmat->garray && (len = ((Mat_SeqBAIJ *) (oldmat->B->data))->nbs)) {
2030     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
2031     PLogObjectMemory(mat,len*sizeof(int));
2032     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
2033   } else a->garray = 0;
2034 
2035   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
2036   PLogObjectParent(mat,a->lvec);
2037   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
2038   PLogObjectParent(mat,a->Mvctx);
2039   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
2040   PLogObjectParent(mat,a->A);
2041   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
2042   PLogObjectParent(mat,a->B);
2043   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
2044   if (flg) {
2045     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
2046   }
2047   *newmat = mat;
2048   PetscFunctionReturn(0);
2049 }
2050 
2051 #include "sys.h"
2052 
2053 #undef __FUNC__
2054 #define __FUNC__ "MatLoad_MPIBAIJ"
2055 int MatLoad_MPIBAIJ(Viewer viewer,MatType type,Mat *newmat)
2056 {
2057   Mat          A;
2058   int          i, nz, ierr, j,rstart, rend, fd;
2059   Scalar       *vals,*buf;
2060   MPI_Comm     comm = ((PetscObject)viewer)->comm;
2061   MPI_Status   status;
2062   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,*browners,maxnz,*cols;
2063   int          *locrowlens,*sndcounts = 0,*procsnz = 0, jj,*mycols,*ibuf;
2064   int          flg,tag = ((PetscObject)viewer)->tag,bs=1,bs2,Mbs,mbs,extra_rows;
2065   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
2066   int          dcount,kmax,k,nzcount,tmp;
2067 
2068   PetscFunctionBegin;
2069   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
2070   bs2  = bs*bs;
2071 
2072   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
2073   if (!rank) {
2074     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
2075     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT); CHKERRQ(ierr);
2076     if (header[0] != MAT_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"not matrix object");
2077     if (header[3] < 0) {
2078       SETERRQ(PETSC_ERR_FILE_UNEXPECTED,1,"Matrix stored in special format, cannot load as MPIBAIJ");
2079     }
2080   }
2081 
2082   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
2083   M = header[1]; N = header[2];
2084 
2085   if (M != N) SETERRQ(PETSC_ERR_SUP,0,"Can only do square matrices");
2086 
2087   /*
2088      This code adds extra rows to make sure the number of rows is
2089      divisible by the blocksize
2090   */
2091   Mbs        = M/bs;
2092   extra_rows = bs - M + bs*(Mbs);
2093   if (extra_rows == bs) extra_rows = 0;
2094   else                  Mbs++;
2095   if (extra_rows &&!rank) {
2096     PLogInfo(0,"MatLoad_MPIBAIJ:Padding loaded matrix to match blocksize\n");
2097   }
2098 
2099   /* determine ownership of all rows */
2100   mbs = Mbs/size + ((Mbs % size) > rank);
2101   m   = mbs * bs;
2102   rowners = (int *) PetscMalloc(2*(size+2)*sizeof(int)); CHKPTRQ(rowners);
2103   browners = rowners + size + 1;
2104   ierr = MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2105   rowners[0] = 0;
2106   for ( i=2; i<=size; i++ ) rowners[i] += rowners[i-1];
2107   for ( i=0; i<=size;  i++ ) browners[i] = rowners[i]*bs;
2108   rstart = rowners[rank];
2109   rend   = rowners[rank+1];
2110 
2111   /* distribute row lengths to all processors */
2112   locrowlens = (int*) PetscMalloc( (rend-rstart)*bs*sizeof(int) ); CHKPTRQ(locrowlens);
2113   if (!rank) {
2114     rowlengths = (int*) PetscMalloc( (M+extra_rows)*sizeof(int) ); CHKPTRQ(rowlengths);
2115     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT); CHKERRQ(ierr);
2116     for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
2117     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
2118     for ( i=0; i<size; i++ ) sndcounts[i] = browners[i+1] - browners[i];
2119     ierr = MPI_Scatterv(rowlengths,sndcounts,browners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);CHKERRQ(ierr);
2120     PetscFree(sndcounts);
2121   } else {
2122     ierr = MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT, 0,comm);CHKERRQ(ierr);
2123   }
2124 
2125   if (!rank) {
2126     /* calculate the number of nonzeros on each processor */
2127     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
2128     PetscMemzero(procsnz,size*sizeof(int));
2129     for ( i=0; i<size; i++ ) {
2130       for ( j=rowners[i]*bs; j< rowners[i+1]*bs; j++ ) {
2131         procsnz[i] += rowlengths[j];
2132       }
2133     }
2134     PetscFree(rowlengths);
2135 
2136     /* determine max buffer needed and allocate it */
2137     maxnz = 0;
2138     for ( i=0; i<size; i++ ) {
2139       maxnz = PetscMax(maxnz,procsnz[i]);
2140     }
2141     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
2142 
2143     /* read in my part of the matrix column indices  */
2144     nz = procsnz[0];
2145     ibuf = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2146     mycols = ibuf;
2147     if (size == 1)  nz -= extra_rows;
2148     ierr = PetscBinaryRead(fd,mycols,nz,PETSC_INT); CHKERRQ(ierr);
2149     if (size == 1)  for (i=0; i< extra_rows; i++) { mycols[nz+i] = M+i; }
2150 
2151     /* read in every ones (except the last) and ship off */
2152     for ( i=1; i<size-1; i++ ) {
2153       nz   = procsnz[i];
2154       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2155       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
2156     }
2157     /* read in the stuff for the last proc */
2158     if ( size != 1 ) {
2159       nz   = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
2160       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2161       for ( i=0; i<extra_rows; i++ ) cols[nz+i] = M+i;
2162       ierr = MPI_Send(cols,nz+extra_rows,MPI_INT,size-1,tag,comm);CHKERRQ(ierr);
2163     }
2164     PetscFree(cols);
2165   } else {
2166     /* determine buffer space needed for message */
2167     nz = 0;
2168     for ( i=0; i<m; i++ ) {
2169       nz += locrowlens[i];
2170     }
2171     ibuf   = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2172     mycols = ibuf;
2173     /* receive message of column indices*/
2174     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
2175     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
2176     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2177   }
2178 
2179   /* loop over local rows, determining number of off diagonal entries */
2180   dlens  = (int *) PetscMalloc( 2*(rend-rstart+1)*sizeof(int) ); CHKPTRQ(dlens);
2181   odlens = dlens + (rend-rstart);
2182   mask   = (int *) PetscMalloc( 3*Mbs*sizeof(int) ); CHKPTRQ(mask);
2183   PetscMemzero(mask,3*Mbs*sizeof(int));
2184   masked1 = mask    + Mbs;
2185   masked2 = masked1 + Mbs;
2186   rowcount = 0; nzcount = 0;
2187   for ( i=0; i<mbs; i++ ) {
2188     dcount  = 0;
2189     odcount = 0;
2190     for ( j=0; j<bs; j++ ) {
2191       kmax = locrowlens[rowcount];
2192       for ( k=0; k<kmax; k++ ) {
2193         tmp = mycols[nzcount++]/bs;
2194         if (!mask[tmp]) {
2195           mask[tmp] = 1;
2196           if (tmp < rstart || tmp >= rend ) masked2[odcount++] = tmp;
2197           else masked1[dcount++] = tmp;
2198         }
2199       }
2200       rowcount++;
2201     }
2202 
2203     dlens[i]  = dcount;
2204     odlens[i] = odcount;
2205 
2206     /* zero out the mask elements we set */
2207     for ( j=0; j<dcount; j++ ) mask[masked1[j]] = 0;
2208     for ( j=0; j<odcount; j++ ) mask[masked2[j]] = 0;
2209   }
2210 
2211   /* create our matrix */
2212   ierr = MatCreateMPIBAIJ(comm,bs,m,PETSC_DECIDE,M+extra_rows,N+extra_rows,0,dlens,0,odlens,newmat);
2213          CHKERRQ(ierr);
2214   A = *newmat;
2215   MatSetOption(A,MAT_COLUMNS_SORTED);
2216 
2217   if (!rank) {
2218     buf = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(buf);
2219     /* read in my part of the matrix numerical values  */
2220     nz = procsnz[0];
2221     vals = buf;
2222     mycols = ibuf;
2223     if (size == 1)  nz -= extra_rows;
2224     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2225     if (size == 1)  for (i=0; i< extra_rows; i++) { vals[nz+i] = 1.0; }
2226 
2227     /* insert into matrix */
2228     jj      = rstart*bs;
2229     for ( i=0; i<m; i++ ) {
2230       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2231       mycols += locrowlens[i];
2232       vals   += locrowlens[i];
2233       jj++;
2234     }
2235     /* read in other processors (except the last one) and ship out */
2236     for ( i=1; i<size-1; i++ ) {
2237       nz   = procsnz[i];
2238       vals = buf;
2239       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2240       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
2241     }
2242     /* the last proc */
2243     if ( size != 1 ){
2244       nz   = procsnz[i] - extra_rows;
2245       vals = buf;
2246       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2247       for ( i=0; i<extra_rows; i++ ) vals[nz+i] = 1.0;
2248       ierr = MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,size-1,A->tag,comm);CHKERRQ(ierr);
2249     }
2250     PetscFree(procsnz);
2251   } else {
2252     /* receive numeric values */
2253     buf = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(buf);
2254 
2255     /* receive message of values*/
2256     vals   = buf;
2257     mycols = ibuf;
2258     ierr   = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
2259     ierr   = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
2260     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2261 
2262     /* insert into matrix */
2263     jj      = rstart*bs;
2264     for ( i=0; i<m; i++ ) {
2265       ierr    = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2266       mycols += locrowlens[i];
2267       vals   += locrowlens[i];
2268       jj++;
2269     }
2270   }
2271   PetscFree(locrowlens);
2272   PetscFree(buf);
2273   PetscFree(ibuf);
2274   PetscFree(rowners);
2275   PetscFree(dlens);
2276   PetscFree(mask);
2277   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2278   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2279   PetscFunctionReturn(0);
2280 }
2281 
2282 
2283