xref: /petsc/src/mat/impls/baij/mpi/mpibaij.c (revision 44e6884e7f997852a6037c5fe784cff0d02aba6e)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: mpibaij.c,v 1.94 1998/01/12 17:10:22 balay Exp balay $";
3 #endif
4 
5 #include "pinclude/pviewer.h"
6 #include "src/mat/impls/baij/mpi/mpibaij.h"
7 #include "src/vec/vecimpl.h"
8 
9 
10 extern int MatSetUpMultiply_MPIBAIJ(Mat);
11 extern int DisAssemble_MPIBAIJ(Mat);
12 extern int MatIncreaseOverlap_MPIBAIJ(Mat,int,IS *,int);
13 extern int MatGetSubMatrices_MPIBAIJ(Mat,int,IS *,IS *,MatGetSubMatrixCall,Mat **);
14 
15 /*
16      Local utility routine that creates a mapping from the global column
17    number to the local number in the off-diagonal part of the local
18    storage of the matrix.  This is done in a non scable way since the
19    length of colmap equals the global matrix length.
20 */
21 #undef __FUNC__
22 #define __FUNC__ "CreateColmap_MPIBAIJ_Private"
23 static int CreateColmap_MPIBAIJ_Private(Mat mat)
24 {
25   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
26   Mat_SeqBAIJ *B = (Mat_SeqBAIJ*) baij->B->data;
27   int         nbs = B->nbs,i,bs=B->bs;;
28 
29   PetscFunctionBegin;
30   baij->colmap = (int *) PetscMalloc((baij->Nbs+1)*sizeof(int));CHKPTRQ(baij->colmap);
31   PLogObjectMemory(mat,baij->Nbs*sizeof(int));
32   PetscMemzero(baij->colmap,baij->Nbs*sizeof(int));
33   for ( i=0; i<nbs; i++ ) baij->colmap[baij->garray[i]] = i*bs+1;
34   PetscFunctionReturn(0);
35 }
36 
37 #define CHUNKSIZE  10
38 
39 #define  MatSetValues_SeqBAIJ_A_Private(row,col,value,addv) \
40 { \
41  \
42     brow = row/bs;  \
43     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
44     rmax = aimax[brow]; nrow = ailen[brow]; \
45       bcol = col/bs; \
46       ridx = row % bs; cidx = col % bs; \
47       low = 0; high = nrow; \
48       while (high-low > 3) { \
49         t = (low+high)/2; \
50         if (rp[t] > bcol) high = t; \
51         else              low  = t; \
52       } \
53       for ( _i=low; _i<high; _i++ ) { \
54         if (rp[_i] > bcol) break; \
55         if (rp[_i] == bcol) { \
56           bap  = ap +  bs2*_i + bs*cidx + ridx; \
57           if (addv == ADD_VALUES) *bap += value;  \
58           else                    *bap  = value;  \
59           goto a_noinsert; \
60         } \
61       } \
62       if (a->nonew == 1) goto a_noinsert; \
63       else if (a->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
64       if (nrow >= rmax) { \
65         /* there is no extra room in row, therefore enlarge */ \
66         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
67         Scalar *new_a; \
68  \
69         if (a->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
70  \
71         /* malloc new storage space */ \
72         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int); \
73         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
74         new_j   = (int *) (new_a + bs2*new_nz); \
75         new_i   = new_j + new_nz; \
76  \
77         /* copy over old data into new slots */ \
78         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];} \
79         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
80         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int)); \
81         len = (new_nz - CHUNKSIZE - ai[brow] - nrow); \
82         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow, \
83                                                            len*sizeof(int)); \
84         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar)); \
85         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
86         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE), \
87                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));  \
88         /* free up old matrix storage */ \
89         PetscFree(a->a);  \
90         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} \
91         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
92         a->singlemalloc = 1; \
93  \
94         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
95         rmax = aimax[brow] = aimax[brow] + CHUNKSIZE; \
96         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
97         a->maxnz += bs2*CHUNKSIZE; \
98         a->reallocs++; \
99         a->nz++; \
100       } \
101       N = nrow++ - 1;  \
102       /* shift up all the later entries in this row */ \
103       for ( ii=N; ii>=_i; ii-- ) { \
104         rp[ii+1] = rp[ii]; \
105         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
106       } \
107       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
108       rp[_i]                      = bcol;  \
109       ap[bs2*_i + bs*cidx + ridx] = value;  \
110       a_noinsert:; \
111     ailen[brow] = nrow; \
112 }
113 
114 #define  MatSetValues_SeqBAIJ_B_Private(row,col,value,addv) \
115 { \
116  \
117     brow = row/bs;  \
118     rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
119     rmax = bimax[brow]; nrow = bilen[brow]; \
120       bcol = col/bs; \
121       ridx = row % bs; cidx = col % bs; \
122       low = 0; high = nrow; \
123       while (high-low > 3) { \
124         t = (low+high)/2; \
125         if (rp[t] > bcol) high = t; \
126         else              low  = t; \
127       } \
128       for ( _i=low; _i<high; _i++ ) { \
129         if (rp[_i] > bcol) break; \
130         if (rp[_i] == bcol) { \
131           bap  = ap +  bs2*_i + bs*cidx + ridx; \
132           if (addv == ADD_VALUES) *bap += value;  \
133           else                    *bap  = value;  \
134           goto b_noinsert; \
135         } \
136       } \
137       if (b->nonew == 1) goto b_noinsert; \
138       else if (b->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
139       if (nrow >= rmax) { \
140         /* there is no extra room in row, therefore enlarge */ \
141         int    new_nz = bi[b->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
142         Scalar *new_a; \
143  \
144         if (b->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
145  \
146         /* malloc new storage space */ \
147         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(b->mbs+1)*sizeof(int); \
148         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
149         new_j   = (int *) (new_a + bs2*new_nz); \
150         new_i   = new_j + new_nz; \
151  \
152         /* copy over old data into new slots */ \
153         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = bi[ii];} \
154         for ( ii=brow+1; ii<b->mbs+1; ii++ ) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
155         PetscMemcpy(new_j,bj,(bi[brow]+nrow)*sizeof(int)); \
156         len = (new_nz - CHUNKSIZE - bi[brow] - nrow); \
157         PetscMemcpy(new_j+bi[brow]+nrow+CHUNKSIZE,bj+bi[brow]+nrow, \
158                                                            len*sizeof(int)); \
159         PetscMemcpy(new_a,ba,(bi[brow]+nrow)*bs2*sizeof(Scalar)); \
160         PetscMemzero(new_a+bs2*(bi[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
161         PetscMemcpy(new_a+bs2*(bi[brow]+nrow+CHUNKSIZE), \
162                     ba+bs2*(bi[brow]+nrow),bs2*len*sizeof(Scalar));  \
163         /* free up old matrix storage */ \
164         PetscFree(b->a);  \
165         if (!b->singlemalloc) {PetscFree(b->i);PetscFree(b->j);} \
166         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
167         b->singlemalloc = 1; \
168  \
169         rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
170         rmax = bimax[brow] = bimax[brow] + CHUNKSIZE; \
171         PLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
172         b->maxnz += bs2*CHUNKSIZE; \
173         b->reallocs++; \
174         b->nz++; \
175       } \
176       N = nrow++ - 1;  \
177       /* shift up all the later entries in this row */ \
178       for ( ii=N; ii>=_i; ii-- ) { \
179         rp[ii+1] = rp[ii]; \
180         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
181       } \
182       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
183       rp[_i]                      = bcol;  \
184       ap[bs2*_i + bs*cidx + ridx] = value;  \
185       b_noinsert:; \
186     bilen[brow] = nrow; \
187 }
188 
189 #undef __FUNC__
190 #define __FUNC__ "MatSetValues_MPIBAIJ"
191 int MatSetValues_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
192 {
193   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
194   Scalar      value;
195   int         ierr,i,j,row,col;
196   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
197   int         rend_orig=baij->rend_bs,cstart_orig=baij->cstart_bs;
198   int         cend_orig=baij->cend_bs,bs=baij->bs;
199 
200   /* Some Variables required in the macro */
201   Mat         A = baij->A;
202   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) (A)->data;
203   int         *aimax=a->imax,*ai=a->i,*ailen=a->ilen,*aj=a->j;
204   Scalar      *aa=a->a;
205 
206   Mat         B = baij->B;
207   Mat_SeqBAIJ *b = (Mat_SeqBAIJ *) (B)->data;
208   int         *bimax=b->imax,*bi=b->i,*bilen=b->ilen,*bj=b->j;
209   Scalar      *ba=b->a;
210 
211   int         *rp,ii,nrow,_i,rmax,N,brow,bcol;
212   int         low,high,t,ridx,cidx,bs2=a->bs2;
213   Scalar      *ap,*bap;
214 
215   PetscFunctionBegin;
216   for ( i=0; i<m; i++ ) {
217 #if defined(USE_PETSC_BOPT_g)
218     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
219     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
220 #endif
221     if (im[i] >= rstart_orig && im[i] < rend_orig) {
222       row = im[i] - rstart_orig;
223       for ( j=0; j<n; j++ ) {
224         if (in[j] >= cstart_orig && in[j] < cend_orig){
225           col = in[j] - cstart_orig;
226           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
227           MatSetValues_SeqBAIJ_A_Private(row,col,value,addv);
228           /* ierr = MatSetValues_SeqBAIJ(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
229         }
230 #if defined(USE_PETSC_BOPT_g)
231         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
232         else if (in[j] >= baij->N) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Col too large");}
233 #endif
234         else {
235           if (mat->was_assembled) {
236             if (!baij->colmap) {
237               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
238             }
239             col = baij->colmap[in[j]/bs] - 1 + in[j]%bs;
240             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
241               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
242               col =  in[j];
243               /* Reinitialize the variables required by MatSetValues_SeqBAIJ_B_Private() */
244               B = baij->B;
245               b = (Mat_SeqBAIJ *) (B)->data;
246               bimax=b->imax;bi=b->i;bilen=b->ilen;bj=b->j;
247               ba=b->a;
248             }
249           } else col = in[j];
250           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
251           MatSetValues_SeqBAIJ_B_Private(row,col,value,addv);
252           /* ierr = MatSetValues_SeqBAIJ(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
253         }
254       }
255     } else {
256       if (roworiented && !baij->donotstash) {
257         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
258       } else {
259         if (!baij->donotstash) {
260           row = im[i];
261 	  for ( j=0; j<n; j++ ) {
262 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
263           }
264         }
265       }
266     }
267   }
268   PetscFunctionReturn(0);
269 }
270 
271 extern int MatSetValuesBlocked_SeqBAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
272 #undef __FUNC__
273 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ"
274 int MatSetValuesBlocked_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
275 {
276   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
277   Scalar      *value,*barray=baij->barray;
278   int         ierr,i,j,ii,jj,row,col,k,l;
279   int         roworiented = baij->roworiented,rstart=baij->rstart ;
280   int         rend=baij->rend,cstart=baij->cstart,stepval;
281   int         cend=baij->cend,bs=baij->bs,bs2=baij->bs2;
282 
283   if(!barray) {
284     baij->barray = barray = (Scalar*) PetscMalloc(bs2*sizeof(Scalar)); CHKPTRQ(barray);
285   }
286 
287   if (roworiented) {
288     stepval = (n-1)*bs;
289   } else {
290     stepval = (m-1)*bs;
291   }
292   for ( i=0; i<m; i++ ) {
293 #if defined(USE_PETSC_BOPT_g)
294     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
295     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
296 #endif
297     if (im[i] >= rstart && im[i] < rend) {
298       row = im[i] - rstart;
299       for ( j=0; j<n; j++ ) {
300         /* If NumCol = 1 then a copy is not required */
301         if ((roworiented) && (n == 1)) {
302           barray = v + i*bs2;
303         } else if((!roworiented) && (m == 1)) {
304           barray = v + j*bs2;
305         } else { /* Here a copy is required */
306           if (roworiented) {
307             value = v + i*(stepval+bs)*bs + j*bs;
308           } else {
309             value = v + j*(stepval+bs)*bs + i*bs;
310           }
311           for ( ii=0; ii<bs; ii++,value+=stepval ) {
312             for (jj=0; jj<bs; jj++ ) {
313               *barray++  = *value++;
314             }
315           }
316           barray -=bs2;
317         }
318 
319         if (in[j] >= cstart && in[j] < cend){
320           col  = in[j] - cstart;
321           ierr = MatSetValuesBlocked_SeqBAIJ(baij->A,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
322         }
323 #if defined(USE_PETSC_BOPT_g)
324         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
325         else if (in[j] >= baij->Nbs) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");}
326 #endif
327         else {
328           if (mat->was_assembled) {
329             if (!baij->colmap) {
330               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
331             }
332 
333 #if defined(USE_PETSC_BOPT_g)
334             if ((baij->colmap[in[j]] - 1) % bs) {SETERRQ(PETSC_ERR_PLIB,0,"Incorrect colmap");}
335 #endif
336             col = (baij->colmap[in[j]] - 1)/bs;
337             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
338               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
339               col =  in[j];
340             }
341           }
342           else col = in[j];
343           ierr = MatSetValuesBlocked_SeqBAIJ(baij->B,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
344         }
345       }
346     } else {
347       if (!baij->donotstash) {
348         if (roworiented ) {
349           row   = im[i]*bs;
350           value = v + i*(stepval+bs)*bs;
351           for ( j=0; j<bs; j++,row++ ) {
352             for ( k=0; k<n; k++ ) {
353               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
354                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
355               }
356             }
357           }
358         } else {
359           for ( j=0; j<n; j++ ) {
360             value = v + j*(stepval+bs)*bs + i*bs;
361             col   = in[j]*bs;
362             for ( k=0; k<bs; k++,col++,value+=stepval) {
363               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
364                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
365               }
366             }
367           }
368         }
369       }
370     }
371   }
372   PetscFunctionReturn(0);
373 }
374 #include <math.h>
375 #define HASH_KEY 0.6180339887
376 #define HASH1(size,key) ((int)((size)*fmod(((key)*HASH_KEY),1)))
377 
378 #undef __FUNC__
379 #define __FUNC__ "MatSetValues_MPIBAIJ_HT"
380 int MatSetValues_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
381 {
382   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
383   int         ierr,i,j,row,col;
384   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
385   int         rend_orig=baij->rend_bs;
386 
387   int         h1,key,size=baij->ht_size,k,bs=baij->bs;
388   double      * HT  = baij->ht;
389   Scalar      ** HD = (Scalar **)(HT + size),value;
390 
391 
392   PetscFunctionBegin;
393 
394   if(((Mat_SeqBAIJ*)baij->A->data)->nonew !=1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix");
395 
396   for ( i=0; i<m; i++ ) {
397 #if defined(USE_PETSC_BOPT_g)
398     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
399     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
400 #endif
401     if (im[i] >= rstart_orig && im[i] < rend_orig) {
402       row = im[i];
403       for ( j=0; j<n; j++ ) {
404         col = in[j];
405         if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
406         /* Look up into the Hash Table */
407         key = (row/bs)*baij->Nbs+(col/bs)+1;
408         h1  = HASH1(size,key);
409 
410         for ( k=0; k<size; k++ ){
411           if (HT[(h1+k)%size] == key) {
412             if (addv == ADD_VALUES) *(HD[(h1+k)%size]+ (col % bs)*bs + row % bs) += value;
413             else                    *(HD[(h1+k)%size]+ (col % bs)*bs + row % bs)  = value;
414             break;
415           }
416         }
417         if ( k==size) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Hastable Entry not found");
418       }
419     } else {
420       if (roworiented && !baij->donotstash) {
421         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
422       } else {
423         if (!baij->donotstash) {
424           row = im[i];
425 	  for ( j=0; j<n; j++ ) {
426 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
427           }
428         }
429       }
430     }
431   }
432   PetscFunctionReturn(0);
433 }
434 
435 #undef __FUNC__
436 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ_HT"
437 int MatSetValuesBlocked_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
438 {
439   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
440   int         ierr,i,j,ii,jj,row,col,k,l;
441   int         roworiented = baij->roworiented,rstart=baij->rstart ;
442   int         rend=baij->rend,stepval,bs=baij->bs,bs2=baij->bs2;
443 
444   int         h1,key,size=baij->ht_size;
445   double      * HT  = baij->ht;
446   Scalar      ** HD = (Scalar **)(HT + size),*value,*baij_a;
447 
448 
449   if (roworiented) {
450     stepval = (n-1)*bs;
451   } else {
452     stepval = (m-1)*bs;
453   }
454   for ( i=0; i<m; i++ ) {
455 #if defined(USE_PETSC_BOPT_g)
456     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
457     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
458 #endif
459     if (im[i] >= rstart && im[i] < rend) {
460       row = im[i];
461       for ( j=0; j<n; j++ ) {
462         col = in[j];
463 
464         /* Look up into the Hash Table */
465         key = row*baij->Nbs+col+1;
466         h1  = HASH1(size,key);
467 
468         for ( k=0; k<size; k++ ){
469           if (HT[(h1+k)%size] == key) {
470             baij_a = HD[(h1+k)%size];
471 
472             if (roworiented) {
473               value = v + i*(stepval+bs)*bs + j*bs;
474               for ( ii=0; ii<bs; ii++,value+=stepval,baij_a++ ) {
475                 for ( jj=0; jj<bs2; jj+=bs ) {
476                   if (addv == ADD_VALUES) baij_a[jj]  += *value++;
477                   else                    baij_a[jj]   = *value++;
478                 }
479               }
480 
481             } else {
482               value = v + j*(stepval+bs)*bs + i*bs;
483               for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
484                 for ( jj=0; jj<bs; jj++ ) {
485                   if (addv == ADD_VALUES) baij_a[jj]  += *value++;
486                   else                    baij_a[jj]   = *value++;
487                 }
488               }
489             }
490             break;
491           }
492         }
493         if ( k==size) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Hastable Entry not found");
494       }
495     } else {
496       if (!baij->donotstash) {
497         if (roworiented ) {
498           row   = im[i]*bs;
499           value = v + i*(stepval+bs)*bs;
500           for ( j=0; j<bs; j++,row++ ) {
501             for ( k=0; k<n; k++ ) {
502               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
503                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
504               }
505             }
506           }
507         } else {
508           for ( j=0; j<n; j++ ) {
509             value = v + j*(stepval+bs)*bs + i*bs;
510             col   = in[j]*bs;
511             for ( k=0; k<bs; k++,col++,value+=stepval) {
512               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
513                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
514               }
515             }
516           }
517         }
518       }
519     }
520   }
521   PetscFunctionReturn(0);
522 }
523 #undef __FUNC__
524 #define __FUNC__ "MatGetValues_MPIBAIJ"
525 int MatGetValues_MPIBAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
526 {
527   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
528   int        bs=baij->bs,ierr,i,j, bsrstart = baij->rstart*bs, bsrend = baij->rend*bs;
529   int        bscstart = baij->cstart*bs, bscend = baij->cend*bs,row,col;
530 
531   for ( i=0; i<m; i++ ) {
532     if (idxm[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
533     if (idxm[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
534     if (idxm[i] >= bsrstart && idxm[i] < bsrend) {
535       row = idxm[i] - bsrstart;
536       for ( j=0; j<n; j++ ) {
537         if (idxn[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");
538         if (idxn[j] >= baij->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");
539         if (idxn[j] >= bscstart && idxn[j] < bscend){
540           col = idxn[j] - bscstart;
541           ierr = MatGetValues(baij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
542         } else {
543           if (!baij->colmap) {
544             ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
545           }
546           if((baij->colmap[idxn[j]/bs]-1 < 0) ||
547              (baij->garray[(baij->colmap[idxn[j]/bs]-1)/bs] != idxn[j]/bs)) *(v+i*n+j) = 0.0;
548           else {
549             col  = (baij->colmap[idxn[j]/bs]-1) + idxn[j]%bs;
550             ierr = MatGetValues(baij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
551           }
552         }
553       }
554     } else {
555       SETERRQ(PETSC_ERR_SUP,0,"Only local values currently supported");
556     }
557   }
558   PetscFunctionReturn(0);
559 }
560 
561 #undef __FUNC__
562 #define __FUNC__ "MatNorm_MPIBAIJ"
563 int MatNorm_MPIBAIJ(Mat mat,NormType type,double *norm)
564 {
565   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
566   Mat_SeqBAIJ *amat = (Mat_SeqBAIJ*) baij->A->data, *bmat = (Mat_SeqBAIJ*) baij->B->data;
567   int        ierr, i,bs2=baij->bs2;
568   double     sum = 0.0;
569   Scalar     *v;
570 
571   PetscFunctionBegin;
572   if (baij->size == 1) {
573     ierr =  MatNorm(baij->A,type,norm); CHKERRQ(ierr);
574   } else {
575     if (type == NORM_FROBENIUS) {
576       v = amat->a;
577       for (i=0; i<amat->nz*bs2; i++ ) {
578 #if defined(USE_PETSC_COMPLEX)
579         sum += real(conj(*v)*(*v)); v++;
580 #else
581         sum += (*v)*(*v); v++;
582 #endif
583       }
584       v = bmat->a;
585       for (i=0; i<bmat->nz*bs2; i++ ) {
586 #if defined(USE_PETSC_COMPLEX)
587         sum += real(conj(*v)*(*v)); v++;
588 #else
589         sum += (*v)*(*v); v++;
590 #endif
591       }
592       ierr = MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
593       *norm = sqrt(*norm);
594     } else {
595       SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
596     }
597   }
598   PetscFunctionReturn(0);
599 }
600 
601 #undef __FUNC__
602 #define __FUNC__ "MatAssemblyBegin_MPIBAIJ"
603 int MatAssemblyBegin_MPIBAIJ(Mat mat,MatAssemblyType mode)
604 {
605   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
606   MPI_Comm    comm = mat->comm;
607   int         size = baij->size, *owners = baij->rowners,bs=baij->bs;
608   int         rank = baij->rank,tag = mat->tag, *owner,*starts,count,ierr;
609   MPI_Request *send_waits,*recv_waits;
610   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
611   InsertMode  addv;
612   Scalar      *rvalues,*svalues;
613 
614   PetscFunctionBegin;
615   /* make sure all processors are either in INSERTMODE or ADDMODE */
616   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);CHKERRQ(ierr);
617   if (addv == (ADD_VALUES|INSERT_VALUES)) {
618     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Some processors inserted others added");
619   }
620   mat->insertmode = addv; /* in case this processor had no cache */
621 
622   /*  first count number of contributors to each processor */
623   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
624   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
625   owner = (int *) PetscMalloc( (baij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
626   for ( i=0; i<baij->stash.n; i++ ) {
627     idx = baij->stash.idx[i];
628     for ( j=0; j<size; j++ ) {
629       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
630         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
631       }
632     }
633   }
634   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
635 
636   /* inform other processors of number of messages and max length*/
637   work      = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
638   ierr      = MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
639   nreceives = work[rank];
640   ierr      = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
641   nmax      = work[rank];
642   PetscFree(work);
643 
644   /* post receives:
645        1) each message will consist of ordered pairs
646      (global index,value) we store the global index as a double
647      to simplify the message passing.
648        2) since we don't know how long each individual message is we
649      allocate the largest needed buffer for each receive. Potentially
650      this is a lot of wasted space.
651 
652 
653        This could be done better.
654   */
655   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));CHKPTRQ(rvalues);
656   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
657   for ( i=0; i<nreceives; i++ ) {
658     ierr = MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
659   }
660 
661   /* do sends:
662       1) starts[i] gives the starting index in svalues for stuff going to
663          the ith processor
664   */
665   svalues = (Scalar *) PetscMalloc(3*(baij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
666   send_waits = (MPI_Request *) PetscMalloc((nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
667   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
668   starts[0] = 0;
669   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
670   for ( i=0; i<baij->stash.n; i++ ) {
671     svalues[3*starts[owner[i]]]       = (Scalar)  baij->stash.idx[i];
672     svalues[3*starts[owner[i]]+1]     = (Scalar)  baij->stash.idy[i];
673     svalues[3*(starts[owner[i]]++)+2] =  baij->stash.array[i];
674   }
675   PetscFree(owner);
676   starts[0] = 0;
677   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
678   count = 0;
679   for ( i=0; i<size; i++ ) {
680     if (procs[i]) {
681       ierr = MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
682     }
683   }
684   PetscFree(starts); PetscFree(nprocs);
685 
686   /* Free cache space */
687   PLogInfo(0,"[%d]MatAssemblyBegin_MPIBAIJ:Number of off-processor values %d\n",rank,baij->stash.n);
688   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
689 
690   baij->svalues    = svalues;    baij->rvalues    = rvalues;
691   baij->nsends     = nsends;     baij->nrecvs     = nreceives;
692   baij->send_waits = send_waits; baij->recv_waits = recv_waits;
693   baij->rmax       = nmax;
694 
695   PetscFunctionReturn(0);
696 }
697 
698 int CreateHashTable(Mat mat,double factor)
699 {
700   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
701   Mat         A = baij->A, B=baij->B;
702   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data, *b=(Mat_SeqBAIJ *)B->data;
703   int         i,j,k,nz=a->nz+b->nz,h1,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
704   int         size,ct=0,max1=0,max2=0,bs2=baij->bs2,rstart=baij->rstart;
705   int         cstart=baij->cstart,*garray=baij->garray,row,col;
706   double      *HT,key;
707   extern int  PetscGlobalRank;
708   Scalar      **HD;
709   /*
710      Scalar      *aa=a->a,*ba=b->a;
711      static double *HT;
712      static      int flag=1;
713      */
714   PetscFunctionBegin;
715   baij->ht_size=(int)(factor*nz);
716   size = baij->ht_size;
717   /* Allocate Memory for Hash Table */
718   if (baij->ht) {
719     PetscFunctionReturn(0);
720   }
721 
722   baij->ht = (double*)PetscMalloc(size*(sizeof(double)+sizeof(Scalar*))+1); CHKPTRQ(baij->ht);
723   HT = baij->ht;
724   HD = (Scalar**)(HT + size);
725   PetscMemzero(HT,size*sizeof(double)+sizeof(Scalar*));
726 
727 
728   /* Loop Over A */
729   for ( i=0; i<a->mbs; i++ ) {
730     for ( j=ai[i]; j<ai[i+1]; j++ ) {
731       row = i+rstart;
732       col = aj[j]+cstart;
733 
734       key = row*baij->Nbs + col + 1;
735       /* printf("[%d]row,col,key = %2d,%2d,%5.2f\n",PetscGlobalRank,row,col,key); */
736       h1  = HASH1(size,key);
737 
738       for ( k=0; k<size; k++ ){
739         if (HT[(h1+k)%size] == 0.0) {
740           HT[(h1+k)%size] = key;
741           HD[(h1+k)%size] = a->a + j*bs2;
742           break;
743         } else ct++;
744       }
745       if (k> max1) max1 = k;
746     }
747   }
748   /* Loop Over B */
749   for ( i=0; i<b->mbs; i++ ) {
750     for ( j=bi[i]; j<bi[i+1]; j++ ) {
751       row = i+rstart;
752       col = garray[bj[j]];
753       key = row*baij->Nbs + col + 1;
754       /* printf("[%d]row,col,key = %2d,%2d,%5.2f\n",PetscGlobalRank,row,col,key); */
755       h1  = HASH1(size,key);
756       for ( k=0; k<size; k++ ){
757         if (HT[(h1+k)%size] == 0.0) {
758           HT[(h1+k)%size] = key;
759           HD[(h1+k)%size] = b->a + j*bs2;
760           break;
761         } else ct++;
762       }
763       if (k> max2) max2 = k;
764     }
765   }
766 
767   /* Print Summary */
768   for ( i=0,key=0.0,j=0; i<size; i++)
769     if (HT[i]) {j++;}
770 
771   /*printf("[%d] fact = %3.2f max1 = %5d max2 = %5d Size %5d - Searches %5d Avg %5.2f Keys %5d\n",
772          PetscGlobalRank,factor,max1,max2,size,ct+j,((float)ct+j),j); */
773   fflush(stdout);
774   PetscFunctionReturn(0);
775 }
776 
777 #undef __FUNC__
778 #define __FUNC__ "MatAssemblyEnd_MPIBAIJ"
779 int MatAssemblyEnd_MPIBAIJ(Mat mat,MatAssemblyType mode)
780 {
781   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
782   MPI_Status  *send_status,recv_status;
783   int         imdex,nrecvs = baij->nrecvs, count = nrecvs, i, n, ierr;
784   int         bs=baij->bs,row,col,other_disassembled,flg;
785   Scalar      *values,val;
786   InsertMode  addv = mat->insertmode;
787 
788   PetscFunctionBegin;
789   /*  wait on receives */
790   while (count) {
791     ierr = MPI_Waitany(nrecvs,baij->recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
792     /* unpack receives into our local space */
793     values = baij->rvalues + 3*imdex*baij->rmax;
794     ierr = MPI_Get_count(&recv_status,MPIU_SCALAR,&n);CHKERRQ(ierr);
795     n = n/3;
796     for ( i=0; i<n; i++ ) {
797       row = (int) PetscReal(values[3*i]) - baij->rstart*bs;
798       col = (int) PetscReal(values[3*i+1]);
799       val = values[3*i+2];
800       if (col >= baij->cstart*bs && col < baij->cend*bs) {
801         col -= baij->cstart*bs;
802         ierr = MatSetValues(baij->A,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
803       } else {
804         if (mat->was_assembled) {
805           if (!baij->colmap) {
806             ierr = CreateColmap_MPIBAIJ_Private(mat); CHKERRQ(ierr);
807           }
808           col = (baij->colmap[col/bs]) - 1 + col%bs;
809           if (col < 0  && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
810             ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
811             col = (int) PetscReal(values[3*i+1]);
812           }
813         }
814         ierr = MatSetValues(baij->B,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
815       }
816     }
817     count--;
818   }
819   PetscFree(baij->recv_waits); PetscFree(baij->rvalues);
820 
821   /* wait on sends */
822   if (baij->nsends) {
823     send_status = (MPI_Status *) PetscMalloc(baij->nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
824     ierr        = MPI_Waitall(baij->nsends,baij->send_waits,send_status);CHKERRQ(ierr);
825     PetscFree(send_status);
826   }
827   PetscFree(baij->send_waits); PetscFree(baij->svalues);
828 
829   ierr = MatAssemblyBegin(baij->A,mode); CHKERRQ(ierr);
830   ierr = MatAssemblyEnd(baij->A,mode); CHKERRQ(ierr);
831 
832   /* determine if any processor has disassembled, if so we must
833      also disassemble ourselfs, in order that we may reassemble. */
834   ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
835   if (mat->was_assembled && !other_disassembled) {
836     ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
837   }
838 
839   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
840     ierr = MatSetUpMultiply_MPIBAIJ(mat); CHKERRQ(ierr);
841   }
842   ierr = MatAssemblyBegin(baij->B,mode); CHKERRQ(ierr);
843   ierr = MatAssemblyEnd(baij->B,mode); CHKERRQ(ierr);
844 
845   ierr = OptionsHasName(PETSC_NULL,"-use_hash",&flg); CHKERRQ(ierr);
846   if (flg && !baij->ht && mode== MAT_FINAL_ASSEMBLY) {
847     double fact = 1.39;
848     CreateHashTable(mat,fact);
849     mat->ops.setvalues        = MatSetValues_MPIBAIJ_HT;
850     mat->ops.setvaluesblocked = MatSetValuesBlocked_MPIBAIJ_HT;
851   }
852   if (baij->rowvalues) {PetscFree(baij->rowvalues); baij->rowvalues = 0;}
853   PetscFunctionReturn(0);
854 }
855 
856 #undef __FUNC__
857 #define __FUNC__ "MatView_MPIBAIJ_Binary"
858 static int MatView_MPIBAIJ_Binary(Mat mat,Viewer viewer)
859 {
860   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
861   int          ierr;
862 
863   PetscFunctionBegin;
864   if (baij->size == 1) {
865     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
866   } else SETERRQ(PETSC_ERR_SUP,0,"Only uniprocessor output supported");
867   PetscFunctionReturn(0);
868 }
869 
870 #undef __FUNC__
871 #define __FUNC__ "MatView_MPIBAIJ_ASCIIorDraworMatlab"
872 static int MatView_MPIBAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
873 {
874   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
875   int          ierr, format,rank,bs = baij->bs;
876   FILE         *fd;
877   ViewerType   vtype;
878 
879   PetscFunctionBegin;
880   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
881   if (vtype  == ASCII_FILES_VIEWER || vtype == ASCII_FILE_VIEWER) {
882     ierr = ViewerGetFormat(viewer,&format);
883     if (format == VIEWER_FORMAT_ASCII_INFO_LONG) {
884       MatInfo info;
885       MPI_Comm_rank(mat->comm,&rank);
886       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
887       ierr = MatGetInfo(mat,MAT_LOCAL,&info);
888       PetscSequentialPhaseBegin(mat->comm,1);
889       fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
890               rank,baij->m,(int)info.nz_used*bs,(int)info.nz_allocated*bs,
891               baij->bs,(int)info.memory);
892       ierr = MatGetInfo(baij->A,MAT_LOCAL,&info);
893       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
894       ierr = MatGetInfo(baij->B,MAT_LOCAL,&info);
895       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
896       fflush(fd);
897       PetscSequentialPhaseEnd(mat->comm,1);
898       ierr = VecScatterView(baij->Mvctx,viewer); CHKERRQ(ierr);
899       PetscFunctionReturn(0);
900     } else if (format == VIEWER_FORMAT_ASCII_INFO) {
901       PetscPrintf(mat->comm,"  block size is %d\n",bs);
902       PetscFunctionReturn(0);
903     }
904   }
905 
906   if (vtype == DRAW_VIEWER) {
907     Draw       draw;
908     PetscTruth isnull;
909     ierr = ViewerDrawGetDraw(viewer,&draw); CHKERRQ(ierr);
910     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
911   }
912 
913   if (vtype == ASCII_FILE_VIEWER) {
914     ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
915     PetscSequentialPhaseBegin(mat->comm,1);
916     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
917            baij->rank,baij->m,baij->rstart*bs,baij->rend*bs,baij->n,
918             baij->cstart*bs,baij->cend*bs);
919     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
920     ierr = MatView(baij->B,viewer); CHKERRQ(ierr);
921     fflush(fd);
922     PetscSequentialPhaseEnd(mat->comm,1);
923   } else {
924     int size = baij->size;
925     rank = baij->rank;
926     if (size == 1) {
927       ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
928     } else {
929       /* assemble the entire matrix onto first processor. */
930       Mat         A;
931       Mat_SeqBAIJ *Aloc;
932       int         M = baij->M, N = baij->N,*ai,*aj,row,col,i,j,k,*rvals;
933       int         mbs=baij->mbs;
934       Scalar      *a;
935 
936       if (!rank) {
937         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
938       } else {
939         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
940       }
941       PLogObjectParent(mat,A);
942 
943       /* copy over the A part */
944       Aloc = (Mat_SeqBAIJ*) baij->A->data;
945       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
946       row = baij->rstart;
947       rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
948 
949       for ( i=0; i<mbs; i++ ) {
950         rvals[0] = bs*(baij->rstart + i);
951         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
952         for ( j=ai[i]; j<ai[i+1]; j++ ) {
953           col = (baij->cstart+aj[j])*bs;
954           for (k=0; k<bs; k++ ) {
955             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
956             col++; a += bs;
957           }
958         }
959       }
960       /* copy over the B part */
961       Aloc = (Mat_SeqBAIJ*) baij->B->data;
962       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
963       row = baij->rstart*bs;
964       for ( i=0; i<mbs; i++ ) {
965         rvals[0] = bs*(baij->rstart + i);
966         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
967         for ( j=ai[i]; j<ai[i+1]; j++ ) {
968           col = baij->garray[aj[j]]*bs;
969           for (k=0; k<bs; k++ ) {
970             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
971             col++; a += bs;
972           }
973         }
974       }
975       PetscFree(rvals);
976       ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
977       ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
978       /*
979          Everyone has to call to draw the matrix since the graphics waits are
980          synchronized across all processors that share the Draw object
981       */
982       if (!rank || vtype == DRAW_VIEWER) {
983         ierr = MatView(((Mat_MPIBAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
984       }
985       ierr = MatDestroy(A); CHKERRQ(ierr);
986     }
987   }
988   PetscFunctionReturn(0);
989 }
990 
991 
992 
993 #undef __FUNC__
994 #define __FUNC__ "MatView_MPIBAIJ"
995 int MatView_MPIBAIJ(PetscObject obj,Viewer viewer)
996 {
997   Mat         mat = (Mat) obj;
998   int         ierr;
999   ViewerType  vtype;
1000 
1001   PetscFunctionBegin;
1002   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
1003   if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER ||
1004       vtype == DRAW_VIEWER       || vtype == MATLAB_VIEWER) {
1005     ierr = MatView_MPIBAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
1006   } else if (vtype == BINARY_FILE_VIEWER) {
1007     ierr = MatView_MPIBAIJ_Binary(mat,viewer);CHKERRQ(ierr);
1008   }
1009   PetscFunctionReturn(0);
1010 }
1011 
1012 #undef __FUNC__
1013 #define __FUNC__ "MatDestroy_MPIBAIJ"
1014 int MatDestroy_MPIBAIJ(PetscObject obj)
1015 {
1016   Mat         mat = (Mat) obj;
1017   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1018   int         ierr;
1019 
1020   PetscFunctionBegin;
1021 #if defined(USE_PETSC_LOG)
1022   PLogObjectState(obj,"Rows=%d, Cols=%d",baij->M,baij->N);
1023 #endif
1024 
1025   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
1026   PetscFree(baij->rowners);
1027   ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1028   ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1029   if (baij->colmap) PetscFree(baij->colmap);
1030   if (baij->garray) PetscFree(baij->garray);
1031   if (baij->lvec)   VecDestroy(baij->lvec);
1032   if (baij->Mvctx)  VecScatterDestroy(baij->Mvctx);
1033   if (baij->rowvalues) PetscFree(baij->rowvalues);
1034   if (baij->barray) PetscFree(baij->barray);
1035   if (baij->ht) PetscFree(baij->ht);
1036   PetscFree(baij);
1037   PLogObjectDestroy(mat);
1038   PetscHeaderDestroy(mat);
1039   PetscFunctionReturn(0);
1040 }
1041 
1042 #undef __FUNC__
1043 #define __FUNC__ "MatMult_MPIBAIJ"
1044 int MatMult_MPIBAIJ(Mat A,Vec xx,Vec yy)
1045 {
1046   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1047   int         ierr, nt;
1048 
1049   PetscFunctionBegin;
1050   VecGetLocalSize_Fast(xx,nt);
1051   if (nt != a->n) {
1052     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible partition of A and xx");
1053   }
1054   VecGetLocalSize_Fast(yy,nt);
1055   if (nt != a->m) {
1056     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible parition of A and yy");
1057   }
1058   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1059   ierr = (*a->A->ops.mult)(a->A,xx,yy); CHKERRQ(ierr);
1060   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1061   ierr = (*a->B->ops.multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
1062   ierr = VecScatterPostRecvs(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1063   PetscFunctionReturn(0);
1064 }
1065 
1066 #undef __FUNC__
1067 #define __FUNC__ "MatMultAdd_MPIBAIJ"
1068 int MatMultAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1069 {
1070   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1071   int        ierr;
1072 
1073   PetscFunctionBegin;
1074   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1075   ierr = (*a->A->ops.multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1076   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1077   ierr = (*a->B->ops.multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
1078   PetscFunctionReturn(0);
1079 }
1080 
1081 #undef __FUNC__
1082 #define __FUNC__ "MatMultTrans_MPIBAIJ"
1083 int MatMultTrans_MPIBAIJ(Mat A,Vec xx,Vec yy)
1084 {
1085   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1086   int         ierr;
1087 
1088   PetscFunctionBegin;
1089   /* do nondiagonal part */
1090   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1091   /* send it on its way */
1092   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1093   /* do local part */
1094   ierr = (*a->A->ops.multtrans)(a->A,xx,yy); CHKERRQ(ierr);
1095   /* receive remote parts: note this assumes the values are not actually */
1096   /* inserted in yy until the next line, which is true for my implementation*/
1097   /* but is not perhaps always true. */
1098   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1099   PetscFunctionReturn(0);
1100 }
1101 
1102 #undef __FUNC__
1103 #define __FUNC__ "MatMultTransAdd_MPIBAIJ"
1104 int MatMultTransAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1105 {
1106   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1107   int         ierr;
1108 
1109   PetscFunctionBegin;
1110   /* do nondiagonal part */
1111   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1112   /* send it on its way */
1113   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1114   /* do local part */
1115   ierr = (*a->A->ops.multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1116   /* receive remote parts: note this assumes the values are not actually */
1117   /* inserted in yy until the next line, which is true for my implementation*/
1118   /* but is not perhaps always true. */
1119   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1120   PetscFunctionReturn(0);
1121 }
1122 
1123 /*
1124   This only works correctly for square matrices where the subblock A->A is the
1125    diagonal block
1126 */
1127 #undef __FUNC__
1128 #define __FUNC__ "MatGetDiagonal_MPIBAIJ"
1129 int MatGetDiagonal_MPIBAIJ(Mat A,Vec v)
1130 {
1131   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1132   int         ierr;
1133 
1134   PetscFunctionBegin;
1135   if (a->M != a->N) SETERRQ(PETSC_ERR_SUP,0,"Supports only square matrix where A->A is diag block");
1136   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
1137   PetscFunctionReturn(0);
1138 }
1139 
1140 #undef __FUNC__
1141 #define __FUNC__ "MatScale_MPIBAIJ"
1142 int MatScale_MPIBAIJ(Scalar *aa,Mat A)
1143 {
1144   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1145   int         ierr;
1146 
1147   PetscFunctionBegin;
1148   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
1149   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
1150   PetscFunctionReturn(0);
1151 }
1152 
1153 #undef __FUNC__
1154 #define __FUNC__ "MatGetSize_MPIBAIJ"
1155 int MatGetSize_MPIBAIJ(Mat matin,int *m,int *n)
1156 {
1157   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1158 
1159   PetscFunctionBegin;
1160   if (m) *m = mat->M;
1161   if (n) *n = mat->N;
1162   PetscFunctionReturn(0);
1163 }
1164 
1165 #undef __FUNC__
1166 #define __FUNC__ "MatGetLocalSize_MPIBAIJ"
1167 int MatGetLocalSize_MPIBAIJ(Mat matin,int *m,int *n)
1168 {
1169   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1170 
1171   PetscFunctionBegin;
1172   *m = mat->m; *n = mat->N;
1173   PetscFunctionReturn(0);
1174 }
1175 
1176 #undef __FUNC__
1177 #define __FUNC__ "MatGetOwnershipRange_MPIBAIJ"
1178 int MatGetOwnershipRange_MPIBAIJ(Mat matin,int *m,int *n)
1179 {
1180   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1181 
1182   PetscFunctionBegin;
1183   *m = mat->rstart*mat->bs; *n = mat->rend*mat->bs;
1184   PetscFunctionReturn(0);
1185 }
1186 
1187 extern int MatGetRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1188 extern int MatRestoreRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1189 
1190 #undef __FUNC__
1191 #define __FUNC__ "MatGetRow_MPIBAIJ"
1192 int MatGetRow_MPIBAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1193 {
1194   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1195   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1196   int        bs = mat->bs, bs2 = mat->bs2, i, ierr, *cworkA, *cworkB, **pcA, **pcB;
1197   int        nztot, nzA, nzB, lrow, brstart = mat->rstart*bs, brend = mat->rend*bs;
1198   int        *cmap, *idx_p,cstart = mat->cstart;
1199 
1200   PetscFunctionBegin;
1201   if (mat->getrowactive == PETSC_TRUE) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Already active");
1202   mat->getrowactive = PETSC_TRUE;
1203 
1204   if (!mat->rowvalues && (idx || v)) {
1205     /*
1206         allocate enough space to hold information from the longest row.
1207     */
1208     Mat_SeqBAIJ *Aa = (Mat_SeqBAIJ *) mat->A->data,*Ba = (Mat_SeqBAIJ *) mat->B->data;
1209     int     max = 1,mbs = mat->mbs,tmp;
1210     for ( i=0; i<mbs; i++ ) {
1211       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1212       if (max < tmp) { max = tmp; }
1213     }
1214     mat->rowvalues = (Scalar *) PetscMalloc( max*bs2*(sizeof(int)+sizeof(Scalar)));
1215     CHKPTRQ(mat->rowvalues);
1216     mat->rowindices = (int *) (mat->rowvalues + max*bs2);
1217   }
1218 
1219   if (row < brstart || row >= brend) SETERRQ(PETSC_ERR_SUP,0,"Only local rows")
1220   lrow = row - brstart;
1221 
1222   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1223   if (!v)   {pvA = 0; pvB = 0;}
1224   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1225   ierr = (*mat->A->ops.getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1226   ierr = (*mat->B->ops.getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1227   nztot = nzA + nzB;
1228 
1229   cmap  = mat->garray;
1230   if (v  || idx) {
1231     if (nztot) {
1232       /* Sort by increasing column numbers, assuming A and B already sorted */
1233       int imark = -1;
1234       if (v) {
1235         *v = v_p = mat->rowvalues;
1236         for ( i=0; i<nzB; i++ ) {
1237           if (cmap[cworkB[i]/bs] < cstart)   v_p[i] = vworkB[i];
1238           else break;
1239         }
1240         imark = i;
1241         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1242         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1243       }
1244       if (idx) {
1245         *idx = idx_p = mat->rowindices;
1246         if (imark > -1) {
1247           for ( i=0; i<imark; i++ ) {
1248             idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs;
1249           }
1250         } else {
1251           for ( i=0; i<nzB; i++ ) {
1252             if (cmap[cworkB[i]/bs] < cstart)
1253               idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1254             else break;
1255           }
1256           imark = i;
1257         }
1258         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart*bs + cworkA[i];
1259         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1260       }
1261     } else {
1262       if (idx) *idx = 0;
1263       if (v)   *v   = 0;
1264     }
1265   }
1266   *nz = nztot;
1267   ierr = (*mat->A->ops.restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1268   ierr = (*mat->B->ops.restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1269   PetscFunctionReturn(0);
1270 }
1271 
1272 #undef __FUNC__
1273 #define __FUNC__ "MatRestoreRow_MPIBAIJ"
1274 int MatRestoreRow_MPIBAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1275 {
1276   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1277 
1278   PetscFunctionBegin;
1279   if (baij->getrowactive == PETSC_FALSE) {
1280     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"MatGetRow not called");
1281   }
1282   baij->getrowactive = PETSC_FALSE;
1283   PetscFunctionReturn(0);
1284 }
1285 
1286 #undef __FUNC__
1287 #define __FUNC__ "MatGetBlockSize_MPIBAIJ"
1288 int MatGetBlockSize_MPIBAIJ(Mat mat,int *bs)
1289 {
1290   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1291 
1292   PetscFunctionBegin;
1293   *bs = baij->bs;
1294   PetscFunctionReturn(0);
1295 }
1296 
1297 #undef __FUNC__
1298 #define __FUNC__ "MatZeroEntries_MPIBAIJ"
1299 int MatZeroEntries_MPIBAIJ(Mat A)
1300 {
1301   Mat_MPIBAIJ *l = (Mat_MPIBAIJ *) A->data;
1302   int         ierr;
1303 
1304   PetscFunctionBegin;
1305   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
1306   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
1307   PetscFunctionReturn(0);
1308 }
1309 
1310 #undef __FUNC__
1311 #define __FUNC__ "MatGetInfo_MPIBAIJ"
1312 int MatGetInfo_MPIBAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1313 {
1314   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) matin->data;
1315   Mat         A = a->A, B = a->B;
1316   int         ierr;
1317   double      isend[5], irecv[5];
1318 
1319   PetscFunctionBegin;
1320   info->block_size     = (double)a->bs;
1321   ierr = MatGetInfo(A,MAT_LOCAL,info); CHKERRQ(ierr);
1322   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->memory;
1323   ierr = MatGetInfo(B,MAT_LOCAL,info); CHKERRQ(ierr);
1324   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->memory;
1325   if (flag == MAT_LOCAL) {
1326     info->nz_used      = isend[0];
1327     info->nz_allocated = isend[1];
1328     info->nz_unneeded  = isend[2];
1329     info->memory       = isend[3];
1330     info->mallocs      = isend[4];
1331   } else if (flag == MAT_GLOBAL_MAX) {
1332     ierr = MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_MAX,matin->comm);CHKERRQ(ierr);
1333     info->nz_used      = irecv[0];
1334     info->nz_allocated = irecv[1];
1335     info->nz_unneeded  = irecv[2];
1336     info->memory       = irecv[3];
1337     info->mallocs      = irecv[4];
1338   } else if (flag == MAT_GLOBAL_SUM) {
1339     ierr = MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_SUM,matin->comm);CHKERRQ(ierr);
1340     info->nz_used      = irecv[0];
1341     info->nz_allocated = irecv[1];
1342     info->nz_unneeded  = irecv[2];
1343     info->memory       = irecv[3];
1344     info->mallocs      = irecv[4];
1345   }
1346   info->rows_global       = (double)a->M;
1347   info->columns_global    = (double)a->N;
1348   info->rows_local        = (double)a->m;
1349   info->columns_local     = (double)a->N;
1350   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1351   info->fill_ratio_needed = 0;
1352   info->factor_mallocs    = 0;
1353   PetscFunctionReturn(0);
1354 }
1355 
1356 #undef __FUNC__
1357 #define __FUNC__ "MatSetOption_MPIBAIJ"
1358 int MatSetOption_MPIBAIJ(Mat A,MatOption op)
1359 {
1360   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1361 
1362   PetscFunctionBegin;
1363   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1364       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1365       op == MAT_COLUMNS_UNSORTED ||
1366       op == MAT_COLUMNS_SORTED ||
1367       op == MAT_NEW_NONZERO_ALLOCATION_ERROR ||
1368       op == MAT_NEW_NONZERO_LOCATION_ERROR) {
1369         MatSetOption(a->A,op);
1370         MatSetOption(a->B,op);
1371   } else if (op == MAT_ROW_ORIENTED) {
1372         a->roworiented = 1;
1373         MatSetOption(a->A,op);
1374         MatSetOption(a->B,op);
1375   } else if (op == MAT_ROWS_SORTED ||
1376              op == MAT_ROWS_UNSORTED ||
1377              op == MAT_SYMMETRIC ||
1378              op == MAT_STRUCTURALLY_SYMMETRIC ||
1379              op == MAT_YES_NEW_DIAGONALS)
1380     PLogInfo(A,"Info:MatSetOption_MPIBAIJ:Option ignored\n");
1381   else if (op == MAT_COLUMN_ORIENTED) {
1382     a->roworiented = 0;
1383     MatSetOption(a->A,op);
1384     MatSetOption(a->B,op);
1385   } else if (op == MAT_IGNORE_OFF_PROC_ENTRIES) {
1386     a->donotstash = 1;
1387   } else if (op == MAT_NO_NEW_DIAGONALS) {
1388     SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");
1389   } else {
1390     SETERRQ(PETSC_ERR_SUP,0,"unknown option");
1391   }
1392   PetscFunctionReturn(0);
1393 }
1394 
1395 #undef __FUNC__
1396 #define __FUNC__ "MatTranspose_MPIBAIJ("
1397 int MatTranspose_MPIBAIJ(Mat A,Mat *matout)
1398 {
1399   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) A->data;
1400   Mat_SeqBAIJ *Aloc;
1401   Mat        B;
1402   int        ierr,M=baij->M,N=baij->N,*ai,*aj,row,i,*rvals,j,k,col;
1403   int        bs=baij->bs,mbs=baij->mbs;
1404   Scalar     *a;
1405 
1406   PetscFunctionBegin;
1407   if (matout == PETSC_NULL && M != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Square matrix only for in-place");
1408   ierr = MatCreateMPIBAIJ(A->comm,baij->bs,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);
1409   CHKERRQ(ierr);
1410 
1411   /* copy over the A part */
1412   Aloc = (Mat_SeqBAIJ*) baij->A->data;
1413   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1414   row = baij->rstart;
1415   rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1416 
1417   for ( i=0; i<mbs; i++ ) {
1418     rvals[0] = bs*(baij->rstart + i);
1419     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1420     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1421       col = (baij->cstart+aj[j])*bs;
1422       for (k=0; k<bs; k++ ) {
1423         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1424         col++; a += bs;
1425       }
1426     }
1427   }
1428   /* copy over the B part */
1429   Aloc = (Mat_SeqBAIJ*) baij->B->data;
1430   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1431   row = baij->rstart*bs;
1432   for ( i=0; i<mbs; i++ ) {
1433     rvals[0] = bs*(baij->rstart + i);
1434     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1435     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1436       col = baij->garray[aj[j]]*bs;
1437       for (k=0; k<bs; k++ ) {
1438         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1439         col++; a += bs;
1440       }
1441     }
1442   }
1443   PetscFree(rvals);
1444   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1445   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1446 
1447   if (matout != PETSC_NULL) {
1448     *matout = B;
1449   } else {
1450     /* This isn't really an in-place transpose .... but free data structures from baij */
1451     PetscFree(baij->rowners);
1452     ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1453     ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1454     if (baij->colmap) PetscFree(baij->colmap);
1455     if (baij->garray) PetscFree(baij->garray);
1456     if (baij->lvec) VecDestroy(baij->lvec);
1457     if (baij->Mvctx) VecScatterDestroy(baij->Mvctx);
1458     PetscFree(baij);
1459     PetscMemcpy(A,B,sizeof(struct _p_Mat));
1460     PetscHeaderDestroy(B);
1461   }
1462   PetscFunctionReturn(0);
1463 }
1464 
1465 #undef __FUNC__
1466 #define __FUNC__ "MatDiagonalScale_MPIBAIJ"
1467 int MatDiagonalScale_MPIBAIJ(Mat A,Vec ll,Vec rr)
1468 {
1469   Mat a = ((Mat_MPIBAIJ *) A->data)->A;
1470   Mat b = ((Mat_MPIBAIJ *) A->data)->B;
1471   int ierr,s1,s2,s3;
1472 
1473   PetscFunctionBegin;
1474   if (ll)  {
1475     ierr = VecGetLocalSize(ll,&s1); CHKERRQ(ierr);
1476     ierr = MatGetLocalSize(A,&s2,&s3); CHKERRQ(ierr);
1477     if (s1!=s2) SETERRQ(PETSC_ERR_ARG_SIZ,0,"non-conforming local sizes");
1478     ierr = MatDiagonalScale(a,ll,0); CHKERRQ(ierr);
1479     ierr = MatDiagonalScale(b,ll,0); CHKERRQ(ierr);
1480   }
1481   if (rr) SETERRQ(PETSC_ERR_SUP,0,"not supported for right vector");
1482   PetscFunctionReturn(0);
1483 }
1484 
1485 #undef __FUNC__
1486 #define __FUNC__ "MatZeroRows_MPIBAIJ"
1487 int MatZeroRows_MPIBAIJ(Mat A,IS is,Scalar *diag)
1488 {
1489   Mat_MPIBAIJ    *l = (Mat_MPIBAIJ *) A->data;
1490   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
1491   int            *procs,*nprocs,j,found,idx,nsends,*work,row;
1492   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
1493   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
1494   int            *lens,imdex,*lrows,*values,bs=l->bs,rstart_bs=l->rstart_bs;
1495   MPI_Comm       comm = A->comm;
1496   MPI_Request    *send_waits,*recv_waits;
1497   MPI_Status     recv_status,*send_status;
1498   IS             istmp;
1499 
1500   PetscFunctionBegin;
1501   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
1502   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
1503 
1504   /*  first count number of contributors to each processor */
1505   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
1506   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
1507   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
1508   for ( i=0; i<N; i++ ) {
1509     idx = rows[i];
1510     found = 0;
1511     for ( j=0; j<size; j++ ) {
1512       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
1513         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
1514       }
1515     }
1516     if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Index out of range");
1517   }
1518   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
1519 
1520   /* inform other processors of number of messages and max length*/
1521   work   = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
1522   ierr   = MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
1523   nrecvs = work[rank];
1524   ierr   = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
1525   nmax   = work[rank];
1526   PetscFree(work);
1527 
1528   /* post receives:   */
1529   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); CHKPTRQ(rvalues);
1530   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
1531   for ( i=0; i<nrecvs; i++ ) {
1532     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
1533   }
1534 
1535   /* do sends:
1536      1) starts[i] gives the starting index in svalues for stuff going to
1537      the ith processor
1538   */
1539   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
1540   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
1541   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
1542   starts[0] = 0;
1543   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1544   for ( i=0; i<N; i++ ) {
1545     svalues[starts[owner[i]]++] = rows[i];
1546   }
1547   ISRestoreIndices(is,&rows);
1548 
1549   starts[0] = 0;
1550   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1551   count = 0;
1552   for ( i=0; i<size; i++ ) {
1553     if (procs[i]) {
1554       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
1555     }
1556   }
1557   PetscFree(starts);
1558 
1559   base = owners[rank]*bs;
1560 
1561   /*  wait on receives */
1562   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
1563   source = lens + nrecvs;
1564   count  = nrecvs; slen = 0;
1565   while (count) {
1566     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
1567     /* unpack receives into our local space */
1568     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
1569     source[imdex]  = recv_status.MPI_SOURCE;
1570     lens[imdex]  = n;
1571     slen += n;
1572     count--;
1573   }
1574   PetscFree(recv_waits);
1575 
1576   /* move the data into the send scatter */
1577   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
1578   count = 0;
1579   for ( i=0; i<nrecvs; i++ ) {
1580     values = rvalues + i*nmax;
1581     for ( j=0; j<lens[i]; j++ ) {
1582       lrows[count++] = values[j] - base;
1583     }
1584   }
1585   PetscFree(rvalues); PetscFree(lens);
1586   PetscFree(owner); PetscFree(nprocs);
1587 
1588   /* actually zap the local rows */
1589   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
1590   PLogObjectParent(A,istmp);
1591 
1592   ierr = MatZeroRows(l->A,istmp,0); CHKERRQ(ierr);
1593   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
1594   ierr = ISDestroy(istmp); CHKERRQ(ierr);
1595 
1596   if (diag) {
1597     for ( i = 0; i < slen; i++ ) {
1598       row = lrows[i] + rstart_bs;
1599       ierr = MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES); CHKERRQ(ierr);
1600     }
1601     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1602     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1603   }
1604   PetscFree(lrows);
1605 
1606   /* wait on sends */
1607   if (nsends) {
1608     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
1609     ierr        = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
1610     PetscFree(send_status);
1611   }
1612   PetscFree(send_waits); PetscFree(svalues);
1613 
1614   PetscFunctionReturn(0);
1615 }
1616 extern int MatPrintHelp_SeqBAIJ(Mat);
1617 #undef __FUNC__
1618 #define __FUNC__ "MatPrintHelp_MPIBAIJ"
1619 int MatPrintHelp_MPIBAIJ(Mat A)
1620 {
1621   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1622   int         ierr;
1623 
1624   PetscFunctionBegin;
1625   if (!a->rank) {
1626     ierr = MatPrintHelp_SeqBAIJ(a->A);CHKERRQ(ierr);
1627   }
1628   PetscFunctionReturn(0);
1629 }
1630 
1631 #undef __FUNC__
1632 #define __FUNC__ "MatSetUnfactored_MPIBAIJ"
1633 int MatSetUnfactored_MPIBAIJ(Mat A)
1634 {
1635   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1636   int         ierr;
1637 
1638   PetscFunctionBegin;
1639   ierr = MatSetUnfactored(a->A); CHKERRQ(ierr);
1640   PetscFunctionReturn(0);
1641 }
1642 
1643 static int MatConvertSameType_MPIBAIJ(Mat,Mat *,int);
1644 
1645 /* -------------------------------------------------------------------*/
1646 static struct _MatOps MatOps = {
1647   MatSetValues_MPIBAIJ,MatGetRow_MPIBAIJ,MatRestoreRow_MPIBAIJ,MatMult_MPIBAIJ,
1648   MatMultAdd_MPIBAIJ,MatMultTrans_MPIBAIJ,MatMultTransAdd_MPIBAIJ,0,
1649   0,0,0,0,
1650   0,0,MatTranspose_MPIBAIJ,MatGetInfo_MPIBAIJ,
1651   0,MatGetDiagonal_MPIBAIJ,MatDiagonalScale_MPIBAIJ,MatNorm_MPIBAIJ,
1652   MatAssemblyBegin_MPIBAIJ,MatAssemblyEnd_MPIBAIJ,0,MatSetOption_MPIBAIJ,
1653   MatZeroEntries_MPIBAIJ,MatZeroRows_MPIBAIJ,0,
1654   0,0,0,MatGetSize_MPIBAIJ,
1655   MatGetLocalSize_MPIBAIJ,MatGetOwnershipRange_MPIBAIJ,0,0,
1656   0,0,MatConvertSameType_MPIBAIJ,0,0,
1657   0,0,0,MatGetSubMatrices_MPIBAIJ,
1658   MatIncreaseOverlap_MPIBAIJ,MatGetValues_MPIBAIJ,0,MatPrintHelp_MPIBAIJ,
1659   MatScale_MPIBAIJ,0,0,0,MatGetBlockSize_MPIBAIJ,
1660   0,0,0,0,0,0,MatSetUnfactored_MPIBAIJ,0,MatSetValuesBlocked_MPIBAIJ};
1661 
1662 
1663 #undef __FUNC__
1664 #define __FUNC__ "MatCreateMPIBAIJ"
1665 /*@C
1666    MatCreateMPIBAIJ - Creates a sparse parallel matrix in block AIJ format
1667    (block compressed row).  For good matrix assembly performance
1668    the user should preallocate the matrix storage by setting the parameters
1669    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1670    performance can be increased by more than a factor of 50.
1671 
1672    Input Parameters:
1673 .  comm - MPI communicator
1674 .  bs   - size of blockk
1675 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1676            This value should be the same as the local size used in creating the
1677            y vector for the matrix-vector product y = Ax.
1678 .  n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1679            This value should be the same as the local size used in creating the
1680            x vector for the matrix-vector product y = Ax.
1681 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1682 .  N - number of global columns (or PETSC_DECIDE to have calculated if n is given)
1683 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1684            submatrix  (same for all local rows)
1685 .  d_nzz - array containing the number of block nonzeros in the various block rows
1686            of the in diagonal portion of the local (possibly different for each block
1687            row) or PETSC_NULL.  You must leave room for the diagonal entry even if
1688            it is zero.
1689 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1690            submatrix (same for all local rows).
1691 .  o_nzz - array containing the number of nonzeros in the various block rows of the
1692            off-diagonal portion of the local submatrix (possibly different for
1693            each block row) or PETSC_NULL.
1694 
1695    Output Parameter:
1696 .  A - the matrix
1697 
1698    Notes:
1699    The user MUST specify either the local or global matrix dimensions
1700    (possibly both).
1701 
1702    Storage Information:
1703    For a square global matrix we define each processor's diagonal portion
1704    to be its local rows and the corresponding columns (a square submatrix);
1705    each processor's off-diagonal portion encompasses the remainder of the
1706    local matrix (a rectangular submatrix).
1707 
1708    The user can specify preallocated storage for the diagonal part of
1709    the local submatrix with either d_nz or d_nnz (not both).  Set
1710    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1711    memory allocation.  Likewise, specify preallocated storage for the
1712    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1713 
1714    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1715    the figure below we depict these three local rows and all columns (0-11).
1716 
1717 $          0 1 2 3 4 5 6 7 8 9 10 11
1718 $         -------------------
1719 $  row 3  |  o o o d d d o o o o o o
1720 $  row 4  |  o o o d d d o o o o o o
1721 $  row 5  |  o o o d d d o o o o o o
1722 $         -------------------
1723 $
1724 
1725    Thus, any entries in the d locations are stored in the d (diagonal)
1726    submatrix, and any entries in the o locations are stored in the
1727    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1728    stored simply in the MATSEQBAIJ format for compressed row storage.
1729 
1730    Now d_nz should indicate the number of block nonzeros per row in the d matrix,
1731    and o_nz should indicate the number of block nonzeros per row in the o matrix.
1732    In general, for PDE problems in which most nonzeros are near the diagonal,
1733    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1734    or you will get TERRIBLE performance; see the users' manual chapter on
1735    matrices.
1736 
1737 .keywords: matrix, block, aij, compressed row, sparse, parallel
1738 
1739 .seealso: MatCreate(), MatCreateSeqBAIJ(), MatSetValues()
1740 @*/
1741 int MatCreateMPIBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,
1742                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1743 {
1744   Mat          B;
1745   Mat_MPIBAIJ  *b;
1746   int          ierr, i,sum[2],work[2],mbs,nbs,Mbs=PETSC_DECIDE,Nbs=PETSC_DECIDE,size;
1747 
1748   PetscFunctionBegin;
1749   if (bs < 1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Invalid block size specified, must be positive");
1750 
1751   MPI_Comm_size(comm,&size);
1752   if (size == 1) {
1753     if (M == PETSC_DECIDE) M = m;
1754     if (N == PETSC_DECIDE) N = n;
1755     ierr = MatCreateSeqBAIJ(comm,bs,M,N,d_nz,d_nnz,A); CHKERRQ(ierr);
1756     PetscFunctionReturn(0);
1757   }
1758 
1759   *A = 0;
1760   PetscHeaderCreate(B,_p_Mat,MAT_COOKIE,MATMPIBAIJ,comm,MatDestroy,MatView);
1761   PLogObjectCreate(B);
1762   B->data       = (void *) (b = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(b);
1763   PetscMemzero(b,sizeof(Mat_MPIBAIJ));
1764   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
1765 
1766   B->destroy    = MatDestroy_MPIBAIJ;
1767   B->view       = MatView_MPIBAIJ;
1768   B->mapping    = 0;
1769   B->factor     = 0;
1770   B->assembled  = PETSC_FALSE;
1771 
1772   B->insertmode = NOT_SET_VALUES;
1773   MPI_Comm_rank(comm,&b->rank);
1774   MPI_Comm_size(comm,&b->size);
1775 
1776   if ( m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL)) {
1777     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
1778   }
1779   if ( M == PETSC_DECIDE && m == PETSC_DECIDE) {
1780     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either M or m should be specified");
1781   }
1782   if ( N == PETSC_DECIDE && n == PETSC_DECIDE) {
1783     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either N or n should be specified");
1784   }
1785   if ( M != PETSC_DECIDE && m != PETSC_DECIDE) M = PETSC_DECIDE;
1786   if ( N != PETSC_DECIDE && n != PETSC_DECIDE) N = PETSC_DECIDE;
1787 
1788   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1789     work[0] = m; work[1] = n;
1790     mbs = m/bs; nbs = n/bs;
1791     ierr = MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );CHKERRQ(ierr);
1792     if (M == PETSC_DECIDE) {M = sum[0]; Mbs = M/bs;}
1793     if (N == PETSC_DECIDE) {N = sum[1]; Nbs = N/bs;}
1794   }
1795   if (m == PETSC_DECIDE) {
1796     Mbs = M/bs;
1797     if (Mbs*bs != M) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global rows must be divisible by blocksize");
1798     mbs = Mbs/b->size + ((Mbs % b->size) > b->rank);
1799     m   = mbs*bs;
1800   }
1801   if (n == PETSC_DECIDE) {
1802     Nbs = N/bs;
1803     if (Nbs*bs != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global cols must be divisible by blocksize");
1804     nbs = Nbs/b->size + ((Nbs % b->size) > b->rank);
1805     n   = nbs*bs;
1806   }
1807   if (mbs*bs != m || nbs*bs != n) {
1808     SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of local rows, cols must be divisible by blocksize");
1809   }
1810 
1811   b->m = m; B->m = m;
1812   b->n = n; B->n = n;
1813   b->N = N; B->N = N;
1814   b->M = M; B->M = M;
1815   b->bs  = bs;
1816   b->bs2 = bs*bs;
1817   b->mbs = mbs;
1818   b->nbs = nbs;
1819   b->Mbs = Mbs;
1820   b->Nbs = Nbs;
1821 
1822   /* build local table of row and column ownerships */
1823   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
1824   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
1825   b->cowners = b->rowners + b->size + 2;
1826   ierr = MPI_Allgather(&mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1827   b->rowners[0] = 0;
1828   for ( i=2; i<=b->size; i++ ) {
1829     b->rowners[i] += b->rowners[i-1];
1830   }
1831   b->rstart    = b->rowners[b->rank];
1832   b->rend      = b->rowners[b->rank+1];
1833   b->rstart_bs = b->rstart * bs;
1834   b->rend_bs   = b->rend * bs;
1835 
1836   ierr = MPI_Allgather(&nbs,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1837   b->cowners[0] = 0;
1838   for ( i=2; i<=b->size; i++ ) {
1839     b->cowners[i] += b->cowners[i-1];
1840   }
1841   b->cstart    = b->cowners[b->rank];
1842   b->cend      = b->cowners[b->rank+1];
1843   b->cstart_bs = b->cstart * bs;
1844   b->cend_bs   = b->cend * bs;
1845 
1846 
1847   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1848   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
1849   PLogObjectParent(B,b->A);
1850   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1851   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
1852   PLogObjectParent(B,b->B);
1853 
1854   /* build cache for off array entries formed */
1855   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
1856   b->donotstash  = 0;
1857   b->colmap      = 0;
1858   b->garray      = 0;
1859   b->roworiented = 1;
1860 
1861   /* stuff used in block assembly */
1862   b->barray       = 0;
1863 
1864   /* stuff used for matrix vector multiply */
1865   b->lvec         = 0;
1866   b->Mvctx        = 0;
1867 
1868   /* stuff for MatGetRow() */
1869   b->rowindices   = 0;
1870   b->rowvalues    = 0;
1871   b->getrowactive = PETSC_FALSE;
1872 
1873   /* hash table stuff */
1874   b->ht          = 0;
1875   b->ht_size     = 0;
1876 
1877   *A = B;
1878   PetscFunctionReturn(0);
1879 }
1880 
1881 #undef __FUNC__
1882 #define __FUNC__ "MatConvertSameType_MPIBAIJ"
1883 static int MatConvertSameType_MPIBAIJ(Mat matin,Mat *newmat,int cpvalues)
1884 {
1885   Mat         mat;
1886   Mat_MPIBAIJ *a,*oldmat = (Mat_MPIBAIJ *) matin->data;
1887   int         ierr, len=0, flg;
1888 
1889   PetscFunctionBegin;
1890   *newmat       = 0;
1891   PetscHeaderCreate(mat,_p_Mat,MAT_COOKIE,MATMPIBAIJ,matin->comm,MatDestroy,MatView);
1892   PLogObjectCreate(mat);
1893   mat->data       = (void *) (a = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(a);
1894   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1895   mat->destroy    = MatDestroy_MPIBAIJ;
1896   mat->view       = MatView_MPIBAIJ;
1897   mat->factor     = matin->factor;
1898   mat->assembled  = PETSC_TRUE;
1899 
1900   a->m = mat->m   = oldmat->m;
1901   a->n = mat->n   = oldmat->n;
1902   a->M = mat->M   = oldmat->M;
1903   a->N = mat->N   = oldmat->N;
1904 
1905   a->bs  = oldmat->bs;
1906   a->bs2 = oldmat->bs2;
1907   a->mbs = oldmat->mbs;
1908   a->nbs = oldmat->nbs;
1909   a->Mbs = oldmat->Mbs;
1910   a->Nbs = oldmat->Nbs;
1911 
1912   a->rstart       = oldmat->rstart;
1913   a->rend         = oldmat->rend;
1914   a->cstart       = oldmat->cstart;
1915   a->cend         = oldmat->cend;
1916   a->size         = oldmat->size;
1917   a->rank         = oldmat->rank;
1918   mat->insertmode = NOT_SET_VALUES;
1919   a->rowvalues    = 0;
1920   a->getrowactive = PETSC_FALSE;
1921   a->barray       = 0;
1922 
1923   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
1924   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
1925   a->cowners = a->rowners + a->size + 2;
1926   PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));
1927   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
1928   if (oldmat->colmap) {
1929     a->colmap = (int *) PetscMalloc((a->Nbs)*sizeof(int));CHKPTRQ(a->colmap);
1930     PLogObjectMemory(mat,(a->Nbs)*sizeof(int));
1931     PetscMemcpy(a->colmap,oldmat->colmap,(a->Nbs)*sizeof(int));
1932   } else a->colmap = 0;
1933   if (oldmat->garray && (len = ((Mat_SeqBAIJ *) (oldmat->B->data))->nbs)) {
1934     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
1935     PLogObjectMemory(mat,len*sizeof(int));
1936     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
1937   } else a->garray = 0;
1938 
1939   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
1940   PLogObjectParent(mat,a->lvec);
1941   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
1942   PLogObjectParent(mat,a->Mvctx);
1943   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
1944   PLogObjectParent(mat,a->A);
1945   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
1946   PLogObjectParent(mat,a->B);
1947   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
1948   if (flg) {
1949     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
1950   }
1951   *newmat = mat;
1952   PetscFunctionReturn(0);
1953 }
1954 
1955 #include "sys.h"
1956 
1957 #undef __FUNC__
1958 #define __FUNC__ "MatLoad_MPIBAIJ"
1959 int MatLoad_MPIBAIJ(Viewer viewer,MatType type,Mat *newmat)
1960 {
1961   Mat          A;
1962   int          i, nz, ierr, j,rstart, rend, fd;
1963   Scalar       *vals,*buf;
1964   MPI_Comm     comm = ((PetscObject)viewer)->comm;
1965   MPI_Status   status;
1966   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,*browners,maxnz,*cols;
1967   int          *locrowlens,*sndcounts = 0,*procsnz = 0, jj,*mycols,*ibuf;
1968   int          flg,tag = ((PetscObject)viewer)->tag,bs=1,bs2,Mbs,mbs,extra_rows;
1969   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
1970   int          dcount,kmax,k,nzcount,tmp;
1971 
1972   PetscFunctionBegin;
1973   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
1974   bs2  = bs*bs;
1975 
1976   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
1977   if (!rank) {
1978     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
1979     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT); CHKERRQ(ierr);
1980     if (header[0] != MAT_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"not matrix object");
1981     if (header[3] < 0) {
1982       SETERRQ(PETSC_ERR_FILE_UNEXPECTED,1,"Matrix stored in special format, cannot load as MPIBAIJ");
1983     }
1984   }
1985 
1986   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
1987   M = header[1]; N = header[2];
1988 
1989   if (M != N) SETERRQ(PETSC_ERR_SUP,0,"Can only do square matrices");
1990 
1991   /*
1992      This code adds extra rows to make sure the number of rows is
1993      divisible by the blocksize
1994   */
1995   Mbs        = M/bs;
1996   extra_rows = bs - M + bs*(Mbs);
1997   if (extra_rows == bs) extra_rows = 0;
1998   else                  Mbs++;
1999   if (extra_rows &&!rank) {
2000     PLogInfo(0,"MatLoad_MPIBAIJ:Padding loaded matrix to match blocksize\n");
2001   }
2002 
2003   /* determine ownership of all rows */
2004   mbs = Mbs/size + ((Mbs % size) > rank);
2005   m   = mbs * bs;
2006   rowners = (int *) PetscMalloc(2*(size+2)*sizeof(int)); CHKPTRQ(rowners);
2007   browners = rowners + size + 1;
2008   ierr = MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2009   rowners[0] = 0;
2010   for ( i=2; i<=size; i++ ) rowners[i] += rowners[i-1];
2011   for ( i=0; i<=size;  i++ ) browners[i] = rowners[i]*bs;
2012   rstart = rowners[rank];
2013   rend   = rowners[rank+1];
2014 
2015   /* distribute row lengths to all processors */
2016   locrowlens = (int*) PetscMalloc( (rend-rstart)*bs*sizeof(int) ); CHKPTRQ(locrowlens);
2017   if (!rank) {
2018     rowlengths = (int*) PetscMalloc( (M+extra_rows)*sizeof(int) ); CHKPTRQ(rowlengths);
2019     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT); CHKERRQ(ierr);
2020     for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
2021     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
2022     for ( i=0; i<size; i++ ) sndcounts[i] = browners[i+1] - browners[i];
2023     ierr = MPI_Scatterv(rowlengths,sndcounts,browners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);CHKERRQ(ierr);
2024     PetscFree(sndcounts);
2025   } else {
2026     ierr = MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT, 0,comm);CHKERRQ(ierr);
2027   }
2028 
2029   if (!rank) {
2030     /* calculate the number of nonzeros on each processor */
2031     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
2032     PetscMemzero(procsnz,size*sizeof(int));
2033     for ( i=0; i<size; i++ ) {
2034       for ( j=rowners[i]*bs; j< rowners[i+1]*bs; j++ ) {
2035         procsnz[i] += rowlengths[j];
2036       }
2037     }
2038     PetscFree(rowlengths);
2039 
2040     /* determine max buffer needed and allocate it */
2041     maxnz = 0;
2042     for ( i=0; i<size; i++ ) {
2043       maxnz = PetscMax(maxnz,procsnz[i]);
2044     }
2045     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
2046 
2047     /* read in my part of the matrix column indices  */
2048     nz = procsnz[0];
2049     ibuf = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2050     mycols = ibuf;
2051     if (size == 1)  nz -= extra_rows;
2052     ierr = PetscBinaryRead(fd,mycols,nz,PETSC_INT); CHKERRQ(ierr);
2053     if (size == 1)  for (i=0; i< extra_rows; i++) { mycols[nz+i] = M+i; }
2054 
2055     /* read in every ones (except the last) and ship off */
2056     for ( i=1; i<size-1; i++ ) {
2057       nz   = procsnz[i];
2058       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2059       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
2060     }
2061     /* read in the stuff for the last proc */
2062     if ( size != 1 ) {
2063       nz   = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
2064       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2065       for ( i=0; i<extra_rows; i++ ) cols[nz+i] = M+i;
2066       ierr = MPI_Send(cols,nz+extra_rows,MPI_INT,size-1,tag,comm);CHKERRQ(ierr);
2067     }
2068     PetscFree(cols);
2069   } else {
2070     /* determine buffer space needed for message */
2071     nz = 0;
2072     for ( i=0; i<m; i++ ) {
2073       nz += locrowlens[i];
2074     }
2075     ibuf   = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2076     mycols = ibuf;
2077     /* receive message of column indices*/
2078     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
2079     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
2080     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2081   }
2082 
2083   /* loop over local rows, determining number of off diagonal entries */
2084   dlens  = (int *) PetscMalloc( 2*(rend-rstart+1)*sizeof(int) ); CHKPTRQ(dlens);
2085   odlens = dlens + (rend-rstart);
2086   mask   = (int *) PetscMalloc( 3*Mbs*sizeof(int) ); CHKPTRQ(mask);
2087   PetscMemzero(mask,3*Mbs*sizeof(int));
2088   masked1 = mask    + Mbs;
2089   masked2 = masked1 + Mbs;
2090   rowcount = 0; nzcount = 0;
2091   for ( i=0; i<mbs; i++ ) {
2092     dcount  = 0;
2093     odcount = 0;
2094     for ( j=0; j<bs; j++ ) {
2095       kmax = locrowlens[rowcount];
2096       for ( k=0; k<kmax; k++ ) {
2097         tmp = mycols[nzcount++]/bs;
2098         if (!mask[tmp]) {
2099           mask[tmp] = 1;
2100           if (tmp < rstart || tmp >= rend ) masked2[odcount++] = tmp;
2101           else masked1[dcount++] = tmp;
2102         }
2103       }
2104       rowcount++;
2105     }
2106 
2107     dlens[i]  = dcount;
2108     odlens[i] = odcount;
2109 
2110     /* zero out the mask elements we set */
2111     for ( j=0; j<dcount; j++ ) mask[masked1[j]] = 0;
2112     for ( j=0; j<odcount; j++ ) mask[masked2[j]] = 0;
2113   }
2114 
2115   /* create our matrix */
2116   ierr = MatCreateMPIBAIJ(comm,bs,m,PETSC_DECIDE,M+extra_rows,N+extra_rows,0,dlens,0,odlens,newmat);
2117          CHKERRQ(ierr);
2118   A = *newmat;
2119   MatSetOption(A,MAT_COLUMNS_SORTED);
2120 
2121   if (!rank) {
2122     buf = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(buf);
2123     /* read in my part of the matrix numerical values  */
2124     nz = procsnz[0];
2125     vals = buf;
2126     mycols = ibuf;
2127     if (size == 1)  nz -= extra_rows;
2128     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2129     if (size == 1)  for (i=0; i< extra_rows; i++) { vals[nz+i] = 1.0; }
2130 
2131     /* insert into matrix */
2132     jj      = rstart*bs;
2133     for ( i=0; i<m; i++ ) {
2134       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2135       mycols += locrowlens[i];
2136       vals   += locrowlens[i];
2137       jj++;
2138     }
2139     /* read in other processors (except the last one) and ship out */
2140     for ( i=1; i<size-1; i++ ) {
2141       nz   = procsnz[i];
2142       vals = buf;
2143       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2144       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
2145     }
2146     /* the last proc */
2147     if ( size != 1 ){
2148       nz   = procsnz[i] - extra_rows;
2149       vals = buf;
2150       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2151       for ( i=0; i<extra_rows; i++ ) vals[nz+i] = 1.0;
2152       ierr = MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,size-1,A->tag,comm);CHKERRQ(ierr);
2153     }
2154     PetscFree(procsnz);
2155   } else {
2156     /* receive numeric values */
2157     buf = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(buf);
2158 
2159     /* receive message of values*/
2160     vals   = buf;
2161     mycols = ibuf;
2162     ierr   = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
2163     ierr   = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
2164     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2165 
2166     /* insert into matrix */
2167     jj      = rstart*bs;
2168     for ( i=0; i<m; i++ ) {
2169       ierr    = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2170       mycols += locrowlens[i];
2171       vals   += locrowlens[i];
2172       jj++;
2173     }
2174   }
2175   PetscFree(locrowlens);
2176   PetscFree(buf);
2177   PetscFree(ibuf);
2178   PetscFree(rowners);
2179   PetscFree(dlens);
2180   PetscFree(mask);
2181   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2182   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2183   PetscFunctionReturn(0);
2184 }
2185 
2186 
2187