xref: /petsc/src/mat/impls/baij/mpi/mpibaij.c (revision 4a15367f0f054a95d8a3f3a066947446a2fbaceb)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: mpibaij.c,v 1.102 1998/02/01 07:42:11 balay Exp balay $";
3 #endif
4 
5 #include "pinclude/pviewer.h"
6 #include "src/mat/impls/baij/mpi/mpibaij.h"
7 #include "src/vec/vecimpl.h"
8 
9 
10 extern int MatSetUpMultiply_MPIBAIJ(Mat);
11 extern int DisAssemble_MPIBAIJ(Mat);
12 extern int MatIncreaseOverlap_MPIBAIJ(Mat,int,IS *,int);
13 extern int MatGetSubMatrices_MPIBAIJ(Mat,int,IS *,IS *,MatGetSubMatrixCall,Mat **);
14 
15 /*
16      Local utility routine that creates a mapping from the global column
17    number to the local number in the off-diagonal part of the local
18    storage of the matrix.  This is done in a non scable way since the
19    length of colmap equals the global matrix length.
20 */
21 #undef __FUNC__
22 #define __FUNC__ "CreateColmap_MPIBAIJ_Private"
23 static int CreateColmap_MPIBAIJ_Private(Mat mat)
24 {
25   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
26   Mat_SeqBAIJ *B = (Mat_SeqBAIJ*) baij->B->data;
27   int         nbs = B->nbs,i,bs=B->bs;;
28 
29   PetscFunctionBegin;
30   baij->colmap = (int *) PetscMalloc((baij->Nbs+1)*sizeof(int));CHKPTRQ(baij->colmap);
31   PLogObjectMemory(mat,baij->Nbs*sizeof(int));
32   PetscMemzero(baij->colmap,baij->Nbs*sizeof(int));
33   for ( i=0; i<nbs; i++ ) baij->colmap[baij->garray[i]] = i*bs+1;
34   PetscFunctionReturn(0);
35 }
36 
37 #define CHUNKSIZE  10
38 
39 #define  MatSetValues_SeqBAIJ_A_Private(row,col,value,addv) \
40 { \
41  \
42     brow = row/bs;  \
43     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
44     rmax = aimax[brow]; nrow = ailen[brow]; \
45       bcol = col/bs; \
46       ridx = row % bs; cidx = col % bs; \
47       low = 0; high = nrow; \
48       while (high-low > 3) { \
49         t = (low+high)/2; \
50         if (rp[t] > bcol) high = t; \
51         else              low  = t; \
52       } \
53       for ( _i=low; _i<high; _i++ ) { \
54         if (rp[_i] > bcol) break; \
55         if (rp[_i] == bcol) { \
56           bap  = ap +  bs2*_i + bs*cidx + ridx; \
57           if (addv == ADD_VALUES) *bap += value;  \
58           else                    *bap  = value;  \
59           goto a_noinsert; \
60         } \
61       } \
62       if (a->nonew == 1) goto a_noinsert; \
63       else if (a->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
64       if (nrow >= rmax) { \
65         /* there is no extra room in row, therefore enlarge */ \
66         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
67         Scalar *new_a; \
68  \
69         if (a->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
70  \
71         /* malloc new storage space */ \
72         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int); \
73         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
74         new_j   = (int *) (new_a + bs2*new_nz); \
75         new_i   = new_j + new_nz; \
76  \
77         /* copy over old data into new slots */ \
78         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];} \
79         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
80         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int)); \
81         len = (new_nz - CHUNKSIZE - ai[brow] - nrow); \
82         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow, \
83                                                            len*sizeof(int)); \
84         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar)); \
85         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
86         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE), \
87                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));  \
88         /* free up old matrix storage */ \
89         PetscFree(a->a);  \
90         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} \
91         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
92         a->singlemalloc = 1; \
93  \
94         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
95         rmax = aimax[brow] = aimax[brow] + CHUNKSIZE; \
96         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
97         a->maxnz += bs2*CHUNKSIZE; \
98         a->reallocs++; \
99         a->nz++; \
100       } \
101       N = nrow++ - 1;  \
102       /* shift up all the later entries in this row */ \
103       for ( ii=N; ii>=_i; ii-- ) { \
104         rp[ii+1] = rp[ii]; \
105         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
106       } \
107       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
108       rp[_i]                      = bcol;  \
109       ap[bs2*_i + bs*cidx + ridx] = value;  \
110       a_noinsert:; \
111     ailen[brow] = nrow; \
112 }
113 
114 #define  MatSetValues_SeqBAIJ_B_Private(row,col,value,addv) \
115 { \
116  \
117     brow = row/bs;  \
118     rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
119     rmax = bimax[brow]; nrow = bilen[brow]; \
120       bcol = col/bs; \
121       ridx = row % bs; cidx = col % bs; \
122       low = 0; high = nrow; \
123       while (high-low > 3) { \
124         t = (low+high)/2; \
125         if (rp[t] > bcol) high = t; \
126         else              low  = t; \
127       } \
128       for ( _i=low; _i<high; _i++ ) { \
129         if (rp[_i] > bcol) break; \
130         if (rp[_i] == bcol) { \
131           bap  = ap +  bs2*_i + bs*cidx + ridx; \
132           if (addv == ADD_VALUES) *bap += value;  \
133           else                    *bap  = value;  \
134           goto b_noinsert; \
135         } \
136       } \
137       if (b->nonew == 1) goto b_noinsert; \
138       else if (b->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
139       if (nrow >= rmax) { \
140         /* there is no extra room in row, therefore enlarge */ \
141         int    new_nz = bi[b->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
142         Scalar *new_a; \
143  \
144         if (b->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
145  \
146         /* malloc new storage space */ \
147         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(b->mbs+1)*sizeof(int); \
148         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
149         new_j   = (int *) (new_a + bs2*new_nz); \
150         new_i   = new_j + new_nz; \
151  \
152         /* copy over old data into new slots */ \
153         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = bi[ii];} \
154         for ( ii=brow+1; ii<b->mbs+1; ii++ ) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
155         PetscMemcpy(new_j,bj,(bi[brow]+nrow)*sizeof(int)); \
156         len = (new_nz - CHUNKSIZE - bi[brow] - nrow); \
157         PetscMemcpy(new_j+bi[brow]+nrow+CHUNKSIZE,bj+bi[brow]+nrow, \
158                                                            len*sizeof(int)); \
159         PetscMemcpy(new_a,ba,(bi[brow]+nrow)*bs2*sizeof(Scalar)); \
160         PetscMemzero(new_a+bs2*(bi[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
161         PetscMemcpy(new_a+bs2*(bi[brow]+nrow+CHUNKSIZE), \
162                     ba+bs2*(bi[brow]+nrow),bs2*len*sizeof(Scalar));  \
163         /* free up old matrix storage */ \
164         PetscFree(b->a);  \
165         if (!b->singlemalloc) {PetscFree(b->i);PetscFree(b->j);} \
166         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
167         b->singlemalloc = 1; \
168  \
169         rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
170         rmax = bimax[brow] = bimax[brow] + CHUNKSIZE; \
171         PLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
172         b->maxnz += bs2*CHUNKSIZE; \
173         b->reallocs++; \
174         b->nz++; \
175       } \
176       N = nrow++ - 1;  \
177       /* shift up all the later entries in this row */ \
178       for ( ii=N; ii>=_i; ii-- ) { \
179         rp[ii+1] = rp[ii]; \
180         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
181       } \
182       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
183       rp[_i]                      = bcol;  \
184       ap[bs2*_i + bs*cidx + ridx] = value;  \
185       b_noinsert:; \
186     bilen[brow] = nrow; \
187 }
188 
189 #undef __FUNC__
190 #define __FUNC__ "MatSetValues_MPIBAIJ"
191 int MatSetValues_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
192 {
193   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
194   Scalar      value;
195   int         ierr,i,j,row,col;
196   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
197   int         rend_orig=baij->rend_bs,cstart_orig=baij->cstart_bs;
198   int         cend_orig=baij->cend_bs,bs=baij->bs;
199 
200   /* Some Variables required in the macro */
201   Mat         A = baij->A;
202   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) (A)->data;
203   int         *aimax=a->imax,*ai=a->i,*ailen=a->ilen,*aj=a->j;
204   Scalar      *aa=a->a;
205 
206   Mat         B = baij->B;
207   Mat_SeqBAIJ *b = (Mat_SeqBAIJ *) (B)->data;
208   int         *bimax=b->imax,*bi=b->i,*bilen=b->ilen,*bj=b->j;
209   Scalar      *ba=b->a;
210 
211   int         *rp,ii,nrow,_i,rmax,N,brow,bcol;
212   int         low,high,t,ridx,cidx,bs2=a->bs2;
213   Scalar      *ap,*bap;
214 
215   PetscFunctionBegin;
216   for ( i=0; i<m; i++ ) {
217 #if defined(USE_PETSC_BOPT_g)
218     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
219     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
220 #endif
221     if (im[i] >= rstart_orig && im[i] < rend_orig) {
222       row = im[i] - rstart_orig;
223       for ( j=0; j<n; j++ ) {
224         if (in[j] >= cstart_orig && in[j] < cend_orig){
225           col = in[j] - cstart_orig;
226           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
227           MatSetValues_SeqBAIJ_A_Private(row,col,value,addv);
228           /* ierr = MatSetValues_SeqBAIJ(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
229         }
230 #if defined(USE_PETSC_BOPT_g)
231         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
232         else if (in[j] >= baij->N) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Col too large");}
233 #endif
234         else {
235           if (mat->was_assembled) {
236             if (!baij->colmap) {
237               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
238             }
239             col = baij->colmap[in[j]/bs] - 1 + in[j]%bs;
240             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
241               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
242               col =  in[j];
243               /* Reinitialize the variables required by MatSetValues_SeqBAIJ_B_Private() */
244               B = baij->B;
245               b = (Mat_SeqBAIJ *) (B)->data;
246               bimax=b->imax;bi=b->i;bilen=b->ilen;bj=b->j;
247               ba=b->a;
248             }
249           } else col = in[j];
250           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
251           MatSetValues_SeqBAIJ_B_Private(row,col,value,addv);
252           /* ierr = MatSetValues_SeqBAIJ(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
253         }
254       }
255     } else {
256       if (roworiented && !baij->donotstash) {
257         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
258       } else {
259         if (!baij->donotstash) {
260           row = im[i];
261 	  for ( j=0; j<n; j++ ) {
262 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
263           }
264         }
265       }
266     }
267   }
268   PetscFunctionReturn(0);
269 }
270 
271 extern int MatSetValuesBlocked_SeqBAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
272 #undef __FUNC__
273 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ"
274 int MatSetValuesBlocked_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
275 {
276   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
277   Scalar      *value,*barray=baij->barray;
278   int         ierr,i,j,ii,jj,row,col,k,l;
279   int         roworiented = baij->roworiented,rstart=baij->rstart ;
280   int         rend=baij->rend,cstart=baij->cstart,stepval;
281   int         cend=baij->cend,bs=baij->bs,bs2=baij->bs2;
282 
283   if(!barray) {
284     baij->barray = barray = (Scalar*) PetscMalloc(bs2*sizeof(Scalar)); CHKPTRQ(barray);
285   }
286 
287   if (roworiented) {
288     stepval = (n-1)*bs;
289   } else {
290     stepval = (m-1)*bs;
291   }
292   for ( i=0; i<m; i++ ) {
293 #if defined(USE_PETSC_BOPT_g)
294     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
295     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
296 #endif
297     if (im[i] >= rstart && im[i] < rend) {
298       row = im[i] - rstart;
299       for ( j=0; j<n; j++ ) {
300         /* If NumCol = 1 then a copy is not required */
301         if ((roworiented) && (n == 1)) {
302           barray = v + i*bs2;
303         } else if((!roworiented) && (m == 1)) {
304           barray = v + j*bs2;
305         } else { /* Here a copy is required */
306           if (roworiented) {
307             value = v + i*(stepval+bs)*bs + j*bs;
308           } else {
309             value = v + j*(stepval+bs)*bs + i*bs;
310           }
311           for ( ii=0; ii<bs; ii++,value+=stepval ) {
312             for (jj=0; jj<bs; jj++ ) {
313               *barray++  = *value++;
314             }
315           }
316           barray -=bs2;
317         }
318 
319         if (in[j] >= cstart && in[j] < cend){
320           col  = in[j] - cstart;
321           ierr = MatSetValuesBlocked_SeqBAIJ(baij->A,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
322         }
323 #if defined(USE_PETSC_BOPT_g)
324         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
325         else if (in[j] >= baij->Nbs) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");}
326 #endif
327         else {
328           if (mat->was_assembled) {
329             if (!baij->colmap) {
330               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
331             }
332 
333 #if defined(USE_PETSC_BOPT_g)
334             if ((baij->colmap[in[j]] - 1) % bs) {SETERRQ(PETSC_ERR_PLIB,0,"Incorrect colmap");}
335 #endif
336             col = (baij->colmap[in[j]] - 1)/bs;
337             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
338               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
339               col =  in[j];
340             }
341           }
342           else col = in[j];
343           ierr = MatSetValuesBlocked_SeqBAIJ(baij->B,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
344         }
345       }
346     } else {
347       if (!baij->donotstash) {
348         if (roworiented ) {
349           row   = im[i]*bs;
350           value = v + i*(stepval+bs)*bs;
351           for ( j=0; j<bs; j++,row++ ) {
352             for ( k=0; k<n; k++ ) {
353               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
354                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
355               }
356             }
357           }
358         } else {
359           for ( j=0; j<n; j++ ) {
360             value = v + j*(stepval+bs)*bs + i*bs;
361             col   = in[j]*bs;
362             for ( k=0; k<bs; k++,col++,value+=stepval) {
363               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
364                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
365               }
366             }
367           }
368         }
369       }
370     }
371   }
372   PetscFunctionReturn(0);
373 }
374 #include <math.h>
375 #define HASH_KEY 0.6180339887
376 /* #define HASH1(size,key) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
377 #define HASH(size,key,tmp) (tmp = (key)*HASH_KEY,(int)((size)*(tmp-(int)tmp)))
378 /* #define HASH(size,key,tmp) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
379 #undef __FUNC__
380 #define __FUNC__ "MatSetValues_MPIBAIJ_HT"
381 int MatSetValues_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
382 {
383   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
384   int         ierr,i,j,row,col;
385   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
386   int         rend_orig=baij->rend_bs,Nbs=baij->Nbs;
387   int         h1,key,size=baij->ht_size,bs=baij->bs,*HT=baij->ht,idx;
388   double      tmp;
389   Scalar      ** HD = baij->hd,value;
390 #if defined(USE_PETSC_BOPT_g)
391   int         total_ct=baij->ht_total_ct,insert_ct=baij->ht_insert_ct;
392 #endif
393 
394   PetscFunctionBegin;
395 
396   for ( i=0; i<m; i++ ) {
397 #if defined(USE_PETSC_BOPT_g)
398     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
399     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
400 #endif
401       row = im[i];
402     if (row >= rstart_orig && row < rend_orig) {
403       for ( j=0; j<n; j++ ) {
404         col = in[j];
405         if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
406         /* Look up into the Hash Table */
407         key = (row/bs)*Nbs+(col/bs)+1;
408         h1  = HASH(size,key,tmp);
409 
410 
411         idx = h1;
412 #if defined(USE_PETSC_BOPT_g)
413         insert_ct++;
414         total_ct++;
415         if (HT[idx] != key) {
416           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++,total_ct++);
417           if (idx == size) {
418             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++,total_ct++);
419             if (idx == h1) {
420               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
421             }
422           }
423         }
424 #else
425         if (HT[idx] != key) {
426           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++);
427           if (idx == size) {
428             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++);
429             if (idx == h1) {
430               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
431             }
432           }
433         }
434 #endif
435         /* A HASH table entry is found, so insert the values at the correct address */
436         if (addv == ADD_VALUES) *(HD[idx]+ (col % bs)*bs + (row % bs)) += value;
437         else                    *(HD[idx]+ (col % bs)*bs + (row % bs))  = value;
438       }
439     } else {
440       if (roworiented && !baij->donotstash) {
441         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
442       } else {
443         if (!baij->donotstash) {
444           row = im[i];
445 	  for ( j=0; j<n; j++ ) {
446 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
447           }
448         }
449       }
450     }
451   }
452 #if defined(USE_PETSC_BOPT_g)
453   baij->ht_total_ct = total_ct;
454   baij->ht_insert_ct = insert_ct;
455 #endif
456   PetscFunctionReturn(0);
457 }
458 
459 #undef __FUNC__
460 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ_HT"
461 int MatSetValuesBlocked_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
462 {
463   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
464   int         ierr,i,j,ii,jj,row,col,k,l;
465   int         roworiented = baij->roworiented,rstart=baij->rstart ;
466   int         rend=baij->rend,stepval,bs=baij->bs,bs2=baij->bs2;
467   int         h1,key,size=baij->ht_size,idx,*HT=baij->ht,Nbs=baij->Nbs;
468   double      tmp;
469   Scalar      ** HD = baij->hd,*value,*v_t,*baij_a;
470 #if defined(USE_PETSC_BOPT_g)
471   int         total_ct=baij->ht_total_ct,insert_ct=baij->ht_insert_ct;
472 #endif
473 
474   PetscFunctionBegin;
475 
476   if (roworiented) {
477     stepval = (n-1)*bs;
478   } else {
479     stepval = (m-1)*bs;
480   }
481   for ( i=0; i<m; i++ ) {
482 #if defined(USE_PETSC_BOPT_g)
483     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
484     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
485 #endif
486     row   = im[i];
487     v_t   = v + i*bs2;
488     if (row >= rstart && row < rend) {
489       for ( j=0; j<n; j++ ) {
490         col = in[j];
491 
492         /* Look up into the Hash Table */
493         key = row*Nbs+col+1;
494         h1  = HASH(size,key,tmp);
495 
496         idx = h1;
497 #if defined(USE_PETSC_BOPT_g)
498         total_ct++;
499         insert_ct++;
500        if (HT[idx] != key) {
501           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++,total_ct++);
502           if (idx == size) {
503             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++,total_ct++);
504             if (idx == h1) {
505               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
506             }
507           }
508         }
509 #else
510         if (HT[idx] != key) {
511           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++);
512           if (idx == size) {
513             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++);
514             if (idx == h1) {
515               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
516             }
517           }
518         }
519 #endif
520         baij_a = HD[idx];
521         if (roworiented) {
522           /*value = v + i*(stepval+bs)*bs + j*bs;*/
523           /* value = v + (i*(stepval+bs)+j)*bs; */
524           value = v_t;
525           v_t  += bs;
526           if (addv == ADD_VALUES) {
527             for ( ii=0; ii<bs; ii++,value+=stepval) {
528               for ( jj=ii; jj<bs2; jj+=bs ) {
529                 baij_a[jj]  += *value++;
530               }
531             }
532           } else {
533             for ( ii=0; ii<bs; ii++,value+=stepval) {
534               for ( jj=ii; jj<bs2; jj+=bs ) {
535                 baij_a[jj]  = *value++;
536               }
537             }
538           }
539         } else {
540           value = v + j*(stepval+bs)*bs + i*bs;
541           if (addv == ADD_VALUES) {
542             for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
543               for ( jj=0; jj<bs; jj++ ) {
544                 baij_a[jj]  += *value++;
545               }
546             }
547           } else {
548             for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
549               for ( jj=0; jj<bs; jj++ ) {
550                 baij_a[jj]  = *value++;
551               }
552             }
553           }
554         }
555       }
556     } else {
557       if (!baij->donotstash) {
558         if (roworiented ) {
559           row   = im[i]*bs;
560           value = v + i*(stepval+bs)*bs;
561           for ( j=0; j<bs; j++,row++ ) {
562             for ( k=0; k<n; k++ ) {
563               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
564                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
565               }
566             }
567           }
568         } else {
569           for ( j=0; j<n; j++ ) {
570             value = v + j*(stepval+bs)*bs + i*bs;
571             col   = in[j]*bs;
572             for ( k=0; k<bs; k++,col++,value+=stepval) {
573               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
574                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
575               }
576             }
577           }
578         }
579       }
580     }
581   }
582 #if defined(USE_PETSC_BOPT_g)
583   baij->ht_total_ct = total_ct;
584   baij->ht_insert_ct = insert_ct;
585 #endif
586   PetscFunctionReturn(0);
587 }
588 #undef __FUNC__
589 #define __FUNC__ "MatGetValues_MPIBAIJ"
590 int MatGetValues_MPIBAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
591 {
592   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
593   int        bs=baij->bs,ierr,i,j, bsrstart = baij->rstart*bs, bsrend = baij->rend*bs;
594   int        bscstart = baij->cstart*bs, bscend = baij->cend*bs,row,col;
595 
596   for ( i=0; i<m; i++ ) {
597     if (idxm[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
598     if (idxm[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
599     if (idxm[i] >= bsrstart && idxm[i] < bsrend) {
600       row = idxm[i] - bsrstart;
601       for ( j=0; j<n; j++ ) {
602         if (idxn[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");
603         if (idxn[j] >= baij->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");
604         if (idxn[j] >= bscstart && idxn[j] < bscend){
605           col = idxn[j] - bscstart;
606           ierr = MatGetValues(baij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
607         } else {
608           if (!baij->colmap) {
609             ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
610           }
611           if((baij->colmap[idxn[j]/bs]-1 < 0) ||
612              (baij->garray[(baij->colmap[idxn[j]/bs]-1)/bs] != idxn[j]/bs)) *(v+i*n+j) = 0.0;
613           else {
614             col  = (baij->colmap[idxn[j]/bs]-1) + idxn[j]%bs;
615             ierr = MatGetValues(baij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
616           }
617         }
618       }
619     } else {
620       SETERRQ(PETSC_ERR_SUP,0,"Only local values currently supported");
621     }
622   }
623  PetscFunctionReturn(0);
624 }
625 
626 #undef __FUNC__
627 #define __FUNC__ "MatNorm_MPIBAIJ"
628 int MatNorm_MPIBAIJ(Mat mat,NormType type,double *norm)
629 {
630   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
631   Mat_SeqBAIJ *amat = (Mat_SeqBAIJ*) baij->A->data, *bmat = (Mat_SeqBAIJ*) baij->B->data;
632   int        ierr, i,bs2=baij->bs2;
633   double     sum = 0.0;
634   Scalar     *v;
635 
636   PetscFunctionBegin;
637   if (baij->size == 1) {
638     ierr =  MatNorm(baij->A,type,norm); CHKERRQ(ierr);
639   } else {
640     if (type == NORM_FROBENIUS) {
641       v = amat->a;
642       for (i=0; i<amat->nz*bs2; i++ ) {
643 #if defined(USE_PETSC_COMPLEX)
644         sum += real(conj(*v)*(*v)); v++;
645 #else
646         sum += (*v)*(*v); v++;
647 #endif
648       }
649       v = bmat->a;
650       for (i=0; i<bmat->nz*bs2; i++ ) {
651 #if defined(USE_PETSC_COMPLEX)
652         sum += real(conj(*v)*(*v)); v++;
653 #else
654         sum += (*v)*(*v); v++;
655 #endif
656       }
657       ierr = MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
658       *norm = sqrt(*norm);
659     } else {
660       SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
661     }
662   }
663   PetscFunctionReturn(0);
664 }
665 
666 #undef __FUNC__
667 #define __FUNC__ "MatAssemblyBegin_MPIBAIJ"
668 int MatAssemblyBegin_MPIBAIJ(Mat mat,MatAssemblyType mode)
669 {
670   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
671   MPI_Comm    comm = mat->comm;
672   int         size = baij->size, *owners = baij->rowners,bs=baij->bs;
673   int         rank = baij->rank,tag = mat->tag, *owner,*starts,count,ierr;
674   MPI_Request *send_waits,*recv_waits;
675   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
676   InsertMode  addv;
677   Scalar      *rvalues,*svalues;
678 
679   PetscFunctionBegin;
680   /* make sure all processors are either in INSERTMODE or ADDMODE */
681   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);CHKERRQ(ierr);
682   if (addv == (ADD_VALUES|INSERT_VALUES)) {
683     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Some processors inserted others added");
684   }
685   mat->insertmode = addv; /* in case this processor had no cache */
686 
687   /*  first count number of contributors to each processor */
688   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
689   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
690   owner = (int *) PetscMalloc( (baij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
691   for ( i=0; i<baij->stash.n; i++ ) {
692     idx = baij->stash.idx[i];
693     for ( j=0; j<size; j++ ) {
694       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
695         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
696       }
697     }
698   }
699   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
700 
701   /* inform other processors of number of messages and max length*/
702   work      = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
703   ierr      = MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
704   nreceives = work[rank];
705   ierr      = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
706   nmax      = work[rank];
707   PetscFree(work);
708 
709   /* post receives:
710        1) each message will consist of ordered pairs
711      (global index,value) we store the global index as a double
712      to simplify the message passing.
713        2) since we don't know how long each individual message is we
714      allocate the largest needed buffer for each receive. Potentially
715      this is a lot of wasted space.
716 
717 
718        This could be done better.
719   */
720   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));CHKPTRQ(rvalues);
721   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
722   for ( i=0; i<nreceives; i++ ) {
723     ierr = MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
724   }
725 
726   /* do sends:
727       1) starts[i] gives the starting index in svalues for stuff going to
728          the ith processor
729   */
730   svalues = (Scalar *) PetscMalloc(3*(baij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
731   send_waits = (MPI_Request *) PetscMalloc((nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
732   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
733   starts[0] = 0;
734   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
735   for ( i=0; i<baij->stash.n; i++ ) {
736     svalues[3*starts[owner[i]]]       = (Scalar)  baij->stash.idx[i];
737     svalues[3*starts[owner[i]]+1]     = (Scalar)  baij->stash.idy[i];
738     svalues[3*(starts[owner[i]]++)+2] =  baij->stash.array[i];
739   }
740   PetscFree(owner);
741   starts[0] = 0;
742   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
743   count = 0;
744   for ( i=0; i<size; i++ ) {
745     if (procs[i]) {
746       ierr = MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
747     }
748   }
749   PetscFree(starts); PetscFree(nprocs);
750 
751   /* Free cache space */
752   PLogInfo(0,"MatAssemblyBegin_MPIBAIJ:Number of off-processor values %d\n",baij->stash.n);
753   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
754 
755   baij->svalues    = svalues;    baij->rvalues    = rvalues;
756   baij->nsends     = nsends;     baij->nrecvs     = nreceives;
757   baij->send_waits = send_waits; baij->recv_waits = recv_waits;
758   baij->rmax       = nmax;
759 
760   PetscFunctionReturn(0);
761 }
762 
763 /*
764   Creates the hash table, and sets the table
765   This table is created only once.
766   If new entried need to be added to the matrix
767   then the hash table has to be destroyed and
768   recreated.
769 */
770 #undef __FUNC__
771 #define __FUNC__ "MatCreateHashTable_MPIBAIJ_Private"
772 int MatCreateHashTable_MPIBAIJ_Private(Mat mat,double factor)
773 {
774   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
775   Mat         A = baij->A, B=baij->B;
776   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data, *b=(Mat_SeqBAIJ *)B->data;
777   int         i,j,k,nz=a->nz+b->nz,h1,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
778   int         size,bs2=baij->bs2,rstart=baij->rstart;
779   int         cstart=baij->cstart,*garray=baij->garray,row,col,Nbs=baij->Nbs;
780   int         *HT,key;
781   Scalar      **HD;
782   double      tmp;
783 #if defined(USE_PETSC_BOPT_g)
784   int         ct=0,max=0;
785 #endif
786 
787   PetscFunctionBegin;
788   baij->ht_size=(int)(factor*nz);
789   size = baij->ht_size;
790 
791   if (baij->ht) {
792     PetscFunctionReturn(0);
793   }
794 
795   /* Allocate Memory for Hash Table */
796   baij->hd = (Scalar**)PetscMalloc((size)*(sizeof(int)+sizeof(Scalar*))+1); CHKPTRQ(baij->hd);
797   baij->ht = (int*)(baij->hd + size);
798   HD = baij->hd;
799   HT = baij->ht;
800 
801 
802   PetscMemzero(HD,size*(sizeof(int)+sizeof(Scalar*)));
803 
804 
805   /* Loop Over A */
806   for ( i=0; i<a->mbs; i++ ) {
807     for ( j=ai[i]; j<ai[i+1]; j++ ) {
808       row = i+rstart;
809       col = aj[j]+cstart;
810 
811       key = row*Nbs + col + 1;
812       h1  = HASH(size,key,tmp);
813       for ( k=0; k<size; k++ ){
814         if (HT[(h1+k)%size] == 0.0) {
815           HT[(h1+k)%size] = key;
816           HD[(h1+k)%size] = a->a + j*bs2;
817           break;
818 #if defined(USE_PETSC_BOPT_g)
819         } else {
820           ct++;
821 #endif
822         }
823       }
824 #if defined(USE_PETSC_BOPT_g)
825       if (k> max) max = k;
826 #endif
827     }
828   }
829   /* Loop Over B */
830   for ( i=0; i<b->mbs; i++ ) {
831     for ( j=bi[i]; j<bi[i+1]; j++ ) {
832       row = i+rstart;
833       col = garray[bj[j]];
834       key = row*Nbs + col + 1;
835       h1  = HASH(size,key,tmp);
836       for ( k=0; k<size; k++ ){
837         if (HT[(h1+k)%size] == 0.0) {
838           HT[(h1+k)%size] = key;
839           HD[(h1+k)%size] = b->a + j*bs2;
840           break;
841 #if defined(USE_PETSC_BOPT_g)
842         } else {
843           ct++;
844 #endif
845         }
846       }
847 #if defined(USE_PETSC_BOPT_g)
848       if (k> max) max = k;
849 #endif
850     }
851   }
852 
853   /* Print Summary */
854 #if defined(USE_PETSC_BOPT_g)
855   for ( i=0,j=0; i<size; i++)
856     if (HT[i]) {j++;}
857   PLogInfo(0,"MatCreateHashTable_MPIBAIJ_Private: Average Search = %5.2f,max search = %d\n",
858            (j== 0)? 0.0:((double)(ct+j))/j,max);
859 #endif
860   PetscFunctionReturn(0);
861 }
862 
863 #undef __FUNC__
864 #define __FUNC__ "MatAssemblyEnd_MPIBAIJ"
865 int MatAssemblyEnd_MPIBAIJ(Mat mat,MatAssemblyType mode)
866 {
867   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
868   MPI_Status  *send_status,recv_status;
869   int         imdex,nrecvs = baij->nrecvs, count = nrecvs, i, n, ierr;
870   int         bs=baij->bs,row,col,other_disassembled,flg;
871   Scalar      *values,val;
872   InsertMode  addv = mat->insertmode;
873 
874   PetscFunctionBegin;
875   /*  wait on receives */
876   while (count) {
877     ierr = MPI_Waitany(nrecvs,baij->recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
878     /* unpack receives into our local space */
879     values = baij->rvalues + 3*imdex*baij->rmax;
880     ierr = MPI_Get_count(&recv_status,MPIU_SCALAR,&n);CHKERRQ(ierr);
881     n = n/3;
882     for ( i=0; i<n; i++ ) {
883       row = (int) PetscReal(values[3*i]) - baij->rstart*bs;
884       col = (int) PetscReal(values[3*i+1]);
885       val = values[3*i+2];
886       if (col >= baij->cstart*bs && col < baij->cend*bs) {
887         col -= baij->cstart*bs;
888         ierr = MatSetValues(baij->A,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
889       } else {
890         if (mat->was_assembled) {
891           if (!baij->colmap) {
892             ierr = CreateColmap_MPIBAIJ_Private(mat); CHKERRQ(ierr);
893           }
894           col = (baij->colmap[col/bs]) - 1 + col%bs;
895           if (col < 0  && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
896             ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
897             col = (int) PetscReal(values[3*i+1]);
898           }
899         }
900         ierr = MatSetValues(baij->B,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
901       }
902     }
903     count--;
904   }
905   PetscFree(baij->recv_waits); PetscFree(baij->rvalues);
906 
907   /* wait on sends */
908   if (baij->nsends) {
909     send_status = (MPI_Status *) PetscMalloc(baij->nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
910     ierr        = MPI_Waitall(baij->nsends,baij->send_waits,send_status);CHKERRQ(ierr);
911     PetscFree(send_status);
912   }
913   PetscFree(baij->send_waits); PetscFree(baij->svalues);
914 
915   ierr = MatAssemblyBegin(baij->A,mode); CHKERRQ(ierr);
916   ierr = MatAssemblyEnd(baij->A,mode); CHKERRQ(ierr);
917 
918   /* determine if any processor has disassembled, if so we must
919      also disassemble ourselfs, in order that we may reassemble. */
920   ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
921   if (mat->was_assembled && !other_disassembled) {
922     ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
923   }
924 
925   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
926     ierr = MatSetUpMultiply_MPIBAIJ(mat); CHKERRQ(ierr);
927   }
928   ierr = MatAssemblyBegin(baij->B,mode); CHKERRQ(ierr);
929   ierr = MatAssemblyEnd(baij->B,mode); CHKERRQ(ierr);
930 
931 #if defined(USE_PETSC_BOPT_g)
932   if (baij->ht && mode== MAT_FINAL_ASSEMBLY) {
933     PLogInfo(0,"MatAssemblyEnd_MPIBAIJ:Average Hash Table Search in MatSetValues = %5.2f\n",
934              ((double)baij->ht_total_ct)/baij->ht_insert_ct);
935     baij->ht_total_ct  = 0;
936     baij->ht_insert_ct = 0;
937   }
938 #endif
939   ierr = OptionsHasName(PETSC_NULL,"-use_hash",&flg); CHKERRQ(ierr);
940   if (flg && !baij->ht && mode== MAT_FINAL_ASSEMBLY) {
941     double fact = 1.39;
942     ierr = OptionsGetDouble(PETSC_NULL,"-use_hash",&fact,&flg); CHKERRQ(ierr);
943     if (fact <= 1.0) fact = 1.39;
944     PLogInfo(0,"MatAssemblyEnd_MPIBAIJ:Hash table Factor used %5.2f\n",fact);
945     ierr = MatCreateHashTable_MPIBAIJ_Private(mat,fact); CHKERRQ(ierr);
946     mat->ops.setvalues        = MatSetValues_MPIBAIJ_HT;
947     mat->ops.setvaluesblocked = MatSetValuesBlocked_MPIBAIJ_HT;
948   }
949 
950   if (baij->rowvalues) {PetscFree(baij->rowvalues); baij->rowvalues = 0;}
951   PetscFunctionReturn(0);
952 }
953 
954 #undef __FUNC__
955 #define __FUNC__ "MatView_MPIBAIJ_Binary"
956 static int MatView_MPIBAIJ_Binary(Mat mat,Viewer viewer)
957 {
958   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
959   int          ierr;
960 
961   PetscFunctionBegin;
962   if (baij->size == 1) {
963     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
964   } else SETERRQ(PETSC_ERR_SUP,0,"Only uniprocessor output supported");
965   PetscFunctionReturn(0);
966 }
967 
968 #undef __FUNC__
969 #define __FUNC__ "MatView_MPIBAIJ_ASCIIorDraworMatlab"
970 static int MatView_MPIBAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
971 {
972   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
973   int          ierr, format,rank,bs = baij->bs;
974   FILE         *fd;
975   ViewerType   vtype;
976 
977   PetscFunctionBegin;
978   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
979   if (vtype  == ASCII_FILES_VIEWER || vtype == ASCII_FILE_VIEWER) {
980     ierr = ViewerGetFormat(viewer,&format);
981     if (format == VIEWER_FORMAT_ASCII_INFO_LONG) {
982       MatInfo info;
983       MPI_Comm_rank(mat->comm,&rank);
984       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
985       ierr = MatGetInfo(mat,MAT_LOCAL,&info);
986       PetscSequentialPhaseBegin(mat->comm,1);
987       fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
988               rank,baij->m,(int)info.nz_used*bs,(int)info.nz_allocated*bs,
989               baij->bs,(int)info.memory);
990       ierr = MatGetInfo(baij->A,MAT_LOCAL,&info);
991       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
992       ierr = MatGetInfo(baij->B,MAT_LOCAL,&info);
993       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
994       fflush(fd);
995       PetscSequentialPhaseEnd(mat->comm,1);
996       ierr = VecScatterView(baij->Mvctx,viewer); CHKERRQ(ierr);
997       PetscFunctionReturn(0);
998     } else if (format == VIEWER_FORMAT_ASCII_INFO) {
999       PetscPrintf(mat->comm,"  block size is %d\n",bs);
1000       PetscFunctionReturn(0);
1001     }
1002   }
1003 
1004   if (vtype == DRAW_VIEWER) {
1005     Draw       draw;
1006     PetscTruth isnull;
1007     ierr = ViewerDrawGetDraw(viewer,&draw); CHKERRQ(ierr);
1008     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
1009   }
1010 
1011   if (vtype == ASCII_FILE_VIEWER) {
1012     ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
1013     PetscSequentialPhaseBegin(mat->comm,1);
1014     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
1015            baij->rank,baij->m,baij->rstart*bs,baij->rend*bs,baij->n,
1016             baij->cstart*bs,baij->cend*bs);
1017     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
1018     ierr = MatView(baij->B,viewer); CHKERRQ(ierr);
1019     fflush(fd);
1020     PetscSequentialPhaseEnd(mat->comm,1);
1021   } else {
1022     int size = baij->size;
1023     rank = baij->rank;
1024     if (size == 1) {
1025       ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
1026     } else {
1027       /* assemble the entire matrix onto first processor. */
1028       Mat         A;
1029       Mat_SeqBAIJ *Aloc;
1030       int         M = baij->M, N = baij->N,*ai,*aj,row,col,i,j,k,*rvals;
1031       int         mbs=baij->mbs;
1032       Scalar      *a;
1033 
1034       if (!rank) {
1035         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
1036       } else {
1037         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
1038       }
1039       PLogObjectParent(mat,A);
1040 
1041       /* copy over the A part */
1042       Aloc = (Mat_SeqBAIJ*) baij->A->data;
1043       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1044       row = baij->rstart;
1045       rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1046 
1047       for ( i=0; i<mbs; i++ ) {
1048         rvals[0] = bs*(baij->rstart + i);
1049         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1050         for ( j=ai[i]; j<ai[i+1]; j++ ) {
1051           col = (baij->cstart+aj[j])*bs;
1052           for (k=0; k<bs; k++ ) {
1053             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
1054             col++; a += bs;
1055           }
1056         }
1057       }
1058       /* copy over the B part */
1059       Aloc = (Mat_SeqBAIJ*) baij->B->data;
1060       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1061       row = baij->rstart*bs;
1062       for ( i=0; i<mbs; i++ ) {
1063         rvals[0] = bs*(baij->rstart + i);
1064         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1065         for ( j=ai[i]; j<ai[i+1]; j++ ) {
1066           col = baij->garray[aj[j]]*bs;
1067           for (k=0; k<bs; k++ ) {
1068             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
1069             col++; a += bs;
1070           }
1071         }
1072       }
1073       PetscFree(rvals);
1074       ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1075       ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1076       /*
1077          Everyone has to call to draw the matrix since the graphics waits are
1078          synchronized across all processors that share the Draw object
1079       */
1080       if (!rank || vtype == DRAW_VIEWER) {
1081         ierr = MatView(((Mat_MPIBAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
1082       }
1083       ierr = MatDestroy(A); CHKERRQ(ierr);
1084     }
1085   }
1086   PetscFunctionReturn(0);
1087 }
1088 
1089 
1090 
1091 #undef __FUNC__
1092 #define __FUNC__ "MatView_MPIBAIJ"
1093 int MatView_MPIBAIJ(PetscObject obj,Viewer viewer)
1094 {
1095   Mat         mat = (Mat) obj;
1096   int         ierr;
1097   ViewerType  vtype;
1098 
1099   PetscFunctionBegin;
1100   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
1101   if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER ||
1102       vtype == DRAW_VIEWER       || vtype == MATLAB_VIEWER) {
1103     ierr = MatView_MPIBAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
1104   } else if (vtype == BINARY_FILE_VIEWER) {
1105     ierr = MatView_MPIBAIJ_Binary(mat,viewer);CHKERRQ(ierr);
1106   }
1107   PetscFunctionReturn(0);
1108 }
1109 
1110 #undef __FUNC__
1111 #define __FUNC__ "MatDestroy_MPIBAIJ"
1112 int MatDestroy_MPIBAIJ(PetscObject obj)
1113 {
1114   Mat         mat = (Mat) obj;
1115   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1116   int         ierr;
1117 
1118   PetscFunctionBegin;
1119 #if defined(USE_PETSC_LOG)
1120   PLogObjectState(obj,"Rows=%d, Cols=%d",baij->M,baij->N);
1121 #endif
1122 
1123   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
1124   PetscFree(baij->rowners);
1125   ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1126   ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1127   if (baij->colmap) PetscFree(baij->colmap);
1128   if (baij->garray) PetscFree(baij->garray);
1129   if (baij->lvec)   VecDestroy(baij->lvec);
1130   if (baij->Mvctx)  VecScatterDestroy(baij->Mvctx);
1131   if (baij->rowvalues) PetscFree(baij->rowvalues);
1132   if (baij->barray) PetscFree(baij->barray);
1133   if (baij->hd) PetscFree(baij->hd);
1134   PetscFree(baij);
1135   PLogObjectDestroy(mat);
1136   PetscHeaderDestroy(mat);
1137   PetscFunctionReturn(0);
1138 }
1139 
1140 #undef __FUNC__
1141 #define __FUNC__ "MatMult_MPIBAIJ"
1142 int MatMult_MPIBAIJ(Mat A,Vec xx,Vec yy)
1143 {
1144   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1145   int         ierr, nt;
1146 
1147   PetscFunctionBegin;
1148   VecGetLocalSize_Fast(xx,nt);
1149   if (nt != a->n) {
1150     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible partition of A and xx");
1151   }
1152   VecGetLocalSize_Fast(yy,nt);
1153   if (nt != a->m) {
1154     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible parition of A and yy");
1155   }
1156   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1157   ierr = (*a->A->ops.mult)(a->A,xx,yy); CHKERRQ(ierr);
1158   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1159   ierr = (*a->B->ops.multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
1160   ierr = VecScatterPostRecvs(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1161   PetscFunctionReturn(0);
1162 }
1163 
1164 #undef __FUNC__
1165 #define __FUNC__ "MatMultAdd_MPIBAIJ"
1166 int MatMultAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1167 {
1168   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1169   int        ierr;
1170 
1171   PetscFunctionBegin;
1172   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1173   ierr = (*a->A->ops.multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1174   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1175   ierr = (*a->B->ops.multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
1176   PetscFunctionReturn(0);
1177 }
1178 
1179 #undef __FUNC__
1180 #define __FUNC__ "MatMultTrans_MPIBAIJ"
1181 int MatMultTrans_MPIBAIJ(Mat A,Vec xx,Vec yy)
1182 {
1183   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1184   int         ierr;
1185 
1186   PetscFunctionBegin;
1187   /* do nondiagonal part */
1188   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1189   /* send it on its way */
1190   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1191   /* do local part */
1192   ierr = (*a->A->ops.multtrans)(a->A,xx,yy); CHKERRQ(ierr);
1193   /* receive remote parts: note this assumes the values are not actually */
1194   /* inserted in yy until the next line, which is true for my implementation*/
1195   /* but is not perhaps always true. */
1196   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1197   PetscFunctionReturn(0);
1198 }
1199 
1200 #undef __FUNC__
1201 #define __FUNC__ "MatMultTransAdd_MPIBAIJ"
1202 int MatMultTransAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1203 {
1204   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1205   int         ierr;
1206 
1207   PetscFunctionBegin;
1208   /* do nondiagonal part */
1209   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1210   /* send it on its way */
1211   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1212   /* do local part */
1213   ierr = (*a->A->ops.multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1214   /* receive remote parts: note this assumes the values are not actually */
1215   /* inserted in yy until the next line, which is true for my implementation*/
1216   /* but is not perhaps always true. */
1217   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1218   PetscFunctionReturn(0);
1219 }
1220 
1221 /*
1222   This only works correctly for square matrices where the subblock A->A is the
1223    diagonal block
1224 */
1225 #undef __FUNC__
1226 #define __FUNC__ "MatGetDiagonal_MPIBAIJ"
1227 int MatGetDiagonal_MPIBAIJ(Mat A,Vec v)
1228 {
1229   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1230   int         ierr;
1231 
1232   PetscFunctionBegin;
1233   if (a->M != a->N) SETERRQ(PETSC_ERR_SUP,0,"Supports only square matrix where A->A is diag block");
1234   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
1235   PetscFunctionReturn(0);
1236 }
1237 
1238 #undef __FUNC__
1239 #define __FUNC__ "MatScale_MPIBAIJ"
1240 int MatScale_MPIBAIJ(Scalar *aa,Mat A)
1241 {
1242   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1243   int         ierr;
1244 
1245   PetscFunctionBegin;
1246   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
1247   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
1248   PetscFunctionReturn(0);
1249 }
1250 
1251 #undef __FUNC__
1252 #define __FUNC__ "MatGetSize_MPIBAIJ"
1253 int MatGetSize_MPIBAIJ(Mat matin,int *m,int *n)
1254 {
1255   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1256 
1257   PetscFunctionBegin;
1258   if (m) *m = mat->M;
1259   if (n) *n = mat->N;
1260   PetscFunctionReturn(0);
1261 }
1262 
1263 #undef __FUNC__
1264 #define __FUNC__ "MatGetLocalSize_MPIBAIJ"
1265 int MatGetLocalSize_MPIBAIJ(Mat matin,int *m,int *n)
1266 {
1267   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1268 
1269   PetscFunctionBegin;
1270   *m = mat->m; *n = mat->N;
1271   PetscFunctionReturn(0);
1272 }
1273 
1274 #undef __FUNC__
1275 #define __FUNC__ "MatGetOwnershipRange_MPIBAIJ"
1276 int MatGetOwnershipRange_MPIBAIJ(Mat matin,int *m,int *n)
1277 {
1278   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1279 
1280   PetscFunctionBegin;
1281   *m = mat->rstart*mat->bs; *n = mat->rend*mat->bs;
1282   PetscFunctionReturn(0);
1283 }
1284 
1285 extern int MatGetRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1286 extern int MatRestoreRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1287 
1288 #undef __FUNC__
1289 #define __FUNC__ "MatGetRow_MPIBAIJ"
1290 int MatGetRow_MPIBAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1291 {
1292   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1293   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1294   int        bs = mat->bs, bs2 = mat->bs2, i, ierr, *cworkA, *cworkB, **pcA, **pcB;
1295   int        nztot, nzA, nzB, lrow, brstart = mat->rstart*bs, brend = mat->rend*bs;
1296   int        *cmap, *idx_p,cstart = mat->cstart;
1297 
1298   PetscFunctionBegin;
1299   if (mat->getrowactive == PETSC_TRUE) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Already active");
1300   mat->getrowactive = PETSC_TRUE;
1301 
1302   if (!mat->rowvalues && (idx || v)) {
1303     /*
1304         allocate enough space to hold information from the longest row.
1305     */
1306     Mat_SeqBAIJ *Aa = (Mat_SeqBAIJ *) mat->A->data,*Ba = (Mat_SeqBAIJ *) mat->B->data;
1307     int     max = 1,mbs = mat->mbs,tmp;
1308     for ( i=0; i<mbs; i++ ) {
1309       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1310       if (max < tmp) { max = tmp; }
1311     }
1312     mat->rowvalues = (Scalar *) PetscMalloc( max*bs2*(sizeof(int)+sizeof(Scalar)));
1313     CHKPTRQ(mat->rowvalues);
1314     mat->rowindices = (int *) (mat->rowvalues + max*bs2);
1315   }
1316 
1317   if (row < brstart || row >= brend) SETERRQ(PETSC_ERR_SUP,0,"Only local rows")
1318   lrow = row - brstart;
1319 
1320   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1321   if (!v)   {pvA = 0; pvB = 0;}
1322   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1323   ierr = (*mat->A->ops.getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1324   ierr = (*mat->B->ops.getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1325   nztot = nzA + nzB;
1326 
1327   cmap  = mat->garray;
1328   if (v  || idx) {
1329     if (nztot) {
1330       /* Sort by increasing column numbers, assuming A and B already sorted */
1331       int imark = -1;
1332       if (v) {
1333         *v = v_p = mat->rowvalues;
1334         for ( i=0; i<nzB; i++ ) {
1335           if (cmap[cworkB[i]/bs] < cstart)   v_p[i] = vworkB[i];
1336           else break;
1337         }
1338         imark = i;
1339         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1340         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1341       }
1342       if (idx) {
1343         *idx = idx_p = mat->rowindices;
1344         if (imark > -1) {
1345           for ( i=0; i<imark; i++ ) {
1346             idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs;
1347           }
1348         } else {
1349           for ( i=0; i<nzB; i++ ) {
1350             if (cmap[cworkB[i]/bs] < cstart)
1351               idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1352             else break;
1353           }
1354           imark = i;
1355         }
1356         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart*bs + cworkA[i];
1357         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1358       }
1359     } else {
1360       if (idx) *idx = 0;
1361       if (v)   *v   = 0;
1362     }
1363   }
1364   *nz = nztot;
1365   ierr = (*mat->A->ops.restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1366   ierr = (*mat->B->ops.restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1367   PetscFunctionReturn(0);
1368 }
1369 
1370 #undef __FUNC__
1371 #define __FUNC__ "MatRestoreRow_MPIBAIJ"
1372 int MatRestoreRow_MPIBAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1373 {
1374   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1375 
1376   PetscFunctionBegin;
1377   if (baij->getrowactive == PETSC_FALSE) {
1378     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"MatGetRow not called");
1379   }
1380   baij->getrowactive = PETSC_FALSE;
1381   PetscFunctionReturn(0);
1382 }
1383 
1384 #undef __FUNC__
1385 #define __FUNC__ "MatGetBlockSize_MPIBAIJ"
1386 int MatGetBlockSize_MPIBAIJ(Mat mat,int *bs)
1387 {
1388   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1389 
1390   PetscFunctionBegin;
1391   *bs = baij->bs;
1392   PetscFunctionReturn(0);
1393 }
1394 
1395 #undef __FUNC__
1396 #define __FUNC__ "MatZeroEntries_MPIBAIJ"
1397 int MatZeroEntries_MPIBAIJ(Mat A)
1398 {
1399   Mat_MPIBAIJ *l = (Mat_MPIBAIJ *) A->data;
1400   int         ierr;
1401 
1402   PetscFunctionBegin;
1403   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
1404   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
1405   PetscFunctionReturn(0);
1406 }
1407 
1408 #undef __FUNC__
1409 #define __FUNC__ "MatGetInfo_MPIBAIJ"
1410 int MatGetInfo_MPIBAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1411 {
1412   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) matin->data;
1413   Mat         A = a->A, B = a->B;
1414   int         ierr;
1415   double      isend[5], irecv[5];
1416 
1417   PetscFunctionBegin;
1418   info->block_size     = (double)a->bs;
1419   ierr = MatGetInfo(A,MAT_LOCAL,info); CHKERRQ(ierr);
1420   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->memory;
1421   ierr = MatGetInfo(B,MAT_LOCAL,info); CHKERRQ(ierr);
1422   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->memory;
1423   if (flag == MAT_LOCAL) {
1424     info->nz_used      = isend[0];
1425     info->nz_allocated = isend[1];
1426     info->nz_unneeded  = isend[2];
1427     info->memory       = isend[3];
1428     info->mallocs      = isend[4];
1429   } else if (flag == MAT_GLOBAL_MAX) {
1430     ierr = MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_MAX,matin->comm);CHKERRQ(ierr);
1431     info->nz_used      = irecv[0];
1432     info->nz_allocated = irecv[1];
1433     info->nz_unneeded  = irecv[2];
1434     info->memory       = irecv[3];
1435     info->mallocs      = irecv[4];
1436   } else if (flag == MAT_GLOBAL_SUM) {
1437     ierr = MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_SUM,matin->comm);CHKERRQ(ierr);
1438     info->nz_used      = irecv[0];
1439     info->nz_allocated = irecv[1];
1440     info->nz_unneeded  = irecv[2];
1441     info->memory       = irecv[3];
1442     info->mallocs      = irecv[4];
1443   }
1444   info->rows_global       = (double)a->M;
1445   info->columns_global    = (double)a->N;
1446   info->rows_local        = (double)a->m;
1447   info->columns_local     = (double)a->N;
1448   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1449   info->fill_ratio_needed = 0;
1450   info->factor_mallocs    = 0;
1451   PetscFunctionReturn(0);
1452 }
1453 
1454 #undef __FUNC__
1455 #define __FUNC__ "MatSetOption_MPIBAIJ"
1456 int MatSetOption_MPIBAIJ(Mat A,MatOption op)
1457 {
1458   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1459 
1460   PetscFunctionBegin;
1461   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1462       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1463       op == MAT_COLUMNS_UNSORTED ||
1464       op == MAT_COLUMNS_SORTED ||
1465       op == MAT_NEW_NONZERO_ALLOCATION_ERROR ||
1466       op == MAT_NEW_NONZERO_LOCATION_ERROR) {
1467         MatSetOption(a->A,op);
1468         MatSetOption(a->B,op);
1469   } else if (op == MAT_ROW_ORIENTED) {
1470         a->roworiented = 1;
1471         MatSetOption(a->A,op);
1472         MatSetOption(a->B,op);
1473   } else if (op == MAT_ROWS_SORTED ||
1474              op == MAT_ROWS_UNSORTED ||
1475              op == MAT_SYMMETRIC ||
1476              op == MAT_STRUCTURALLY_SYMMETRIC ||
1477              op == MAT_YES_NEW_DIAGONALS)
1478     PLogInfo(A,"Info:MatSetOption_MPIBAIJ:Option ignored\n");
1479   else if (op == MAT_COLUMN_ORIENTED) {
1480     a->roworiented = 0;
1481     MatSetOption(a->A,op);
1482     MatSetOption(a->B,op);
1483   } else if (op == MAT_IGNORE_OFF_PROC_ENTRIES) {
1484     a->donotstash = 1;
1485   } else if (op == MAT_NO_NEW_DIAGONALS) {
1486     SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");
1487   } else {
1488     SETERRQ(PETSC_ERR_SUP,0,"unknown option");
1489   }
1490   PetscFunctionReturn(0);
1491 }
1492 
1493 #undef __FUNC__
1494 #define __FUNC__ "MatTranspose_MPIBAIJ("
1495 int MatTranspose_MPIBAIJ(Mat A,Mat *matout)
1496 {
1497   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) A->data;
1498   Mat_SeqBAIJ *Aloc;
1499   Mat        B;
1500   int        ierr,M=baij->M,N=baij->N,*ai,*aj,row,i,*rvals,j,k,col;
1501   int        bs=baij->bs,mbs=baij->mbs;
1502   Scalar     *a;
1503 
1504   PetscFunctionBegin;
1505   if (matout == PETSC_NULL && M != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Square matrix only for in-place");
1506   ierr = MatCreateMPIBAIJ(A->comm,baij->bs,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);
1507   CHKERRQ(ierr);
1508 
1509   /* copy over the A part */
1510   Aloc = (Mat_SeqBAIJ*) baij->A->data;
1511   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1512   row = baij->rstart;
1513   rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1514 
1515   for ( i=0; i<mbs; i++ ) {
1516     rvals[0] = bs*(baij->rstart + i);
1517     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1518     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1519       col = (baij->cstart+aj[j])*bs;
1520       for (k=0; k<bs; k++ ) {
1521         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1522         col++; a += bs;
1523       }
1524     }
1525   }
1526   /* copy over the B part */
1527   Aloc = (Mat_SeqBAIJ*) baij->B->data;
1528   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1529   row = baij->rstart*bs;
1530   for ( i=0; i<mbs; i++ ) {
1531     rvals[0] = bs*(baij->rstart + i);
1532     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1533     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1534       col = baij->garray[aj[j]]*bs;
1535       for (k=0; k<bs; k++ ) {
1536         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1537         col++; a += bs;
1538       }
1539     }
1540   }
1541   PetscFree(rvals);
1542   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1543   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1544 
1545   if (matout != PETSC_NULL) {
1546     *matout = B;
1547   } else {
1548     /* This isn't really an in-place transpose .... but free data structures from baij */
1549     PetscFree(baij->rowners);
1550     ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1551     ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1552     if (baij->colmap) PetscFree(baij->colmap);
1553     if (baij->garray) PetscFree(baij->garray);
1554     if (baij->lvec) VecDestroy(baij->lvec);
1555     if (baij->Mvctx) VecScatterDestroy(baij->Mvctx);
1556     PetscFree(baij);
1557     PetscMemcpy(A,B,sizeof(struct _p_Mat));
1558     PetscHeaderDestroy(B);
1559   }
1560   PetscFunctionReturn(0);
1561 }
1562 
1563 #undef __FUNC__
1564 #define __FUNC__ "MatDiagonalScale_MPIBAIJ"
1565 int MatDiagonalScale_MPIBAIJ(Mat A,Vec ll,Vec rr)
1566 {
1567   Mat a = ((Mat_MPIBAIJ *) A->data)->A;
1568   Mat b = ((Mat_MPIBAIJ *) A->data)->B;
1569   int ierr,s1,s2,s3;
1570 
1571   PetscFunctionBegin;
1572   if (ll)  {
1573     ierr = VecGetLocalSize(ll,&s1); CHKERRQ(ierr);
1574     ierr = MatGetLocalSize(A,&s2,&s3); CHKERRQ(ierr);
1575     if (s1!=s2) SETERRQ(PETSC_ERR_ARG_SIZ,0,"non-conforming local sizes");
1576     ierr = MatDiagonalScale(a,ll,0); CHKERRQ(ierr);
1577     ierr = MatDiagonalScale(b,ll,0); CHKERRQ(ierr);
1578   }
1579   if (rr) SETERRQ(PETSC_ERR_SUP,0,"not supported for right vector");
1580   PetscFunctionReturn(0);
1581 }
1582 
1583 #undef __FUNC__
1584 #define __FUNC__ "MatZeroRows_MPIBAIJ"
1585 int MatZeroRows_MPIBAIJ(Mat A,IS is,Scalar *diag)
1586 {
1587   Mat_MPIBAIJ    *l = (Mat_MPIBAIJ *) A->data;
1588   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
1589   int            *procs,*nprocs,j,found,idx,nsends,*work,row;
1590   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
1591   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
1592   int            *lens,imdex,*lrows,*values,bs=l->bs,rstart_bs=l->rstart_bs;
1593   MPI_Comm       comm = A->comm;
1594   MPI_Request    *send_waits,*recv_waits;
1595   MPI_Status     recv_status,*send_status;
1596   IS             istmp;
1597 
1598   PetscFunctionBegin;
1599   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
1600   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
1601 
1602   /*  first count number of contributors to each processor */
1603   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
1604   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
1605   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
1606   for ( i=0; i<N; i++ ) {
1607     idx = rows[i];
1608     found = 0;
1609     for ( j=0; j<size; j++ ) {
1610       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
1611         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
1612       }
1613     }
1614     if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Index out of range");
1615   }
1616   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
1617 
1618   /* inform other processors of number of messages and max length*/
1619   work   = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
1620   ierr   = MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
1621   nrecvs = work[rank];
1622   ierr   = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
1623   nmax   = work[rank];
1624   PetscFree(work);
1625 
1626   /* post receives:   */
1627   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); CHKPTRQ(rvalues);
1628   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
1629   for ( i=0; i<nrecvs; i++ ) {
1630     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
1631   }
1632 
1633   /* do sends:
1634      1) starts[i] gives the starting index in svalues for stuff going to
1635      the ith processor
1636   */
1637   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
1638   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
1639   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
1640   starts[0] = 0;
1641   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1642   for ( i=0; i<N; i++ ) {
1643     svalues[starts[owner[i]]++] = rows[i];
1644   }
1645   ISRestoreIndices(is,&rows);
1646 
1647   starts[0] = 0;
1648   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1649   count = 0;
1650   for ( i=0; i<size; i++ ) {
1651     if (procs[i]) {
1652       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
1653     }
1654   }
1655   PetscFree(starts);
1656 
1657   base = owners[rank]*bs;
1658 
1659   /*  wait on receives */
1660   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
1661   source = lens + nrecvs;
1662   count  = nrecvs; slen = 0;
1663   while (count) {
1664     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
1665     /* unpack receives into our local space */
1666     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
1667     source[imdex]  = recv_status.MPI_SOURCE;
1668     lens[imdex]  = n;
1669     slen += n;
1670     count--;
1671   }
1672   PetscFree(recv_waits);
1673 
1674   /* move the data into the send scatter */
1675   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
1676   count = 0;
1677   for ( i=0; i<nrecvs; i++ ) {
1678     values = rvalues + i*nmax;
1679     for ( j=0; j<lens[i]; j++ ) {
1680       lrows[count++] = values[j] - base;
1681     }
1682   }
1683   PetscFree(rvalues); PetscFree(lens);
1684   PetscFree(owner); PetscFree(nprocs);
1685 
1686   /* actually zap the local rows */
1687   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
1688   PLogObjectParent(A,istmp);
1689 
1690   ierr = MatZeroRows(l->A,istmp,0); CHKERRQ(ierr);
1691   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
1692   ierr = ISDestroy(istmp); CHKERRQ(ierr);
1693 
1694   if (diag) {
1695     for ( i = 0; i < slen; i++ ) {
1696       row = lrows[i] + rstart_bs;
1697       ierr = MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES); CHKERRQ(ierr);
1698     }
1699     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1700     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1701   }
1702   PetscFree(lrows);
1703 
1704   /* wait on sends */
1705   if (nsends) {
1706     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
1707     ierr        = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
1708     PetscFree(send_status);
1709   }
1710   PetscFree(send_waits); PetscFree(svalues);
1711 
1712   PetscFunctionReturn(0);
1713 }
1714 extern int MatPrintHelp_SeqBAIJ(Mat);
1715 #undef __FUNC__
1716 #define __FUNC__ "MatPrintHelp_MPIBAIJ"
1717 int MatPrintHelp_MPIBAIJ(Mat A)
1718 {
1719   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1720   int         ierr;
1721 
1722   PetscFunctionBegin;
1723   if (!a->rank) {
1724     ierr = MatPrintHelp_SeqBAIJ(a->A);CHKERRQ(ierr);
1725   }
1726   PetscFunctionReturn(0);
1727 }
1728 
1729 #undef __FUNC__
1730 #define __FUNC__ "MatSetUnfactored_MPIBAIJ"
1731 int MatSetUnfactored_MPIBAIJ(Mat A)
1732 {
1733   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1734   int         ierr;
1735 
1736   PetscFunctionBegin;
1737   ierr = MatSetUnfactored(a->A); CHKERRQ(ierr);
1738   PetscFunctionReturn(0);
1739 }
1740 
1741 static int MatConvertSameType_MPIBAIJ(Mat,Mat *,int);
1742 
1743 /* -------------------------------------------------------------------*/
1744 static struct _MatOps MatOps = {
1745   MatSetValues_MPIBAIJ,MatGetRow_MPIBAIJ,MatRestoreRow_MPIBAIJ,MatMult_MPIBAIJ,
1746   MatMultAdd_MPIBAIJ,MatMultTrans_MPIBAIJ,MatMultTransAdd_MPIBAIJ,0,
1747   0,0,0,0,
1748   0,0,MatTranspose_MPIBAIJ,MatGetInfo_MPIBAIJ,
1749   0,MatGetDiagonal_MPIBAIJ,MatDiagonalScale_MPIBAIJ,MatNorm_MPIBAIJ,
1750   MatAssemblyBegin_MPIBAIJ,MatAssemblyEnd_MPIBAIJ,0,MatSetOption_MPIBAIJ,
1751   MatZeroEntries_MPIBAIJ,MatZeroRows_MPIBAIJ,0,
1752   0,0,0,MatGetSize_MPIBAIJ,
1753   MatGetLocalSize_MPIBAIJ,MatGetOwnershipRange_MPIBAIJ,0,0,
1754   0,0,MatConvertSameType_MPIBAIJ,0,0,
1755   0,0,0,MatGetSubMatrices_MPIBAIJ,
1756   MatIncreaseOverlap_MPIBAIJ,MatGetValues_MPIBAIJ,0,MatPrintHelp_MPIBAIJ,
1757   MatScale_MPIBAIJ,0,0,0,MatGetBlockSize_MPIBAIJ,
1758   0,0,0,0,0,0,MatSetUnfactored_MPIBAIJ,0,MatSetValuesBlocked_MPIBAIJ};
1759 
1760 
1761 #undef __FUNC__
1762 #define __FUNC__ "MatCreateMPIBAIJ"
1763 /*@C
1764    MatCreateMPIBAIJ - Creates a sparse parallel matrix in block AIJ format
1765    (block compressed row).  For good matrix assembly performance
1766    the user should preallocate the matrix storage by setting the parameters
1767    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1768    performance can be increased by more than a factor of 50.
1769 
1770    Input Parameters:
1771 .  comm - MPI communicator
1772 .  bs   - size of blockk
1773 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1774            This value should be the same as the local size used in creating the
1775            y vector for the matrix-vector product y = Ax.
1776 .  n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1777            This value should be the same as the local size used in creating the
1778            x vector for the matrix-vector product y = Ax.
1779 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1780 .  N - number of global columns (or PETSC_DECIDE to have calculated if n is given)
1781 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1782            submatrix  (same for all local rows)
1783 .  d_nzz - array containing the number of block nonzeros in the various block rows
1784            of the in diagonal portion of the local (possibly different for each block
1785            row) or PETSC_NULL.  You must leave room for the diagonal entry even if
1786            it is zero.
1787 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1788            submatrix (same for all local rows).
1789 .  o_nzz - array containing the number of nonzeros in the various block rows of the
1790            off-diagonal portion of the local submatrix (possibly different for
1791            each block row) or PETSC_NULL.
1792 
1793    Output Parameter:
1794 .  A - the matrix
1795 
1796    Notes:
1797    The user MUST specify either the local or global matrix dimensions
1798    (possibly both).
1799 
1800    Storage Information:
1801    For a square global matrix we define each processor's diagonal portion
1802    to be its local rows and the corresponding columns (a square submatrix);
1803    each processor's off-diagonal portion encompasses the remainder of the
1804    local matrix (a rectangular submatrix).
1805 
1806    The user can specify preallocated storage for the diagonal part of
1807    the local submatrix with either d_nz or d_nnz (not both).  Set
1808    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1809    memory allocation.  Likewise, specify preallocated storage for the
1810    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1811 
1812    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1813    the figure below we depict these three local rows and all columns (0-11).
1814 
1815 $          0 1 2 3 4 5 6 7 8 9 10 11
1816 $         -------------------
1817 $  row 3  |  o o o d d d o o o o o o
1818 $  row 4  |  o o o d d d o o o o o o
1819 $  row 5  |  o o o d d d o o o o o o
1820 $         -------------------
1821 $
1822 
1823    Thus, any entries in the d locations are stored in the d (diagonal)
1824    submatrix, and any entries in the o locations are stored in the
1825    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1826    stored simply in the MATSEQBAIJ format for compressed row storage.
1827 
1828    Now d_nz should indicate the number of block nonzeros per row in the d matrix,
1829    and o_nz should indicate the number of block nonzeros per row in the o matrix.
1830    In general, for PDE problems in which most nonzeros are near the diagonal,
1831    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1832    or you will get TERRIBLE performance; see the users' manual chapter on
1833    matrices.
1834 
1835 .keywords: matrix, block, aij, compressed row, sparse, parallel
1836 
1837 .seealso: MatCreate(), MatCreateSeqBAIJ(), MatSetValues()
1838 @*/
1839 int MatCreateMPIBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,
1840                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1841 {
1842   Mat          B;
1843   Mat_MPIBAIJ  *b;
1844   int          ierr, i,sum[2],work[2],mbs,nbs,Mbs=PETSC_DECIDE,Nbs=PETSC_DECIDE,size;
1845 
1846   PetscFunctionBegin;
1847   if (bs < 1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Invalid block size specified, must be positive");
1848 
1849   MPI_Comm_size(comm,&size);
1850   if (size == 1) {
1851     if (M == PETSC_DECIDE) M = m;
1852     if (N == PETSC_DECIDE) N = n;
1853     ierr = MatCreateSeqBAIJ(comm,bs,M,N,d_nz,d_nnz,A); CHKERRQ(ierr);
1854     PetscFunctionReturn(0);
1855   }
1856 
1857   *A = 0;
1858   PetscHeaderCreate(B,_p_Mat,MAT_COOKIE,MATMPIBAIJ,comm,MatDestroy,MatView);
1859   PLogObjectCreate(B);
1860   B->data       = (void *) (b = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(b);
1861   PetscMemzero(b,sizeof(Mat_MPIBAIJ));
1862   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
1863 
1864   B->destroy    = MatDestroy_MPIBAIJ;
1865   B->view       = MatView_MPIBAIJ;
1866   B->mapping    = 0;
1867   B->factor     = 0;
1868   B->assembled  = PETSC_FALSE;
1869 
1870   B->insertmode = NOT_SET_VALUES;
1871   MPI_Comm_rank(comm,&b->rank);
1872   MPI_Comm_size(comm,&b->size);
1873 
1874   if ( m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL)) {
1875     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
1876   }
1877   if ( M == PETSC_DECIDE && m == PETSC_DECIDE) {
1878     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either M or m should be specified");
1879   }
1880   if ( N == PETSC_DECIDE && n == PETSC_DECIDE) {
1881     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either N or n should be specified");
1882   }
1883   if ( M != PETSC_DECIDE && m != PETSC_DECIDE) M = PETSC_DECIDE;
1884   if ( N != PETSC_DECIDE && n != PETSC_DECIDE) N = PETSC_DECIDE;
1885 
1886   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1887     work[0] = m; work[1] = n;
1888     mbs = m/bs; nbs = n/bs;
1889     ierr = MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );CHKERRQ(ierr);
1890     if (M == PETSC_DECIDE) {M = sum[0]; Mbs = M/bs;}
1891     if (N == PETSC_DECIDE) {N = sum[1]; Nbs = N/bs;}
1892   }
1893   if (m == PETSC_DECIDE) {
1894     Mbs = M/bs;
1895     if (Mbs*bs != M) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global rows must be divisible by blocksize");
1896     mbs = Mbs/b->size + ((Mbs % b->size) > b->rank);
1897     m   = mbs*bs;
1898   }
1899   if (n == PETSC_DECIDE) {
1900     Nbs = N/bs;
1901     if (Nbs*bs != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global cols must be divisible by blocksize");
1902     nbs = Nbs/b->size + ((Nbs % b->size) > b->rank);
1903     n   = nbs*bs;
1904   }
1905   if (mbs*bs != m || nbs*bs != n) {
1906     SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of local rows, cols must be divisible by blocksize");
1907   }
1908 
1909   b->m = m; B->m = m;
1910   b->n = n; B->n = n;
1911   b->N = N; B->N = N;
1912   b->M = M; B->M = M;
1913   b->bs  = bs;
1914   b->bs2 = bs*bs;
1915   b->mbs = mbs;
1916   b->nbs = nbs;
1917   b->Mbs = Mbs;
1918   b->Nbs = Nbs;
1919 
1920   /* build local table of row and column ownerships */
1921   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
1922   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
1923   b->cowners = b->rowners + b->size + 2;
1924   ierr = MPI_Allgather(&mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1925   b->rowners[0] = 0;
1926   for ( i=2; i<=b->size; i++ ) {
1927     b->rowners[i] += b->rowners[i-1];
1928   }
1929   b->rstart    = b->rowners[b->rank];
1930   b->rend      = b->rowners[b->rank+1];
1931   b->rstart_bs = b->rstart * bs;
1932   b->rend_bs   = b->rend * bs;
1933 
1934   ierr = MPI_Allgather(&nbs,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1935   b->cowners[0] = 0;
1936   for ( i=2; i<=b->size; i++ ) {
1937     b->cowners[i] += b->cowners[i-1];
1938   }
1939   b->cstart    = b->cowners[b->rank];
1940   b->cend      = b->cowners[b->rank+1];
1941   b->cstart_bs = b->cstart * bs;
1942   b->cend_bs   = b->cend * bs;
1943 
1944 
1945   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1946   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
1947   PLogObjectParent(B,b->A);
1948   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1949   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
1950   PLogObjectParent(B,b->B);
1951 
1952   /* build cache for off array entries formed */
1953   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
1954   b->donotstash  = 0;
1955   b->colmap      = 0;
1956   b->garray      = 0;
1957   b->roworiented = 1;
1958 
1959   /* stuff used in block assembly */
1960   b->barray       = 0;
1961 
1962   /* stuff used for matrix vector multiply */
1963   b->lvec         = 0;
1964   b->Mvctx        = 0;
1965 
1966   /* stuff for MatGetRow() */
1967   b->rowindices   = 0;
1968   b->rowvalues    = 0;
1969   b->getrowactive = PETSC_FALSE;
1970 
1971   /* hash table stuff */
1972   b->ht           = 0;
1973   b->hd           = 0;
1974   b->ht_size      = 0;
1975   b->ht_total_ct  = 0;
1976   b->ht_insert_ct = 0;
1977 
1978   *A = B;
1979   PetscFunctionReturn(0);
1980 }
1981 
1982 #undef __FUNC__
1983 #define __FUNC__ "MatConvertSameType_MPIBAIJ"
1984 static int MatConvertSameType_MPIBAIJ(Mat matin,Mat *newmat,int cpvalues)
1985 {
1986   Mat         mat;
1987   Mat_MPIBAIJ *a,*oldmat = (Mat_MPIBAIJ *) matin->data;
1988   int         ierr, len=0, flg;
1989 
1990   PetscFunctionBegin;
1991   *newmat       = 0;
1992   PetscHeaderCreate(mat,_p_Mat,MAT_COOKIE,MATMPIBAIJ,matin->comm,MatDestroy,MatView);
1993   PLogObjectCreate(mat);
1994   mat->data       = (void *) (a = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(a);
1995   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1996   mat->destroy    = MatDestroy_MPIBAIJ;
1997   mat->view       = MatView_MPIBAIJ;
1998   mat->factor     = matin->factor;
1999   mat->assembled  = PETSC_TRUE;
2000 
2001   a->m = mat->m   = oldmat->m;
2002   a->n = mat->n   = oldmat->n;
2003   a->M = mat->M   = oldmat->M;
2004   a->N = mat->N   = oldmat->N;
2005 
2006   a->bs  = oldmat->bs;
2007   a->bs2 = oldmat->bs2;
2008   a->mbs = oldmat->mbs;
2009   a->nbs = oldmat->nbs;
2010   a->Mbs = oldmat->Mbs;
2011   a->Nbs = oldmat->Nbs;
2012 
2013   a->rstart       = oldmat->rstart;
2014   a->rend         = oldmat->rend;
2015   a->cstart       = oldmat->cstart;
2016   a->cend         = oldmat->cend;
2017   a->size         = oldmat->size;
2018   a->rank         = oldmat->rank;
2019   mat->insertmode = NOT_SET_VALUES;
2020   a->rowvalues    = 0;
2021   a->getrowactive = PETSC_FALSE;
2022   a->barray       = 0;
2023 
2024   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
2025   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
2026   a->cowners = a->rowners + a->size + 2;
2027   PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));
2028   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
2029   if (oldmat->colmap) {
2030     a->colmap = (int *) PetscMalloc((a->Nbs)*sizeof(int));CHKPTRQ(a->colmap);
2031     PLogObjectMemory(mat,(a->Nbs)*sizeof(int));
2032     PetscMemcpy(a->colmap,oldmat->colmap,(a->Nbs)*sizeof(int));
2033   } else a->colmap = 0;
2034   if (oldmat->garray && (len = ((Mat_SeqBAIJ *) (oldmat->B->data))->nbs)) {
2035     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
2036     PLogObjectMemory(mat,len*sizeof(int));
2037     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
2038   } else a->garray = 0;
2039 
2040   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
2041   PLogObjectParent(mat,a->lvec);
2042   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
2043   PLogObjectParent(mat,a->Mvctx);
2044   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
2045   PLogObjectParent(mat,a->A);
2046   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
2047   PLogObjectParent(mat,a->B);
2048   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
2049   if (flg) {
2050     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
2051   }
2052   *newmat = mat;
2053   PetscFunctionReturn(0);
2054 }
2055 
2056 #include "sys.h"
2057 
2058 #undef __FUNC__
2059 #define __FUNC__ "MatLoad_MPIBAIJ"
2060 int MatLoad_MPIBAIJ(Viewer viewer,MatType type,Mat *newmat)
2061 {
2062   Mat          A;
2063   int          i, nz, ierr, j,rstart, rend, fd;
2064   Scalar       *vals,*buf;
2065   MPI_Comm     comm = ((PetscObject)viewer)->comm;
2066   MPI_Status   status;
2067   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,*browners,maxnz,*cols;
2068   int          *locrowlens,*sndcounts = 0,*procsnz = 0, jj,*mycols,*ibuf;
2069   int          flg,tag = ((PetscObject)viewer)->tag,bs=1,bs2,Mbs,mbs,extra_rows;
2070   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
2071   int          dcount,kmax,k,nzcount,tmp;
2072 
2073   PetscFunctionBegin;
2074   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
2075   bs2  = bs*bs;
2076 
2077   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
2078   if (!rank) {
2079     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
2080     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT); CHKERRQ(ierr);
2081     if (header[0] != MAT_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"not matrix object");
2082     if (header[3] < 0) {
2083       SETERRQ(PETSC_ERR_FILE_UNEXPECTED,1,"Matrix stored in special format, cannot load as MPIBAIJ");
2084     }
2085   }
2086 
2087   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
2088   M = header[1]; N = header[2];
2089 
2090   if (M != N) SETERRQ(PETSC_ERR_SUP,0,"Can only do square matrices");
2091 
2092   /*
2093      This code adds extra rows to make sure the number of rows is
2094      divisible by the blocksize
2095   */
2096   Mbs        = M/bs;
2097   extra_rows = bs - M + bs*(Mbs);
2098   if (extra_rows == bs) extra_rows = 0;
2099   else                  Mbs++;
2100   if (extra_rows &&!rank) {
2101     PLogInfo(0,"MatLoad_MPIBAIJ:Padding loaded matrix to match blocksize\n");
2102   }
2103 
2104   /* determine ownership of all rows */
2105   mbs = Mbs/size + ((Mbs % size) > rank);
2106   m   = mbs * bs;
2107   rowners = (int *) PetscMalloc(2*(size+2)*sizeof(int)); CHKPTRQ(rowners);
2108   browners = rowners + size + 1;
2109   ierr = MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2110   rowners[0] = 0;
2111   for ( i=2; i<=size; i++ ) rowners[i] += rowners[i-1];
2112   for ( i=0; i<=size;  i++ ) browners[i] = rowners[i]*bs;
2113   rstart = rowners[rank];
2114   rend   = rowners[rank+1];
2115 
2116   /* distribute row lengths to all processors */
2117   locrowlens = (int*) PetscMalloc( (rend-rstart)*bs*sizeof(int) ); CHKPTRQ(locrowlens);
2118   if (!rank) {
2119     rowlengths = (int*) PetscMalloc( (M+extra_rows)*sizeof(int) ); CHKPTRQ(rowlengths);
2120     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT); CHKERRQ(ierr);
2121     for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
2122     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
2123     for ( i=0; i<size; i++ ) sndcounts[i] = browners[i+1] - browners[i];
2124     ierr = MPI_Scatterv(rowlengths,sndcounts,browners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);CHKERRQ(ierr);
2125     PetscFree(sndcounts);
2126   } else {
2127     ierr = MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT, 0,comm);CHKERRQ(ierr);
2128   }
2129 
2130   if (!rank) {
2131     /* calculate the number of nonzeros on each processor */
2132     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
2133     PetscMemzero(procsnz,size*sizeof(int));
2134     for ( i=0; i<size; i++ ) {
2135       for ( j=rowners[i]*bs; j< rowners[i+1]*bs; j++ ) {
2136         procsnz[i] += rowlengths[j];
2137       }
2138     }
2139     PetscFree(rowlengths);
2140 
2141     /* determine max buffer needed and allocate it */
2142     maxnz = 0;
2143     for ( i=0; i<size; i++ ) {
2144       maxnz = PetscMax(maxnz,procsnz[i]);
2145     }
2146     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
2147 
2148     /* read in my part of the matrix column indices  */
2149     nz = procsnz[0];
2150     ibuf = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2151     mycols = ibuf;
2152     if (size == 1)  nz -= extra_rows;
2153     ierr = PetscBinaryRead(fd,mycols,nz,PETSC_INT); CHKERRQ(ierr);
2154     if (size == 1)  for (i=0; i< extra_rows; i++) { mycols[nz+i] = M+i; }
2155 
2156     /* read in every ones (except the last) and ship off */
2157     for ( i=1; i<size-1; i++ ) {
2158       nz   = procsnz[i];
2159       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2160       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
2161     }
2162     /* read in the stuff for the last proc */
2163     if ( size != 1 ) {
2164       nz   = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
2165       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2166       for ( i=0; i<extra_rows; i++ ) cols[nz+i] = M+i;
2167       ierr = MPI_Send(cols,nz+extra_rows,MPI_INT,size-1,tag,comm);CHKERRQ(ierr);
2168     }
2169     PetscFree(cols);
2170   } else {
2171     /* determine buffer space needed for message */
2172     nz = 0;
2173     for ( i=0; i<m; i++ ) {
2174       nz += locrowlens[i];
2175     }
2176     ibuf   = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2177     mycols = ibuf;
2178     /* receive message of column indices*/
2179     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
2180     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
2181     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2182   }
2183 
2184   /* loop over local rows, determining number of off diagonal entries */
2185   dlens  = (int *) PetscMalloc( 2*(rend-rstart+1)*sizeof(int) ); CHKPTRQ(dlens);
2186   odlens = dlens + (rend-rstart);
2187   mask   = (int *) PetscMalloc( 3*Mbs*sizeof(int) ); CHKPTRQ(mask);
2188   PetscMemzero(mask,3*Mbs*sizeof(int));
2189   masked1 = mask    + Mbs;
2190   masked2 = masked1 + Mbs;
2191   rowcount = 0; nzcount = 0;
2192   for ( i=0; i<mbs; i++ ) {
2193     dcount  = 0;
2194     odcount = 0;
2195     for ( j=0; j<bs; j++ ) {
2196       kmax = locrowlens[rowcount];
2197       for ( k=0; k<kmax; k++ ) {
2198         tmp = mycols[nzcount++]/bs;
2199         if (!mask[tmp]) {
2200           mask[tmp] = 1;
2201           if (tmp < rstart || tmp >= rend ) masked2[odcount++] = tmp;
2202           else masked1[dcount++] = tmp;
2203         }
2204       }
2205       rowcount++;
2206     }
2207 
2208     dlens[i]  = dcount;
2209     odlens[i] = odcount;
2210 
2211     /* zero out the mask elements we set */
2212     for ( j=0; j<dcount; j++ ) mask[masked1[j]] = 0;
2213     for ( j=0; j<odcount; j++ ) mask[masked2[j]] = 0;
2214   }
2215 
2216   /* create our matrix */
2217   ierr = MatCreateMPIBAIJ(comm,bs,m,PETSC_DECIDE,M+extra_rows,N+extra_rows,0,dlens,0,odlens,newmat);
2218          CHKERRQ(ierr);
2219   A = *newmat;
2220   MatSetOption(A,MAT_COLUMNS_SORTED);
2221 
2222   if (!rank) {
2223     buf = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(buf);
2224     /* read in my part of the matrix numerical values  */
2225     nz = procsnz[0];
2226     vals = buf;
2227     mycols = ibuf;
2228     if (size == 1)  nz -= extra_rows;
2229     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2230     if (size == 1)  for (i=0; i< extra_rows; i++) { vals[nz+i] = 1.0; }
2231 
2232     /* insert into matrix */
2233     jj      = rstart*bs;
2234     for ( i=0; i<m; i++ ) {
2235       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2236       mycols += locrowlens[i];
2237       vals   += locrowlens[i];
2238       jj++;
2239     }
2240     /* read in other processors (except the last one) and ship out */
2241     for ( i=1; i<size-1; i++ ) {
2242       nz   = procsnz[i];
2243       vals = buf;
2244       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2245       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
2246     }
2247     /* the last proc */
2248     if ( size != 1 ){
2249       nz   = procsnz[i] - extra_rows;
2250       vals = buf;
2251       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2252       for ( i=0; i<extra_rows; i++ ) vals[nz+i] = 1.0;
2253       ierr = MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,size-1,A->tag,comm);CHKERRQ(ierr);
2254     }
2255     PetscFree(procsnz);
2256   } else {
2257     /* receive numeric values */
2258     buf = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(buf);
2259 
2260     /* receive message of values*/
2261     vals   = buf;
2262     mycols = ibuf;
2263     ierr   = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
2264     ierr   = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
2265     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2266 
2267     /* insert into matrix */
2268     jj      = rstart*bs;
2269     for ( i=0; i<m; i++ ) {
2270       ierr    = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2271       mycols += locrowlens[i];
2272       vals   += locrowlens[i];
2273       jj++;
2274     }
2275   }
2276   PetscFree(locrowlens);
2277   PetscFree(buf);
2278   PetscFree(ibuf);
2279   PetscFree(rowners);
2280   PetscFree(dlens);
2281   PetscFree(mask);
2282   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2283   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2284   PetscFunctionReturn(0);
2285 }
2286 
2287 
2288