xref: /petsc/src/mat/impls/baij/mpi/mpibaij.c (revision fd16b1779ca341c090fa91c269e2c2dd5dd0b73d)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: mpibaij.c,v 1.177 1999/09/15 16:33:20 balay Exp bsmith $";
3 #endif
4 
5 #include "src/mat/impls/baij/mpi/mpibaij.h"   /*I  "mat.h"  I*/
6 #include "src/vec/vecimpl.h"
7 
8 extern int MatSetUpMultiply_MPIBAIJ(Mat);
9 extern int DisAssemble_MPIBAIJ(Mat);
10 extern int MatIncreaseOverlap_MPIBAIJ(Mat,int,IS *,int);
11 extern int MatGetSubMatrices_MPIBAIJ(Mat,int,IS *,IS *,MatReuse,Mat **);
12 extern int MatGetValues_SeqBAIJ(Mat,int,int *,int,int *,Scalar *);
13 extern int MatSetValues_SeqBAIJ(Mat,int,int *,int,int *,Scalar *,InsertMode);
14 extern int MatSetValuesBlocked_SeqBAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
15 extern int MatGetRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
16 extern int MatRestoreRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
17 extern int MatPrintHelp_SeqBAIJ(Mat);
18 
19 EXTERN_C_BEGIN
20 #undef __FUNC__
21 #define __FUNC__ "MatStoreValues_MPIBAIJ"
22 int MatStoreValues_MPIBAIJ(Mat mat)
23 {
24   Mat_MPIBAIJ *aij = (Mat_MPIBAIJ *)mat->data;
25   int         ierr;
26 
27   PetscFunctionBegin;
28   ierr = MatStoreValues(aij->A);CHKERRQ(ierr);
29   ierr = MatStoreValues(aij->B);CHKERRQ(ierr);
30   PetscFunctionReturn(0);
31 }
32 EXTERN_C_END
33 
34 EXTERN_C_BEGIN
35 #undef __FUNC__
36 #define __FUNC__ "MatRetrieveValues_MPIBAIJ"
37 int MatRetrieveValues_MPIBAIJ(Mat mat)
38 {
39   Mat_MPIBAIJ *aij = (Mat_MPIBAIJ *)mat->data;
40   int         ierr;
41 
42   PetscFunctionBegin;
43   ierr = MatRetrieveValues(aij->A);CHKERRQ(ierr);
44   ierr = MatRetrieveValues(aij->B);CHKERRQ(ierr);
45   PetscFunctionReturn(0);
46 }
47 EXTERN_C_END
48 
49 /*
50      Local utility routine that creates a mapping from the global column
51    number to the local number in the off-diagonal part of the local
52    storage of the matrix.  This is done in a non scable way since the
53    length of colmap equals the global matrix length.
54 */
55 #undef __FUNC__
56 #define __FUNC__ "CreateColmap_MPIBAIJ_Private"
57 static int CreateColmap_MPIBAIJ_Private(Mat mat)
58 {
59   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
60   Mat_SeqBAIJ *B = (Mat_SeqBAIJ*) baij->B->data;
61   int         nbs = B->nbs,i,bs=B->bs,ierr;
62 
63   PetscFunctionBegin;
64 #if defined (PETSC_USE_CTABLE)
65   ierr = TableCreate(baij->nbs/5,&baij->colmap);CHKERRQ(ierr);
66   for ( i=0; i<nbs; i++ ){
67     ierr = TableAdd(baij->colmap,baij->garray[i]+1,i*bs+1);CHKERRQ(ierr);
68   }
69 #else
70   baij->colmap = (int *) PetscMalloc((baij->Nbs+1)*sizeof(int));CHKPTRQ(baij->colmap);
71   PLogObjectMemory(mat,baij->Nbs*sizeof(int));
72   ierr = PetscMemzero(baij->colmap,baij->Nbs*sizeof(int));CHKERRQ(ierr);
73   for ( i=0; i<nbs; i++ ) baij->colmap[baij->garray[i]] = i*bs+1;
74 #endif
75   PetscFunctionReturn(0);
76 }
77 
78 #define CHUNKSIZE  10
79 
80 #define  MatSetValues_SeqBAIJ_A_Private(row,col,value,addv) \
81 { \
82  \
83     brow = row/bs;  \
84     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
85     rmax = aimax[brow]; nrow = ailen[brow]; \
86       bcol = col/bs; \
87       ridx = row % bs; cidx = col % bs; \
88       low = 0; high = nrow; \
89       while (high-low > 3) { \
90         t = (low+high)/2; \
91         if (rp[t] > bcol) high = t; \
92         else              low  = t; \
93       } \
94       for ( _i=low; _i<high; _i++ ) { \
95         if (rp[_i] > bcol) break; \
96         if (rp[_i] == bcol) { \
97           bap  = ap +  bs2*_i + bs*cidx + ridx; \
98           if (addv == ADD_VALUES) *bap += value;  \
99           else                    *bap  = value;  \
100           goto a_noinsert; \
101         } \
102       } \
103       if (a->nonew == 1) goto a_noinsert; \
104       else if (a->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
105       if (nrow >= rmax) { \
106         /* there is no extra room in row, therefore enlarge */ \
107         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
108         Scalar *new_a; \
109  \
110         if (a->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
111  \
112         /* malloc new storage space */ \
113         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int); \
114         new_a   = (Scalar *) PetscMalloc( len );CHKPTRQ(new_a); \
115         new_j   = (int *) (new_a + bs2*new_nz); \
116         new_i   = new_j + new_nz; \
117  \
118         /* copy over old data into new slots */ \
119         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];} \
120         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
121         ierr = PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int));CHKERRQ(ierr); \
122         len = (new_nz - CHUNKSIZE - ai[brow] - nrow); \
123         ierr = PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow, \
124                           len*sizeof(int));CHKERRQ(ierr); \
125         ierr = PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar));CHKERRQ(ierr); \
126         ierr = PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar));CHKERRQ(ierr); \
127         ierr = PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE), \
128                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));CHKERRQ(ierr);  \
129         /* free up old matrix storage */ \
130         ierr = PetscFree(a->a);CHKERRQ(ierr);  \
131         if (!a->singlemalloc) { \
132           ierr = PetscFree(a->i);CHKERRQ(ierr); \
133           ierr = PetscFree(a->j);CHKERRQ(ierr);\
134         } \
135         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
136         a->singlemalloc = 1; \
137  \
138         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
139         rmax = aimax[brow] = aimax[brow] + CHUNKSIZE; \
140         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
141         a->maxnz += bs2*CHUNKSIZE; \
142         a->reallocs++; \
143         a->nz++; \
144       } \
145       N = nrow++ - 1;  \
146       /* shift up all the later entries in this row */ \
147       for ( ii=N; ii>=_i; ii-- ) { \
148         rp[ii+1] = rp[ii]; \
149         ierr = PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar));CHKERRQ(ierr); \
150       } \
151       if (N>=_i) { ierr = PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));CHKERRQ(ierr); }  \
152       rp[_i]                      = bcol;  \
153       ap[bs2*_i + bs*cidx + ridx] = value;  \
154       a_noinsert:; \
155     ailen[brow] = nrow; \
156 }
157 
158 #define  MatSetValues_SeqBAIJ_B_Private(row,col,value,addv) \
159 { \
160  \
161     brow = row/bs;  \
162     rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
163     rmax = bimax[brow]; nrow = bilen[brow]; \
164       bcol = col/bs; \
165       ridx = row % bs; cidx = col % bs; \
166       low = 0; high = nrow; \
167       while (high-low > 3) { \
168         t = (low+high)/2; \
169         if (rp[t] > bcol) high = t; \
170         else              low  = t; \
171       } \
172       for ( _i=low; _i<high; _i++ ) { \
173         if (rp[_i] > bcol) break; \
174         if (rp[_i] == bcol) { \
175           bap  = ap +  bs2*_i + bs*cidx + ridx; \
176           if (addv == ADD_VALUES) *bap += value;  \
177           else                    *bap  = value;  \
178           goto b_noinsert; \
179         } \
180       } \
181       if (b->nonew == 1) goto b_noinsert; \
182       else if (b->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
183       if (nrow >= rmax) { \
184         /* there is no extra room in row, therefore enlarge */ \
185         int    new_nz = bi[b->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
186         Scalar *new_a; \
187  \
188         if (b->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
189  \
190         /* malloc new storage space */ \
191         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(b->mbs+1)*sizeof(int); \
192         new_a   = (Scalar *) PetscMalloc( len );CHKPTRQ(new_a); \
193         new_j   = (int *) (new_a + bs2*new_nz); \
194         new_i   = new_j + new_nz; \
195  \
196         /* copy over old data into new slots */ \
197         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = bi[ii];} \
198         for ( ii=brow+1; ii<b->mbs+1; ii++ ) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
199         ierr = PetscMemcpy(new_j,bj,(bi[brow]+nrow)*sizeof(int));CHKERRQ(ierr); \
200         len = (new_nz - CHUNKSIZE - bi[brow] - nrow); \
201         ierr = PetscMemcpy(new_j+bi[brow]+nrow+CHUNKSIZE,bj+bi[brow]+nrow, \
202                            len*sizeof(int));CHKERRQ(ierr); \
203         ierr = PetscMemcpy(new_a,ba,(bi[brow]+nrow)*bs2*sizeof(Scalar));CHKERRQ(ierr); \
204         ierr = PetscMemzero(new_a+bs2*(bi[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar));CHKERRQ(ierr); \
205         ierr = PetscMemcpy(new_a+bs2*(bi[brow]+nrow+CHUNKSIZE), \
206                     ba+bs2*(bi[brow]+nrow),bs2*len*sizeof(Scalar));CHKERRQ(ierr);  \
207         /* free up old matrix storage */ \
208         ierr = PetscFree(b->a);CHKERRQ(ierr);  \
209         if (!b->singlemalloc) { \
210           ierr = PetscFree(b->i);CHKERRQ(ierr); \
211           ierr = PetscFree(b->j);CHKERRQ(ierr); \
212         } \
213         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
214         b->singlemalloc = 1; \
215  \
216         rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
217         rmax = bimax[brow] = bimax[brow] + CHUNKSIZE; \
218         PLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
219         b->maxnz += bs2*CHUNKSIZE; \
220         b->reallocs++; \
221         b->nz++; \
222       } \
223       N = nrow++ - 1;  \
224       /* shift up all the later entries in this row */ \
225       for ( ii=N; ii>=_i; ii-- ) { \
226         rp[ii+1] = rp[ii]; \
227         ierr = PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar));CHKERRQ(ierr); \
228       } \
229       if (N>=_i) { ierr = PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));CHKERRQ(ierr);}  \
230       rp[_i]                      = bcol;  \
231       ap[bs2*_i + bs*cidx + ridx] = value;  \
232       b_noinsert:; \
233     bilen[brow] = nrow; \
234 }
235 
236 #undef __FUNC__
237 #define __FUNC__ "MatSetValues_MPIBAIJ"
238 int MatSetValues_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
239 {
240   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
241   Scalar      value;
242   int         ierr,i,j,row,col;
243   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
244   int         rend_orig=baij->rend_bs,cstart_orig=baij->cstart_bs;
245   int         cend_orig=baij->cend_bs,bs=baij->bs;
246 
247   /* Some Variables required in the macro */
248   Mat         A = baij->A;
249   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) (A)->data;
250   int         *aimax=a->imax,*ai=a->i,*ailen=a->ilen,*aj=a->j;
251   Scalar      *aa=a->a;
252 
253   Mat         B = baij->B;
254   Mat_SeqBAIJ *b = (Mat_SeqBAIJ *) (B)->data;
255   int         *bimax=b->imax,*bi=b->i,*bilen=b->ilen,*bj=b->j;
256   Scalar      *ba=b->a;
257 
258   int         *rp,ii,nrow,_i,rmax,N,brow,bcol;
259   int         low,high,t,ridx,cidx,bs2=a->bs2;
260   Scalar      *ap,*bap;
261 
262   PetscFunctionBegin;
263   for ( i=0; i<m; i++ ) {
264     if (im[i] < 0) continue;
265 #if defined(PETSC_USE_BOPT_g)
266     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
267 #endif
268     if (im[i] >= rstart_orig && im[i] < rend_orig) {
269       row = im[i] - rstart_orig;
270       for ( j=0; j<n; j++ ) {
271         if (in[j] >= cstart_orig && in[j] < cend_orig){
272           col = in[j] - cstart_orig;
273           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
274           MatSetValues_SeqBAIJ_A_Private(row,col,value,addv);
275           /* ierr = MatSetValues_SeqBAIJ(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
276         } else if (in[j] < 0) continue;
277 #if defined(PETSC_USE_BOPT_g)
278         else if (in[j] >= baij->N) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Col too large");}
279 #endif
280         else {
281           if (mat->was_assembled) {
282             if (!baij->colmap) {
283               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
284             }
285 #if defined (PETSC_USE_CTABLE)
286             ierr = TableFind(baij->colmap, in[j]/bs + 1,&col);CHKERRQ(ierr);
287             col  = col - 1 + in[j]%bs;
288 #else
289             col = baij->colmap[in[j]/bs] - 1 + in[j]%bs;
290 #endif
291             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
292               ierr = DisAssemble_MPIBAIJ(mat);CHKERRQ(ierr);
293               col =  in[j];
294               /* Reinitialize the variables required by MatSetValues_SeqBAIJ_B_Private() */
295               B = baij->B;
296               b = (Mat_SeqBAIJ *) (B)->data;
297               bimax=b->imax;bi=b->i;bilen=b->ilen;bj=b->j;
298               ba=b->a;
299             }
300           } else col = in[j];
301           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
302           MatSetValues_SeqBAIJ_B_Private(row,col,value,addv);
303           /* ierr = MatSetValues_SeqBAIJ(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
304         }
305       }
306     } else {
307       if ( !baij->donotstash) {
308         if (roworiented) {
309           ierr = MatStashValuesRow_Private(&mat->stash,im[i],n,in,v+i*n);CHKERRQ(ierr);
310         } else {
311           ierr = MatStashValuesCol_Private(&mat->stash,im[i],n,in,v+i,m);CHKERRQ(ierr);
312         }
313       }
314     }
315   }
316   PetscFunctionReturn(0);
317 }
318 
319 #undef __FUNC__
320 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ"
321 int MatSetValuesBlocked_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
322 {
323   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
324   Scalar      *value,*barray=baij->barray;
325   int         ierr,i,j,ii,jj,row,col;
326   int         roworiented = baij->roworiented,rstart=baij->rstart ;
327   int         rend=baij->rend,cstart=baij->cstart,stepval;
328   int         cend=baij->cend,bs=baij->bs,bs2=baij->bs2;
329 
330   PetscFunctionBegin;
331   if(!barray) {
332     baij->barray = barray = (Scalar*) PetscMalloc(bs2*sizeof(Scalar));CHKPTRQ(barray);
333   }
334 
335   if (roworiented) {
336     stepval = (n-1)*bs;
337   } else {
338     stepval = (m-1)*bs;
339   }
340   for ( i=0; i<m; i++ ) {
341     if (im[i] < 0) continue;
342 #if defined(PETSC_USE_BOPT_g)
343     if (im[i] >= baij->Mbs) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large, row %d max %d",im[i],baij->Mbs);
344 #endif
345     if (im[i] >= rstart && im[i] < rend) {
346       row = im[i] - rstart;
347       for ( j=0; j<n; j++ ) {
348         /* If NumCol = 1 then a copy is not required */
349         if ((roworiented) && (n == 1)) {
350           barray = v + i*bs2;
351         } else if((!roworiented) && (m == 1)) {
352           barray = v + j*bs2;
353         } else { /* Here a copy is required */
354           if (roworiented) {
355             value = v + i*(stepval+bs)*bs + j*bs;
356           } else {
357             value = v + j*(stepval+bs)*bs + i*bs;
358           }
359           for ( ii=0; ii<bs; ii++,value+=stepval ) {
360             for (jj=0; jj<bs; jj++ ) {
361               *barray++  = *value++;
362             }
363           }
364           barray -=bs2;
365         }
366 
367         if (in[j] >= cstart && in[j] < cend){
368           col  = in[j] - cstart;
369           ierr = MatSetValuesBlocked_SeqBAIJ(baij->A,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
370         }
371         else if (in[j] < 0) continue;
372 #if defined(PETSC_USE_BOPT_g)
373         else if (in[j] >= baij->Nbs) {SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large, col %d max %d",in[j],baij->Nbs);}
374 #endif
375         else {
376           if (mat->was_assembled) {
377             if (!baij->colmap) {
378               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
379             }
380 
381 #if defined(PETSC_USE_BOPT_g)
382 #if defined (PETSC_USE_CTABLE)
383             { int data;
384             ierr = TableFind(baij->colmap,in[j]+1,&data);CHKERRQ(ierr);
385             if((data - 1) % bs)
386 	      {SETERRQ(PETSC_ERR_PLIB,0,"Incorrect colmap");}
387             }
388 #else
389             if ((baij->colmap[in[j]] - 1) % bs) {SETERRQ(PETSC_ERR_PLIB,0,"Incorrect colmap");}
390 #endif
391 #endif
392 #if defined (PETSC_USE_CTABLE)
393 	    ierr = TableFind(baij->colmap,in[j]+1,&col);CHKERRQ(ierr);
394             col  = (col - 1)/bs;
395 #else
396             col = (baij->colmap[in[j]] - 1)/bs;
397 #endif
398             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
399               ierr = DisAssemble_MPIBAIJ(mat);CHKERRQ(ierr);
400               col =  in[j];
401             }
402           }
403           else col = in[j];
404           ierr = MatSetValuesBlocked_SeqBAIJ(baij->B,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
405         }
406       }
407     } else {
408       if (!baij->donotstash) {
409         if (roworiented) {
410           ierr = MatStashValuesRowBlocked_Private(&mat->bstash,im[i],n,in,v,m,n,i);CHKERRQ(ierr);
411         } else {
412           ierr = MatStashValuesColBlocked_Private(&mat->bstash,im[i],n,in,v,m,n,i);CHKERRQ(ierr);
413         }
414       }
415     }
416   }
417   PetscFunctionReturn(0);
418 }
419 #define HASH_KEY 0.6180339887
420 /* #define HASH1(size,key) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
421 #define HASH(size,key,tmp) (tmp = (key)*HASH_KEY,(int)((size)*(tmp-(int)tmp)))
422 /* #define HASH(size,key,tmp) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
423 #undef __FUNC__
424 #define __FUNC__ "MatSetValues_MPIBAIJ_HT"
425 int MatSetValues_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
426 {
427   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
428   int         ierr,i,j,row,col;
429   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
430   int         rend_orig=baij->rend_bs,Nbs=baij->Nbs;
431   int         h1,key,size=baij->ht_size,bs=baij->bs,*HT=baij->ht,idx;
432   double      tmp;
433   Scalar      ** HD = baij->hd,value;
434 #if defined(PETSC_USE_BOPT_g)
435   int         total_ct=baij->ht_total_ct,insert_ct=baij->ht_insert_ct;
436 #endif
437 
438   PetscFunctionBegin;
439 
440   for ( i=0; i<m; i++ ) {
441 #if defined(PETSC_USE_BOPT_g)
442     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
443     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
444 #endif
445       row = im[i];
446     if (row >= rstart_orig && row < rend_orig) {
447       for ( j=0; j<n; j++ ) {
448         col = in[j];
449         if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
450         /* Look up into the Hash Table */
451         key = (row/bs)*Nbs+(col/bs)+1;
452         h1  = HASH(size,key,tmp);
453 
454 
455         idx = h1;
456 #if defined(PETSC_USE_BOPT_g)
457         insert_ct++;
458         total_ct++;
459         if (HT[idx] != key) {
460           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++,total_ct++);
461           if (idx == size) {
462             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++,total_ct++);
463             if (idx == h1) {
464               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
465             }
466           }
467         }
468 #else
469         if (HT[idx] != key) {
470           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++);
471           if (idx == size) {
472             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++);
473             if (idx == h1) {
474               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
475             }
476           }
477         }
478 #endif
479         /* A HASH table entry is found, so insert the values at the correct address */
480         if (addv == ADD_VALUES) *(HD[idx]+ (col % bs)*bs + (row % bs)) += value;
481         else                    *(HD[idx]+ (col % bs)*bs + (row % bs))  = value;
482       }
483     } else {
484       if (!baij->donotstash) {
485         if (roworiented) {
486           ierr = MatStashValuesRow_Private(&mat->stash,im[i],n,in,v+i*n);CHKERRQ(ierr);
487         } else {
488           ierr = MatStashValuesCol_Private(&mat->stash,im[i],n,in,v+i,m);CHKERRQ(ierr);
489         }
490       }
491     }
492   }
493 #if defined(PETSC_USE_BOPT_g)
494   baij->ht_total_ct = total_ct;
495   baij->ht_insert_ct = insert_ct;
496 #endif
497   PetscFunctionReturn(0);
498 }
499 
500 #undef __FUNC__
501 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ_HT"
502 int MatSetValuesBlocked_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
503 {
504   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
505   int         ierr,i,j,ii,jj,row,col;
506   int         roworiented = baij->roworiented,rstart=baij->rstart ;
507   int         rend=baij->rend,stepval,bs=baij->bs,bs2=baij->bs2;
508   int         h1,key,size=baij->ht_size,idx,*HT=baij->ht,Nbs=baij->Nbs;
509   double      tmp;
510   Scalar      ** HD = baij->hd,*value,*v_t,*baij_a;
511 #if defined(PETSC_USE_BOPT_g)
512   int         total_ct=baij->ht_total_ct,insert_ct=baij->ht_insert_ct;
513 #endif
514 
515   PetscFunctionBegin;
516 
517   if (roworiented) {
518     stepval = (n-1)*bs;
519   } else {
520     stepval = (m-1)*bs;
521   }
522   for ( i=0; i<m; i++ ) {
523 #if defined(PETSC_USE_BOPT_g)
524     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
525     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
526 #endif
527     row   = im[i];
528     v_t   = v + i*bs2;
529     if (row >= rstart && row < rend) {
530       for ( j=0; j<n; j++ ) {
531         col = in[j];
532 
533         /* Look up into the Hash Table */
534         key = row*Nbs+col+1;
535         h1  = HASH(size,key,tmp);
536 
537         idx = h1;
538 #if defined(PETSC_USE_BOPT_g)
539         total_ct++;
540         insert_ct++;
541        if (HT[idx] != key) {
542           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++,total_ct++);
543           if (idx == size) {
544             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++,total_ct++);
545             if (idx == h1) {
546               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
547             }
548           }
549         }
550 #else
551         if (HT[idx] != key) {
552           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++);
553           if (idx == size) {
554             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++);
555             if (idx == h1) {
556               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
557             }
558           }
559         }
560 #endif
561         baij_a = HD[idx];
562         if (roworiented) {
563           /*value = v + i*(stepval+bs)*bs + j*bs;*/
564           /* value = v + (i*(stepval+bs)+j)*bs; */
565           value = v_t;
566           v_t  += bs;
567           if (addv == ADD_VALUES) {
568             for ( ii=0; ii<bs; ii++,value+=stepval) {
569               for ( jj=ii; jj<bs2; jj+=bs ) {
570                 baij_a[jj]  += *value++;
571               }
572             }
573           } else {
574             for ( ii=0; ii<bs; ii++,value+=stepval) {
575               for ( jj=ii; jj<bs2; jj+=bs ) {
576                 baij_a[jj]  = *value++;
577               }
578             }
579           }
580         } else {
581           value = v + j*(stepval+bs)*bs + i*bs;
582           if (addv == ADD_VALUES) {
583             for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
584               for ( jj=0; jj<bs; jj++ ) {
585                 baij_a[jj]  += *value++;
586               }
587             }
588           } else {
589             for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
590               for ( jj=0; jj<bs; jj++ ) {
591                 baij_a[jj]  = *value++;
592               }
593             }
594           }
595         }
596       }
597     } else {
598       if (!baij->donotstash) {
599         if (roworiented) {
600           ierr = MatStashValuesRowBlocked_Private(&mat->bstash,im[i],n,in,v,m,n,i);CHKERRQ(ierr);
601         } else {
602           ierr = MatStashValuesColBlocked_Private(&mat->bstash,im[i],n,in,v,m,n,i);CHKERRQ(ierr);
603         }
604       }
605     }
606   }
607 #if defined(PETSC_USE_BOPT_g)
608   baij->ht_total_ct = total_ct;
609   baij->ht_insert_ct = insert_ct;
610 #endif
611   PetscFunctionReturn(0);
612 }
613 
614 #undef __FUNC__
615 #define __FUNC__ "MatGetValues_MPIBAIJ"
616 int MatGetValues_MPIBAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
617 {
618   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
619   int        bs=baij->bs,ierr,i,j, bsrstart = baij->rstart*bs, bsrend = baij->rend*bs;
620   int        bscstart = baij->cstart*bs, bscend = baij->cend*bs,row,col,data;
621 
622   PetscFunctionBegin;
623   for ( i=0; i<m; i++ ) {
624     if (idxm[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
625     if (idxm[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
626     if (idxm[i] >= bsrstart && idxm[i] < bsrend) {
627       row = idxm[i] - bsrstart;
628       for ( j=0; j<n; j++ ) {
629         if (idxn[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");
630         if (idxn[j] >= baij->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");
631         if (idxn[j] >= bscstart && idxn[j] < bscend){
632           col = idxn[j] - bscstart;
633           ierr = MatGetValues_SeqBAIJ(baij->A,1,&row,1,&col,v+i*n+j);CHKERRQ(ierr);
634         } else {
635           if (!baij->colmap) {
636             ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
637           }
638 #if defined (PETSC_USE_CTABLE)
639           ierr = TableFind(baij->colmap,idxn[j]/bs+1,&data);CHKERRQ(ierr);
640           data --;
641 #else
642           data = baij->colmap[idxn[j]/bs]-1;
643 #endif
644           if((data < 0) || (baij->garray[data/bs] != idxn[j]/bs)) *(v+i*n+j) = 0.0;
645           else {
646             col  = data + idxn[j]%bs;
647             ierr = MatGetValues_SeqBAIJ(baij->B,1,&row,1,&col,v+i*n+j);CHKERRQ(ierr);
648           }
649         }
650       }
651     } else {
652       SETERRQ(PETSC_ERR_SUP,0,"Only local values currently supported");
653     }
654   }
655  PetscFunctionReturn(0);
656 }
657 
658 #undef __FUNC__
659 #define __FUNC__ "MatNorm_MPIBAIJ"
660 int MatNorm_MPIBAIJ(Mat mat,NormType type,double *norm)
661 {
662   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
663   Mat_SeqBAIJ *amat = (Mat_SeqBAIJ*) baij->A->data, *bmat = (Mat_SeqBAIJ*) baij->B->data;
664   int        ierr, i,bs2=baij->bs2;
665   double     sum = 0.0;
666   Scalar     *v;
667 
668   PetscFunctionBegin;
669   if (baij->size == 1) {
670     ierr =  MatNorm(baij->A,type,norm);CHKERRQ(ierr);
671   } else {
672     if (type == NORM_FROBENIUS) {
673       v = amat->a;
674       for (i=0; i<amat->nz*bs2; i++ ) {
675 #if defined(PETSC_USE_COMPLEX)
676         sum += PetscReal(PetscConj(*v)*(*v)); v++;
677 #else
678         sum += (*v)*(*v); v++;
679 #endif
680       }
681       v = bmat->a;
682       for (i=0; i<bmat->nz*bs2; i++ ) {
683 #if defined(PETSC_USE_COMPLEX)
684         sum += PetscReal(PetscConj(*v)*(*v)); v++;
685 #else
686         sum += (*v)*(*v); v++;
687 #endif
688       }
689       ierr = MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
690       *norm = sqrt(*norm);
691     } else {
692       SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
693     }
694   }
695   PetscFunctionReturn(0);
696 }
697 
698 
699 /*
700   Creates the hash table, and sets the table
701   This table is created only once.
702   If new entried need to be added to the matrix
703   then the hash table has to be destroyed and
704   recreated.
705 */
706 #undef __FUNC__
707 #define __FUNC__ "MatCreateHashTable_MPIBAIJ_Private"
708 int MatCreateHashTable_MPIBAIJ_Private(Mat mat,double factor)
709 {
710   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
711   Mat         A = baij->A, B=baij->B;
712   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data, *b=(Mat_SeqBAIJ *)B->data;
713   int         i,j,k,nz=a->nz+b->nz,h1,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
714   int         size,bs2=baij->bs2,rstart=baij->rstart,ierr;
715   int         cstart=baij->cstart,*garray=baij->garray,row,col,Nbs=baij->Nbs;
716   int         *HT,key;
717   Scalar      **HD;
718   double      tmp;
719 #if defined(PETSC_USE_BOPT_g)
720   int         ct=0,max=0;
721 #endif
722 
723   PetscFunctionBegin;
724   baij->ht_size=(int)(factor*nz);
725   size = baij->ht_size;
726 
727   if (baij->ht) {
728     PetscFunctionReturn(0);
729   }
730 
731   /* Allocate Memory for Hash Table */
732   baij->hd = (Scalar**)PetscMalloc((size)*(sizeof(int)+sizeof(Scalar*))+1);CHKPTRQ(baij->hd);
733   baij->ht = (int*)(baij->hd + size);
734   HD = baij->hd;
735   HT = baij->ht;
736 
737 
738   ierr = PetscMemzero(HD,size*(sizeof(int)+sizeof(Scalar*)));CHKERRQ(ierr);
739 
740 
741   /* Loop Over A */
742   for ( i=0; i<a->mbs; i++ ) {
743     for ( j=ai[i]; j<ai[i+1]; j++ ) {
744       row = i+rstart;
745       col = aj[j]+cstart;
746 
747       key = row*Nbs + col + 1;
748       h1  = HASH(size,key,tmp);
749       for ( k=0; k<size; k++ ){
750         if (HT[(h1+k)%size] == 0.0) {
751           HT[(h1+k)%size] = key;
752           HD[(h1+k)%size] = a->a + j*bs2;
753           break;
754 #if defined(PETSC_USE_BOPT_g)
755         } else {
756           ct++;
757 #endif
758         }
759       }
760 #if defined(PETSC_USE_BOPT_g)
761       if (k> max) max = k;
762 #endif
763     }
764   }
765   /* Loop Over B */
766   for ( i=0; i<b->mbs; i++ ) {
767     for ( j=bi[i]; j<bi[i+1]; j++ ) {
768       row = i+rstart;
769       col = garray[bj[j]];
770       key = row*Nbs + col + 1;
771       h1  = HASH(size,key,tmp);
772       for ( k=0; k<size; k++ ){
773         if (HT[(h1+k)%size] == 0.0) {
774           HT[(h1+k)%size] = key;
775           HD[(h1+k)%size] = b->a + j*bs2;
776           break;
777 #if defined(PETSC_USE_BOPT_g)
778         } else {
779           ct++;
780 #endif
781         }
782       }
783 #if defined(PETSC_USE_BOPT_g)
784       if (k> max) max = k;
785 #endif
786     }
787   }
788 
789   /* Print Summary */
790 #if defined(PETSC_USE_BOPT_g)
791   for ( i=0,j=0; i<size; i++)
792     if (HT[i]) {j++;}
793   PLogInfo(0,"MatCreateHashTable_MPIBAIJ_Private: Average Search = %5.2f,max search = %d\n",
794            (j== 0)? 0.0:((double)(ct+j))/j,max);
795 #endif
796   PetscFunctionReturn(0);
797 }
798 
799 #undef __FUNC__
800 #define __FUNC__ "MatAssemblyBegin_MPIBAIJ"
801 int MatAssemblyBegin_MPIBAIJ(Mat mat,MatAssemblyType mode)
802 {
803   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
804   int         ierr,nstash,reallocs;
805   InsertMode  addv;
806 
807   PetscFunctionBegin;
808   if (baij->donotstash) {
809     PetscFunctionReturn(0);
810   }
811 
812   /* make sure all processors are either in INSERTMODE or ADDMODE */
813   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,mat->comm);CHKERRQ(ierr);
814   if (addv == (ADD_VALUES|INSERT_VALUES)) {
815     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Some processors inserted others added");
816   }
817   mat->insertmode = addv; /* in case this processor had no cache */
818 
819   ierr = MatStashScatterBegin_Private(&mat->stash,baij->rowners_bs);CHKERRQ(ierr);
820   ierr = MatStashScatterBegin_Private(&mat->bstash,baij->rowners);CHKERRQ(ierr);
821   ierr = MatStashGetInfo_Private(&mat->stash,&nstash,&reallocs);CHKERRQ(ierr);
822   PLogInfo(0,"MatAssemblyBegin_MPIBAIJ:Stash has %d entries, uses %d mallocs.\n",nstash,reallocs);
823   ierr = MatStashGetInfo_Private(&mat->stash,&nstash,&reallocs);CHKERRQ(ierr);
824   PLogInfo(0,"MatAssemblyBegin_MPIBAIJ:Block-Stash has %d entries, uses %d mallocs.\n",nstash,reallocs);
825   PetscFunctionReturn(0);
826 }
827 
828 #undef __FUNC__
829 #define __FUNC__ "MatAssemblyEnd_MPIBAIJ"
830 int MatAssemblyEnd_MPIBAIJ(Mat mat,MatAssemblyType mode)
831 {
832   Mat_MPIBAIJ *baij=(Mat_MPIBAIJ *) mat->data;
833   Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)baij->A->data,*b=(Mat_SeqBAIJ*)baij->B->data;
834   int         i,j,rstart,ncols,n,ierr,flg,bs2=baij->bs2;
835   int         *row,*col,other_disassembled,r1,r2,r3;
836   Scalar      *val;
837   InsertMode  addv = mat->insertmode;
838 
839   PetscFunctionBegin;
840   if (!baij->donotstash) {
841     while (1) {
842       ierr = MatStashScatterGetMesg_Private(&mat->stash,&n,&row,&col,&val,&flg);CHKERRQ(ierr);
843       if (!flg) break;
844 
845       for ( i=0; i<n; ) {
846         /* Now identify the consecutive vals belonging to the same row */
847         for ( j=i,rstart=row[j]; j<n; j++ ) { if (row[j] != rstart) break; }
848         if (j < n) ncols = j-i;
849         else       ncols = n-i;
850         /* Now assemble all these values with a single function call */
851         ierr = MatSetValues_MPIBAIJ(mat,1,row+i,ncols,col+i,val+i,addv);CHKERRQ(ierr);
852         i = j;
853       }
854     }
855     ierr = MatStashScatterEnd_Private(&mat->stash);CHKERRQ(ierr);
856     /* Now process the block-stash. Since the values are stashed column-oriented,
857        set the roworiented flag to column oriented, and after MatSetValues()
858        restore the original flags */
859     r1 = baij->roworiented;
860     r2 = a->roworiented;
861     r3 = b->roworiented;
862     baij->roworiented = 0;
863     a->roworiented    = 0;
864     b->roworiented    = 0;
865     while (1) {
866       ierr = MatStashScatterGetMesg_Private(&mat->bstash,&n,&row,&col,&val,&flg);CHKERRQ(ierr);
867       if (!flg) break;
868 
869       for ( i=0; i<n; ) {
870         /* Now identify the consecutive vals belonging to the same row */
871         for ( j=i,rstart=row[j]; j<n; j++ ) { if (row[j] != rstart) break; }
872         if (j < n) ncols = j-i;
873         else       ncols = n-i;
874         ierr = MatSetValuesBlocked_MPIBAIJ(mat,1,row+i,ncols,col+i,val+i*bs2,addv);CHKERRQ(ierr);
875         i = j;
876       }
877     }
878     ierr = MatStashScatterEnd_Private(&mat->bstash);CHKERRQ(ierr);
879     baij->roworiented = r1;
880     a->roworiented    = r2;
881     b->roworiented    = r3;
882   }
883 
884   ierr = MatAssemblyBegin(baij->A,mode);CHKERRQ(ierr);
885   ierr = MatAssemblyEnd(baij->A,mode);CHKERRQ(ierr);
886 
887   /* determine if any processor has disassembled, if so we must
888      also disassemble ourselfs, in order that we may reassemble. */
889   /*
890      if nonzero structure of submatrix B cannot change then we know that
891      no processor disassembled thus we can skip this stuff
892   */
893   if (!((Mat_SeqBAIJ*) baij->B->data)->nonew)  {
894     ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
895     if (mat->was_assembled && !other_disassembled) {
896       ierr = DisAssemble_MPIBAIJ(mat);CHKERRQ(ierr);
897     }
898   }
899 
900   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
901     ierr = MatSetUpMultiply_MPIBAIJ(mat);CHKERRQ(ierr);
902   }
903   ierr = MatAssemblyBegin(baij->B,mode);CHKERRQ(ierr);
904   ierr = MatAssemblyEnd(baij->B,mode);CHKERRQ(ierr);
905 
906 #if defined(PETSC_USE_BOPT_g)
907   if (baij->ht && mode== MAT_FINAL_ASSEMBLY) {
908     PLogInfo(0,"MatAssemblyEnd_MPIBAIJ:Average Hash Table Search in MatSetValues = %5.2f\n",
909              ((double)baij->ht_total_ct)/baij->ht_insert_ct);
910     baij->ht_total_ct  = 0;
911     baij->ht_insert_ct = 0;
912   }
913 #endif
914   if (baij->ht_flag && !baij->ht && mode == MAT_FINAL_ASSEMBLY) {
915     ierr = MatCreateHashTable_MPIBAIJ_Private(mat,baij->ht_fact);CHKERRQ(ierr);
916     mat->ops->setvalues        = MatSetValues_MPIBAIJ_HT;
917     mat->ops->setvaluesblocked = MatSetValuesBlocked_MPIBAIJ_HT;
918   }
919 
920   if (baij->rowvalues) {
921     ierr = PetscFree(baij->rowvalues);CHKERRQ(ierr);
922     baij->rowvalues = 0;
923   }
924   PetscFunctionReturn(0);
925 }
926 
927 /*
928 #undef __FUNC__
929 #define __FUNC__ "MatView_MPIBAIJ_Binary"
930 static int MatView_MPIBAIJ_Binary(Mat mat,Viewer viewer)
931 {
932   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
933   int          ierr;
934 
935   PetscFunctionBegin;
936   if (baij->size == 1) {
937     ierr = MatView(baij->A,viewer);CHKERRQ(ierr);
938   } else SETERRQ(PETSC_ERR_SUP,0,"Only uniprocessor output supported");
939   PetscFunctionReturn(0);
940 }
941 */
942 
943 #undef __FUNC__
944 #define __FUNC__ "MatView_MPIBAIJ_ASCIIorDraworSocket"
945 static int MatView_MPIBAIJ_ASCIIorDraworSocket(Mat mat,Viewer viewer)
946 {
947   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
948   int          ierr, format,bs = baij->bs, size = baij->size, rank = baij->rank;
949   FILE         *fd;
950   ViewerType   vtype;
951 
952   PetscFunctionBegin;
953   ierr = ViewerGetType(viewer,&vtype);CHKERRQ(ierr);
954   if (PetscTypeCompare(vtype,ASCII_VIEWER)) {
955     ierr = ViewerGetFormat(viewer,&format);CHKERRQ(ierr);
956     if (format == VIEWER_FORMAT_ASCII_INFO_LONG) {
957       MatInfo info;
958       ierr = MPI_Comm_rank(mat->comm,&rank);CHKERRQ(ierr);
959       ierr = ViewerASCIIGetPointer(viewer,&fd);CHKERRQ(ierr);
960       ierr = MatGetInfo(mat,MAT_LOCAL,&info);CHKERRQ(ierr);
961       ierr = PetscSequentialPhaseBegin(mat->comm,1);CHKERRQ(ierr);
962       fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
963               rank,baij->m,(int)info.nz_used*bs,(int)info.nz_allocated*bs,
964               baij->bs,(int)info.memory);
965       ierr = MatGetInfo(baij->A,MAT_LOCAL,&info);CHKERRQ(ierr);
966       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
967       ierr = MatGetInfo(baij->B,MAT_LOCAL,&info);CHKERRQ(ierr);
968       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
969       fflush(fd);
970       ierr = PetscSequentialPhaseEnd(mat->comm,1);CHKERRQ(ierr);
971       ierr = VecScatterView(baij->Mvctx,viewer);CHKERRQ(ierr);
972       PetscFunctionReturn(0);
973     } else if (format == VIEWER_FORMAT_ASCII_INFO) {
974       ierr = PetscPrintf(mat->comm,"  block size is %d\n",bs);CHKERRQ(ierr);
975       PetscFunctionReturn(0);
976     }
977   }
978 
979   if (PetscTypeCompare(vtype,DRAW_VIEWER)) {
980     Draw       draw;
981     PetscTruth isnull;
982     ierr = ViewerDrawGetDraw(viewer,0,&draw);CHKERRQ(ierr);
983     ierr = DrawIsNull(draw,&isnull);CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
984   }
985 
986   if (size == 1) {
987     ierr = MatView(baij->A,viewer);CHKERRQ(ierr);
988   } else {
989     /* assemble the entire matrix onto first processor. */
990     Mat         A;
991     Mat_SeqBAIJ *Aloc;
992     int         M = baij->M, N = baij->N,*ai,*aj,col,i,j,k,*rvals;
993     int         mbs = baij->mbs;
994     Scalar      *a;
995 
996     if (!rank) {
997       ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
998     } else {
999       ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
1000     }
1001     PLogObjectParent(mat,A);
1002 
1003     /* copy over the A part */
1004     Aloc = (Mat_SeqBAIJ*) baij->A->data;
1005     ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1006     rvals = (int *) PetscMalloc(bs*sizeof(int));CHKPTRQ(rvals);
1007 
1008     for ( i=0; i<mbs; i++ ) {
1009       rvals[0] = bs*(baij->rstart + i);
1010       for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1011       for ( j=ai[i]; j<ai[i+1]; j++ ) {
1012         col = (baij->cstart+aj[j])*bs;
1013         for (k=0; k<bs; k++ ) {
1014           ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
1015           col++; a += bs;
1016         }
1017       }
1018     }
1019     /* copy over the B part */
1020     Aloc = (Mat_SeqBAIJ*) baij->B->data;
1021     ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1022     for ( i=0; i<mbs; i++ ) {
1023       rvals[0] = bs*(baij->rstart + i);
1024       for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1025       for ( j=ai[i]; j<ai[i+1]; j++ ) {
1026         col = baij->garray[aj[j]]*bs;
1027         for (k=0; k<bs; k++ ) {
1028           ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
1029           col++; a += bs;
1030         }
1031       }
1032     }
1033     ierr = PetscFree(rvals);CHKERRQ(ierr);
1034     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1035     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1036     /*
1037        Everyone has to call to draw the matrix since the graphics waits are
1038        synchronized across all processors that share the Draw object
1039     */
1040     if (!rank || PetscTypeCompare(vtype,DRAW_VIEWER)) {
1041       ierr = MatView(((Mat_MPIBAIJ*)(A->data))->A,viewer);CHKERRQ(ierr);
1042     }
1043     ierr = MatDestroy(A);CHKERRQ(ierr);
1044   }
1045   PetscFunctionReturn(0);
1046 }
1047 
1048 
1049 
1050 #undef __FUNC__
1051 #define __FUNC__ "MatView_MPIBAIJ"
1052 int MatView_MPIBAIJ(Mat mat,Viewer viewer)
1053 {
1054   int         ierr;
1055   ViewerType  vtype;
1056 
1057   PetscFunctionBegin;
1058   ierr = ViewerGetType(viewer,&vtype);CHKERRQ(ierr);
1059   if (PetscTypeCompare(vtype,ASCII_VIEWER) || PetscTypeCompare(vtype,DRAW_VIEWER) ||
1060       PetscTypeCompare(vtype,SOCKET_VIEWER)|| PetscTypeCompare(vtype,BINARY_VIEWER)) {
1061     ierr = MatView_MPIBAIJ_ASCIIorDraworSocket(mat,viewer);CHKERRQ(ierr);
1062     /*
1063   } else if (PetscTypeCompare(vtype,BINARY_VIEWER)) {
1064     ierr = MatView_MPIBAIJ_Binary(mat,viewer);CHKERRQ(ierr);
1065     */
1066   } else {
1067     SETERRQ(1,1,"Viewer type not supported by PETSc object");
1068   }
1069   PetscFunctionReturn(0);
1070 }
1071 
1072 #undef __FUNC__
1073 #define __FUNC__ "MatDestroy_MPIBAIJ"
1074 int MatDestroy_MPIBAIJ(Mat mat)
1075 {
1076   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1077   int         ierr;
1078 
1079   PetscFunctionBegin;
1080 
1081   if (mat->mapping) {
1082     ierr = ISLocalToGlobalMappingDestroy(mat->mapping);CHKERRQ(ierr);
1083   }
1084   if (mat->bmapping) {
1085     ierr = ISLocalToGlobalMappingDestroy(mat->bmapping);CHKERRQ(ierr);
1086   }
1087   if (mat->rmap) {
1088     ierr = MapDestroy(mat->rmap);CHKERRQ(ierr);
1089   }
1090   if (mat->cmap) {
1091     ierr = MapDestroy(mat->cmap);CHKERRQ(ierr);
1092   }
1093 #if defined(PETSC_USE_LOG)
1094   PLogObjectState((PetscObject)mat,"Rows=%d, Cols=%d",baij->M,baij->N);
1095 #endif
1096 
1097   ierr = MatStashDestroy_Private(&mat->stash);CHKERRQ(ierr);
1098   ierr = MatStashDestroy_Private(&mat->bstash);CHKERRQ(ierr);
1099 
1100   ierr = PetscFree(baij->rowners);CHKERRQ(ierr);
1101   ierr = MatDestroy(baij->A);CHKERRQ(ierr);
1102   ierr = MatDestroy(baij->B);CHKERRQ(ierr);
1103 #if defined (PETSC_USE_CTABLE)
1104   if (baij->colmap) {ierr = TableDelete(baij->colmap);CHKERRQ(ierr);}
1105 #else
1106   if (baij->colmap) {ierr = PetscFree(baij->colmap);CHKERRQ(ierr);}
1107 #endif
1108   if (baij->garray) {ierr = PetscFree(baij->garray);CHKERRQ(ierr);}
1109   if (baij->lvec)   {ierr = VecDestroy(baij->lvec);CHKERRQ(ierr);}
1110   if (baij->Mvctx)  {ierr = VecScatterDestroy(baij->Mvctx);CHKERRQ(ierr);}
1111   if (baij->rowvalues) {ierr = PetscFree(baij->rowvalues);CHKERRQ(ierr);}
1112   if (baij->barray) {ierr = PetscFree(baij->barray);CHKERRQ(ierr);}
1113   if (baij->hd) {ierr = PetscFree(baij->hd);CHKERRQ(ierr);}
1114   ierr = PetscFree(baij);CHKERRQ(ierr);
1115   PLogObjectDestroy(mat);
1116   PetscHeaderDestroy(mat);
1117   PetscFunctionReturn(0);
1118 }
1119 
1120 #undef __FUNC__
1121 #define __FUNC__ "MatMult_MPIBAIJ"
1122 int MatMult_MPIBAIJ(Mat A,Vec xx,Vec yy)
1123 {
1124   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1125   int         ierr, nt;
1126 
1127   PetscFunctionBegin;
1128   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
1129   if (nt != a->n) {
1130     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible partition of A and xx");
1131   }
1132   ierr = VecGetLocalSize(yy,&nt);CHKERRQ(ierr);
1133   if (nt != a->m) {
1134     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible parition of A and yy");
1135   }
1136   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1137   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
1138   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1139   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
1140   ierr = VecScatterPostRecvs(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1141   PetscFunctionReturn(0);
1142 }
1143 
1144 #undef __FUNC__
1145 #define __FUNC__ "MatMultAdd_MPIBAIJ"
1146 int MatMultAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1147 {
1148   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1149   int        ierr;
1150 
1151   PetscFunctionBegin;
1152   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1153   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
1154   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1155   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
1156   PetscFunctionReturn(0);
1157 }
1158 
1159 #undef __FUNC__
1160 #define __FUNC__ "MatMultTrans_MPIBAIJ"
1161 int MatMultTrans_MPIBAIJ(Mat A,Vec xx,Vec yy)
1162 {
1163   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1164   int         ierr;
1165 
1166   PetscFunctionBegin;
1167   /* do nondiagonal part */
1168   ierr = (*a->B->ops->multtrans)(a->B,xx,a->lvec);CHKERRQ(ierr);
1169   /* send it on its way */
1170   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1171   /* do local part */
1172   ierr = (*a->A->ops->multtrans)(a->A,xx,yy);CHKERRQ(ierr);
1173   /* receive remote parts: note this assumes the values are not actually */
1174   /* inserted in yy until the next line, which is true for my implementation*/
1175   /* but is not perhaps always true. */
1176   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1177   PetscFunctionReturn(0);
1178 }
1179 
1180 #undef __FUNC__
1181 #define __FUNC__ "MatMultTransAdd_MPIBAIJ"
1182 int MatMultTransAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1183 {
1184   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1185   int         ierr;
1186 
1187   PetscFunctionBegin;
1188   /* do nondiagonal part */
1189   ierr = (*a->B->ops->multtrans)(a->B,xx,a->lvec);CHKERRQ(ierr);
1190   /* send it on its way */
1191   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1192   /* do local part */
1193   ierr = (*a->A->ops->multtransadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
1194   /* receive remote parts: note this assumes the values are not actually */
1195   /* inserted in yy until the next line, which is true for my implementation*/
1196   /* but is not perhaps always true. */
1197   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1198   PetscFunctionReturn(0);
1199 }
1200 
1201 /*
1202   This only works correctly for square matrices where the subblock A->A is the
1203    diagonal block
1204 */
1205 #undef __FUNC__
1206 #define __FUNC__ "MatGetDiagonal_MPIBAIJ"
1207 int MatGetDiagonal_MPIBAIJ(Mat A,Vec v)
1208 {
1209   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1210   int         ierr;
1211 
1212   PetscFunctionBegin;
1213   if (a->M != a->N) SETERRQ(PETSC_ERR_SUP,0,"Supports only square matrix where A->A is diag block");
1214   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
1215   PetscFunctionReturn(0);
1216 }
1217 
1218 #undef __FUNC__
1219 #define __FUNC__ "MatScale_MPIBAIJ"
1220 int MatScale_MPIBAIJ(Scalar *aa,Mat A)
1221 {
1222   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1223   int         ierr;
1224 
1225   PetscFunctionBegin;
1226   ierr = MatScale(aa,a->A);CHKERRQ(ierr);
1227   ierr = MatScale(aa,a->B);CHKERRQ(ierr);
1228   PetscFunctionReturn(0);
1229 }
1230 
1231 #undef __FUNC__
1232 #define __FUNC__ "MatGetSize_MPIBAIJ"
1233 int MatGetSize_MPIBAIJ(Mat matin,int *m,int *n)
1234 {
1235   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1236 
1237   PetscFunctionBegin;
1238   if (m) *m = mat->M;
1239   if (n) *n = mat->N;
1240   PetscFunctionReturn(0);
1241 }
1242 
1243 #undef __FUNC__
1244 #define __FUNC__ "MatGetLocalSize_MPIBAIJ"
1245 int MatGetLocalSize_MPIBAIJ(Mat matin,int *m,int *n)
1246 {
1247   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1248 
1249   PetscFunctionBegin;
1250   *m = mat->m; *n = mat->n;
1251   PetscFunctionReturn(0);
1252 }
1253 
1254 #undef __FUNC__
1255 #define __FUNC__ "MatGetOwnershipRange_MPIBAIJ"
1256 int MatGetOwnershipRange_MPIBAIJ(Mat matin,int *m,int *n)
1257 {
1258   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1259 
1260   PetscFunctionBegin;
1261   *m = mat->rstart*mat->bs; *n = mat->rend*mat->bs;
1262   PetscFunctionReturn(0);
1263 }
1264 
1265 #undef __FUNC__
1266 #define __FUNC__ "MatGetRow_MPIBAIJ"
1267 int MatGetRow_MPIBAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1268 {
1269   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1270   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1271   int        bs = mat->bs, bs2 = mat->bs2, i, ierr, *cworkA, *cworkB, **pcA, **pcB;
1272   int        nztot, nzA, nzB, lrow, brstart = mat->rstart*bs, brend = mat->rend*bs;
1273   int        *cmap, *idx_p,cstart = mat->cstart;
1274 
1275   PetscFunctionBegin;
1276   if (mat->getrowactive == PETSC_TRUE) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Already active");
1277   mat->getrowactive = PETSC_TRUE;
1278 
1279   if (!mat->rowvalues && (idx || v)) {
1280     /*
1281         allocate enough space to hold information from the longest row.
1282     */
1283     Mat_SeqBAIJ *Aa = (Mat_SeqBAIJ *) mat->A->data,*Ba = (Mat_SeqBAIJ *) mat->B->data;
1284     int     max = 1,mbs = mat->mbs,tmp;
1285     for ( i=0; i<mbs; i++ ) {
1286       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1287       if (max < tmp) { max = tmp; }
1288     }
1289     mat->rowvalues = (Scalar *) PetscMalloc( max*bs2*(sizeof(int)+sizeof(Scalar)));CHKPTRQ(mat->rowvalues);
1290     mat->rowindices = (int *) (mat->rowvalues + max*bs2);
1291   }
1292 
1293   if (row < brstart || row >= brend) SETERRQ(PETSC_ERR_SUP,0,"Only local rows")
1294   lrow = row - brstart;
1295 
1296   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1297   if (!v)   {pvA = 0; pvB = 0;}
1298   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1299   ierr = (*mat->A->ops->getrow)(mat->A,lrow,&nzA,pcA,pvA);CHKERRQ(ierr);
1300   ierr = (*mat->B->ops->getrow)(mat->B,lrow,&nzB,pcB,pvB);CHKERRQ(ierr);
1301   nztot = nzA + nzB;
1302 
1303   cmap  = mat->garray;
1304   if (v  || idx) {
1305     if (nztot) {
1306       /* Sort by increasing column numbers, assuming A and B already sorted */
1307       int imark = -1;
1308       if (v) {
1309         *v = v_p = mat->rowvalues;
1310         for ( i=0; i<nzB; i++ ) {
1311           if (cmap[cworkB[i]/bs] < cstart)   v_p[i] = vworkB[i];
1312           else break;
1313         }
1314         imark = i;
1315         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1316         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1317       }
1318       if (idx) {
1319         *idx = idx_p = mat->rowindices;
1320         if (imark > -1) {
1321           for ( i=0; i<imark; i++ ) {
1322             idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs;
1323           }
1324         } else {
1325           for ( i=0; i<nzB; i++ ) {
1326             if (cmap[cworkB[i]/bs] < cstart)
1327               idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1328             else break;
1329           }
1330           imark = i;
1331         }
1332         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart*bs + cworkA[i];
1333         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1334       }
1335     } else {
1336       if (idx) *idx = 0;
1337       if (v)   *v   = 0;
1338     }
1339   }
1340   *nz = nztot;
1341   ierr = (*mat->A->ops->restorerow)(mat->A,lrow,&nzA,pcA,pvA);CHKERRQ(ierr);
1342   ierr = (*mat->B->ops->restorerow)(mat->B,lrow,&nzB,pcB,pvB);CHKERRQ(ierr);
1343   PetscFunctionReturn(0);
1344 }
1345 
1346 #undef __FUNC__
1347 #define __FUNC__ "MatRestoreRow_MPIBAIJ"
1348 int MatRestoreRow_MPIBAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1349 {
1350   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1351 
1352   PetscFunctionBegin;
1353   if (baij->getrowactive == PETSC_FALSE) {
1354     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"MatGetRow not called");
1355   }
1356   baij->getrowactive = PETSC_FALSE;
1357   PetscFunctionReturn(0);
1358 }
1359 
1360 #undef __FUNC__
1361 #define __FUNC__ "MatGetBlockSize_MPIBAIJ"
1362 int MatGetBlockSize_MPIBAIJ(Mat mat,int *bs)
1363 {
1364   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1365 
1366   PetscFunctionBegin;
1367   *bs = baij->bs;
1368   PetscFunctionReturn(0);
1369 }
1370 
1371 #undef __FUNC__
1372 #define __FUNC__ "MatZeroEntries_MPIBAIJ"
1373 int MatZeroEntries_MPIBAIJ(Mat A)
1374 {
1375   Mat_MPIBAIJ *l = (Mat_MPIBAIJ *) A->data;
1376   int         ierr;
1377 
1378   PetscFunctionBegin;
1379   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
1380   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
1381   PetscFunctionReturn(0);
1382 }
1383 
1384 #undef __FUNC__
1385 #define __FUNC__ "MatGetInfo_MPIBAIJ"
1386 int MatGetInfo_MPIBAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1387 {
1388   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) matin->data;
1389   Mat         A = a->A, B = a->B;
1390   int         ierr;
1391   double      isend[5], irecv[5];
1392 
1393   PetscFunctionBegin;
1394   info->block_size     = (double)a->bs;
1395   ierr = MatGetInfo(A,MAT_LOCAL,info);CHKERRQ(ierr);
1396   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->nz_unneeded;
1397   isend[3] = info->memory;  isend[4] = info->mallocs;
1398   ierr = MatGetInfo(B,MAT_LOCAL,info);CHKERRQ(ierr);
1399   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->nz_unneeded;
1400   isend[3] += info->memory;  isend[4] += info->mallocs;
1401   if (flag == MAT_LOCAL) {
1402     info->nz_used      = isend[0];
1403     info->nz_allocated = isend[1];
1404     info->nz_unneeded  = isend[2];
1405     info->memory       = isend[3];
1406     info->mallocs      = isend[4];
1407   } else if (flag == MAT_GLOBAL_MAX) {
1408     ierr = MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_MAX,matin->comm);CHKERRQ(ierr);
1409     info->nz_used      = irecv[0];
1410     info->nz_allocated = irecv[1];
1411     info->nz_unneeded  = irecv[2];
1412     info->memory       = irecv[3];
1413     info->mallocs      = irecv[4];
1414   } else if (flag == MAT_GLOBAL_SUM) {
1415     ierr = MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_SUM,matin->comm);CHKERRQ(ierr);
1416     info->nz_used      = irecv[0];
1417     info->nz_allocated = irecv[1];
1418     info->nz_unneeded  = irecv[2];
1419     info->memory       = irecv[3];
1420     info->mallocs      = irecv[4];
1421   } else {
1422     SETERRQ1(1,1,"Unknown MatInfoType argument %d",flag);
1423   }
1424   info->rows_global       = (double)a->M;
1425   info->columns_global    = (double)a->N;
1426   info->rows_local        = (double)a->m;
1427   info->columns_local     = (double)a->N;
1428   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1429   info->fill_ratio_needed = 0;
1430   info->factor_mallocs    = 0;
1431   PetscFunctionReturn(0);
1432 }
1433 
1434 #undef __FUNC__
1435 #define __FUNC__ "MatSetOption_MPIBAIJ"
1436 int MatSetOption_MPIBAIJ(Mat A,MatOption op)
1437 {
1438   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1439   int         ierr;
1440 
1441   PetscFunctionBegin;
1442   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1443       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1444       op == MAT_COLUMNS_UNSORTED ||
1445       op == MAT_COLUMNS_SORTED ||
1446       op == MAT_NEW_NONZERO_ALLOCATION_ERR ||
1447       op == MAT_NEW_NONZERO_LOCATION_ERR) {
1448         ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1449         ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1450   } else if (op == MAT_ROW_ORIENTED) {
1451         a->roworiented = 1;
1452         ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1453         ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1454   } else if (op == MAT_ROWS_SORTED ||
1455              op == MAT_ROWS_UNSORTED ||
1456              op == MAT_SYMMETRIC ||
1457              op == MAT_STRUCTURALLY_SYMMETRIC ||
1458              op == MAT_YES_NEW_DIAGONALS ||
1459              op == MAT_USE_HASH_TABLE) {
1460     PLogInfo(A,"Info:MatSetOption_MPIBAIJ:Option ignored\n");
1461   } else if (op == MAT_COLUMN_ORIENTED) {
1462     a->roworiented = 0;
1463     ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1464     ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1465   } else if (op == MAT_IGNORE_OFF_PROC_ENTRIES) {
1466     a->donotstash = 1;
1467   } else if (op == MAT_NO_NEW_DIAGONALS) {
1468     SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");
1469   } else if (op == MAT_USE_HASH_TABLE) {
1470     a->ht_flag = 1;
1471   } else {
1472     SETERRQ(PETSC_ERR_SUP,0,"unknown option");
1473   }
1474   PetscFunctionReturn(0);
1475 }
1476 
1477 #undef __FUNC__
1478 #define __FUNC__ "MatTranspose_MPIBAIJ("
1479 int MatTranspose_MPIBAIJ(Mat A,Mat *matout)
1480 {
1481   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) A->data;
1482   Mat_SeqBAIJ *Aloc;
1483   Mat        B;
1484   int        ierr,M=baij->M,N=baij->N,*ai,*aj,i,*rvals,j,k,col;
1485   int        bs=baij->bs,mbs=baij->mbs;
1486   Scalar     *a;
1487 
1488   PetscFunctionBegin;
1489   if (matout == PETSC_NULL && M != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Square matrix only for in-place");
1490   ierr = MatCreateMPIBAIJ(A->comm,baij->bs,baij->n,baij->m,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);
1491 CHKERRQ(ierr);
1492 
1493   /* copy over the A part */
1494   Aloc = (Mat_SeqBAIJ*) baij->A->data;
1495   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1496   rvals = (int *) PetscMalloc(bs*sizeof(int));CHKPTRQ(rvals);
1497 
1498   for ( i=0; i<mbs; i++ ) {
1499     rvals[0] = bs*(baij->rstart + i);
1500     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1501     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1502       col = (baij->cstart+aj[j])*bs;
1503       for (k=0; k<bs; k++ ) {
1504         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1505         col++; a += bs;
1506       }
1507     }
1508   }
1509   /* copy over the B part */
1510   Aloc = (Mat_SeqBAIJ*) baij->B->data;
1511   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1512   for ( i=0; i<mbs; i++ ) {
1513     rvals[0] = bs*(baij->rstart + i);
1514     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1515     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1516       col = baij->garray[aj[j]]*bs;
1517       for (k=0; k<bs; k++ ) {
1518         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1519         col++; a += bs;
1520       }
1521     }
1522   }
1523   ierr = PetscFree(rvals);CHKERRQ(ierr);
1524   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1525   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1526 
1527   if (matout != PETSC_NULL) {
1528     *matout = B;
1529   } else {
1530     PetscOps *Abops;
1531     MatOps   Aops;
1532 
1533     /* This isn't really an in-place transpose .... but free data structures from baij */
1534     ierr = PetscFree(baij->rowners); CHKERRQ(ierr);
1535     ierr = MatDestroy(baij->A);CHKERRQ(ierr);
1536     ierr = MatDestroy(baij->B);CHKERRQ(ierr);
1537 #if defined (PETSC_USE_CTABLE)
1538     if (baij->colmap) {ierr = TableDelete(baij->colmap);CHKERRQ(ierr);}
1539 #else
1540     if (baij->colmap) {ierr = PetscFree(baij->colmap);CHKERRQ(ierr);}
1541 #endif
1542     if (baij->garray) {ierr = PetscFree(baij->garray);CHKERRQ(ierr);}
1543     if (baij->lvec) {ierr = VecDestroy(baij->lvec);CHKERRQ(ierr);}
1544     if (baij->Mvctx) {ierr = VecScatterDestroy(baij->Mvctx);CHKERRQ(ierr);}
1545     ierr = PetscFree(baij);CHKERRQ(ierr);
1546 
1547     /*
1548        This is horrible, horrible code. We need to keep the
1549       A pointers for the bops and ops but copy everything
1550       else from C.
1551     */
1552     Abops   = A->bops;
1553     Aops    = A->ops;
1554     ierr    = PetscMemcpy(A,B,sizeof(struct _p_Mat));CHKERRQ(ierr);
1555     A->bops = Abops;
1556     A->ops  = Aops;
1557 
1558     PetscHeaderDestroy(B);
1559   }
1560   PetscFunctionReturn(0);
1561 }
1562 
1563 #undef __FUNC__
1564 #define __FUNC__ "MatDiagonalScale_MPIBAIJ"
1565 int MatDiagonalScale_MPIBAIJ(Mat mat,Vec ll,Vec rr)
1566 {
1567   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1568   Mat         a = baij->A, b = baij->B;
1569   int         ierr,s1,s2,s3;
1570 
1571   PetscFunctionBegin;
1572   ierr = MatGetLocalSize(mat,&s2,&s3);CHKERRQ(ierr);
1573   if (rr) {
1574     ierr = VecGetLocalSize(rr,&s1);CHKERRQ(ierr);
1575     if (s1!=s3) SETERRQ(PETSC_ERR_ARG_SIZ,0,"right vector non-conforming local size");
1576     /* Overlap communication with computation. */
1577     ierr = VecScatterBegin(rr,baij->lvec,INSERT_VALUES,SCATTER_FORWARD,baij->Mvctx);CHKERRQ(ierr);
1578   }
1579   if (ll) {
1580     ierr = VecGetLocalSize(ll,&s1);CHKERRQ(ierr);
1581     if (s1!=s2) SETERRQ(PETSC_ERR_ARG_SIZ,0,"left vector non-conforming local size");
1582     ierr = (*b->ops->diagonalscale)(b,ll,0);CHKERRQ(ierr);
1583   }
1584   /* scale  the diagonal block */
1585   ierr = (*a->ops->diagonalscale)(a,ll,rr);CHKERRQ(ierr);
1586 
1587   if (rr) {
1588     /* Do a scatter end and then right scale the off-diagonal block */
1589     ierr = VecScatterEnd(rr,baij->lvec,INSERT_VALUES,SCATTER_FORWARD,baij->Mvctx);CHKERRQ(ierr);
1590     ierr = (*b->ops->diagonalscale)(b,0,baij->lvec);CHKERRQ(ierr);
1591   }
1592 
1593   PetscFunctionReturn(0);
1594 }
1595 
1596 #undef __FUNC__
1597 #define __FUNC__ "MatZeroRows_MPIBAIJ"
1598 int MatZeroRows_MPIBAIJ(Mat A,IS is,Scalar *diag)
1599 {
1600   Mat_MPIBAIJ    *l = (Mat_MPIBAIJ *) A->data;
1601   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
1602   int            *procs,*nprocs,j,found,idx,nsends,*work,row;
1603   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
1604   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
1605   int            *lens,imdex,*lrows,*values,bs=l->bs,rstart_bs=l->rstart_bs;
1606   MPI_Comm       comm = A->comm;
1607   MPI_Request    *send_waits,*recv_waits;
1608   MPI_Status     recv_status,*send_status;
1609   IS             istmp;
1610 
1611   PetscFunctionBegin;
1612   ierr = ISGetSize(is,&N);CHKERRQ(ierr);
1613   ierr = ISGetIndices(is,&rows);CHKERRQ(ierr);
1614 
1615   /*  first count number of contributors to each processor */
1616   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) );CHKPTRQ(nprocs);
1617   ierr   = PetscMemzero(nprocs,2*size*sizeof(int));CHKERRQ(ierr);
1618  procs   = nprocs + size;
1619   owner  = (int *) PetscMalloc((N+1)*sizeof(int));CHKPTRQ(owner); /* see note*/
1620   for ( i=0; i<N; i++ ) {
1621     idx = rows[i];
1622     found = 0;
1623     for ( j=0; j<size; j++ ) {
1624       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
1625         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
1626       }
1627     }
1628     if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Index out of range");
1629   }
1630   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
1631 
1632   /* inform other processors of number of messages and max length*/
1633   work   = (int *) PetscMalloc( size*sizeof(int) );CHKPTRQ(work);
1634   ierr   = MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
1635   nrecvs = work[rank];
1636   ierr   = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
1637   nmax   = work[rank];
1638   ierr = PetscFree(work);CHKERRQ(ierr);
1639 
1640   /* post receives:   */
1641   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int));CHKPTRQ(rvalues);
1642   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
1643   for ( i=0; i<nrecvs; i++ ) {
1644     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
1645   }
1646 
1647   /* do sends:
1648      1) starts[i] gives the starting index in svalues for stuff going to
1649      the ith processor
1650   */
1651   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) );CHKPTRQ(svalues);
1652   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
1653   starts = (int *) PetscMalloc( (size+1)*sizeof(int) );CHKPTRQ(starts);
1654   starts[0] = 0;
1655   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1656   for ( i=0; i<N; i++ ) {
1657     svalues[starts[owner[i]]++] = rows[i];
1658   }
1659   ISRestoreIndices(is,&rows);
1660 
1661   starts[0] = 0;
1662   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1663   count = 0;
1664   for ( i=0; i<size; i++ ) {
1665     if (procs[i]) {
1666       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
1667     }
1668   }
1669   ierr = PetscFree(starts);CHKERRQ(ierr);
1670 
1671   base = owners[rank]*bs;
1672 
1673   /*  wait on receives */
1674   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) );CHKPTRQ(lens);
1675   source = lens + nrecvs;
1676   count  = nrecvs; slen = 0;
1677   while (count) {
1678     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
1679     /* unpack receives into our local space */
1680     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
1681     source[imdex]  = recv_status.MPI_SOURCE;
1682     lens[imdex]  = n;
1683     slen += n;
1684     count--;
1685   }
1686   ierr = PetscFree(recv_waits); CHKERRQ(ierr);
1687 
1688   /* move the data into the send scatter */
1689   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) );CHKPTRQ(lrows);
1690   count = 0;
1691   for ( i=0; i<nrecvs; i++ ) {
1692     values = rvalues + i*nmax;
1693     for ( j=0; j<lens[i]; j++ ) {
1694       lrows[count++] = values[j] - base;
1695     }
1696   }
1697   ierr = PetscFree(rvalues);CHKERRQ(ierr);
1698   ierr = PetscFree(lens);CHKERRQ(ierr);
1699   ierr = PetscFree(owner);CHKERRQ(ierr);
1700   ierr = PetscFree(nprocs);CHKERRQ(ierr);
1701 
1702   /* actually zap the local rows */
1703   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
1704   PLogObjectParent(A,istmp);
1705 
1706   /*
1707         Zero the required rows. If the "diagonal block" of the matrix
1708      is square and the user wishes to set the diagonal we use seperate
1709      code so that MatSetValues() is not called for each diagonal allocating
1710      new memory, thus calling lots of mallocs and slowing things down.
1711 
1712        Contributed by: Mathew Knepley
1713   */
1714   /* must zero l->B before l->A because the (diag) case below may put values into l->B*/
1715   ierr = MatZeroRows(l->B,istmp,0);CHKERRQ(ierr);
1716   if (diag && (l->A->M == l->A->N)) {
1717     ierr      = MatZeroRows(l->A,istmp,diag);CHKERRQ(ierr);
1718   } else if (diag) {
1719     ierr = MatZeroRows(l->A,istmp,0);CHKERRQ(ierr);
1720     if (((Mat_SeqBAIJ*)l->A->data)->nonew) {
1721       SETERRQ(PETSC_ERR_SUP,0,"MatZeroRows() on rectangular matrices cannot be used with the Mat options \n\
1722 MAT_NO_NEW_NONZERO_LOCATIONS,MAT_NEW_NONZERO_LOCATION_ERR,MAT_NEW_NONZERO_ALLOCATION_ERR");
1723     }
1724     for ( i = 0; i < slen; i++ ) {
1725       row  = lrows[i] + rstart_bs;
1726       ierr = MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES);CHKERRQ(ierr);
1727     }
1728     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1729     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1730   } else {
1731     ierr = MatZeroRows(l->A,istmp,0);CHKERRQ(ierr);
1732   }
1733 
1734   ierr = ISDestroy(istmp);CHKERRQ(ierr);
1735   ierr = PetscFree(lrows);CHKERRQ(ierr);
1736 
1737   /* wait on sends */
1738   if (nsends) {
1739     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
1740     ierr        = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
1741     ierr = PetscFree(send_status);CHKERRQ(ierr);
1742   }
1743   ierr = PetscFree(send_waits);CHKERRQ(ierr);
1744   ierr = PetscFree(svalues);CHKERRQ(ierr);
1745 
1746   PetscFunctionReturn(0);
1747 }
1748 
1749 #undef __FUNC__
1750 #define __FUNC__ "MatPrintHelp_MPIBAIJ"
1751 int MatPrintHelp_MPIBAIJ(Mat A)
1752 {
1753   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1754   MPI_Comm    comm = A->comm;
1755   static int  called = 0;
1756   int         ierr;
1757 
1758   PetscFunctionBegin;
1759   if (!a->rank) {
1760     ierr = MatPrintHelp_SeqBAIJ(a->A);CHKERRQ(ierr);
1761   }
1762   if (called) {PetscFunctionReturn(0);} else called = 1;
1763   ierr = (*PetscHelpPrintf)(comm," Options for MATMPIBAIJ matrix format (the defaults):\n");CHKERRQ(ierr);
1764   ierr = (*PetscHelpPrintf)(comm,"  -mat_use_hash_table <factor>: Use hashtable for efficient matrix assembly\n");CHKERRQ(ierr);
1765   PetscFunctionReturn(0);
1766 }
1767 
1768 #undef __FUNC__
1769 #define __FUNC__ "MatSetUnfactored_MPIBAIJ"
1770 int MatSetUnfactored_MPIBAIJ(Mat A)
1771 {
1772   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1773   int         ierr;
1774 
1775   PetscFunctionBegin;
1776   ierr = MatSetUnfactored(a->A);CHKERRQ(ierr);
1777   PetscFunctionReturn(0);
1778 }
1779 
1780 static int MatDuplicate_MPIBAIJ(Mat,MatDuplicateOption,Mat *);
1781 
1782 #undef __FUNC__
1783 #define __FUNC__ "MatEqual_MPIBAIJ"
1784 int MatEqual_MPIBAIJ(Mat A, Mat B, PetscTruth *flag)
1785 {
1786   Mat_MPIBAIJ *matB = (Mat_MPIBAIJ *) B->data,*matA = (Mat_MPIBAIJ *) A->data;
1787   Mat         a, b, c, d;
1788   PetscTruth  flg;
1789   int         ierr;
1790 
1791   PetscFunctionBegin;
1792   if (B->type != MATMPIBAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices must be same type");
1793   a = matA->A; b = matA->B;
1794   c = matB->A; d = matB->B;
1795 
1796   ierr = MatEqual(a, c, &flg);CHKERRQ(ierr);
1797   if (flg == PETSC_TRUE) {
1798     ierr = MatEqual(b, d, &flg);CHKERRQ(ierr);
1799   }
1800   ierr = MPI_Allreduce(&flg, flag, 1, MPI_INT, MPI_LAND, A->comm);CHKERRQ(ierr);
1801   PetscFunctionReturn(0);
1802 }
1803 
1804 /* -------------------------------------------------------------------*/
1805 static struct _MatOps MatOps_Values = {
1806   MatSetValues_MPIBAIJ,
1807   MatGetRow_MPIBAIJ,
1808   MatRestoreRow_MPIBAIJ,
1809   MatMult_MPIBAIJ,
1810   MatMultAdd_MPIBAIJ,
1811   MatMultTrans_MPIBAIJ,
1812   MatMultTransAdd_MPIBAIJ,
1813   0,
1814   0,
1815   0,
1816   0,
1817   0,
1818   0,
1819   0,
1820   MatTranspose_MPIBAIJ,
1821   MatGetInfo_MPIBAIJ,
1822   MatEqual_MPIBAIJ,
1823   MatGetDiagonal_MPIBAIJ,
1824   MatDiagonalScale_MPIBAIJ,
1825   MatNorm_MPIBAIJ,
1826   MatAssemblyBegin_MPIBAIJ,
1827   MatAssemblyEnd_MPIBAIJ,
1828   0,
1829   MatSetOption_MPIBAIJ,
1830   MatZeroEntries_MPIBAIJ,
1831   MatZeroRows_MPIBAIJ,
1832   0,
1833   0,
1834   0,
1835   0,
1836   MatGetSize_MPIBAIJ,
1837   MatGetLocalSize_MPIBAIJ,
1838   MatGetOwnershipRange_MPIBAIJ,
1839   0,
1840   0,
1841   0,
1842   0,
1843   MatDuplicate_MPIBAIJ,
1844   0,
1845   0,
1846   0,
1847   0,
1848   0,
1849   MatGetSubMatrices_MPIBAIJ,
1850   MatIncreaseOverlap_MPIBAIJ,
1851   MatGetValues_MPIBAIJ,
1852   0,
1853   MatPrintHelp_MPIBAIJ,
1854   MatScale_MPIBAIJ,
1855   0,
1856   0,
1857   0,
1858   MatGetBlockSize_MPIBAIJ,
1859   0,
1860   0,
1861   0,
1862   0,
1863   0,
1864   0,
1865   MatSetUnfactored_MPIBAIJ,
1866   0,
1867   MatSetValuesBlocked_MPIBAIJ,
1868   0,
1869   0,
1870   0,
1871   MatGetMaps_Petsc};
1872 
1873 
1874 EXTERN_C_BEGIN
1875 #undef __FUNC__
1876 #define __FUNC__ "MatGetDiagonalBlock_MPIBAIJ"
1877 int MatGetDiagonalBlock_MPIBAIJ(Mat A,PetscTruth *iscopy,MatReuse reuse,Mat *a)
1878 {
1879   PetscFunctionBegin;
1880   *a      = ((Mat_MPIBAIJ *)A->data)->A;
1881   *iscopy = PETSC_FALSE;
1882   PetscFunctionReturn(0);
1883 }
1884 EXTERN_C_END
1885 
1886 #undef __FUNC__
1887 #define __FUNC__ "MatCreateMPIBAIJ"
1888 /*@C
1889    MatCreateMPIBAIJ - Creates a sparse parallel matrix in block AIJ format
1890    (block compressed row).  For good matrix assembly performance
1891    the user should preallocate the matrix storage by setting the parameters
1892    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1893    performance can be increased by more than a factor of 50.
1894 
1895    Collective on MPI_Comm
1896 
1897    Input Parameters:
1898 +  comm - MPI communicator
1899 .  bs   - size of blockk
1900 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1901            This value should be the same as the local size used in creating the
1902            y vector for the matrix-vector product y = Ax.
1903 .  n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1904            This value should be the same as the local size used in creating the
1905            x vector for the matrix-vector product y = Ax.
1906 .  M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1907 .  N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1908 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1909            submatrix  (same for all local rows)
1910 .  d_nnz - array containing the number of block nonzeros in the various block rows
1911            of the in diagonal portion of the local (possibly different for each block
1912            row) or PETSC_NULL.  You must leave room for the diagonal entry even if it is zero.
1913 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1914            submatrix (same for all local rows).
1915 -  o_nnz - array containing the number of nonzeros in the various block rows of the
1916            off-diagonal portion of the local submatrix (possibly different for
1917            each block row) or PETSC_NULL.
1918 
1919    Output Parameter:
1920 .  A - the matrix
1921 
1922    Options Database Keys:
1923 .   -mat_no_unroll - uses code that does not unroll the loops in the
1924                      block calculations (much slower)
1925 .   -mat_block_size - size of the blocks to use
1926 .   -mat_mpi - use the parallel matrix data structures even on one processor
1927                (defaults to using SeqBAIJ format on one processor)
1928 
1929    Notes:
1930    The user MUST specify either the local or global matrix dimensions
1931    (possibly both).
1932 
1933    If PETSC_DECIDE or  PETSC_DETERMINE is used for a particular argument on one processor
1934    than it must be used on all processors that share the object for that argument.
1935 
1936    Storage Information:
1937    For a square global matrix we define each processor's diagonal portion
1938    to be its local rows and the corresponding columns (a square submatrix);
1939    each processor's off-diagonal portion encompasses the remainder of the
1940    local matrix (a rectangular submatrix).
1941 
1942    The user can specify preallocated storage for the diagonal part of
1943    the local submatrix with either d_nz or d_nnz (not both).  Set
1944    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1945    memory allocation.  Likewise, specify preallocated storage for the
1946    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1947 
1948    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1949    the figure below we depict these three local rows and all columns (0-11).
1950 
1951 .vb
1952            0 1 2 3 4 5 6 7 8 9 10 11
1953           -------------------
1954    row 3  |  o o o d d d o o o o o o
1955    row 4  |  o o o d d d o o o o o o
1956    row 5  |  o o o d d d o o o o o o
1957           -------------------
1958 .ve
1959 
1960    Thus, any entries in the d locations are stored in the d (diagonal)
1961    submatrix, and any entries in the o locations are stored in the
1962    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1963    stored simply in the MATSEQBAIJ format for compressed row storage.
1964 
1965    Now d_nz should indicate the number of block nonzeros per row in the d matrix,
1966    and o_nz should indicate the number of block nonzeros per row in the o matrix.
1967    In general, for PDE problems in which most nonzeros are near the diagonal,
1968    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1969    or you will get TERRIBLE performance; see the users' manual chapter on
1970    matrices.
1971 
1972    Level: intermediate
1973 
1974 .keywords: matrix, block, aij, compressed row, sparse, parallel
1975 
1976 .seealso: MatCreate(), MatCreateSeqBAIJ(), MatSetValues(), MatCreateMPIAIJ()
1977 @*/
1978 int MatCreateMPIBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,
1979                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1980 {
1981   Mat          B;
1982   Mat_MPIBAIJ  *b;
1983   int          ierr, i,sum[2],work[2],mbs,nbs,Mbs=PETSC_DECIDE,Nbs=PETSC_DECIDE,size,flg;
1984   int          flag1 = 0,flag2 = 0;
1985 
1986   PetscFunctionBegin;
1987   ierr = OptionsGetInt(PETSC_NULL,"-mat_block_size",&bs,PETSC_NULL);CHKERRQ(ierr);
1988 
1989   if (bs < 1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Invalid block size specified, must be positive");
1990   if (d_nz < -2) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,0,"d_nz cannot be less than -1: value %d",d_nz);
1991   if (o_nz < -2) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,0,"o_nz cannot be less than -1: value %d",o_nz);
1992   if (d_nnz) {
1993     for (i=0; i<m/bs; i++) {
1994       if (d_nnz[i] < 0) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,0,"d_nnz cannot be less than -1: local row %d value %d",i,d_nnz[i]);
1995     }
1996   }
1997   if (o_nnz) {
1998     for (i=0; i<m/bs; i++) {
1999       if (o_nnz[i] < 0) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,0,"o_nnz cannot be less than -1: local row %d value %d",i,o_nnz[i]);
2000     }
2001   }
2002 
2003   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
2004   ierr = OptionsHasName(PETSC_NULL,"-mat_mpibaij",&flag1);CHKERRQ(ierr);
2005   ierr = OptionsHasName(PETSC_NULL,"-mat_mpi",&flag2);CHKERRQ(ierr);
2006   if (!flag1 && !flag2 && size == 1) {
2007     if (M == PETSC_DECIDE) M = m;
2008     if (N == PETSC_DECIDE) N = n;
2009     ierr = MatCreateSeqBAIJ(comm,bs,M,N,d_nz,d_nnz,A);CHKERRQ(ierr);
2010     PetscFunctionReturn(0);
2011   }
2012 
2013   *A = 0;
2014   PetscHeaderCreate(B,_p_Mat,struct _MatOps,MAT_COOKIE,MATMPIBAIJ,"Mat",comm,MatDestroy,MatView);
2015   PLogObjectCreate(B);
2016   B->data = (void *) (b = PetscNew(Mat_MPIBAIJ));CHKPTRQ(b);
2017   ierr    = PetscMemzero(b,sizeof(Mat_MPIBAIJ));CHKERRQ(ierr);
2018   ierr    = PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr);
2019 
2020   B->ops->destroy    = MatDestroy_MPIBAIJ;
2021   B->ops->view       = MatView_MPIBAIJ;
2022   B->mapping    = 0;
2023   B->factor     = 0;
2024   B->assembled  = PETSC_FALSE;
2025 
2026   B->insertmode = NOT_SET_VALUES;
2027   ierr = MPI_Comm_rank(comm,&b->rank);CHKERRQ(ierr);
2028   ierr = MPI_Comm_size(comm,&b->size);CHKERRQ(ierr);
2029 
2030   if ( m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL)) {
2031     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
2032   }
2033   if ( M == PETSC_DECIDE && m == PETSC_DECIDE) {
2034     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either M or m should be specified");
2035   }
2036   if ( N == PETSC_DECIDE && n == PETSC_DECIDE) {
2037     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either N or n should be specified");
2038   }
2039   if ( M != PETSC_DECIDE && m != PETSC_DECIDE) M = PETSC_DECIDE;
2040   if ( N != PETSC_DECIDE && n != PETSC_DECIDE) N = PETSC_DECIDE;
2041 
2042   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
2043     work[0] = m; work[1] = n;
2044     mbs = m/bs; nbs = n/bs;
2045     ierr = MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );CHKERRQ(ierr);
2046     if (M == PETSC_DECIDE) {M = sum[0]; Mbs = M/bs;}
2047     if (N == PETSC_DECIDE) {N = sum[1]; Nbs = N/bs;}
2048   }
2049   if (m == PETSC_DECIDE) {
2050     Mbs = M/bs;
2051     if (Mbs*bs != M) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global rows must be divisible by blocksize");
2052     mbs = Mbs/b->size + ((Mbs % b->size) > b->rank);
2053     m   = mbs*bs;
2054   }
2055   if (n == PETSC_DECIDE) {
2056     Nbs = N/bs;
2057     if (Nbs*bs != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global cols must be divisible by blocksize");
2058     nbs = Nbs/b->size + ((Nbs % b->size) > b->rank);
2059     n   = nbs*bs;
2060   }
2061   if (mbs*bs != m || nbs*bs != n) {
2062     SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of local rows, cols must be divisible by blocksize");
2063   }
2064 
2065   b->m = m; B->m = m;
2066   b->n = n; B->n = n;
2067   b->N = N; B->N = N;
2068   b->M = M; B->M = M;
2069   b->bs  = bs;
2070   b->bs2 = bs*bs;
2071   b->mbs = mbs;
2072   b->nbs = nbs;
2073   b->Mbs = Mbs;
2074   b->Nbs = Nbs;
2075 
2076   /* the information in the maps duplicates the information computed below, eventually
2077      we should remove the duplicate information that is not contained in the maps */
2078   ierr = MapCreateMPI(comm,m,M,&B->rmap);CHKERRQ(ierr);
2079   ierr = MapCreateMPI(comm,n,N,&B->cmap);CHKERRQ(ierr);
2080 
2081   /* build local table of row and column ownerships */
2082   b->rowners = (int *) PetscMalloc(3*(b->size+2)*sizeof(int));CHKPTRQ(b->rowners);
2083   PLogObjectMemory(B,3*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
2084   b->cowners    = b->rowners + b->size + 2;
2085   b->rowners_bs = b->cowners + b->size + 2;
2086   ierr = MPI_Allgather(&mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2087   b->rowners[0]    = 0;
2088   for ( i=2; i<=b->size; i++ ) {
2089     b->rowners[i] += b->rowners[i-1];
2090   }
2091   for ( i=0; i<=b->size; i++ ) {
2092     b->rowners_bs[i] = b->rowners[i]*bs;
2093   }
2094   b->rstart    = b->rowners[b->rank];
2095   b->rend      = b->rowners[b->rank+1];
2096   b->rstart_bs = b->rstart * bs;
2097   b->rend_bs   = b->rend * bs;
2098 
2099   ierr = MPI_Allgather(&nbs,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2100   b->cowners[0] = 0;
2101   for ( i=2; i<=b->size; i++ ) {
2102     b->cowners[i] += b->cowners[i-1];
2103   }
2104   b->cstart    = b->cowners[b->rank];
2105   b->cend      = b->cowners[b->rank+1];
2106   b->cstart_bs = b->cstart * bs;
2107   b->cend_bs   = b->cend * bs;
2108 
2109 
2110   if (d_nz == PETSC_DEFAULT) d_nz = 5;
2111   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,n,d_nz,d_nnz,&b->A);CHKERRQ(ierr);
2112   PLogObjectParent(B,b->A);
2113   if (o_nz == PETSC_DEFAULT) o_nz = 0;
2114   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,N,o_nz,o_nnz,&b->B);CHKERRQ(ierr);
2115   PLogObjectParent(B,b->B);
2116 
2117   /* build cache for off array entries formed */
2118   ierr = MatStashCreate_Private(comm,1,&B->stash);CHKERRQ(ierr);
2119   ierr = MatStashCreate_Private(comm,bs,&B->bstash);CHKERRQ(ierr);
2120   b->donotstash  = 0;
2121   b->colmap      = 0;
2122   b->garray      = 0;
2123   b->roworiented = 1;
2124 
2125   /* stuff used in block assembly */
2126   b->barray       = 0;
2127 
2128   /* stuff used for matrix vector multiply */
2129   b->lvec         = 0;
2130   b->Mvctx        = 0;
2131 
2132   /* stuff for MatGetRow() */
2133   b->rowindices   = 0;
2134   b->rowvalues    = 0;
2135   b->getrowactive = PETSC_FALSE;
2136 
2137   /* hash table stuff */
2138   b->ht           = 0;
2139   b->hd           = 0;
2140   b->ht_size      = 0;
2141   b->ht_flag      = 0;
2142   b->ht_fact      = 0;
2143   b->ht_total_ct  = 0;
2144   b->ht_insert_ct = 0;
2145 
2146   *A = B;
2147   ierr = OptionsHasName(PETSC_NULL,"-mat_use_hash_table",&flg);CHKERRQ(ierr);
2148   if (flg) {
2149     double fact = 1.39;
2150     ierr = MatSetOption(B,MAT_USE_HASH_TABLE);CHKERRQ(ierr);
2151     ierr = OptionsGetDouble(PETSC_NULL,"-mat_use_hash_table",&fact,&flg);CHKERRQ(ierr);
2152     if (fact <= 1.0) fact = 1.39;
2153     ierr = MatMPIBAIJSetHashTableFactor(B,fact);CHKERRQ(ierr);
2154     PLogInfo(0,"MatCreateMPIBAIJ:Hash table Factor used %5.2f\n",fact);
2155   }
2156   ierr = PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",
2157                                      "MatStoreValues_MPIBAIJ",
2158                                      (void*)MatStoreValues_MPIBAIJ);CHKERRQ(ierr);
2159   ierr = PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",
2160                                      "MatRetrieveValues_MPIBAIJ",
2161                                      (void*)MatRetrieveValues_MPIBAIJ);CHKERRQ(ierr);
2162   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetDiagonalBlock_C",
2163                                      "MatGetDiagonalBlock_MPIBAIJ",
2164                                      (void*)MatGetDiagonalBlock_MPIBAIJ);CHKERRQ(ierr);
2165   PetscFunctionReturn(0);
2166 }
2167 
2168 #undef __FUNC__
2169 #define __FUNC__ "MatDuplicate_MPIBAIJ"
2170 static int MatDuplicate_MPIBAIJ(Mat matin,MatDuplicateOption cpvalues,Mat *newmat)
2171 {
2172   Mat         mat;
2173   Mat_MPIBAIJ *a,*oldmat = (Mat_MPIBAIJ *) matin->data;
2174   int         ierr, len=0, flg;
2175 
2176   PetscFunctionBegin;
2177   *newmat       = 0;
2178   PetscHeaderCreate(mat,_p_Mat,struct _MatOps,MAT_COOKIE,MATMPIBAIJ,"Mat",matin->comm,MatDestroy,MatView);
2179   PLogObjectCreate(mat);
2180   mat->data         = (void *) (a = PetscNew(Mat_MPIBAIJ));CHKPTRQ(a);
2181   ierr              = PetscMemcpy(mat->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr);
2182   mat->ops->destroy = MatDestroy_MPIBAIJ;
2183   mat->ops->view    = MatView_MPIBAIJ;
2184   mat->factor       = matin->factor;
2185   mat->assembled    = PETSC_TRUE;
2186   mat->insertmode   = NOT_SET_VALUES;
2187 
2188   a->m = mat->m   = oldmat->m;
2189   a->n = mat->n   = oldmat->n;
2190   a->M = mat->M   = oldmat->M;
2191   a->N = mat->N   = oldmat->N;
2192 
2193   a->bs  = oldmat->bs;
2194   a->bs2 = oldmat->bs2;
2195   a->mbs = oldmat->mbs;
2196   a->nbs = oldmat->nbs;
2197   a->Mbs = oldmat->Mbs;
2198   a->Nbs = oldmat->Nbs;
2199 
2200   a->rstart       = oldmat->rstart;
2201   a->rend         = oldmat->rend;
2202   a->cstart       = oldmat->cstart;
2203   a->cend         = oldmat->cend;
2204   a->size         = oldmat->size;
2205   a->rank         = oldmat->rank;
2206   a->donotstash   = oldmat->donotstash;
2207   a->roworiented  = oldmat->roworiented;
2208   a->rowindices   = 0;
2209   a->rowvalues    = 0;
2210   a->getrowactive = PETSC_FALSE;
2211   a->barray       = 0;
2212   a->rstart_bs    = oldmat->rstart_bs;
2213   a->rend_bs      = oldmat->rend_bs;
2214   a->cstart_bs    = oldmat->cstart_bs;
2215   a->cend_bs      = oldmat->cend_bs;
2216 
2217   /* hash table stuff */
2218   a->ht           = 0;
2219   a->hd           = 0;
2220   a->ht_size      = 0;
2221   a->ht_flag      = oldmat->ht_flag;
2222   a->ht_fact      = oldmat->ht_fact;
2223   a->ht_total_ct  = 0;
2224   a->ht_insert_ct = 0;
2225 
2226 
2227   a->rowners = (int *) PetscMalloc(3*(a->size+2)*sizeof(int));CHKPTRQ(a->rowners);
2228   PLogObjectMemory(mat,3*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
2229   a->cowners    = a->rowners + a->size + 2;
2230   a->rowners_bs = a->cowners + a->size + 2;
2231   ierr = PetscMemcpy(a->rowners,oldmat->rowners,3*(a->size+2)*sizeof(int));CHKERRQ(ierr);
2232   ierr = MatStashCreate_Private(matin->comm,1,&mat->stash);CHKERRQ(ierr);
2233   ierr = MatStashCreate_Private(matin->comm,oldmat->bs,&mat->bstash);CHKERRQ(ierr);
2234   if (oldmat->colmap) {
2235 #if defined (PETSC_USE_CTABLE)
2236   ierr = TableCreateCopy(oldmat->colmap,&a->colmap);CHKERRQ(ierr);
2237 #else
2238     a->colmap = (int *) PetscMalloc((a->Nbs)*sizeof(int));CHKPTRQ(a->colmap);
2239     PLogObjectMemory(mat,(a->Nbs)*sizeof(int));
2240     ierr      = PetscMemcpy(a->colmap,oldmat->colmap,(a->Nbs)*sizeof(int));CHKERRQ(ierr);
2241 #endif
2242   } else a->colmap = 0;
2243   if (oldmat->garray && (len = ((Mat_SeqBAIJ *) (oldmat->B->data))->nbs)) {
2244     a->garray = (int *) PetscMalloc(len*sizeof(int));CHKPTRQ(a->garray);
2245     PLogObjectMemory(mat,len*sizeof(int));
2246     ierr = PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));CHKERRQ(ierr);
2247   } else a->garray = 0;
2248 
2249   ierr =  VecDuplicate(oldmat->lvec,&a->lvec);CHKERRQ(ierr);
2250   PLogObjectParent(mat,a->lvec);
2251   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx);CHKERRQ(ierr);
2252 
2253   PLogObjectParent(mat,a->Mvctx);
2254   ierr =  MatDuplicate(oldmat->A,cpvalues,&a->A);CHKERRQ(ierr);
2255   PLogObjectParent(mat,a->A);
2256   ierr =  MatDuplicate(oldmat->B,cpvalues,&a->B);CHKERRQ(ierr);
2257   PLogObjectParent(mat,a->B);
2258   ierr = OptionsHasName(PETSC_NULL,"-help",&flg);CHKERRQ(ierr);
2259   ierr = FListDuplicate(mat->qlist,&matin->qlist);CHKERRQ(ierr);
2260   if (flg) {
2261     ierr = MatPrintHelp(mat);CHKERRQ(ierr);
2262   }
2263   *newmat = mat;
2264   PetscFunctionReturn(0);
2265 }
2266 
2267 #include "sys.h"
2268 
2269 #undef __FUNC__
2270 #define __FUNC__ "MatLoad_MPIBAIJ"
2271 int MatLoad_MPIBAIJ(Viewer viewer,MatType type,Mat *newmat)
2272 {
2273   Mat          A;
2274   int          i, nz, ierr, j,rstart, rend, fd;
2275   Scalar       *vals,*buf;
2276   MPI_Comm     comm = ((PetscObject)viewer)->comm;
2277   MPI_Status   status;
2278   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,*browners,maxnz,*cols;
2279   int          *locrowlens,*sndcounts = 0,*procsnz = 0, jj,*mycols,*ibuf;
2280   int          flg,tag = ((PetscObject)viewer)->tag,bs=1,Mbs,mbs,extra_rows;
2281   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
2282   int          dcount,kmax,k,nzcount,tmp;
2283 
2284   PetscFunctionBegin;
2285   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
2286 
2287   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
2288   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
2289   if (!rank) {
2290     ierr = ViewerBinaryGetDescriptor(viewer,&fd);CHKERRQ(ierr);
2291     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT);CHKERRQ(ierr);
2292     if (header[0] != MAT_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"not matrix object");
2293     if (header[3] < 0) {
2294       SETERRQ(PETSC_ERR_FILE_UNEXPECTED,1,"Matrix stored in special format, cannot load as MPIBAIJ");
2295     }
2296   }
2297 
2298   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
2299   M = header[1]; N = header[2];
2300 
2301   if (M != N) SETERRQ(PETSC_ERR_SUP,0,"Can only do square matrices");
2302 
2303   /*
2304      This code adds extra rows to make sure the number of rows is
2305      divisible by the blocksize
2306   */
2307   Mbs        = M/bs;
2308   extra_rows = bs - M + bs*(Mbs);
2309   if (extra_rows == bs) extra_rows = 0;
2310   else                  Mbs++;
2311   if (extra_rows &&!rank) {
2312     PLogInfo(0,"MatLoad_MPIBAIJ:Padding loaded matrix to match blocksize\n");
2313   }
2314 
2315   /* determine ownership of all rows */
2316   mbs = Mbs/size + ((Mbs % size) > rank);
2317   m   = mbs * bs;
2318   rowners = (int *) PetscMalloc(2*(size+2)*sizeof(int));CHKPTRQ(rowners);
2319   browners = rowners + size + 1;
2320   ierr = MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2321   rowners[0] = 0;
2322   for ( i=2; i<=size; i++ ) rowners[i] += rowners[i-1];
2323   for ( i=0; i<=size;  i++ ) browners[i] = rowners[i]*bs;
2324   rstart = rowners[rank];
2325   rend   = rowners[rank+1];
2326 
2327   /* distribute row lengths to all processors */
2328   locrowlens = (int*) PetscMalloc( (rend-rstart)*bs*sizeof(int) );CHKPTRQ(locrowlens);
2329   if (!rank) {
2330     rowlengths = (int*) PetscMalloc( (M+extra_rows)*sizeof(int) );CHKPTRQ(rowlengths);
2331     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT);CHKERRQ(ierr);
2332     for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
2333     sndcounts = (int*) PetscMalloc( size*sizeof(int) );CHKPTRQ(sndcounts);
2334     for ( i=0; i<size; i++ ) sndcounts[i] = browners[i+1] - browners[i];
2335     ierr = MPI_Scatterv(rowlengths,sndcounts,browners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);CHKERRQ(ierr);
2336     ierr = PetscFree(sndcounts);CHKERRQ(ierr);
2337   } else {
2338     ierr = MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT, 0,comm);CHKERRQ(ierr);
2339   }
2340 
2341   if (!rank) {
2342     /* calculate the number of nonzeros on each processor */
2343     procsnz = (int*) PetscMalloc( size*sizeof(int) );CHKPTRQ(procsnz);
2344     ierr    = PetscMemzero(procsnz,size*sizeof(int));CHKERRQ(ierr);
2345     for ( i=0; i<size; i++ ) {
2346       for ( j=rowners[i]*bs; j< rowners[i+1]*bs; j++ ) {
2347         procsnz[i] += rowlengths[j];
2348       }
2349     }
2350     ierr = PetscFree(rowlengths);CHKERRQ(ierr);
2351 
2352     /* determine max buffer needed and allocate it */
2353     maxnz = 0;
2354     for ( i=0; i<size; i++ ) {
2355       maxnz = PetscMax(maxnz,procsnz[i]);
2356     }
2357     cols = (int *) PetscMalloc( maxnz*sizeof(int) );CHKPTRQ(cols);
2358 
2359     /* read in my part of the matrix column indices  */
2360     nz = procsnz[0];
2361     ibuf = (int *) PetscMalloc( nz*sizeof(int) );CHKPTRQ(ibuf);
2362     mycols = ibuf;
2363     if (size == 1)  nz -= extra_rows;
2364     ierr = PetscBinaryRead(fd,mycols,nz,PETSC_INT);CHKERRQ(ierr);
2365     if (size == 1)  for (i=0; i< extra_rows; i++) { mycols[nz+i] = M+i; }
2366 
2367     /* read in every ones (except the last) and ship off */
2368     for ( i=1; i<size-1; i++ ) {
2369       nz   = procsnz[i];
2370       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT);CHKERRQ(ierr);
2371       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
2372     }
2373     /* read in the stuff for the last proc */
2374     if ( size != 1 ) {
2375       nz   = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
2376       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT);CHKERRQ(ierr);
2377       for ( i=0; i<extra_rows; i++ ) cols[nz+i] = M+i;
2378       ierr = MPI_Send(cols,nz+extra_rows,MPI_INT,size-1,tag,comm);CHKERRQ(ierr);
2379     }
2380     ierr = PetscFree(cols);CHKERRQ(ierr);
2381   } else {
2382     /* determine buffer space needed for message */
2383     nz = 0;
2384     for ( i=0; i<m; i++ ) {
2385       nz += locrowlens[i];
2386     }
2387     ibuf   = (int*) PetscMalloc( nz*sizeof(int) );CHKPTRQ(ibuf);
2388     mycols = ibuf;
2389     /* receive message of column indices*/
2390     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
2391     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
2392     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2393   }
2394 
2395   /* loop over local rows, determining number of off diagonal entries */
2396   dlens  = (int *) PetscMalloc( 2*(rend-rstart+1)*sizeof(int) );CHKPTRQ(dlens);
2397   odlens = dlens + (rend-rstart);
2398   mask   = (int *) PetscMalloc( 3*Mbs*sizeof(int) );CHKPTRQ(mask);
2399   ierr   = PetscMemzero(mask,3*Mbs*sizeof(int));CHKERRQ(ierr);
2400   masked1 = mask    + Mbs;
2401   masked2 = masked1 + Mbs;
2402   rowcount = 0; nzcount = 0;
2403   for ( i=0; i<mbs; i++ ) {
2404     dcount  = 0;
2405     odcount = 0;
2406     for ( j=0; j<bs; j++ ) {
2407       kmax = locrowlens[rowcount];
2408       for ( k=0; k<kmax; k++ ) {
2409         tmp = mycols[nzcount++]/bs;
2410         if (!mask[tmp]) {
2411           mask[tmp] = 1;
2412           if (tmp < rstart || tmp >= rend ) masked2[odcount++] = tmp;
2413           else masked1[dcount++] = tmp;
2414         }
2415       }
2416       rowcount++;
2417     }
2418 
2419     dlens[i]  = dcount;
2420     odlens[i] = odcount;
2421 
2422     /* zero out the mask elements we set */
2423     for ( j=0; j<dcount; j++ ) mask[masked1[j]] = 0;
2424     for ( j=0; j<odcount; j++ ) mask[masked2[j]] = 0;
2425   }
2426 
2427   /* create our matrix */
2428   ierr = MatCreateMPIBAIJ(comm,bs,m,PETSC_DECIDE,M+extra_rows,N+extra_rows,0,dlens,0,odlens,newmat);CHKERRQ(ierr);
2429   A = *newmat;
2430   MatSetOption(A,MAT_COLUMNS_SORTED);
2431 
2432   if (!rank) {
2433     buf = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) );CHKPTRQ(buf);
2434     /* read in my part of the matrix numerical values  */
2435     nz = procsnz[0];
2436     vals = buf;
2437     mycols = ibuf;
2438     if (size == 1)  nz -= extra_rows;
2439     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR);CHKERRQ(ierr);
2440     if (size == 1)  for (i=0; i< extra_rows; i++) { vals[nz+i] = 1.0; }
2441 
2442     /* insert into matrix */
2443     jj      = rstart*bs;
2444     for ( i=0; i<m; i++ ) {
2445       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2446       mycols += locrowlens[i];
2447       vals   += locrowlens[i];
2448       jj++;
2449     }
2450     /* read in other processors (except the last one) and ship out */
2451     for ( i=1; i<size-1; i++ ) {
2452       nz   = procsnz[i];
2453       vals = buf;
2454       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR);CHKERRQ(ierr);
2455       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
2456     }
2457     /* the last proc */
2458     if ( size != 1 ){
2459       nz   = procsnz[i] - extra_rows;
2460       vals = buf;
2461       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR);CHKERRQ(ierr);
2462       for ( i=0; i<extra_rows; i++ ) vals[nz+i] = 1.0;
2463       ierr = MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,size-1,A->tag,comm);CHKERRQ(ierr);
2464     }
2465     ierr = PetscFree(procsnz);CHKERRQ(ierr);
2466   } else {
2467     /* receive numeric values */
2468     buf = (Scalar*) PetscMalloc( nz*sizeof(Scalar) );CHKPTRQ(buf);
2469 
2470     /* receive message of values*/
2471     vals   = buf;
2472     mycols = ibuf;
2473     ierr   = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
2474     ierr   = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
2475     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2476 
2477     /* insert into matrix */
2478     jj      = rstart*bs;
2479     for ( i=0; i<m; i++ ) {
2480       ierr    = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2481       mycols += locrowlens[i];
2482       vals   += locrowlens[i];
2483       jj++;
2484     }
2485   }
2486   ierr = PetscFree(locrowlens);CHKERRQ(ierr);
2487   ierr = PetscFree(buf);CHKERRQ(ierr);
2488   ierr = PetscFree(ibuf);CHKERRQ(ierr);
2489   ierr = PetscFree(rowners);CHKERRQ(ierr);
2490   ierr = PetscFree(dlens);CHKERRQ(ierr);
2491   ierr = PetscFree(mask);CHKERRQ(ierr);
2492   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2493   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2494   PetscFunctionReturn(0);
2495 }
2496 
2497 #undef __FUNC__
2498 #define __FUNC__ "MatMPIBAIJSetHashTableFactor"
2499 /*@
2500    MatMPIBAIJSetHashTableFactor - Sets the factor required to compute the size of the HashTable.
2501 
2502    Input Parameters:
2503 .  mat  - the matrix
2504 .  fact - factor
2505 
2506    Collective on Mat
2507 
2508    Level: advanced
2509 
2510   Notes:
2511    This can also be set by the command line option: -mat_use_hash_table fact
2512 
2513 .keywords: matrix, hashtable, factor, HT
2514 
2515 .seealso: MatSetOption()
2516 @*/
2517 int MatMPIBAIJSetHashTableFactor(Mat mat,double fact)
2518 {
2519   Mat_MPIBAIJ *baij;
2520 
2521   PetscFunctionBegin;
2522   PetscValidHeaderSpecific(mat,MAT_COOKIE);
2523   if (mat->type != MATMPIBAIJ) {
2524       SETERRQ(PETSC_ERR_ARG_WRONG,1,"Incorrect matrix type. Use MPIBAIJ only.");
2525   }
2526   baij = (Mat_MPIBAIJ*) mat->data;
2527   baij->ht_fact = fact;
2528   PetscFunctionReturn(0);
2529 }
2530