xref: /petsc/src/mat/impls/baij/mpi/mpibaij.c (revision d07ff4555fceecf3da02886fa26b1d3c851458fc)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: mpibaij.c,v 1.153 1999/02/18 17:08:12 bsmith Exp bsmith $";
3 #endif
4 
5 #include "src/mat/impls/baij/mpi/mpibaij.h"   /*I  "mat.h"  I*/
6 #include "src/vec/vecimpl.h"
7 
8 extern int MatSetUpMultiply_MPIBAIJ(Mat);
9 extern int DisAssemble_MPIBAIJ(Mat);
10 extern int MatIncreaseOverlap_MPIBAIJ(Mat,int,IS *,int);
11 extern int MatGetSubMatrices_MPIBAIJ(Mat,int,IS *,IS *,MatReuse,Mat **);
12 extern int MatGetValues_SeqBAIJ(Mat,int,int *,int,int *,Scalar *);
13 
14 /*
15      Local utility routine that creates a mapping from the global column
16    number to the local number in the off-diagonal part of the local
17    storage of the matrix.  This is done in a non scable way since the
18    length of colmap equals the global matrix length.
19 */
20 #undef __FUNC__
21 #define __FUNC__ "CreateColmap_MPIBAIJ_Private"
22 static int CreateColmap_MPIBAIJ_Private(Mat mat)
23 {
24   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
25   Mat_SeqBAIJ *B = (Mat_SeqBAIJ*) baij->B->data;
26   int         nbs = B->nbs,i,bs=B->bs;
27 #if defined (USE_CTABLE)
28   int         ierr;
29 #endif
30 
31   PetscFunctionBegin;
32 #if defined (USE_CTABLE)
33   ierr = TableCreate(baij->nbs/5,&baij->colmap); CHKERRQ(ierr);
34   for ( i=0; i<nbs; i++ ){
35     ierr = TableAdd(baij->colmap,baij->garray[i]+1,i*bs+1);CHKERRQ(ierr);
36   }
37 #else
38   baij->colmap = (int *) PetscMalloc((baij->Nbs+1)*sizeof(int));CHKPTRQ(baij->colmap);
39   PLogObjectMemory(mat,baij->Nbs*sizeof(int));
40   PetscMemzero(baij->colmap,baij->Nbs*sizeof(int));
41   for ( i=0; i<nbs; i++ ) baij->colmap[baij->garray[i]] = i*bs+1;
42 #endif
43   PetscFunctionReturn(0);
44 }
45 
46 #define CHUNKSIZE  10
47 
48 #define  MatSetValues_SeqBAIJ_A_Private(row,col,value,addv) \
49 { \
50  \
51     brow = row/bs;  \
52     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
53     rmax = aimax[brow]; nrow = ailen[brow]; \
54       bcol = col/bs; \
55       ridx = row % bs; cidx = col % bs; \
56       low = 0; high = nrow; \
57       while (high-low > 3) { \
58         t = (low+high)/2; \
59         if (rp[t] > bcol) high = t; \
60         else              low  = t; \
61       } \
62       for ( _i=low; _i<high; _i++ ) { \
63         if (rp[_i] > bcol) break; \
64         if (rp[_i] == bcol) { \
65           bap  = ap +  bs2*_i + bs*cidx + ridx; \
66           if (addv == ADD_VALUES) *bap += value;  \
67           else                    *bap  = value;  \
68           goto a_noinsert; \
69         } \
70       } \
71       if (a->nonew == 1) goto a_noinsert; \
72       else if (a->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
73       if (nrow >= rmax) { \
74         /* there is no extra room in row, therefore enlarge */ \
75         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
76         Scalar *new_a; \
77  \
78         if (a->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
79  \
80         /* malloc new storage space */ \
81         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int); \
82         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
83         new_j   = (int *) (new_a + bs2*new_nz); \
84         new_i   = new_j + new_nz; \
85  \
86         /* copy over old data into new slots */ \
87         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];} \
88         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
89         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int)); \
90         len = (new_nz - CHUNKSIZE - ai[brow] - nrow); \
91         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow, \
92                                                            len*sizeof(int)); \
93         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar)); \
94         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
95         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE), \
96                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));  \
97         /* free up old matrix storage */ \
98         PetscFree(a->a);  \
99         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} \
100         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
101         a->singlemalloc = 1; \
102  \
103         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
104         rmax = aimax[brow] = aimax[brow] + CHUNKSIZE; \
105         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
106         a->maxnz += bs2*CHUNKSIZE; \
107         a->reallocs++; \
108         a->nz++; \
109       } \
110       N = nrow++ - 1;  \
111       /* shift up all the later entries in this row */ \
112       for ( ii=N; ii>=_i; ii-- ) { \
113         rp[ii+1] = rp[ii]; \
114         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
115       } \
116       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
117       rp[_i]                      = bcol;  \
118       ap[bs2*_i + bs*cidx + ridx] = value;  \
119       a_noinsert:; \
120     ailen[brow] = nrow; \
121 }
122 
123 #define  MatSetValues_SeqBAIJ_B_Private(row,col,value,addv) \
124 { \
125  \
126     brow = row/bs;  \
127     rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
128     rmax = bimax[brow]; nrow = bilen[brow]; \
129       bcol = col/bs; \
130       ridx = row % bs; cidx = col % bs; \
131       low = 0; high = nrow; \
132       while (high-low > 3) { \
133         t = (low+high)/2; \
134         if (rp[t] > bcol) high = t; \
135         else              low  = t; \
136       } \
137       for ( _i=low; _i<high; _i++ ) { \
138         if (rp[_i] > bcol) break; \
139         if (rp[_i] == bcol) { \
140           bap  = ap +  bs2*_i + bs*cidx + ridx; \
141           if (addv == ADD_VALUES) *bap += value;  \
142           else                    *bap  = value;  \
143           goto b_noinsert; \
144         } \
145       } \
146       if (b->nonew == 1) goto b_noinsert; \
147       else if (b->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
148       if (nrow >= rmax) { \
149         /* there is no extra room in row, therefore enlarge */ \
150         int    new_nz = bi[b->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
151         Scalar *new_a; \
152  \
153         if (b->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
154  \
155         /* malloc new storage space */ \
156         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(b->mbs+1)*sizeof(int); \
157         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
158         new_j   = (int *) (new_a + bs2*new_nz); \
159         new_i   = new_j + new_nz; \
160  \
161         /* copy over old data into new slots */ \
162         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = bi[ii];} \
163         for ( ii=brow+1; ii<b->mbs+1; ii++ ) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
164         PetscMemcpy(new_j,bj,(bi[brow]+nrow)*sizeof(int)); \
165         len = (new_nz - CHUNKSIZE - bi[brow] - nrow); \
166         PetscMemcpy(new_j+bi[brow]+nrow+CHUNKSIZE,bj+bi[brow]+nrow, \
167                                                            len*sizeof(int)); \
168         PetscMemcpy(new_a,ba,(bi[brow]+nrow)*bs2*sizeof(Scalar)); \
169         PetscMemzero(new_a+bs2*(bi[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
170         PetscMemcpy(new_a+bs2*(bi[brow]+nrow+CHUNKSIZE), \
171                     ba+bs2*(bi[brow]+nrow),bs2*len*sizeof(Scalar));  \
172         /* free up old matrix storage */ \
173         PetscFree(b->a);  \
174         if (!b->singlemalloc) {PetscFree(b->i);PetscFree(b->j);} \
175         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
176         b->singlemalloc = 1; \
177  \
178         rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
179         rmax = bimax[brow] = bimax[brow] + CHUNKSIZE; \
180         PLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
181         b->maxnz += bs2*CHUNKSIZE; \
182         b->reallocs++; \
183         b->nz++; \
184       } \
185       N = nrow++ - 1;  \
186       /* shift up all the later entries in this row */ \
187       for ( ii=N; ii>=_i; ii-- ) { \
188         rp[ii+1] = rp[ii]; \
189         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
190       } \
191       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
192       rp[_i]                      = bcol;  \
193       ap[bs2*_i + bs*cidx + ridx] = value;  \
194       b_noinsert:; \
195     bilen[brow] = nrow; \
196 }
197 
198 #undef __FUNC__
199 #define __FUNC__ "MatSetValues_MPIBAIJ"
200 int MatSetValues_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
201 {
202   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
203   Scalar      value;
204   int         ierr,i,j,row,col;
205   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
206   int         rend_orig=baij->rend_bs,cstart_orig=baij->cstart_bs;
207   int         cend_orig=baij->cend_bs,bs=baij->bs;
208 
209   /* Some Variables required in the macro */
210   Mat         A = baij->A;
211   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) (A)->data;
212   int         *aimax=a->imax,*ai=a->i,*ailen=a->ilen,*aj=a->j;
213   Scalar      *aa=a->a;
214 
215   Mat         B = baij->B;
216   Mat_SeqBAIJ *b = (Mat_SeqBAIJ *) (B)->data;
217   int         *bimax=b->imax,*bi=b->i,*bilen=b->ilen,*bj=b->j;
218   Scalar      *ba=b->a;
219 
220   int         *rp,ii,nrow,_i,rmax,N,brow,bcol;
221   int         low,high,t,ridx,cidx,bs2=a->bs2;
222   Scalar      *ap,*bap;
223 
224   PetscFunctionBegin;
225   for ( i=0; i<m; i++ ) {
226     if (im[i] < 0) continue;
227 #if defined(USE_PETSC_BOPT_g)
228     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
229 #endif
230     if (im[i] >= rstart_orig && im[i] < rend_orig) {
231       row = im[i] - rstart_orig;
232       for ( j=0; j<n; j++ ) {
233         if (in[j] >= cstart_orig && in[j] < cend_orig){
234           col = in[j] - cstart_orig;
235           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
236           MatSetValues_SeqBAIJ_A_Private(row,col,value,addv);
237           /* ierr = MatSetValues_SeqBAIJ(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
238         }
239         else if (in[j] < 0) continue;
240 #if defined(USE_PETSC_BOPT_g)
241         else if (in[j] >= baij->N) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Col too large");}
242 #endif
243         else {
244           if (mat->was_assembled) {
245             if (!baij->colmap) {
246               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
247             }
248 #if defined (USE_CTABLE)
249             ierr = TableFind(baij->colmap, in[j]/bs + 1,&col); CHKERRQ(ierr);
250             col  = col - 1 + in[j]%bs;
251 #else
252             col = baij->colmap[in[j]/bs] - 1 + in[j]%bs;
253 #endif
254             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
255               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
256               col =  in[j];
257               /* Reinitialize the variables required by MatSetValues_SeqBAIJ_B_Private() */
258               B = baij->B;
259               b = (Mat_SeqBAIJ *) (B)->data;
260               bimax=b->imax;bi=b->i;bilen=b->ilen;bj=b->j;
261               ba=b->a;
262             }
263           } else col = in[j];
264           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
265           MatSetValues_SeqBAIJ_B_Private(row,col,value,addv);
266           /* ierr = MatSetValues_SeqBAIJ(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
267         }
268       }
269     } else {
270       if (roworiented && !baij->donotstash) {
271         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
272       } else {
273         if (!baij->donotstash) {
274           row = im[i];
275 	  for ( j=0; j<n; j++ ) {
276 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
277           }
278         }
279       }
280     }
281   }
282   PetscFunctionReturn(0);
283 }
284 
285 extern int MatSetValuesBlocked_SeqBAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
286 #undef __FUNC__
287 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ"
288 int MatSetValuesBlocked_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
289 {
290   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
291   Scalar      *value,*barray=baij->barray;
292   int         ierr,i,j,ii,jj,row,col,k,l;
293   int         roworiented = baij->roworiented,rstart=baij->rstart ;
294   int         rend=baij->rend,cstart=baij->cstart,stepval;
295   int         cend=baij->cend,bs=baij->bs,bs2=baij->bs2;
296 
297   if(!barray) {
298     baij->barray = barray = (Scalar*) PetscMalloc(bs2*sizeof(Scalar)); CHKPTRQ(barray);
299   }
300 
301   if (roworiented) {
302     stepval = (n-1)*bs;
303   } else {
304     stepval = (m-1)*bs;
305   }
306   for ( i=0; i<m; i++ ) {
307     if (im[i] < 0) continue;
308 #if defined(USE_PETSC_BOPT_g)
309     if (im[i] >= baij->Mbs) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large, row %d max %d",im[i],baij->Mbs);
310 #endif
311     if (im[i] >= rstart && im[i] < rend) {
312       row = im[i] - rstart;
313       for ( j=0; j<n; j++ ) {
314         /* If NumCol = 1 then a copy is not required */
315         if ((roworiented) && (n == 1)) {
316           barray = v + i*bs2;
317         } else if((!roworiented) && (m == 1)) {
318           barray = v + j*bs2;
319         } else { /* Here a copy is required */
320           if (roworiented) {
321             value = v + i*(stepval+bs)*bs + j*bs;
322           } else {
323             value = v + j*(stepval+bs)*bs + i*bs;
324           }
325           for ( ii=0; ii<bs; ii++,value+=stepval ) {
326             for (jj=0; jj<bs; jj++ ) {
327               *barray++  = *value++;
328             }
329           }
330           barray -=bs2;
331         }
332 
333         if (in[j] >= cstart && in[j] < cend){
334           col  = in[j] - cstart;
335           ierr = MatSetValuesBlocked_SeqBAIJ(baij->A,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
336         }
337         else if (in[j] < 0) continue;
338 #if defined(USE_PETSC_BOPT_g)
339         else if (in[j] >= baij->Nbs) {SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large, col %d max %d",in[j],baij->Nbs);}
340 #endif
341         else {
342           if (mat->was_assembled) {
343             if (!baij->colmap) {
344               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
345             }
346 
347 #if defined(USE_PETSC_BOPT_g)
348 #if defined (USE_CTABLE)
349             { int data;
350             ierr = TableFind(baij->colmap,in[j]+1,&data); CHKERRQ(ierr);
351             if((data - 1) % bs)
352 	      {SETERRQ(PETSC_ERR_PLIB,0,"Incorrect colmap");}
353             }
354 #else
355             if ((baij->colmap[in[j]] - 1) % bs) {SETERRQ(PETSC_ERR_PLIB,0,"Incorrect colmap");}
356 #endif
357 #endif
358 #if defined (USE_CTABLE)
359 	    ierr = TableFind(baij->colmap,in[j]+1,&col); CHKERRQ(ierr);
360             col  = (col - 1)/bs;
361 #else
362             col = (baij->colmap[in[j]] - 1)/bs;
363 #endif
364             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
365               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
366               col =  in[j];
367             }
368           }
369           else col = in[j];
370           ierr = MatSetValuesBlocked_SeqBAIJ(baij->B,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
371         }
372       }
373     } else {
374       if (!baij->donotstash) {
375         if (roworiented ) {
376           row   = im[i]*bs;
377           value = v + i*(stepval+bs)*bs;
378           for ( j=0; j<bs; j++,row++ ) {
379             for ( k=0; k<n; k++ ) {
380               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
381                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
382               }
383             }
384           }
385         } else {
386           for ( j=0; j<n; j++ ) {
387             value = v + j*(stepval+bs)*bs + i*bs;
388             col   = in[j]*bs;
389             for ( k=0; k<bs; k++,col++,value+=stepval) {
390               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
391                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
392               }
393             }
394           }
395         }
396       }
397     }
398   }
399   PetscFunctionReturn(0);
400 }
401 #define HASH_KEY 0.6180339887
402 /* #define HASH1(size,key) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
403 #define HASH(size,key,tmp) (tmp = (key)*HASH_KEY,(int)((size)*(tmp-(int)tmp)))
404 /* #define HASH(size,key,tmp) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
405 #undef __FUNC__
406 #define __FUNC__ "MatSetValues_MPIBAIJ_HT"
407 int MatSetValues_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
408 {
409   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
410   int         ierr,i,j,row,col;
411   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
412   int         rend_orig=baij->rend_bs,Nbs=baij->Nbs;
413   int         h1,key,size=baij->ht_size,bs=baij->bs,*HT=baij->ht,idx;
414   double      tmp;
415   Scalar      ** HD = baij->hd,value;
416 #if defined(USE_PETSC_BOPT_g)
417   int         total_ct=baij->ht_total_ct,insert_ct=baij->ht_insert_ct;
418 #endif
419 
420   PetscFunctionBegin;
421 
422   for ( i=0; i<m; i++ ) {
423 #if defined(USE_PETSC_BOPT_g)
424     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
425     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
426 #endif
427       row = im[i];
428     if (row >= rstart_orig && row < rend_orig) {
429       for ( j=0; j<n; j++ ) {
430         col = in[j];
431         if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
432         /* Look up into the Hash Table */
433         key = (row/bs)*Nbs+(col/bs)+1;
434         h1  = HASH(size,key,tmp);
435 
436 
437         idx = h1;
438 #if defined(USE_PETSC_BOPT_g)
439         insert_ct++;
440         total_ct++;
441         if (HT[idx] != key) {
442           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++,total_ct++);
443           if (idx == size) {
444             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++,total_ct++);
445             if (idx == h1) {
446               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
447             }
448           }
449         }
450 #else
451         if (HT[idx] != key) {
452           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++);
453           if (idx == size) {
454             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++);
455             if (idx == h1) {
456               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
457             }
458           }
459         }
460 #endif
461         /* A HASH table entry is found, so insert the values at the correct address */
462         if (addv == ADD_VALUES) *(HD[idx]+ (col % bs)*bs + (row % bs)) += value;
463         else                    *(HD[idx]+ (col % bs)*bs + (row % bs))  = value;
464       }
465     } else {
466       if (roworiented && !baij->donotstash) {
467         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
468       } else {
469         if (!baij->donotstash) {
470           row = im[i];
471 	  for ( j=0; j<n; j++ ) {
472 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
473           }
474         }
475       }
476     }
477   }
478 #if defined(USE_PETSC_BOPT_g)
479   baij->ht_total_ct = total_ct;
480   baij->ht_insert_ct = insert_ct;
481 #endif
482   PetscFunctionReturn(0);
483 }
484 
485 #undef __FUNC__
486 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ_HT"
487 int MatSetValuesBlocked_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
488 {
489   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
490   int         ierr,i,j,ii,jj,row,col,k,l;
491   int         roworiented = baij->roworiented,rstart=baij->rstart ;
492   int         rend=baij->rend,stepval,bs=baij->bs,bs2=baij->bs2;
493   int         h1,key,size=baij->ht_size,idx,*HT=baij->ht,Nbs=baij->Nbs;
494   double      tmp;
495   Scalar      ** HD = baij->hd,*value,*v_t,*baij_a;
496 #if defined(USE_PETSC_BOPT_g)
497   int         total_ct=baij->ht_total_ct,insert_ct=baij->ht_insert_ct;
498 #endif
499 
500   PetscFunctionBegin;
501 
502   if (roworiented) {
503     stepval = (n-1)*bs;
504   } else {
505     stepval = (m-1)*bs;
506   }
507   for ( i=0; i<m; i++ ) {
508 #if defined(USE_PETSC_BOPT_g)
509     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
510     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
511 #endif
512     row   = im[i];
513     v_t   = v + i*bs2;
514     if (row >= rstart && row < rend) {
515       for ( j=0; j<n; j++ ) {
516         col = in[j];
517 
518         /* Look up into the Hash Table */
519         key = row*Nbs+col+1;
520         h1  = HASH(size,key,tmp);
521 
522         idx = h1;
523 #if defined(USE_PETSC_BOPT_g)
524         total_ct++;
525         insert_ct++;
526        if (HT[idx] != key) {
527           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++,total_ct++);
528           if (idx == size) {
529             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++,total_ct++);
530             if (idx == h1) {
531               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
532             }
533           }
534         }
535 #else
536         if (HT[idx] != key) {
537           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++);
538           if (idx == size) {
539             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++);
540             if (idx == h1) {
541               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
542             }
543           }
544         }
545 #endif
546         baij_a = HD[idx];
547         if (roworiented) {
548           /*value = v + i*(stepval+bs)*bs + j*bs;*/
549           /* value = v + (i*(stepval+bs)+j)*bs; */
550           value = v_t;
551           v_t  += bs;
552           if (addv == ADD_VALUES) {
553             for ( ii=0; ii<bs; ii++,value+=stepval) {
554               for ( jj=ii; jj<bs2; jj+=bs ) {
555                 baij_a[jj]  += *value++;
556               }
557             }
558           } else {
559             for ( ii=0; ii<bs; ii++,value+=stepval) {
560               for ( jj=ii; jj<bs2; jj+=bs ) {
561                 baij_a[jj]  = *value++;
562               }
563             }
564           }
565         } else {
566           value = v + j*(stepval+bs)*bs + i*bs;
567           if (addv == ADD_VALUES) {
568             for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
569               for ( jj=0; jj<bs; jj++ ) {
570                 baij_a[jj]  += *value++;
571               }
572             }
573           } else {
574             for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
575               for ( jj=0; jj<bs; jj++ ) {
576                 baij_a[jj]  = *value++;
577               }
578             }
579           }
580         }
581       }
582     } else {
583       if (!baij->donotstash) {
584         if (roworiented ) {
585           row   = im[i]*bs;
586           value = v + i*(stepval+bs)*bs;
587           for ( j=0; j<bs; j++,row++ ) {
588             for ( k=0; k<n; k++ ) {
589               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
590                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
591               }
592             }
593           }
594         } else {
595           for ( j=0; j<n; j++ ) {
596             value = v + j*(stepval+bs)*bs + i*bs;
597             col   = in[j]*bs;
598             for ( k=0; k<bs; k++,col++,value+=stepval) {
599               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
600                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
601               }
602             }
603           }
604         }
605       }
606     }
607   }
608 #if defined(USE_PETSC_BOPT_g)
609   baij->ht_total_ct = total_ct;
610   baij->ht_insert_ct = insert_ct;
611 #endif
612   PetscFunctionReturn(0);
613 }
614 
615 #undef __FUNC__
616 #define __FUNC__ "MatGetValues_MPIBAIJ"
617 int MatGetValues_MPIBAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
618 {
619   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
620   int        bs=baij->bs,ierr,i,j, bsrstart = baij->rstart*bs, bsrend = baij->rend*bs;
621   int        bscstart = baij->cstart*bs, bscend = baij->cend*bs,row,col,data;
622 
623   PetscFunctionBegin;
624   for ( i=0; i<m; i++ ) {
625     if (idxm[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
626     if (idxm[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
627     if (idxm[i] >= bsrstart && idxm[i] < bsrend) {
628       row = idxm[i] - bsrstart;
629       for ( j=0; j<n; j++ ) {
630         if (idxn[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");
631         if (idxn[j] >= baij->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");
632         if (idxn[j] >= bscstart && idxn[j] < bscend){
633           col = idxn[j] - bscstart;
634           ierr = MatGetValues_SeqBAIJ(baij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
635         } else {
636           if (!baij->colmap) {
637             ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
638           }
639 #if defined (USE_CTABLE)
640           ierr = TableFind(baij->colmap,idxn[j]/bs+1,&data); CHKERRQ(ierr);
641           data --;
642 #else
643           data = baij->colmap[idxn[j]/bs]-1;
644 #endif
645           if((data < 0) || (baij->garray[data/bs] != idxn[j]/bs)) *(v+i*n+j) = 0.0;
646           else {
647             col  = data + idxn[j]%bs;
648             ierr = MatGetValues_SeqBAIJ(baij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
649           }
650         }
651       }
652     } else {
653       SETERRQ(PETSC_ERR_SUP,0,"Only local values currently supported");
654     }
655   }
656  PetscFunctionReturn(0);
657 }
658 
659 #undef __FUNC__
660 #define __FUNC__ "MatNorm_MPIBAIJ"
661 int MatNorm_MPIBAIJ(Mat mat,NormType type,double *norm)
662 {
663   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
664   Mat_SeqBAIJ *amat = (Mat_SeqBAIJ*) baij->A->data, *bmat = (Mat_SeqBAIJ*) baij->B->data;
665   int        ierr, i,bs2=baij->bs2;
666   double     sum = 0.0;
667   Scalar     *v;
668 
669   PetscFunctionBegin;
670   if (baij->size == 1) {
671     ierr =  MatNorm(baij->A,type,norm); CHKERRQ(ierr);
672   } else {
673     if (type == NORM_FROBENIUS) {
674       v = amat->a;
675       for (i=0; i<amat->nz*bs2; i++ ) {
676 #if defined(USE_PETSC_COMPLEX)
677         sum += PetscReal(PetscConj(*v)*(*v)); v++;
678 #else
679         sum += (*v)*(*v); v++;
680 #endif
681       }
682       v = bmat->a;
683       for (i=0; i<bmat->nz*bs2; i++ ) {
684 #if defined(USE_PETSC_COMPLEX)
685         sum += PetscReal(PetscConj(*v)*(*v)); v++;
686 #else
687         sum += (*v)*(*v); v++;
688 #endif
689       }
690       ierr = MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
691       *norm = sqrt(*norm);
692     } else {
693       SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
694     }
695   }
696   PetscFunctionReturn(0);
697 }
698 
699 #undef __FUNC__
700 #define __FUNC__ "MatAssemblyBegin_MPIBAIJ"
701 int MatAssemblyBegin_MPIBAIJ(Mat mat,MatAssemblyType mode)
702 {
703   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
704   MPI_Comm    comm = mat->comm;
705   int         size = baij->size, *owners = baij->rowners,bs=baij->bs;
706   int         rank = baij->rank,tag = mat->tag, *owner,*starts,count,ierr;
707   MPI_Request *send_waits,*recv_waits;
708   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
709   InsertMode  addv;
710   Scalar      *rvalues,*svalues;
711 
712   PetscFunctionBegin;
713   if (baij->donotstash) {
714     baij->svalues    = 0; baij->rvalues    = 0;
715     baij->nsends     = 0; baij->nrecvs     = 0;
716     baij->send_waits = 0; baij->recv_waits = 0;
717     baij->rmax       = 0;
718     PetscFunctionReturn(0);
719   }
720 
721   /* make sure all processors are either in INSERTMODE or ADDMODE */
722   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);CHKERRQ(ierr);
723   if (addv == (ADD_VALUES|INSERT_VALUES)) {
724     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Some processors inserted others added");
725   }
726   mat->insertmode = addv; /* in case this processor had no cache */
727 
728   /*  first count number of contributors to each processor */
729   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
730   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
731   owner = (int *) PetscMalloc( (baij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
732   for ( i=0; i<baij->stash.n; i++ ) {
733     idx = baij->stash.idx[i];
734     for ( j=0; j<size; j++ ) {
735       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
736         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
737       }
738     }
739   }
740   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
741 
742   /* inform other processors of number of messages and max length*/
743   work      = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
744   ierr      = MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
745   nreceives = work[rank];
746   ierr      = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
747   nmax      = work[rank];
748   PetscFree(work);
749 
750   /* post receives:
751        1) each message will consist of ordered pairs
752      (global index,value) we store the global index as a double
753      to simplify the message passing.
754        2) since we don't know how long each individual message is we
755      allocate the largest needed buffer for each receive. Potentially
756      this is a lot of wasted space.
757 
758 
759        This could be done better.
760   */
761   rvalues    = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));CHKPTRQ(rvalues);
762   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
763   for ( i=0; i<nreceives; i++ ) {
764     ierr = MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
765   }
766 
767   /* do sends:
768       1) starts[i] gives the starting index in svalues for stuff going to
769          the ith processor
770   */
771   svalues    = (Scalar *) PetscMalloc(3*(baij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
772   send_waits = (MPI_Request *) PetscMalloc((nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
773   starts     = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
774   starts[0] = 0;
775   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
776   for ( i=0; i<baij->stash.n; i++ ) {
777     svalues[3*starts[owner[i]]]       = (Scalar)  baij->stash.idx[i];
778     svalues[3*starts[owner[i]]+1]     = (Scalar)  baij->stash.idy[i];
779     svalues[3*(starts[owner[i]]++)+2] =  baij->stash.array[i];
780   }
781   PetscFree(owner);
782   starts[0] = 0;
783   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
784   count = 0;
785   for ( i=0; i<size; i++ ) {
786     if (procs[i]) {
787       ierr = MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
788     }
789   }
790   PetscFree(starts); PetscFree(nprocs);
791 
792   /* Free cache space */
793   PLogInfo(baij->A,"MatAssemblyBegin_MPIBAIJ:Number of off-processor values %d\n",baij->stash.n);
794   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
795 
796   baij->svalues    = svalues;    baij->rvalues    = rvalues;
797   baij->nsends     = nsends;     baij->nrecvs     = nreceives;
798   baij->send_waits = send_waits; baij->recv_waits = recv_waits;
799   baij->rmax       = nmax;
800 
801   PetscFunctionReturn(0);
802 }
803 
804 /*
805   Creates the hash table, and sets the table
806   This table is created only once.
807   If new entried need to be added to the matrix
808   then the hash table has to be destroyed and
809   recreated.
810 */
811 #undef __FUNC__
812 #define __FUNC__ "MatCreateHashTable_MPIBAIJ_Private"
813 int MatCreateHashTable_MPIBAIJ_Private(Mat mat,double factor)
814 {
815   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
816   Mat         A = baij->A, B=baij->B;
817   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data, *b=(Mat_SeqBAIJ *)B->data;
818   int         i,j,k,nz=a->nz+b->nz,h1,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
819   int         size,bs2=baij->bs2,rstart=baij->rstart;
820   int         cstart=baij->cstart,*garray=baij->garray,row,col,Nbs=baij->Nbs;
821   int         *HT,key;
822   Scalar      **HD;
823   double      tmp;
824 #if defined(USE_PETSC_BOPT_g)
825   int         ct=0,max=0;
826 #endif
827 
828   PetscFunctionBegin;
829   baij->ht_size=(int)(factor*nz);
830   size = baij->ht_size;
831 
832   if (baij->ht) {
833     PetscFunctionReturn(0);
834   }
835 
836   /* Allocate Memory for Hash Table */
837   baij->hd = (Scalar**)PetscMalloc((size)*(sizeof(int)+sizeof(Scalar*))+1); CHKPTRQ(baij->hd);
838   baij->ht = (int*)(baij->hd + size);
839   HD = baij->hd;
840   HT = baij->ht;
841 
842 
843   PetscMemzero(HD,size*(sizeof(int)+sizeof(Scalar*)));
844 
845 
846   /* Loop Over A */
847   for ( i=0; i<a->mbs; i++ ) {
848     for ( j=ai[i]; j<ai[i+1]; j++ ) {
849       row = i+rstart;
850       col = aj[j]+cstart;
851 
852       key = row*Nbs + col + 1;
853       h1  = HASH(size,key,tmp);
854       for ( k=0; k<size; k++ ){
855         if (HT[(h1+k)%size] == 0.0) {
856           HT[(h1+k)%size] = key;
857           HD[(h1+k)%size] = a->a + j*bs2;
858           break;
859 #if defined(USE_PETSC_BOPT_g)
860         } else {
861           ct++;
862 #endif
863         }
864       }
865 #if defined(USE_PETSC_BOPT_g)
866       if (k> max) max = k;
867 #endif
868     }
869   }
870   /* Loop Over B */
871   for ( i=0; i<b->mbs; i++ ) {
872     for ( j=bi[i]; j<bi[i+1]; j++ ) {
873       row = i+rstart;
874       col = garray[bj[j]];
875       key = row*Nbs + col + 1;
876       h1  = HASH(size,key,tmp);
877       for ( k=0; k<size; k++ ){
878         if (HT[(h1+k)%size] == 0.0) {
879           HT[(h1+k)%size] = key;
880           HD[(h1+k)%size] = b->a + j*bs2;
881           break;
882 #if defined(USE_PETSC_BOPT_g)
883         } else {
884           ct++;
885 #endif
886         }
887       }
888 #if defined(USE_PETSC_BOPT_g)
889       if (k> max) max = k;
890 #endif
891     }
892   }
893 
894   /* Print Summary */
895 #if defined(USE_PETSC_BOPT_g)
896   for ( i=0,j=0; i<size; i++)
897     if (HT[i]) {j++;}
898   PLogInfo(0,"MatCreateHashTable_MPIBAIJ_Private: Average Search = %5.2f,max search = %d\n",
899            (j== 0)? 0.0:((double)(ct+j))/j,max);
900 #endif
901   PetscFunctionReturn(0);
902 }
903 
904 #undef __FUNC__
905 #define __FUNC__ "MatAssemblyEnd_MPIBAIJ"
906 int MatAssemblyEnd_MPIBAIJ(Mat mat,MatAssemblyType mode)
907 {
908   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
909   MPI_Status  *send_status,recv_status;
910   int         imdex,nrecvs = baij->nrecvs, count = nrecvs, i, n, ierr;
911   int         bs=baij->bs,row,col,other_disassembled;
912   Scalar      *values,val;
913   InsertMode  addv = mat->insertmode;
914 
915   PetscFunctionBegin;
916   /*  wait on receives */
917   while (count) {
918     ierr = MPI_Waitany(nrecvs,baij->recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
919     /* unpack receives into our local space */
920     values = baij->rvalues + 3*imdex*baij->rmax;
921     ierr = MPI_Get_count(&recv_status,MPIU_SCALAR,&n);CHKERRQ(ierr);
922     n = n/3;
923     for ( i=0; i<n; i++ ) {
924       row = (int) PetscReal(values[3*i]) - baij->rstart*bs;
925       col = (int) PetscReal(values[3*i+1]);
926       val = values[3*i+2];
927       if (col >= baij->cstart*bs && col < baij->cend*bs) {
928         col -= baij->cstart*bs;
929         ierr = MatSetValues(baij->A,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
930       } else {
931         if (mat->was_assembled) {
932           if (!baij->colmap) {
933             ierr = CreateColmap_MPIBAIJ_Private(mat); CHKERRQ(ierr);
934           }
935 #if defined (USE_CTABLE)
936 	  ierr = TableFind(baij->colmap,col/bs+1,&col); CHKERRQ(ierr);
937           col  = col - 1 + col%bs;
938 #else
939           col = (baij->colmap[col/bs]) - 1 + col%bs;
940 #endif
941           if (col < 0  && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
942             ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
943             col = (int) PetscReal(values[3*i+1]);
944           }
945         }
946         ierr = MatSetValues(baij->B,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
947       }
948     }
949     count--;
950   }
951   if (baij->recv_waits) PetscFree(baij->recv_waits);
952   if (baij->rvalues)    PetscFree(baij->rvalues);
953 
954   /* wait on sends */
955   if (baij->nsends) {
956     send_status = (MPI_Status *) PetscMalloc(baij->nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
957     ierr        = MPI_Waitall(baij->nsends,baij->send_waits,send_status);CHKERRQ(ierr);
958     PetscFree(send_status);
959   }
960   if (baij->send_waits) PetscFree(baij->send_waits);
961   if (baij->svalues)    PetscFree(baij->svalues);
962 
963   ierr = MatAssemblyBegin(baij->A,mode); CHKERRQ(ierr);
964   ierr = MatAssemblyEnd(baij->A,mode); CHKERRQ(ierr);
965 
966   /* determine if any processor has disassembled, if so we must
967      also disassemble ourselfs, in order that we may reassemble. */
968   /*
969      if nonzero structure of submatrix B cannot change then we know that
970      no processor disassembled thus we can skip this stuff
971   */
972   if (!((Mat_SeqBAIJ*) baij->B->data)->nonew)  {
973     ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
974     if (mat->was_assembled && !other_disassembled) {
975       ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
976     }
977   }
978 
979   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
980     ierr = MatSetUpMultiply_MPIBAIJ(mat); CHKERRQ(ierr);
981   }
982   ierr = MatAssemblyBegin(baij->B,mode); CHKERRQ(ierr);
983   ierr = MatAssemblyEnd(baij->B,mode); CHKERRQ(ierr);
984 
985 #if defined(USE_PETSC_BOPT_g)
986   if (baij->ht && mode== MAT_FINAL_ASSEMBLY) {
987     PLogInfo(0,"MatAssemblyEnd_MPIBAIJ:Average Hash Table Search in MatSetValues = %5.2f\n",
988              ((double)baij->ht_total_ct)/baij->ht_insert_ct);
989     baij->ht_total_ct  = 0;
990     baij->ht_insert_ct = 0;
991   }
992 #endif
993   if (baij->ht_flag && !baij->ht && mode == MAT_FINAL_ASSEMBLY) {
994     ierr = MatCreateHashTable_MPIBAIJ_Private(mat,baij->ht_fact); CHKERRQ(ierr);
995     mat->ops->setvalues        = MatSetValues_MPIBAIJ_HT;
996     mat->ops->setvaluesblocked = MatSetValuesBlocked_MPIBAIJ_HT;
997   }
998 
999   if (baij->rowvalues) {PetscFree(baij->rowvalues); baij->rowvalues = 0;}
1000   PetscFunctionReturn(0);
1001 }
1002 
1003 #undef __FUNC__
1004 #define __FUNC__ "MatView_MPIBAIJ_Binary"
1005 static int MatView_MPIBAIJ_Binary(Mat mat,Viewer viewer)
1006 {
1007   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
1008   int          ierr;
1009 
1010   PetscFunctionBegin;
1011   if (baij->size == 1) {
1012     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
1013   } else SETERRQ(PETSC_ERR_SUP,0,"Only uniprocessor output supported");
1014   PetscFunctionReturn(0);
1015 }
1016 
1017 #undef __FUNC__
1018 #define __FUNC__ "MatView_MPIBAIJ_ASCIIorDraworSocket"
1019 static int MatView_MPIBAIJ_ASCIIorDraworSocket(Mat mat,Viewer viewer)
1020 {
1021   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
1022   int          ierr, format,bs = baij->bs, size = baij->size, rank = baij->rank;
1023   FILE         *fd;
1024   ViewerType   vtype;
1025 
1026   PetscFunctionBegin;
1027   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
1028   if (PetscTypeCompare(vtype,ASCII_VIEWER)) {
1029     ierr = ViewerGetFormat(viewer,&format); CHKERRQ(ierr);
1030     if (format == VIEWER_FORMAT_ASCII_INFO_LONG) {
1031       MatInfo info;
1032       MPI_Comm_rank(mat->comm,&rank);
1033       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
1034       ierr = MatGetInfo(mat,MAT_LOCAL,&info);CHKERRQ(ierr);
1035       PetscSequentialPhaseBegin(mat->comm,1);
1036       fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
1037               rank,baij->m,(int)info.nz_used*bs,(int)info.nz_allocated*bs,
1038               baij->bs,(int)info.memory);
1039       ierr = MatGetInfo(baij->A,MAT_LOCAL,&info);
1040       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
1041       ierr = MatGetInfo(baij->B,MAT_LOCAL,&info);
1042       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
1043       fflush(fd);
1044       PetscSequentialPhaseEnd(mat->comm,1);
1045       ierr = VecScatterView(baij->Mvctx,viewer); CHKERRQ(ierr);
1046       PetscFunctionReturn(0);
1047     } else if (format == VIEWER_FORMAT_ASCII_INFO) {
1048       PetscPrintf(mat->comm,"  block size is %d\n",bs);
1049       PetscFunctionReturn(0);
1050     }
1051   }
1052 
1053   if (PetscTypeCompare(vtype,DRAW_VIEWER)) {
1054     Draw       draw;
1055     PetscTruth isnull;
1056     ierr = ViewerDrawGetDraw(viewer,0,&draw); CHKERRQ(ierr);
1057     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
1058   }
1059 
1060   if (size == 1) {
1061     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
1062   } else {
1063     /* assemble the entire matrix onto first processor. */
1064     Mat         A;
1065     Mat_SeqBAIJ *Aloc;
1066     int         M = baij->M, N = baij->N,*ai,*aj,col,i,j,k,*rvals;
1067     int         mbs = baij->mbs;
1068     Scalar      *a;
1069 
1070     if (!rank) {
1071       ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
1072     } else {
1073       ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
1074     }
1075     PLogObjectParent(mat,A);
1076 
1077     /* copy over the A part */
1078     Aloc = (Mat_SeqBAIJ*) baij->A->data;
1079     ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1080     rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1081 
1082     for ( i=0; i<mbs; i++ ) {
1083       rvals[0] = bs*(baij->rstart + i);
1084       for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1085       for ( j=ai[i]; j<ai[i+1]; j++ ) {
1086         col = (baij->cstart+aj[j])*bs;
1087         for (k=0; k<bs; k++ ) {
1088           ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
1089           col++; a += bs;
1090         }
1091       }
1092     }
1093     /* copy over the B part */
1094     Aloc = (Mat_SeqBAIJ*) baij->B->data;
1095     ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1096     for ( i=0; i<mbs; i++ ) {
1097       rvals[0] = bs*(baij->rstart + i);
1098       for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1099       for ( j=ai[i]; j<ai[i+1]; j++ ) {
1100         col = baij->garray[aj[j]]*bs;
1101         for (k=0; k<bs; k++ ) {
1102           ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
1103           col++; a += bs;
1104         }
1105       }
1106     }
1107     PetscFree(rvals);
1108     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1109     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1110     /*
1111        Everyone has to call to draw the matrix since the graphics waits are
1112        synchronized across all processors that share the Draw object
1113     */
1114     if (!rank || PetscTypeCompare(vtype,DRAW_VIEWER)) {
1115       ierr = MatView(((Mat_MPIBAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
1116     }
1117     ierr = MatDestroy(A); CHKERRQ(ierr);
1118   }
1119   PetscFunctionReturn(0);
1120 }
1121 
1122 
1123 
1124 #undef __FUNC__
1125 #define __FUNC__ "MatView_MPIBAIJ"
1126 int MatView_MPIBAIJ(Mat mat,Viewer viewer)
1127 {
1128   int         ierr;
1129   ViewerType  vtype;
1130 
1131   PetscFunctionBegin;
1132   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
1133   if (PetscTypeCompare(vtype,ASCII_VIEWER) || PetscTypeCompare(vtype,DRAW_VIEWER) ||
1134       PetscTypeCompare(vtype,SOCKET_VIEWER)) {
1135     ierr = MatView_MPIBAIJ_ASCIIorDraworSocket(mat,viewer); CHKERRQ(ierr);
1136   } else if (PetscTypeCompare(vtype,BINARY_VIEWER)) {
1137     ierr = MatView_MPIBAIJ_Binary(mat,viewer);CHKERRQ(ierr);
1138   } else {
1139     SETERRQ(1,1,"Viewer type not supported by PETSc object");
1140   }
1141   PetscFunctionReturn(0);
1142 }
1143 
1144 #undef __FUNC__
1145 #define __FUNC__ "MatDestroy_MPIBAIJ"
1146 int MatDestroy_MPIBAIJ(Mat mat)
1147 {
1148   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1149   int         ierr;
1150 
1151   PetscFunctionBegin;
1152   if (--mat->refct > 0) PetscFunctionReturn(0);
1153 
1154   if (mat->mapping) {
1155     ierr = ISLocalToGlobalMappingDestroy(mat->mapping); CHKERRQ(ierr);
1156   }
1157   if (mat->bmapping) {
1158     ierr = ISLocalToGlobalMappingDestroy(mat->bmapping); CHKERRQ(ierr);
1159   }
1160   if (mat->rmap) {
1161     ierr = MapDestroy(mat->rmap);CHKERRQ(ierr);
1162   }
1163   if (mat->cmap) {
1164     ierr = MapDestroy(mat->cmap);CHKERRQ(ierr);
1165   }
1166 #if defined(USE_PETSC_LOG)
1167   PLogObjectState((PetscObject)mat,"Rows=%d, Cols=%d",baij->M,baij->N);
1168 #endif
1169 
1170   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
1171   PetscFree(baij->rowners);
1172   ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1173   ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1174 #if defined (USE_CTABLE)
1175   if (baij->colmap) TableDelete(baij->colmap);
1176 #else
1177   if (baij->colmap) PetscFree(baij->colmap);
1178 #endif
1179   if (baij->garray) PetscFree(baij->garray);
1180   if (baij->lvec)   VecDestroy(baij->lvec);
1181   if (baij->Mvctx)  VecScatterDestroy(baij->Mvctx);
1182   if (baij->rowvalues) PetscFree(baij->rowvalues);
1183   if (baij->barray) PetscFree(baij->barray);
1184   if (baij->hd) PetscFree(baij->hd);
1185   PetscFree(baij);
1186   PLogObjectDestroy(mat);
1187   PetscHeaderDestroy(mat);
1188   PetscFunctionReturn(0);
1189 }
1190 
1191 #undef __FUNC__
1192 #define __FUNC__ "MatMult_MPIBAIJ"
1193 int MatMult_MPIBAIJ(Mat A,Vec xx,Vec yy)
1194 {
1195   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1196   int         ierr, nt;
1197 
1198   PetscFunctionBegin;
1199   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
1200   if (nt != a->n) {
1201     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible partition of A and xx");
1202   }
1203   ierr = VecGetLocalSize(yy,&nt);CHKERRQ(ierr);
1204   if (nt != a->m) {
1205     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible parition of A and yy");
1206   }
1207   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1208   ierr = (*a->A->ops->mult)(a->A,xx,yy); CHKERRQ(ierr);
1209   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1210   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
1211   ierr = VecScatterPostRecvs(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1212   PetscFunctionReturn(0);
1213 }
1214 
1215 #undef __FUNC__
1216 #define __FUNC__ "MatMultAdd_MPIBAIJ"
1217 int MatMultAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1218 {
1219   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1220   int        ierr;
1221 
1222   PetscFunctionBegin;
1223   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1224   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1225   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1226   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
1227   PetscFunctionReturn(0);
1228 }
1229 
1230 #undef __FUNC__
1231 #define __FUNC__ "MatMultTrans_MPIBAIJ"
1232 int MatMultTrans_MPIBAIJ(Mat A,Vec xx,Vec yy)
1233 {
1234   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1235   int         ierr;
1236 
1237   PetscFunctionBegin;
1238   /* do nondiagonal part */
1239   ierr = (*a->B->ops->multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1240   /* send it on its way */
1241   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1242   /* do local part */
1243   ierr = (*a->A->ops->multtrans)(a->A,xx,yy); CHKERRQ(ierr);
1244   /* receive remote parts: note this assumes the values are not actually */
1245   /* inserted in yy until the next line, which is true for my implementation*/
1246   /* but is not perhaps always true. */
1247   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1248   PetscFunctionReturn(0);
1249 }
1250 
1251 #undef __FUNC__
1252 #define __FUNC__ "MatMultTransAdd_MPIBAIJ"
1253 int MatMultTransAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1254 {
1255   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1256   int         ierr;
1257 
1258   PetscFunctionBegin;
1259   /* do nondiagonal part */
1260   ierr = (*a->B->ops->multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1261   /* send it on its way */
1262   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1263   /* do local part */
1264   ierr = (*a->A->ops->multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1265   /* receive remote parts: note this assumes the values are not actually */
1266   /* inserted in yy until the next line, which is true for my implementation*/
1267   /* but is not perhaps always true. */
1268   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1269   PetscFunctionReturn(0);
1270 }
1271 
1272 /*
1273   This only works correctly for square matrices where the subblock A->A is the
1274    diagonal block
1275 */
1276 #undef __FUNC__
1277 #define __FUNC__ "MatGetDiagonal_MPIBAIJ"
1278 int MatGetDiagonal_MPIBAIJ(Mat A,Vec v)
1279 {
1280   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1281   int         ierr;
1282 
1283   PetscFunctionBegin;
1284   if (a->M != a->N) SETERRQ(PETSC_ERR_SUP,0,"Supports only square matrix where A->A is diag block");
1285   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
1286   PetscFunctionReturn(0);
1287 }
1288 
1289 #undef __FUNC__
1290 #define __FUNC__ "MatScale_MPIBAIJ"
1291 int MatScale_MPIBAIJ(Scalar *aa,Mat A)
1292 {
1293   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1294   int         ierr;
1295 
1296   PetscFunctionBegin;
1297   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
1298   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
1299   PetscFunctionReturn(0);
1300 }
1301 
1302 #undef __FUNC__
1303 #define __FUNC__ "MatGetSize_MPIBAIJ"
1304 int MatGetSize_MPIBAIJ(Mat matin,int *m,int *n)
1305 {
1306   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1307 
1308   PetscFunctionBegin;
1309   if (m) *m = mat->M;
1310   if (n) *n = mat->N;
1311   PetscFunctionReturn(0);
1312 }
1313 
1314 #undef __FUNC__
1315 #define __FUNC__ "MatGetLocalSize_MPIBAIJ"
1316 int MatGetLocalSize_MPIBAIJ(Mat matin,int *m,int *n)
1317 {
1318   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1319 
1320   PetscFunctionBegin;
1321   *m = mat->m; *n = mat->n;
1322   PetscFunctionReturn(0);
1323 }
1324 
1325 #undef __FUNC__
1326 #define __FUNC__ "MatGetOwnershipRange_MPIBAIJ"
1327 int MatGetOwnershipRange_MPIBAIJ(Mat matin,int *m,int *n)
1328 {
1329   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1330 
1331   PetscFunctionBegin;
1332   *m = mat->rstart*mat->bs; *n = mat->rend*mat->bs;
1333   PetscFunctionReturn(0);
1334 }
1335 
1336 extern int MatGetRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1337 extern int MatRestoreRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1338 
1339 #undef __FUNC__
1340 #define __FUNC__ "MatGetRow_MPIBAIJ"
1341 int MatGetRow_MPIBAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1342 {
1343   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1344   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1345   int        bs = mat->bs, bs2 = mat->bs2, i, ierr, *cworkA, *cworkB, **pcA, **pcB;
1346   int        nztot, nzA, nzB, lrow, brstart = mat->rstart*bs, brend = mat->rend*bs;
1347   int        *cmap, *idx_p,cstart = mat->cstart;
1348 
1349   PetscFunctionBegin;
1350   if (mat->getrowactive == PETSC_TRUE) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Already active");
1351   mat->getrowactive = PETSC_TRUE;
1352 
1353   if (!mat->rowvalues && (idx || v)) {
1354     /*
1355         allocate enough space to hold information from the longest row.
1356     */
1357     Mat_SeqBAIJ *Aa = (Mat_SeqBAIJ *) mat->A->data,*Ba = (Mat_SeqBAIJ *) mat->B->data;
1358     int     max = 1,mbs = mat->mbs,tmp;
1359     for ( i=0; i<mbs; i++ ) {
1360       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1361       if (max < tmp) { max = tmp; }
1362     }
1363     mat->rowvalues = (Scalar *) PetscMalloc( max*bs2*(sizeof(int)+sizeof(Scalar)));
1364     CHKPTRQ(mat->rowvalues);
1365     mat->rowindices = (int *) (mat->rowvalues + max*bs2);
1366   }
1367 
1368   if (row < brstart || row >= brend) SETERRQ(PETSC_ERR_SUP,0,"Only local rows")
1369   lrow = row - brstart;
1370 
1371   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1372   if (!v)   {pvA = 0; pvB = 0;}
1373   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1374   ierr = (*mat->A->ops->getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1375   ierr = (*mat->B->ops->getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1376   nztot = nzA + nzB;
1377 
1378   cmap  = mat->garray;
1379   if (v  || idx) {
1380     if (nztot) {
1381       /* Sort by increasing column numbers, assuming A and B already sorted */
1382       int imark = -1;
1383       if (v) {
1384         *v = v_p = mat->rowvalues;
1385         for ( i=0; i<nzB; i++ ) {
1386           if (cmap[cworkB[i]/bs] < cstart)   v_p[i] = vworkB[i];
1387           else break;
1388         }
1389         imark = i;
1390         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1391         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1392       }
1393       if (idx) {
1394         *idx = idx_p = mat->rowindices;
1395         if (imark > -1) {
1396           for ( i=0; i<imark; i++ ) {
1397             idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs;
1398           }
1399         } else {
1400           for ( i=0; i<nzB; i++ ) {
1401             if (cmap[cworkB[i]/bs] < cstart)
1402               idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1403             else break;
1404           }
1405           imark = i;
1406         }
1407         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart*bs + cworkA[i];
1408         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1409       }
1410     } else {
1411       if (idx) *idx = 0;
1412       if (v)   *v   = 0;
1413     }
1414   }
1415   *nz = nztot;
1416   ierr = (*mat->A->ops->restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1417   ierr = (*mat->B->ops->restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1418   PetscFunctionReturn(0);
1419 }
1420 
1421 #undef __FUNC__
1422 #define __FUNC__ "MatRestoreRow_MPIBAIJ"
1423 int MatRestoreRow_MPIBAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1424 {
1425   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1426 
1427   PetscFunctionBegin;
1428   if (baij->getrowactive == PETSC_FALSE) {
1429     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"MatGetRow not called");
1430   }
1431   baij->getrowactive = PETSC_FALSE;
1432   PetscFunctionReturn(0);
1433 }
1434 
1435 #undef __FUNC__
1436 #define __FUNC__ "MatGetBlockSize_MPIBAIJ"
1437 int MatGetBlockSize_MPIBAIJ(Mat mat,int *bs)
1438 {
1439   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1440 
1441   PetscFunctionBegin;
1442   *bs = baij->bs;
1443   PetscFunctionReturn(0);
1444 }
1445 
1446 #undef __FUNC__
1447 #define __FUNC__ "MatZeroEntries_MPIBAIJ"
1448 int MatZeroEntries_MPIBAIJ(Mat A)
1449 {
1450   Mat_MPIBAIJ *l = (Mat_MPIBAIJ *) A->data;
1451   int         ierr;
1452 
1453   PetscFunctionBegin;
1454   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
1455   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
1456   PetscFunctionReturn(0);
1457 }
1458 
1459 #undef __FUNC__
1460 #define __FUNC__ "MatGetInfo_MPIBAIJ"
1461 int MatGetInfo_MPIBAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1462 {
1463   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) matin->data;
1464   Mat         A = a->A, B = a->B;
1465   int         ierr;
1466   double      isend[5], irecv[5];
1467 
1468   PetscFunctionBegin;
1469   info->block_size     = (double)a->bs;
1470   ierr = MatGetInfo(A,MAT_LOCAL,info); CHKERRQ(ierr);
1471   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->nz_unneeded;
1472   isend[3] = info->memory;  isend[4] = info->mallocs;
1473   ierr = MatGetInfo(B,MAT_LOCAL,info); CHKERRQ(ierr);
1474   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->nz_unneeded;
1475   isend[3] += info->memory;  isend[4] += info->mallocs;
1476   if (flag == MAT_LOCAL) {
1477     info->nz_used      = isend[0];
1478     info->nz_allocated = isend[1];
1479     info->nz_unneeded  = isend[2];
1480     info->memory       = isend[3];
1481     info->mallocs      = isend[4];
1482   } else if (flag == MAT_GLOBAL_MAX) {
1483     ierr = MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_MAX,matin->comm);CHKERRQ(ierr);
1484     info->nz_used      = irecv[0];
1485     info->nz_allocated = irecv[1];
1486     info->nz_unneeded  = irecv[2];
1487     info->memory       = irecv[3];
1488     info->mallocs      = irecv[4];
1489   } else if (flag == MAT_GLOBAL_SUM) {
1490     ierr = MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_SUM,matin->comm);CHKERRQ(ierr);
1491     info->nz_used      = irecv[0];
1492     info->nz_allocated = irecv[1];
1493     info->nz_unneeded  = irecv[2];
1494     info->memory       = irecv[3];
1495     info->mallocs      = irecv[4];
1496   } else {
1497     SETERRQ1(1,1,"Unknown MatInfoType argument %d",flag);
1498   }
1499   info->rows_global       = (double)a->M;
1500   info->columns_global    = (double)a->N;
1501   info->rows_local        = (double)a->m;
1502   info->columns_local     = (double)a->N;
1503   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1504   info->fill_ratio_needed = 0;
1505   info->factor_mallocs    = 0;
1506   PetscFunctionReturn(0);
1507 }
1508 
1509 #undef __FUNC__
1510 #define __FUNC__ "MatSetOption_MPIBAIJ"
1511 int MatSetOption_MPIBAIJ(Mat A,MatOption op)
1512 {
1513   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1514   int         ierr;
1515 
1516   PetscFunctionBegin;
1517   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1518       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1519       op == MAT_COLUMNS_UNSORTED ||
1520       op == MAT_COLUMNS_SORTED ||
1521       op == MAT_NEW_NONZERO_ALLOCATION_ERR ||
1522       op == MAT_NEW_NONZERO_LOCATION_ERR) {
1523         ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1524         ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1525   } else if (op == MAT_ROW_ORIENTED) {
1526         a->roworiented = 1;
1527         ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1528         ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1529   } else if (op == MAT_ROWS_SORTED ||
1530              op == MAT_ROWS_UNSORTED ||
1531              op == MAT_SYMMETRIC ||
1532              op == MAT_STRUCTURALLY_SYMMETRIC ||
1533              op == MAT_YES_NEW_DIAGONALS ||
1534              op == MAT_USE_HASH_TABLE) {
1535     PLogInfo(A,"Info:MatSetOption_MPIBAIJ:Option ignored\n");
1536   } else if (op == MAT_COLUMN_ORIENTED) {
1537     a->roworiented = 0;
1538     ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1539     ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1540   } else if (op == MAT_IGNORE_OFF_PROC_ENTRIES) {
1541     a->donotstash = 1;
1542   } else if (op == MAT_NO_NEW_DIAGONALS) {
1543     SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");
1544   } else if (op == MAT_USE_HASH_TABLE) {
1545     a->ht_flag = 1;
1546   } else {
1547     SETERRQ(PETSC_ERR_SUP,0,"unknown option");
1548   }
1549   PetscFunctionReturn(0);
1550 }
1551 
1552 #undef __FUNC__
1553 #define __FUNC__ "MatTranspose_MPIBAIJ("
1554 int MatTranspose_MPIBAIJ(Mat A,Mat *matout)
1555 {
1556   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) A->data;
1557   Mat_SeqBAIJ *Aloc;
1558   Mat        B;
1559   int        ierr,M=baij->M,N=baij->N,*ai,*aj,i,*rvals,j,k,col;
1560   int        bs=baij->bs,mbs=baij->mbs;
1561   Scalar     *a;
1562 
1563   PetscFunctionBegin;
1564   if (matout == PETSC_NULL && M != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Square matrix only for in-place");
1565   ierr = MatCreateMPIBAIJ(A->comm,baij->bs,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);
1566   CHKERRQ(ierr);
1567 
1568   /* copy over the A part */
1569   Aloc = (Mat_SeqBAIJ*) baij->A->data;
1570   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1571   rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1572 
1573   for ( i=0; i<mbs; i++ ) {
1574     rvals[0] = bs*(baij->rstart + i);
1575     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1576     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1577       col = (baij->cstart+aj[j])*bs;
1578       for (k=0; k<bs; k++ ) {
1579         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1580         col++; a += bs;
1581       }
1582     }
1583   }
1584   /* copy over the B part */
1585   Aloc = (Mat_SeqBAIJ*) baij->B->data;
1586   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1587   for ( i=0; i<mbs; i++ ) {
1588     rvals[0] = bs*(baij->rstart + i);
1589     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1590     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1591       col = baij->garray[aj[j]]*bs;
1592       for (k=0; k<bs; k++ ) {
1593         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1594         col++; a += bs;
1595       }
1596     }
1597   }
1598   PetscFree(rvals);
1599   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1600   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1601 
1602   if (matout != PETSC_NULL) {
1603     *matout = B;
1604   } else {
1605     PetscOps *Abops;
1606     MatOps   Aops;
1607 
1608     /* This isn't really an in-place transpose .... but free data structures from baij */
1609     PetscFree(baij->rowners);
1610     ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1611     ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1612 #if defined (USE_CTABLE)
1613     if (baij->colmap) TableDelete(baij->colmap);
1614 #else
1615     if (baij->colmap) PetscFree(baij->colmap);
1616 #endif
1617     if (baij->garray) PetscFree(baij->garray);
1618     if (baij->lvec) VecDestroy(baij->lvec);
1619     if (baij->Mvctx) VecScatterDestroy(baij->Mvctx);
1620     PetscFree(baij);
1621 
1622     /*
1623        This is horrible, horrible code. We need to keep the
1624       A pointers for the bops and ops but copy everything
1625       else from C.
1626     */
1627     Abops = A->bops;
1628     Aops  = A->ops;
1629     PetscMemcpy(A,B,sizeof(struct _p_Mat));
1630     A->bops = Abops;
1631     A->ops  = Aops;
1632 
1633     PetscHeaderDestroy(B);
1634   }
1635   PetscFunctionReturn(0);
1636 }
1637 
1638 #undef __FUNC__
1639 #define __FUNC__ "MatDiagonalScale_MPIBAIJ"
1640 int MatDiagonalScale_MPIBAIJ(Mat A,Vec ll,Vec rr)
1641 {
1642   Mat a = ((Mat_MPIBAIJ *) A->data)->A;
1643   Mat b = ((Mat_MPIBAIJ *) A->data)->B;
1644   int ierr,s1,s2,s3;
1645 
1646   PetscFunctionBegin;
1647   if (ll)  {
1648     ierr = VecGetLocalSize(ll,&s1); CHKERRQ(ierr);
1649     ierr = MatGetLocalSize(A,&s2,&s3); CHKERRQ(ierr);
1650     if (s1!=s2) SETERRQ(PETSC_ERR_ARG_SIZ,0,"non-conforming local sizes");
1651     ierr = MatDiagonalScale(a,ll,0); CHKERRQ(ierr);
1652     ierr = MatDiagonalScale(b,ll,0); CHKERRQ(ierr);
1653   }
1654   if (rr) SETERRQ(PETSC_ERR_SUP,0,"not supported for right vector");
1655   PetscFunctionReturn(0);
1656 }
1657 
1658 #undef __FUNC__
1659 #define __FUNC__ "MatZeroRows_MPIBAIJ"
1660 int MatZeroRows_MPIBAIJ(Mat A,IS is,Scalar *diag)
1661 {
1662   Mat_MPIBAIJ    *l = (Mat_MPIBAIJ *) A->data;
1663   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
1664   int            *procs,*nprocs,j,found,idx,nsends,*work,row;
1665   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
1666   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
1667   int            *lens,imdex,*lrows,*values,bs=l->bs,rstart_bs=l->rstart_bs;
1668   MPI_Comm       comm = A->comm;
1669   MPI_Request    *send_waits,*recv_waits;
1670   MPI_Status     recv_status,*send_status;
1671   IS             istmp;
1672 
1673   PetscFunctionBegin;
1674   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
1675   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
1676 
1677   /*  first count number of contributors to each processor */
1678   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
1679   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
1680   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
1681   for ( i=0; i<N; i++ ) {
1682     idx = rows[i];
1683     found = 0;
1684     for ( j=0; j<size; j++ ) {
1685       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
1686         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
1687       }
1688     }
1689     if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Index out of range");
1690   }
1691   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
1692 
1693   /* inform other processors of number of messages and max length*/
1694   work   = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
1695   ierr   = MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
1696   nrecvs = work[rank];
1697   ierr   = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
1698   nmax   = work[rank];
1699   PetscFree(work);
1700 
1701   /* post receives:   */
1702   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); CHKPTRQ(rvalues);
1703   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
1704   for ( i=0; i<nrecvs; i++ ) {
1705     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
1706   }
1707 
1708   /* do sends:
1709      1) starts[i] gives the starting index in svalues for stuff going to
1710      the ith processor
1711   */
1712   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
1713   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
1714   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
1715   starts[0] = 0;
1716   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1717   for ( i=0; i<N; i++ ) {
1718     svalues[starts[owner[i]]++] = rows[i];
1719   }
1720   ISRestoreIndices(is,&rows);
1721 
1722   starts[0] = 0;
1723   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1724   count = 0;
1725   for ( i=0; i<size; i++ ) {
1726     if (procs[i]) {
1727       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
1728     }
1729   }
1730   PetscFree(starts);
1731 
1732   base = owners[rank]*bs;
1733 
1734   /*  wait on receives */
1735   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
1736   source = lens + nrecvs;
1737   count  = nrecvs; slen = 0;
1738   while (count) {
1739     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
1740     /* unpack receives into our local space */
1741     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
1742     source[imdex]  = recv_status.MPI_SOURCE;
1743     lens[imdex]  = n;
1744     slen += n;
1745     count--;
1746   }
1747   PetscFree(recv_waits);
1748 
1749   /* move the data into the send scatter */
1750   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
1751   count = 0;
1752   for ( i=0; i<nrecvs; i++ ) {
1753     values = rvalues + i*nmax;
1754     for ( j=0; j<lens[i]; j++ ) {
1755       lrows[count++] = values[j] - base;
1756     }
1757   }
1758   PetscFree(rvalues); PetscFree(lens);
1759   PetscFree(owner); PetscFree(nprocs);
1760 
1761   /* actually zap the local rows */
1762   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
1763   PLogObjectParent(A,istmp);
1764 
1765   /*
1766         Zero the required rows. If the "diagonal block" of the matrix
1767      is square and the user wishes to set the diagonal we use seperate
1768      code so that MatSetValues() is not called for each diagonal allocating
1769      new memory, thus calling lots of mallocs and slowing things down.
1770 
1771        Contributed by: Mathew Knepley
1772   */
1773   /* must zero l->B before l->A because the (diag) case below may put values into l->B*/
1774   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
1775   if (diag && (l->A->M == l->A->N)) {
1776     ierr      = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
1777   } else if (diag) {
1778     ierr = MatZeroRows(l->A,istmp,0); CHKERRQ(ierr);
1779     if (((Mat_SeqBAIJ*)l->A->data)->nonew) {
1780       SETERRQ(PETSC_ERR_SUP,0,"MatZeroRows() on rectangular matrices cannot be used with the Mat options \n\
1781 MAT_NO_NEW_NONZERO_LOCATIONS,MAT_NEW_NONZERO_LOCATION_ERR,MAT_NEW_NONZERO_ALLOCATION_ERR");
1782     }
1783     for ( i = 0; i < slen; i++ ) {
1784       row  = lrows[i] + rstart_bs;
1785       ierr = MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES);CHKERRQ(ierr);
1786     }
1787     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1788     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1789   } else {
1790     ierr = MatZeroRows(l->A,istmp,0); CHKERRQ(ierr);
1791   }
1792 
1793   ierr = ISDestroy(istmp); CHKERRQ(ierr);
1794   PetscFree(lrows);
1795 
1796   /* wait on sends */
1797   if (nsends) {
1798     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
1799     ierr        = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
1800     PetscFree(send_status);
1801   }
1802   PetscFree(send_waits); PetscFree(svalues);
1803 
1804   PetscFunctionReturn(0);
1805 }
1806 
1807 extern int MatPrintHelp_SeqBAIJ(Mat);
1808 #undef __FUNC__
1809 #define __FUNC__ "MatPrintHelp_MPIBAIJ"
1810 int MatPrintHelp_MPIBAIJ(Mat A)
1811 {
1812   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1813   MPI_Comm    comm = A->comm;
1814   static int  called = 0;
1815   int         ierr;
1816 
1817   PetscFunctionBegin;
1818   if (!a->rank) {
1819     ierr = MatPrintHelp_SeqBAIJ(a->A);CHKERRQ(ierr);
1820   }
1821   if (called) {PetscFunctionReturn(0);} else called = 1;
1822   (*PetscHelpPrintf)(comm," Options for MATMPIBAIJ matrix format (the defaults):\n");
1823   (*PetscHelpPrintf)(comm,"  -mat_use_hash_table <factor>: Use hashtable for efficient matrix assembly\n");
1824   PetscFunctionReturn(0);
1825 }
1826 
1827 #undef __FUNC__
1828 #define __FUNC__ "MatSetUnfactored_MPIBAIJ"
1829 int MatSetUnfactored_MPIBAIJ(Mat A)
1830 {
1831   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1832   int         ierr;
1833 
1834   PetscFunctionBegin;
1835   ierr = MatSetUnfactored(a->A); CHKERRQ(ierr);
1836   PetscFunctionReturn(0);
1837 }
1838 
1839 static int MatDuplicate_MPIBAIJ(Mat,MatDuplicateOption,Mat *);
1840 
1841 /* -------------------------------------------------------------------*/
1842 static struct _MatOps MatOps_Values = {
1843   MatSetValues_MPIBAIJ,
1844   MatGetRow_MPIBAIJ,
1845   MatRestoreRow_MPIBAIJ,
1846   MatMult_MPIBAIJ,
1847   MatMultAdd_MPIBAIJ,
1848   MatMultTrans_MPIBAIJ,
1849   MatMultTransAdd_MPIBAIJ,
1850   0,
1851   0,
1852   0,
1853   0,
1854   0,
1855   0,
1856   0,
1857   MatTranspose_MPIBAIJ,
1858   MatGetInfo_MPIBAIJ,
1859   0,
1860   MatGetDiagonal_MPIBAIJ,
1861   MatDiagonalScale_MPIBAIJ,
1862   MatNorm_MPIBAIJ,
1863   MatAssemblyBegin_MPIBAIJ,
1864   MatAssemblyEnd_MPIBAIJ,
1865   0,
1866   MatSetOption_MPIBAIJ,
1867   MatZeroEntries_MPIBAIJ,
1868   MatZeroRows_MPIBAIJ,
1869   0,
1870   0,
1871   0,
1872   0,
1873   MatGetSize_MPIBAIJ,
1874   MatGetLocalSize_MPIBAIJ,
1875   MatGetOwnershipRange_MPIBAIJ,
1876   0,
1877   0,
1878   0,
1879   0,
1880   MatDuplicate_MPIBAIJ,
1881   0,
1882   0,
1883   0,
1884   0,
1885   0,
1886   MatGetSubMatrices_MPIBAIJ,
1887   MatIncreaseOverlap_MPIBAIJ,
1888   MatGetValues_MPIBAIJ,
1889   0,
1890   MatPrintHelp_MPIBAIJ,
1891   MatScale_MPIBAIJ,
1892   0,
1893   0,
1894   0,
1895   MatGetBlockSize_MPIBAIJ,
1896   0,
1897   0,
1898   0,
1899   0,
1900   0,
1901   0,
1902   MatSetUnfactored_MPIBAIJ,
1903   0,
1904   MatSetValuesBlocked_MPIBAIJ,
1905   0,
1906   0,
1907   0,
1908   MatGetMaps_Petsc};
1909 
1910 
1911 EXTERN_C_BEGIN
1912 #undef __FUNC__
1913 #define __FUNC__ "MatGetDiagonalBlock_MPIBAIJ"
1914 int MatGetDiagonalBlock_MPIBAIJ(Mat A,PetscTruth *iscopy,MatReuse reuse,Mat *a)
1915 {
1916   PetscFunctionBegin;
1917   *a      = ((Mat_MPIBAIJ *)A->data)->A;
1918   *iscopy = PETSC_FALSE;
1919   PetscFunctionReturn(0);
1920 }
1921 EXTERN_C_END
1922 
1923 #undef __FUNC__
1924 #define __FUNC__ "MatCreateMPIBAIJ"
1925 /*@C
1926    MatCreateMPIBAIJ - Creates a sparse parallel matrix in block AIJ format
1927    (block compressed row).  For good matrix assembly performance
1928    the user should preallocate the matrix storage by setting the parameters
1929    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1930    performance can be increased by more than a factor of 50.
1931 
1932    Collective on MPI_Comm
1933 
1934    Input Parameters:
1935 +  comm - MPI communicator
1936 .  bs   - size of blockk
1937 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1938            This value should be the same as the local size used in creating the
1939            y vector for the matrix-vector product y = Ax.
1940 .  n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1941            This value should be the same as the local size used in creating the
1942            x vector for the matrix-vector product y = Ax.
1943 .  M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1944 .  N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1945 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1946            submatrix  (same for all local rows)
1947 .  d_nzz - array containing the number of block nonzeros in the various block rows
1948            of the in diagonal portion of the local (possibly different for each block
1949            row) or PETSC_NULL.  You must leave room for the diagonal entry even if it is zero.
1950 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1951            submatrix (same for all local rows).
1952 -  o_nzz - array containing the number of nonzeros in the various block rows of the
1953            off-diagonal portion of the local submatrix (possibly different for
1954            each block row) or PETSC_NULL.
1955 
1956    Output Parameter:
1957 .  A - the matrix
1958 
1959    Options Database Keys:
1960 .   -mat_no_unroll - uses code that does not unroll the loops in the
1961                      block calculations (much slower)
1962 .   -mat_block_size - size of the blocks to use
1963 .   -mat_mpi - use the parallel matrix data structures even on one processor
1964                (defaults to using SeqBAIJ format on one processor)
1965 
1966    Notes:
1967    The user MUST specify either the local or global matrix dimensions
1968    (possibly both).
1969 
1970    If PETSC_DECIDE or  PETSC_DETERMINE is used for a particular argument on one processor
1971    than it must be used on all processors that share the object for that argument.
1972 
1973    Storage Information:
1974    For a square global matrix we define each processor's diagonal portion
1975    to be its local rows and the corresponding columns (a square submatrix);
1976    each processor's off-diagonal portion encompasses the remainder of the
1977    local matrix (a rectangular submatrix).
1978 
1979    The user can specify preallocated storage for the diagonal part of
1980    the local submatrix with either d_nz or d_nnz (not both).  Set
1981    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1982    memory allocation.  Likewise, specify preallocated storage for the
1983    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1984 
1985    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1986    the figure below we depict these three local rows and all columns (0-11).
1987 
1988 .vb
1989            0 1 2 3 4 5 6 7 8 9 10 11
1990           -------------------
1991    row 3  |  o o o d d d o o o o o o
1992    row 4  |  o o o d d d o o o o o o
1993    row 5  |  o o o d d d o o o o o o
1994           -------------------
1995 .ve
1996 
1997    Thus, any entries in the d locations are stored in the d (diagonal)
1998    submatrix, and any entries in the o locations are stored in the
1999    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
2000    stored simply in the MATSEQBAIJ format for compressed row storage.
2001 
2002    Now d_nz should indicate the number of block nonzeros per row in the d matrix,
2003    and o_nz should indicate the number of block nonzeros per row in the o matrix.
2004    In general, for PDE problems in which most nonzeros are near the diagonal,
2005    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
2006    or you will get TERRIBLE performance; see the users' manual chapter on
2007    matrices.
2008 
2009    Level: intermediate
2010 
2011 .keywords: matrix, block, aij, compressed row, sparse, parallel
2012 
2013 .seealso: MatCreate(), MatCreateSeqBAIJ(), MatSetValues(), MatCreateMPIAIJ()
2014 @*/
2015 int MatCreateMPIBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,
2016                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
2017 {
2018   Mat          B;
2019   Mat_MPIBAIJ  *b;
2020   int          ierr, i,sum[2],work[2],mbs,nbs,Mbs=PETSC_DECIDE,Nbs=PETSC_DECIDE,size,flg;
2021   int          flag1 = 0,flag2 = 0;
2022 
2023   PetscFunctionBegin;
2024   if (bs < 1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Invalid block size specified, must be positive");
2025 
2026   MPI_Comm_size(comm,&size);
2027   ierr = OptionsHasName(PETSC_NULL,"-mat_mpibaij",&flag1); CHKERRQ(ierr);
2028   ierr = OptionsHasName(PETSC_NULL,"-mat_mpi",&flag2); CHKERRQ(ierr);
2029   if (!flag1 && !flag2 && size == 1) {
2030     if (M == PETSC_DECIDE) M = m;
2031     if (N == PETSC_DECIDE) N = n;
2032     ierr = MatCreateSeqBAIJ(comm,bs,M,N,d_nz,d_nnz,A); CHKERRQ(ierr);
2033     PetscFunctionReturn(0);
2034   }
2035 
2036   *A = 0;
2037   PetscHeaderCreate(B,_p_Mat,struct _MatOps,MAT_COOKIE,MATMPIBAIJ,"Mat",comm,MatDestroy,MatView);
2038   PLogObjectCreate(B);
2039   B->data       = (void *) (b = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(b);
2040   PetscMemzero(b,sizeof(Mat_MPIBAIJ));
2041   PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));
2042 
2043   B->ops->destroy    = MatDestroy_MPIBAIJ;
2044   B->ops->view       = MatView_MPIBAIJ;
2045   B->mapping    = 0;
2046   B->factor     = 0;
2047   B->assembled  = PETSC_FALSE;
2048 
2049   B->insertmode = NOT_SET_VALUES;
2050   MPI_Comm_rank(comm,&b->rank);
2051   MPI_Comm_size(comm,&b->size);
2052 
2053   if ( m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL)) {
2054     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
2055   }
2056   if ( M == PETSC_DECIDE && m == PETSC_DECIDE) {
2057     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either M or m should be specified");
2058   }
2059   if ( N == PETSC_DECIDE && n == PETSC_DECIDE) {
2060     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either N or n should be specified");
2061   }
2062   if ( M != PETSC_DECIDE && m != PETSC_DECIDE) M = PETSC_DECIDE;
2063   if ( N != PETSC_DECIDE && n != PETSC_DECIDE) N = PETSC_DECIDE;
2064 
2065   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
2066     work[0] = m; work[1] = n;
2067     mbs = m/bs; nbs = n/bs;
2068     ierr = MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );CHKERRQ(ierr);
2069     if (M == PETSC_DECIDE) {M = sum[0]; Mbs = M/bs;}
2070     if (N == PETSC_DECIDE) {N = sum[1]; Nbs = N/bs;}
2071   }
2072   if (m == PETSC_DECIDE) {
2073     Mbs = M/bs;
2074     if (Mbs*bs != M) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global rows must be divisible by blocksize");
2075     mbs = Mbs/b->size + ((Mbs % b->size) > b->rank);
2076     m   = mbs*bs;
2077   }
2078   if (n == PETSC_DECIDE) {
2079     Nbs = N/bs;
2080     if (Nbs*bs != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global cols must be divisible by blocksize");
2081     nbs = Nbs/b->size + ((Nbs % b->size) > b->rank);
2082     n   = nbs*bs;
2083   }
2084   if (mbs*bs != m || nbs*bs != n) {
2085     SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of local rows, cols must be divisible by blocksize");
2086   }
2087 
2088   b->m = m; B->m = m;
2089   b->n = n; B->n = n;
2090   b->N = N; B->N = N;
2091   b->M = M; B->M = M;
2092   b->bs  = bs;
2093   b->bs2 = bs*bs;
2094   b->mbs = mbs;
2095   b->nbs = nbs;
2096   b->Mbs = Mbs;
2097   b->Nbs = Nbs;
2098 
2099   /* the information in the maps duplicates the information computed below, eventually
2100      we should remove the duplicate information that is not contained in the maps */
2101   ierr = MapCreateMPI(comm,m,M,&B->rmap);CHKERRQ(ierr);
2102   ierr = MapCreateMPI(comm,n,N,&B->cmap);CHKERRQ(ierr);
2103 
2104   /* build local table of row and column ownerships */
2105   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
2106   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
2107   b->cowners = b->rowners + b->size + 2;
2108   ierr = MPI_Allgather(&mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2109   b->rowners[0] = 0;
2110   for ( i=2; i<=b->size; i++ ) {
2111     b->rowners[i] += b->rowners[i-1];
2112   }
2113   b->rstart    = b->rowners[b->rank];
2114   b->rend      = b->rowners[b->rank+1];
2115   b->rstart_bs = b->rstart * bs;
2116   b->rend_bs   = b->rend * bs;
2117 
2118   ierr = MPI_Allgather(&nbs,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2119   b->cowners[0] = 0;
2120   for ( i=2; i<=b->size; i++ ) {
2121     b->cowners[i] += b->cowners[i-1];
2122   }
2123   b->cstart    = b->cowners[b->rank];
2124   b->cend      = b->cowners[b->rank+1];
2125   b->cstart_bs = b->cstart * bs;
2126   b->cend_bs   = b->cend * bs;
2127 
2128 
2129   if (d_nz == PETSC_DEFAULT) d_nz = 5;
2130   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
2131   PLogObjectParent(B,b->A);
2132   if (o_nz == PETSC_DEFAULT) o_nz = 0;
2133   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
2134   PLogObjectParent(B,b->B);
2135 
2136   /* build cache for off array entries formed */
2137   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
2138   b->donotstash  = 0;
2139   b->colmap      = 0;
2140   b->garray      = 0;
2141   b->roworiented = 1;
2142 
2143   /* stuff used in block assembly */
2144   b->barray       = 0;
2145 
2146   /* stuff used for matrix vector multiply */
2147   b->lvec         = 0;
2148   b->Mvctx        = 0;
2149 
2150   /* stuff for MatGetRow() */
2151   b->rowindices   = 0;
2152   b->rowvalues    = 0;
2153   b->getrowactive = PETSC_FALSE;
2154 
2155   /* hash table stuff */
2156   b->ht           = 0;
2157   b->hd           = 0;
2158   b->ht_size      = 0;
2159   b->ht_flag      = 0;
2160   b->ht_fact      = 0;
2161   b->ht_total_ct  = 0;
2162   b->ht_insert_ct = 0;
2163 
2164   *A = B;
2165   ierr = OptionsHasName(PETSC_NULL,"-mat_use_hash_table",&flg); CHKERRQ(ierr);
2166   if (flg) {
2167     double fact = 1.39;
2168     ierr = MatSetOption(B,MAT_USE_HASH_TABLE); CHKERRQ(ierr);
2169     ierr = OptionsGetDouble(PETSC_NULL,"-mat_use_hash_table",&fact,&flg); CHKERRQ(ierr);
2170     if (fact <= 1.0) fact = 1.39;
2171     ierr = MatMPIBAIJSetHashTableFactor(B,fact); CHKERRQ(ierr);
2172     PLogInfo(0,"MatCreateMPIBAIJ:Hash table Factor used %5.2f\n",fact);
2173   }
2174   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetDiagonalBlock_C",
2175                                      "MatGetDiagonalBlock_MPIBAIJ",
2176                                      (void*)MatGetDiagonalBlock_MPIBAIJ);CHKERRQ(ierr);
2177   PetscFunctionReturn(0);
2178 }
2179 
2180 #undef __FUNC__
2181 #define __FUNC__ "MatDuplicate_MPIBAIJ"
2182 static int MatDuplicate_MPIBAIJ(Mat matin,MatDuplicateOption cpvalues,Mat *newmat)
2183 {
2184   Mat         mat;
2185   Mat_MPIBAIJ *a,*oldmat = (Mat_MPIBAIJ *) matin->data;
2186   int         ierr, len=0, flg;
2187 
2188   PetscFunctionBegin;
2189   *newmat       = 0;
2190   PetscHeaderCreate(mat,_p_Mat,struct _MatOps,MAT_COOKIE,MATMPIBAIJ,"Mat",matin->comm,MatDestroy,MatView);
2191   PLogObjectCreate(mat);
2192   mat->data       = (void *) (a = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(a);
2193   PetscMemcpy(mat->ops,&MatOps_Values,sizeof(struct _MatOps));
2194   mat->ops->destroy    = MatDestroy_MPIBAIJ;
2195   mat->ops->view       = MatView_MPIBAIJ;
2196   mat->factor          = matin->factor;
2197   mat->assembled       = PETSC_TRUE;
2198   mat->insertmode      = NOT_SET_VALUES;
2199 
2200   a->m = mat->m   = oldmat->m;
2201   a->n = mat->n   = oldmat->n;
2202   a->M = mat->M   = oldmat->M;
2203   a->N = mat->N   = oldmat->N;
2204 
2205   a->bs  = oldmat->bs;
2206   a->bs2 = oldmat->bs2;
2207   a->mbs = oldmat->mbs;
2208   a->nbs = oldmat->nbs;
2209   a->Mbs = oldmat->Mbs;
2210   a->Nbs = oldmat->Nbs;
2211 
2212   a->rstart       = oldmat->rstart;
2213   a->rend         = oldmat->rend;
2214   a->cstart       = oldmat->cstart;
2215   a->cend         = oldmat->cend;
2216   a->size         = oldmat->size;
2217   a->rank         = oldmat->rank;
2218   a->donotstash   = oldmat->donotstash;
2219   a->roworiented  = oldmat->roworiented;
2220   a->rowindices   = 0;
2221   a->rowvalues    = 0;
2222   a->getrowactive = PETSC_FALSE;
2223   a->barray       = 0;
2224   a->rstart_bs    = oldmat->rstart_bs;
2225   a->rend_bs      = oldmat->rend_bs;
2226   a->cstart_bs    = oldmat->cstart_bs;
2227   a->cend_bs      = oldmat->cend_bs;
2228 
2229   /* hash table stuff */
2230   a->ht           = 0;
2231   a->hd           = 0;
2232   a->ht_size      = 0;
2233   a->ht_flag      = oldmat->ht_flag;
2234   a->ht_fact      = oldmat->ht_fact;
2235   a->ht_total_ct  = 0;
2236   a->ht_insert_ct = 0;
2237 
2238 
2239   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
2240   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
2241   a->cowners = a->rowners + a->size + 2;
2242   PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));
2243   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
2244   if (oldmat->colmap) {
2245 #if defined (USE_CTABLE)
2246   ierr = TableCreateCopy(oldmat->colmap,&a->colmap); CHKERRQ(ierr);
2247 #else
2248     a->colmap = (int *) PetscMalloc((a->Nbs)*sizeof(int));CHKPTRQ(a->colmap);
2249     PLogObjectMemory(mat,(a->Nbs)*sizeof(int));
2250     PetscMemcpy(a->colmap,oldmat->colmap,(a->Nbs)*sizeof(int));
2251 #endif
2252   } else a->colmap = 0;
2253   if (oldmat->garray && (len = ((Mat_SeqBAIJ *) (oldmat->B->data))->nbs)) {
2254     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
2255     PLogObjectMemory(mat,len*sizeof(int));
2256     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
2257   } else a->garray = 0;
2258 
2259   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
2260   PLogObjectParent(mat,a->lvec);
2261   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
2262 
2263   PLogObjectParent(mat,a->Mvctx);
2264   ierr =  MatDuplicate(oldmat->A,cpvalues,&a->A); CHKERRQ(ierr);
2265   PLogObjectParent(mat,a->A);
2266   ierr =  MatDuplicate(oldmat->B,cpvalues,&a->B); CHKERRQ(ierr);
2267   PLogObjectParent(mat,a->B);
2268   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
2269   ierr = FListDuplicate(mat->qlist,&matin->qlist);CHKERRQ(ierr);
2270   if (flg) {
2271     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
2272   }
2273   *newmat = mat;
2274   PetscFunctionReturn(0);
2275 }
2276 
2277 #include "sys.h"
2278 
2279 #undef __FUNC__
2280 #define __FUNC__ "MatLoad_MPIBAIJ"
2281 int MatLoad_MPIBAIJ(Viewer viewer,MatType type,Mat *newmat)
2282 {
2283   Mat          A;
2284   int          i, nz, ierr, j,rstart, rend, fd;
2285   Scalar       *vals,*buf;
2286   MPI_Comm     comm = ((PetscObject)viewer)->comm;
2287   MPI_Status   status;
2288   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,*browners,maxnz,*cols;
2289   int          *locrowlens,*sndcounts = 0,*procsnz = 0, jj,*mycols,*ibuf;
2290   int          flg,tag = ((PetscObject)viewer)->tag,bs=1,Mbs,mbs,extra_rows;
2291   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
2292   int          dcount,kmax,k,nzcount,tmp;
2293 
2294   PetscFunctionBegin;
2295   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
2296 
2297   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
2298   if (!rank) {
2299     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
2300     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT); CHKERRQ(ierr);
2301     if (header[0] != MAT_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"not matrix object");
2302     if (header[3] < 0) {
2303       SETERRQ(PETSC_ERR_FILE_UNEXPECTED,1,"Matrix stored in special format, cannot load as MPIBAIJ");
2304     }
2305   }
2306 
2307   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
2308   M = header[1]; N = header[2];
2309 
2310   if (M != N) SETERRQ(PETSC_ERR_SUP,0,"Can only do square matrices");
2311 
2312   /*
2313      This code adds extra rows to make sure the number of rows is
2314      divisible by the blocksize
2315   */
2316   Mbs        = M/bs;
2317   extra_rows = bs - M + bs*(Mbs);
2318   if (extra_rows == bs) extra_rows = 0;
2319   else                  Mbs++;
2320   if (extra_rows &&!rank) {
2321     PLogInfo(0,"MatLoad_MPIBAIJ:Padding loaded matrix to match blocksize\n");
2322   }
2323 
2324   /* determine ownership of all rows */
2325   mbs = Mbs/size + ((Mbs % size) > rank);
2326   m   = mbs * bs;
2327   rowners = (int *) PetscMalloc(2*(size+2)*sizeof(int)); CHKPTRQ(rowners);
2328   browners = rowners + size + 1;
2329   ierr = MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2330   rowners[0] = 0;
2331   for ( i=2; i<=size; i++ ) rowners[i] += rowners[i-1];
2332   for ( i=0; i<=size;  i++ ) browners[i] = rowners[i]*bs;
2333   rstart = rowners[rank];
2334   rend   = rowners[rank+1];
2335 
2336   /* distribute row lengths to all processors */
2337   locrowlens = (int*) PetscMalloc( (rend-rstart)*bs*sizeof(int) ); CHKPTRQ(locrowlens);
2338   if (!rank) {
2339     rowlengths = (int*) PetscMalloc( (M+extra_rows)*sizeof(int) ); CHKPTRQ(rowlengths);
2340     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT); CHKERRQ(ierr);
2341     for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
2342     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
2343     for ( i=0; i<size; i++ ) sndcounts[i] = browners[i+1] - browners[i];
2344     ierr = MPI_Scatterv(rowlengths,sndcounts,browners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);CHKERRQ(ierr);
2345     PetscFree(sndcounts);
2346   } else {
2347     ierr = MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT, 0,comm);CHKERRQ(ierr);
2348   }
2349 
2350   if (!rank) {
2351     /* calculate the number of nonzeros on each processor */
2352     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
2353     PetscMemzero(procsnz,size*sizeof(int));
2354     for ( i=0; i<size; i++ ) {
2355       for ( j=rowners[i]*bs; j< rowners[i+1]*bs; j++ ) {
2356         procsnz[i] += rowlengths[j];
2357       }
2358     }
2359     PetscFree(rowlengths);
2360 
2361     /* determine max buffer needed and allocate it */
2362     maxnz = 0;
2363     for ( i=0; i<size; i++ ) {
2364       maxnz = PetscMax(maxnz,procsnz[i]);
2365     }
2366     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
2367 
2368     /* read in my part of the matrix column indices  */
2369     nz = procsnz[0];
2370     ibuf = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2371     mycols = ibuf;
2372     if (size == 1)  nz -= extra_rows;
2373     ierr = PetscBinaryRead(fd,mycols,nz,PETSC_INT); CHKERRQ(ierr);
2374     if (size == 1)  for (i=0; i< extra_rows; i++) { mycols[nz+i] = M+i; }
2375 
2376     /* read in every ones (except the last) and ship off */
2377     for ( i=1; i<size-1; i++ ) {
2378       nz   = procsnz[i];
2379       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2380       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
2381     }
2382     /* read in the stuff for the last proc */
2383     if ( size != 1 ) {
2384       nz   = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
2385       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2386       for ( i=0; i<extra_rows; i++ ) cols[nz+i] = M+i;
2387       ierr = MPI_Send(cols,nz+extra_rows,MPI_INT,size-1,tag,comm);CHKERRQ(ierr);
2388     }
2389     PetscFree(cols);
2390   } else {
2391     /* determine buffer space needed for message */
2392     nz = 0;
2393     for ( i=0; i<m; i++ ) {
2394       nz += locrowlens[i];
2395     }
2396     ibuf   = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2397     mycols = ibuf;
2398     /* receive message of column indices*/
2399     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
2400     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
2401     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2402   }
2403 
2404   /* loop over local rows, determining number of off diagonal entries */
2405   dlens  = (int *) PetscMalloc( 2*(rend-rstart+1)*sizeof(int) ); CHKPTRQ(dlens);
2406   odlens = dlens + (rend-rstart);
2407   mask   = (int *) PetscMalloc( 3*Mbs*sizeof(int) ); CHKPTRQ(mask);
2408   PetscMemzero(mask,3*Mbs*sizeof(int));
2409   masked1 = mask    + Mbs;
2410   masked2 = masked1 + Mbs;
2411   rowcount = 0; nzcount = 0;
2412   for ( i=0; i<mbs; i++ ) {
2413     dcount  = 0;
2414     odcount = 0;
2415     for ( j=0; j<bs; j++ ) {
2416       kmax = locrowlens[rowcount];
2417       for ( k=0; k<kmax; k++ ) {
2418         tmp = mycols[nzcount++]/bs;
2419         if (!mask[tmp]) {
2420           mask[tmp] = 1;
2421           if (tmp < rstart || tmp >= rend ) masked2[odcount++] = tmp;
2422           else masked1[dcount++] = tmp;
2423         }
2424       }
2425       rowcount++;
2426     }
2427 
2428     dlens[i]  = dcount;
2429     odlens[i] = odcount;
2430 
2431     /* zero out the mask elements we set */
2432     for ( j=0; j<dcount; j++ ) mask[masked1[j]] = 0;
2433     for ( j=0; j<odcount; j++ ) mask[masked2[j]] = 0;
2434   }
2435 
2436   /* create our matrix */
2437   ierr = MatCreateMPIBAIJ(comm,bs,m,PETSC_DECIDE,M+extra_rows,N+extra_rows,0,dlens,0,odlens,newmat);
2438          CHKERRQ(ierr);
2439   A = *newmat;
2440   MatSetOption(A,MAT_COLUMNS_SORTED);
2441 
2442   if (!rank) {
2443     buf = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(buf);
2444     /* read in my part of the matrix numerical values  */
2445     nz = procsnz[0];
2446     vals = buf;
2447     mycols = ibuf;
2448     if (size == 1)  nz -= extra_rows;
2449     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2450     if (size == 1)  for (i=0; i< extra_rows; i++) { vals[nz+i] = 1.0; }
2451 
2452     /* insert into matrix */
2453     jj      = rstart*bs;
2454     for ( i=0; i<m; i++ ) {
2455       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2456       mycols += locrowlens[i];
2457       vals   += locrowlens[i];
2458       jj++;
2459     }
2460     /* read in other processors (except the last one) and ship out */
2461     for ( i=1; i<size-1; i++ ) {
2462       nz   = procsnz[i];
2463       vals = buf;
2464       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2465       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
2466     }
2467     /* the last proc */
2468     if ( size != 1 ){
2469       nz   = procsnz[i] - extra_rows;
2470       vals = buf;
2471       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2472       for ( i=0; i<extra_rows; i++ ) vals[nz+i] = 1.0;
2473       ierr = MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,size-1,A->tag,comm);CHKERRQ(ierr);
2474     }
2475     PetscFree(procsnz);
2476   } else {
2477     /* receive numeric values */
2478     buf = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(buf);
2479 
2480     /* receive message of values*/
2481     vals   = buf;
2482     mycols = ibuf;
2483     ierr   = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
2484     ierr   = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
2485     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2486 
2487     /* insert into matrix */
2488     jj      = rstart*bs;
2489     for ( i=0; i<m; i++ ) {
2490       ierr    = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2491       mycols += locrowlens[i];
2492       vals   += locrowlens[i];
2493       jj++;
2494     }
2495   }
2496   PetscFree(locrowlens);
2497   PetscFree(buf);
2498   PetscFree(ibuf);
2499   PetscFree(rowners);
2500   PetscFree(dlens);
2501   PetscFree(mask);
2502   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2503   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2504   PetscFunctionReturn(0);
2505 }
2506 
2507 
2508 
2509 #undef __FUNC__
2510 #define __FUNC__ "MatMPIBAIJSetHashTableFactor"
2511 /*@
2512    MatMPIBAIJSetHashTableFactor - Sets the factor required to compute the size of the HashTable.
2513 
2514    Input Parameters:
2515 .  mat  - the matrix
2516 .  fact - factor
2517 
2518    Collective on Mat
2519 
2520   Notes:
2521    This can also be set by the command line option: -mat_use_hash_table fact
2522 
2523 .keywords: matrix, hashtable, factor, HT
2524 
2525 .seealso: MatSetOption()
2526 @*/
2527 int MatMPIBAIJSetHashTableFactor(Mat mat,double fact)
2528 {
2529   Mat_MPIBAIJ *baij;
2530 
2531   PetscFunctionBegin;
2532   PetscValidHeaderSpecific(mat,MAT_COOKIE);
2533   if (mat->type != MATMPIBAIJ) {
2534       SETERRQ(PETSC_ERR_ARG_WRONG,1,"Incorrect matrix type. Use MPIBAIJ only.");
2535   }
2536   baij = (Mat_MPIBAIJ*) mat->data;
2537   baij->ht_fact = fact;
2538   PetscFunctionReturn(0);
2539 }
2540