xref: /petsc/src/mat/impls/baij/mpi/mpibaij.c (revision d41123aaae7b2cb6f39fac17d2021b3d4fd395d0)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: mpibaij.c,v 1.152 1999/02/17 21:37:14 balay Exp bsmith $";
3 #endif
4 
5 #include "src/mat/impls/baij/mpi/mpibaij.h"   /*I  "mat.h"  I*/
6 #include "src/vec/vecimpl.h"
7 
8 extern int MatSetUpMultiply_MPIBAIJ(Mat);
9 extern int DisAssemble_MPIBAIJ(Mat);
10 extern int MatIncreaseOverlap_MPIBAIJ(Mat,int,IS *,int);
11 extern int MatGetSubMatrices_MPIBAIJ(Mat,int,IS *,IS *,MatReuse,Mat **);
12 extern int MatGetValues_SeqBAIJ(Mat,int,int *,int,int *,Scalar *);
13 
14 /*
15      Local utility routine that creates a mapping from the global column
16    number to the local number in the off-diagonal part of the local
17    storage of the matrix.  This is done in a non scable way since the
18    length of colmap equals the global matrix length.
19 */
20 #undef __FUNC__
21 #define __FUNC__ "CreateColmap_MPIBAIJ_Private"
22 static int CreateColmap_MPIBAIJ_Private(Mat mat)
23 {
24   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
25   Mat_SeqBAIJ *B = (Mat_SeqBAIJ*) baij->B->data;
26   int         nbs = B->nbs,i,bs=B->bs;
27 #if defined (USE_CTABLE)
28   int         ierr;
29 #endif
30 
31   PetscFunctionBegin;
32 #if defined (USE_CTABLE)
33   ierr = TableCreate(baij->nbs/5,&baij->colmap); CHKERRQ(ierr);
34   for ( i=0; i<nbs; i++ ){
35     ierr = TableAdd(baij->colmap,baij->garray[i]+1,i*bs+1);CHKERRQ(ierr);
36   }
37 #else
38   baij->colmap = (int *) PetscMalloc((baij->Nbs+1)*sizeof(int));CHKPTRQ(baij->colmap);
39   PLogObjectMemory(mat,baij->Nbs*sizeof(int));
40   PetscMemzero(baij->colmap,baij->Nbs*sizeof(int));
41   for ( i=0; i<nbs; i++ ) baij->colmap[baij->garray[i]] = i*bs+1;
42 #endif
43   PetscFunctionReturn(0);
44 }
45 
46 #define CHUNKSIZE  10
47 
48 #define  MatSetValues_SeqBAIJ_A_Private(row,col,value,addv) \
49 { \
50  \
51     brow = row/bs;  \
52     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
53     rmax = aimax[brow]; nrow = ailen[brow]; \
54       bcol = col/bs; \
55       ridx = row % bs; cidx = col % bs; \
56       low = 0; high = nrow; \
57       while (high-low > 3) { \
58         t = (low+high)/2; \
59         if (rp[t] > bcol) high = t; \
60         else              low  = t; \
61       } \
62       for ( _i=low; _i<high; _i++ ) { \
63         if (rp[_i] > bcol) break; \
64         if (rp[_i] == bcol) { \
65           bap  = ap +  bs2*_i + bs*cidx + ridx; \
66           if (addv == ADD_VALUES) *bap += value;  \
67           else                    *bap  = value;  \
68           goto a_noinsert; \
69         } \
70       } \
71       if (a->nonew == 1) goto a_noinsert; \
72       else if (a->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
73       if (nrow >= rmax) { \
74         /* there is no extra room in row, therefore enlarge */ \
75         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
76         Scalar *new_a; \
77  \
78         if (a->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
79  \
80         /* malloc new storage space */ \
81         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int); \
82         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
83         new_j   = (int *) (new_a + bs2*new_nz); \
84         new_i   = new_j + new_nz; \
85  \
86         /* copy over old data into new slots */ \
87         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];} \
88         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
89         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int)); \
90         len = (new_nz - CHUNKSIZE - ai[brow] - nrow); \
91         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow, \
92                                                            len*sizeof(int)); \
93         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar)); \
94         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
95         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE), \
96                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));  \
97         /* free up old matrix storage */ \
98         PetscFree(a->a);  \
99         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} \
100         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
101         a->singlemalloc = 1; \
102  \
103         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
104         rmax = aimax[brow] = aimax[brow] + CHUNKSIZE; \
105         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
106         a->maxnz += bs2*CHUNKSIZE; \
107         a->reallocs++; \
108         a->nz++; \
109       } \
110       N = nrow++ - 1;  \
111       /* shift up all the later entries in this row */ \
112       for ( ii=N; ii>=_i; ii-- ) { \
113         rp[ii+1] = rp[ii]; \
114         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
115       } \
116       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
117       rp[_i]                      = bcol;  \
118       ap[bs2*_i + bs*cidx + ridx] = value;  \
119       a_noinsert:; \
120     ailen[brow] = nrow; \
121 }
122 
123 #define  MatSetValues_SeqBAIJ_B_Private(row,col,value,addv) \
124 { \
125  \
126     brow = row/bs;  \
127     rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
128     rmax = bimax[brow]; nrow = bilen[brow]; \
129       bcol = col/bs; \
130       ridx = row % bs; cidx = col % bs; \
131       low = 0; high = nrow; \
132       while (high-low > 3) { \
133         t = (low+high)/2; \
134         if (rp[t] > bcol) high = t; \
135         else              low  = t; \
136       } \
137       for ( _i=low; _i<high; _i++ ) { \
138         if (rp[_i] > bcol) break; \
139         if (rp[_i] == bcol) { \
140           bap  = ap +  bs2*_i + bs*cidx + ridx; \
141           if (addv == ADD_VALUES) *bap += value;  \
142           else                    *bap  = value;  \
143           goto b_noinsert; \
144         } \
145       } \
146       if (b->nonew == 1) goto b_noinsert; \
147       else if (b->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
148       if (nrow >= rmax) { \
149         /* there is no extra room in row, therefore enlarge */ \
150         int    new_nz = bi[b->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
151         Scalar *new_a; \
152  \
153         if (b->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
154  \
155         /* malloc new storage space */ \
156         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(b->mbs+1)*sizeof(int); \
157         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
158         new_j   = (int *) (new_a + bs2*new_nz); \
159         new_i   = new_j + new_nz; \
160  \
161         /* copy over old data into new slots */ \
162         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = bi[ii];} \
163         for ( ii=brow+1; ii<b->mbs+1; ii++ ) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
164         PetscMemcpy(new_j,bj,(bi[brow]+nrow)*sizeof(int)); \
165         len = (new_nz - CHUNKSIZE - bi[brow] - nrow); \
166         PetscMemcpy(new_j+bi[brow]+nrow+CHUNKSIZE,bj+bi[brow]+nrow, \
167                                                            len*sizeof(int)); \
168         PetscMemcpy(new_a,ba,(bi[brow]+nrow)*bs2*sizeof(Scalar)); \
169         PetscMemzero(new_a+bs2*(bi[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
170         PetscMemcpy(new_a+bs2*(bi[brow]+nrow+CHUNKSIZE), \
171                     ba+bs2*(bi[brow]+nrow),bs2*len*sizeof(Scalar));  \
172         /* free up old matrix storage */ \
173         PetscFree(b->a);  \
174         if (!b->singlemalloc) {PetscFree(b->i);PetscFree(b->j);} \
175         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
176         b->singlemalloc = 1; \
177  \
178         rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
179         rmax = bimax[brow] = bimax[brow] + CHUNKSIZE; \
180         PLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
181         b->maxnz += bs2*CHUNKSIZE; \
182         b->reallocs++; \
183         b->nz++; \
184       } \
185       N = nrow++ - 1;  \
186       /* shift up all the later entries in this row */ \
187       for ( ii=N; ii>=_i; ii-- ) { \
188         rp[ii+1] = rp[ii]; \
189         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
190       } \
191       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
192       rp[_i]                      = bcol;  \
193       ap[bs2*_i + bs*cidx + ridx] = value;  \
194       b_noinsert:; \
195     bilen[brow] = nrow; \
196 }
197 
198 #undef __FUNC__
199 #define __FUNC__ "MatSetValues_MPIBAIJ"
200 int MatSetValues_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
201 {
202   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
203   Scalar      value;
204   int         ierr,i,j,row,col;
205   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
206   int         rend_orig=baij->rend_bs,cstart_orig=baij->cstart_bs;
207   int         cend_orig=baij->cend_bs,bs=baij->bs;
208 
209   /* Some Variables required in the macro */
210   Mat         A = baij->A;
211   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) (A)->data;
212   int         *aimax=a->imax,*ai=a->i,*ailen=a->ilen,*aj=a->j;
213   Scalar      *aa=a->a;
214 
215   Mat         B = baij->B;
216   Mat_SeqBAIJ *b = (Mat_SeqBAIJ *) (B)->data;
217   int         *bimax=b->imax,*bi=b->i,*bilen=b->ilen,*bj=b->j;
218   Scalar      *ba=b->a;
219 
220   int         *rp,ii,nrow,_i,rmax,N,brow,bcol;
221   int         low,high,t,ridx,cidx,bs2=a->bs2;
222   Scalar      *ap,*bap;
223 
224   PetscFunctionBegin;
225   for ( i=0; i<m; i++ ) {
226     if (im[i] < 0) continue;
227 #if defined(USE_PETSC_BOPT_g)
228     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
229 #endif
230     if (im[i] >= rstart_orig && im[i] < rend_orig) {
231       row = im[i] - rstart_orig;
232       for ( j=0; j<n; j++ ) {
233         if (in[j] >= cstart_orig && in[j] < cend_orig){
234           col = in[j] - cstart_orig;
235           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
236           MatSetValues_SeqBAIJ_A_Private(row,col,value,addv);
237           /* ierr = MatSetValues_SeqBAIJ(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
238         }
239         else if (in[j] < 0) continue;
240 #if defined(USE_PETSC_BOPT_g)
241         else if (in[j] >= baij->N) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Col too large");}
242 #endif
243         else {
244           if (mat->was_assembled) {
245             if (!baij->colmap) {
246               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
247             }
248 #if defined (USE_CTABLE)
249             ierr = TableFind(baij->colmap, in[j]/bs + 1,&col); CHKERRQ(ierr);
250             col  = col - 1 + in[j]%bs;
251 #else
252             col = baij->colmap[in[j]/bs] - 1 + in[j]%bs;
253 #endif
254             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
255               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
256               col =  in[j];
257               /* Reinitialize the variables required by MatSetValues_SeqBAIJ_B_Private() */
258               B = baij->B;
259               b = (Mat_SeqBAIJ *) (B)->data;
260               bimax=b->imax;bi=b->i;bilen=b->ilen;bj=b->j;
261               ba=b->a;
262             }
263           } else col = in[j];
264           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
265           MatSetValues_SeqBAIJ_B_Private(row,col,value,addv);
266           /* ierr = MatSetValues_SeqBAIJ(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
267         }
268       }
269     } else {
270       if (roworiented && !baij->donotstash) {
271         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
272       } else {
273         if (!baij->donotstash) {
274           row = im[i];
275 	  for ( j=0; j<n; j++ ) {
276 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
277           }
278         }
279       }
280     }
281   }
282   PetscFunctionReturn(0);
283 }
284 
285 extern int MatSetValuesBlocked_SeqBAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
286 #undef __FUNC__
287 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ"
288 int MatSetValuesBlocked_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
289 {
290   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
291   Scalar      *value,*barray=baij->barray;
292   int         ierr,i,j,ii,jj,row,col,k,l;
293   int         roworiented = baij->roworiented,rstart=baij->rstart ;
294   int         rend=baij->rend,cstart=baij->cstart,stepval;
295   int         cend=baij->cend,bs=baij->bs,bs2=baij->bs2;
296 
297   if(!barray) {
298     baij->barray = barray = (Scalar*) PetscMalloc(bs2*sizeof(Scalar)); CHKPTRQ(barray);
299   }
300 
301   if (roworiented) {
302     stepval = (n-1)*bs;
303   } else {
304     stepval = (m-1)*bs;
305   }
306   for ( i=0; i<m; i++ ) {
307     if (im[i] < 0) continue;
308 #if defined(USE_PETSC_BOPT_g)
309     if (im[i] >= baij->Mbs) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large, row %d max %d",im[i],baij->Mbs);
310 #endif
311     if (im[i] >= rstart && im[i] < rend) {
312       row = im[i] - rstart;
313       for ( j=0; j<n; j++ ) {
314         /* If NumCol = 1 then a copy is not required */
315         if ((roworiented) && (n == 1)) {
316           barray = v + i*bs2;
317         } else if((!roworiented) && (m == 1)) {
318           barray = v + j*bs2;
319         } else { /* Here a copy is required */
320           if (roworiented) {
321             value = v + i*(stepval+bs)*bs + j*bs;
322           } else {
323             value = v + j*(stepval+bs)*bs + i*bs;
324           }
325           for ( ii=0; ii<bs; ii++,value+=stepval ) {
326             for (jj=0; jj<bs; jj++ ) {
327               *barray++  = *value++;
328             }
329           }
330           barray -=bs2;
331         }
332 
333         if (in[j] >= cstart && in[j] < cend){
334           col  = in[j] - cstart;
335           ierr = MatSetValuesBlocked_SeqBAIJ(baij->A,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
336         }
337         else if (in[j] < 0) continue;
338 #if defined(USE_PETSC_BOPT_g)
339         else if (in[j] >= baij->Nbs) {SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large, col %d max %d",in[j],baij->Nbs);}
340 #endif
341         else {
342           if (mat->was_assembled) {
343             if (!baij->colmap) {
344               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
345             }
346 
347 #if defined(USE_PETSC_BOPT_g)
348 #if defined (USE_CTABLE)
349             { int data;
350             ierr = TableFind(baij->colmap,in[j]+1,&data); CHKERRQ(ierr);
351             if((data - 1) % bs)
352 	      {SETERRQ(PETSC_ERR_PLIB,0,"Incorrect colmap");}
353             }
354 #else
355             if ((baij->colmap[in[j]] - 1) % bs) {SETERRQ(PETSC_ERR_PLIB,0,"Incorrect colmap");}
356 #endif
357 #endif
358 #if defined (USE_CTABLE)
359 	    ierr = TableFind(baij->colmap,in[j]+1,&col); CHKERRQ(ierr);
360             col  = (col - 1)/bs;
361 #else
362             col = (baij->colmap[in[j]] - 1)/bs;
363 #endif
364             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
365               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
366               col =  in[j];
367             }
368           }
369           else col = in[j];
370           ierr = MatSetValuesBlocked_SeqBAIJ(baij->B,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
371         }
372       }
373     } else {
374       if (!baij->donotstash) {
375         if (roworiented ) {
376           row   = im[i]*bs;
377           value = v + i*(stepval+bs)*bs;
378           for ( j=0; j<bs; j++,row++ ) {
379             for ( k=0; k<n; k++ ) {
380               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
381                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
382               }
383             }
384           }
385         } else {
386           for ( j=0; j<n; j++ ) {
387             value = v + j*(stepval+bs)*bs + i*bs;
388             col   = in[j]*bs;
389             for ( k=0; k<bs; k++,col++,value+=stepval) {
390               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
391                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
392               }
393             }
394           }
395         }
396       }
397     }
398   }
399   PetscFunctionReturn(0);
400 }
401 #define HASH_KEY 0.6180339887
402 /* #define HASH1(size,key) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
403 #define HASH(size,key,tmp) (tmp = (key)*HASH_KEY,(int)((size)*(tmp-(int)tmp)))
404 /* #define HASH(size,key,tmp) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
405 #undef __FUNC__
406 #define __FUNC__ "MatSetValues_MPIBAIJ_HT"
407 int MatSetValues_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
408 {
409   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
410   int         ierr,i,j,row,col;
411   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
412   int         rend_orig=baij->rend_bs,Nbs=baij->Nbs;
413   int         h1,key,size=baij->ht_size,bs=baij->bs,*HT=baij->ht,idx;
414   double      tmp;
415   Scalar      ** HD = baij->hd,value;
416 #if defined(USE_PETSC_BOPT_g)
417   int         total_ct=baij->ht_total_ct,insert_ct=baij->ht_insert_ct;
418 #endif
419 
420   PetscFunctionBegin;
421 
422   for ( i=0; i<m; i++ ) {
423 #if defined(USE_PETSC_BOPT_g)
424     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
425     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
426 #endif
427       row = im[i];
428     if (row >= rstart_orig && row < rend_orig) {
429       for ( j=0; j<n; j++ ) {
430         col = in[j];
431         if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
432         /* Look up into the Hash Table */
433         key = (row/bs)*Nbs+(col/bs)+1;
434         h1  = HASH(size,key,tmp);
435 
436 
437         idx = h1;
438 #if defined(USE_PETSC_BOPT_g)
439         insert_ct++;
440         total_ct++;
441         if (HT[idx] != key) {
442           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++,total_ct++);
443           if (idx == size) {
444             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++,total_ct++);
445             if (idx == h1) {
446               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
447             }
448           }
449         }
450 #else
451         if (HT[idx] != key) {
452           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++);
453           if (idx == size) {
454             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++);
455             if (idx == h1) {
456               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
457             }
458           }
459         }
460 #endif
461         /* A HASH table entry is found, so insert the values at the correct address */
462         if (addv == ADD_VALUES) *(HD[idx]+ (col % bs)*bs + (row % bs)) += value;
463         else                    *(HD[idx]+ (col % bs)*bs + (row % bs))  = value;
464       }
465     } else {
466       if (roworiented && !baij->donotstash) {
467         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
468       } else {
469         if (!baij->donotstash) {
470           row = im[i];
471 	  for ( j=0; j<n; j++ ) {
472 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
473           }
474         }
475       }
476     }
477   }
478 #if defined(USE_PETSC_BOPT_g)
479   baij->ht_total_ct = total_ct;
480   baij->ht_insert_ct = insert_ct;
481 #endif
482   PetscFunctionReturn(0);
483 }
484 
485 #undef __FUNC__
486 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ_HT"
487 int MatSetValuesBlocked_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
488 {
489   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
490   int         ierr,i,j,ii,jj,row,col,k,l;
491   int         roworiented = baij->roworiented,rstart=baij->rstart ;
492   int         rend=baij->rend,stepval,bs=baij->bs,bs2=baij->bs2;
493   int         h1,key,size=baij->ht_size,idx,*HT=baij->ht,Nbs=baij->Nbs;
494   double      tmp;
495   Scalar      ** HD = baij->hd,*value,*v_t,*baij_a;
496 #if defined(USE_PETSC_BOPT_g)
497   int         total_ct=baij->ht_total_ct,insert_ct=baij->ht_insert_ct;
498 #endif
499 
500   PetscFunctionBegin;
501 
502   if (roworiented) {
503     stepval = (n-1)*bs;
504   } else {
505     stepval = (m-1)*bs;
506   }
507   for ( i=0; i<m; i++ ) {
508 #if defined(USE_PETSC_BOPT_g)
509     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
510     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
511 #endif
512     row   = im[i];
513     v_t   = v + i*bs2;
514     if (row >= rstart && row < rend) {
515       for ( j=0; j<n; j++ ) {
516         col = in[j];
517 
518         /* Look up into the Hash Table */
519         key = row*Nbs+col+1;
520         h1  = HASH(size,key,tmp);
521 
522         idx = h1;
523 #if defined(USE_PETSC_BOPT_g)
524         total_ct++;
525         insert_ct++;
526        if (HT[idx] != key) {
527           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++,total_ct++);
528           if (idx == size) {
529             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++,total_ct++);
530             if (idx == h1) {
531               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
532             }
533           }
534         }
535 #else
536         if (HT[idx] != key) {
537           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++);
538           if (idx == size) {
539             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++);
540             if (idx == h1) {
541               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
542             }
543           }
544         }
545 #endif
546         baij_a = HD[idx];
547         if (roworiented) {
548           /*value = v + i*(stepval+bs)*bs + j*bs;*/
549           /* value = v + (i*(stepval+bs)+j)*bs; */
550           value = v_t;
551           v_t  += bs;
552           if (addv == ADD_VALUES) {
553             for ( ii=0; ii<bs; ii++,value+=stepval) {
554               for ( jj=ii; jj<bs2; jj+=bs ) {
555                 baij_a[jj]  += *value++;
556               }
557             }
558           } else {
559             for ( ii=0; ii<bs; ii++,value+=stepval) {
560               for ( jj=ii; jj<bs2; jj+=bs ) {
561                 baij_a[jj]  = *value++;
562               }
563             }
564           }
565         } else {
566           value = v + j*(stepval+bs)*bs + i*bs;
567           if (addv == ADD_VALUES) {
568             for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
569               for ( jj=0; jj<bs; jj++ ) {
570                 baij_a[jj]  += *value++;
571               }
572             }
573           } else {
574             for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
575               for ( jj=0; jj<bs; jj++ ) {
576                 baij_a[jj]  = *value++;
577               }
578             }
579           }
580         }
581       }
582     } else {
583       if (!baij->donotstash) {
584         if (roworiented ) {
585           row   = im[i]*bs;
586           value = v + i*(stepval+bs)*bs;
587           for ( j=0; j<bs; j++,row++ ) {
588             for ( k=0; k<n; k++ ) {
589               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
590                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
591               }
592             }
593           }
594         } else {
595           for ( j=0; j<n; j++ ) {
596             value = v + j*(stepval+bs)*bs + i*bs;
597             col   = in[j]*bs;
598             for ( k=0; k<bs; k++,col++,value+=stepval) {
599               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
600                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
601               }
602             }
603           }
604         }
605       }
606     }
607   }
608 #if defined(USE_PETSC_BOPT_g)
609   baij->ht_total_ct = total_ct;
610   baij->ht_insert_ct = insert_ct;
611 #endif
612   PetscFunctionReturn(0);
613 }
614 
615 #undef __FUNC__
616 #define __FUNC__ "MatGetValues_MPIBAIJ"
617 int MatGetValues_MPIBAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
618 {
619   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
620   int        bs=baij->bs,ierr,i,j, bsrstart = baij->rstart*bs, bsrend = baij->rend*bs;
621   int        bscstart = baij->cstart*bs, bscend = baij->cend*bs,row,col,data;
622 
623   PetscFunctionBegin;
624   for ( i=0; i<m; i++ ) {
625     if (idxm[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
626     if (idxm[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
627     if (idxm[i] >= bsrstart && idxm[i] < bsrend) {
628       row = idxm[i] - bsrstart;
629       for ( j=0; j<n; j++ ) {
630         if (idxn[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");
631         if (idxn[j] >= baij->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");
632         if (idxn[j] >= bscstart && idxn[j] < bscend){
633           col = idxn[j] - bscstart;
634           ierr = MatGetValues_SeqBAIJ(baij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
635         } else {
636           if (!baij->colmap) {
637             ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
638           }
639 #if defined (USE_CTABLE)
640           ierr = TableFind(baij->colmap,idxn[j]/bs+1,&data); CHKERRQ(ierr);
641           data --;
642 #else
643           data = baij->colmap[idxn[j]/bs]-1;
644 #endif
645           if((data < 0) || (baij->garray[data/bs] != idxn[j]/bs)) *(v+i*n+j) = 0.0;
646           else {
647             col  = data + idxn[j]%bs;
648             ierr = MatGetValues_SeqBAIJ(baij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
649           }
650         }
651       }
652     } else {
653       SETERRQ(PETSC_ERR_SUP,0,"Only local values currently supported");
654     }
655   }
656  PetscFunctionReturn(0);
657 }
658 
659 #undef __FUNC__
660 #define __FUNC__ "MatNorm_MPIBAIJ"
661 int MatNorm_MPIBAIJ(Mat mat,NormType type,double *norm)
662 {
663   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
664   Mat_SeqBAIJ *amat = (Mat_SeqBAIJ*) baij->A->data, *bmat = (Mat_SeqBAIJ*) baij->B->data;
665   int        ierr, i,bs2=baij->bs2;
666   double     sum = 0.0;
667   Scalar     *v;
668 
669   PetscFunctionBegin;
670   if (baij->size == 1) {
671     ierr =  MatNorm(baij->A,type,norm); CHKERRQ(ierr);
672   } else {
673     if (type == NORM_FROBENIUS) {
674       v = amat->a;
675       for (i=0; i<amat->nz*bs2; i++ ) {
676 #if defined(USE_PETSC_COMPLEX)
677         sum += PetscReal(PetscConj(*v)*(*v)); v++;
678 #else
679         sum += (*v)*(*v); v++;
680 #endif
681       }
682       v = bmat->a;
683       for (i=0; i<bmat->nz*bs2; i++ ) {
684 #if defined(USE_PETSC_COMPLEX)
685         sum += PetscReal(PetscConj(*v)*(*v)); v++;
686 #else
687         sum += (*v)*(*v); v++;
688 #endif
689       }
690       ierr = MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
691       *norm = sqrt(*norm);
692     } else {
693       SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
694     }
695   }
696   PetscFunctionReturn(0);
697 }
698 
699 #undef __FUNC__
700 #define __FUNC__ "MatAssemblyBegin_MPIBAIJ"
701 int MatAssemblyBegin_MPIBAIJ(Mat mat,MatAssemblyType mode)
702 {
703   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
704   MPI_Comm    comm = mat->comm;
705   int         size = baij->size, *owners = baij->rowners,bs=baij->bs;
706   int         rank = baij->rank,tag = mat->tag, *owner,*starts,count,ierr;
707   MPI_Request *send_waits,*recv_waits;
708   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
709   InsertMode  addv;
710   Scalar      *rvalues,*svalues;
711 
712   PetscFunctionBegin;
713   if (baij->donotstash) {
714     baij->svalues    = 0; baij->rvalues    = 0;
715     baij->nsends     = 0; baij->nrecvs     = 0;
716     baij->send_waits = 0; baij->recv_waits = 0;
717     baij->rmax       = 0;
718     PetscFunctionReturn(0);
719   }
720 
721   /* make sure all processors are either in INSERTMODE or ADDMODE */
722   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);CHKERRQ(ierr);
723   if (addv == (ADD_VALUES|INSERT_VALUES)) {
724     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Some processors inserted others added");
725   }
726   mat->insertmode = addv; /* in case this processor had no cache */
727 
728   /*  first count number of contributors to each processor */
729   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
730   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
731   owner = (int *) PetscMalloc( (baij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
732   for ( i=0; i<baij->stash.n; i++ ) {
733     idx = baij->stash.idx[i];
734     for ( j=0; j<size; j++ ) {
735       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
736         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
737       }
738     }
739   }
740   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
741 
742   /* inform other processors of number of messages and max length*/
743   work      = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
744   ierr      = MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
745   nreceives = work[rank];
746   ierr      = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
747   nmax      = work[rank];
748   PetscFree(work);
749 
750   /* post receives:
751        1) each message will consist of ordered pairs
752      (global index,value) we store the global index as a double
753      to simplify the message passing.
754        2) since we don't know how long each individual message is we
755      allocate the largest needed buffer for each receive. Potentially
756      this is a lot of wasted space.
757 
758 
759        This could be done better.
760   */
761   rvalues    = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));CHKPTRQ(rvalues);
762   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
763   for ( i=0; i<nreceives; i++ ) {
764     ierr = MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
765   }
766 
767   /* do sends:
768       1) starts[i] gives the starting index in svalues for stuff going to
769          the ith processor
770   */
771   svalues    = (Scalar *) PetscMalloc(3*(baij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
772   send_waits = (MPI_Request *) PetscMalloc((nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
773   starts     = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
774   starts[0] = 0;
775   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
776   for ( i=0; i<baij->stash.n; i++ ) {
777     svalues[3*starts[owner[i]]]       = (Scalar)  baij->stash.idx[i];
778     svalues[3*starts[owner[i]]+1]     = (Scalar)  baij->stash.idy[i];
779     svalues[3*(starts[owner[i]]++)+2] =  baij->stash.array[i];
780   }
781   PetscFree(owner);
782   starts[0] = 0;
783   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
784   count = 0;
785   for ( i=0; i<size; i++ ) {
786     if (procs[i]) {
787       ierr = MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
788     }
789   }
790   PetscFree(starts); PetscFree(nprocs);
791 
792   /* Free cache space */
793   PLogInfo(baij->A,"MatAssemblyBegin_MPIBAIJ:Number of off-processor values %d\n",baij->stash.n);
794   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
795 
796   baij->svalues    = svalues;    baij->rvalues    = rvalues;
797   baij->nsends     = nsends;     baij->nrecvs     = nreceives;
798   baij->send_waits = send_waits; baij->recv_waits = recv_waits;
799   baij->rmax       = nmax;
800 
801   PetscFunctionReturn(0);
802 }
803 
804 /*
805   Creates the hash table, and sets the table
806   This table is created only once.
807   If new entried need to be added to the matrix
808   then the hash table has to be destroyed and
809   recreated.
810 */
811 #undef __FUNC__
812 #define __FUNC__ "MatCreateHashTable_MPIBAIJ_Private"
813 int MatCreateHashTable_MPIBAIJ_Private(Mat mat,double factor)
814 {
815   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
816   Mat         A = baij->A, B=baij->B;
817   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data, *b=(Mat_SeqBAIJ *)B->data;
818   int         i,j,k,nz=a->nz+b->nz,h1,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
819   int         size,bs2=baij->bs2,rstart=baij->rstart;
820   int         cstart=baij->cstart,*garray=baij->garray,row,col,Nbs=baij->Nbs;
821   int         *HT,key;
822   Scalar      **HD;
823   double      tmp;
824 #if defined(USE_PETSC_BOPT_g)
825   int         ct=0,max=0;
826 #endif
827 
828   PetscFunctionBegin;
829   baij->ht_size=(int)(factor*nz);
830   size = baij->ht_size;
831 
832   if (baij->ht) {
833     PetscFunctionReturn(0);
834   }
835 
836   /* Allocate Memory for Hash Table */
837   baij->hd = (Scalar**)PetscMalloc((size)*(sizeof(int)+sizeof(Scalar*))+1); CHKPTRQ(baij->hd);
838   baij->ht = (int*)(baij->hd + size);
839   HD = baij->hd;
840   HT = baij->ht;
841 
842 
843   PetscMemzero(HD,size*(sizeof(int)+sizeof(Scalar*)));
844 
845 
846   /* Loop Over A */
847   for ( i=0; i<a->mbs; i++ ) {
848     for ( j=ai[i]; j<ai[i+1]; j++ ) {
849       row = i+rstart;
850       col = aj[j]+cstart;
851 
852       key = row*Nbs + col + 1;
853       h1  = HASH(size,key,tmp);
854       for ( k=0; k<size; k++ ){
855         if (HT[(h1+k)%size] == 0.0) {
856           HT[(h1+k)%size] = key;
857           HD[(h1+k)%size] = a->a + j*bs2;
858           break;
859 #if defined(USE_PETSC_BOPT_g)
860         } else {
861           ct++;
862 #endif
863         }
864       }
865 #if defined(USE_PETSC_BOPT_g)
866       if (k> max) max = k;
867 #endif
868     }
869   }
870   /* Loop Over B */
871   for ( i=0; i<b->mbs; i++ ) {
872     for ( j=bi[i]; j<bi[i+1]; j++ ) {
873       row = i+rstart;
874       col = garray[bj[j]];
875       key = row*Nbs + col + 1;
876       h1  = HASH(size,key,tmp);
877       for ( k=0; k<size; k++ ){
878         if (HT[(h1+k)%size] == 0.0) {
879           HT[(h1+k)%size] = key;
880           HD[(h1+k)%size] = b->a + j*bs2;
881           break;
882 #if defined(USE_PETSC_BOPT_g)
883         } else {
884           ct++;
885 #endif
886         }
887       }
888 #if defined(USE_PETSC_BOPT_g)
889       if (k> max) max = k;
890 #endif
891     }
892   }
893 
894   /* Print Summary */
895 #if defined(USE_PETSC_BOPT_g)
896   for ( i=0,j=0; i<size; i++)
897     if (HT[i]) {j++;}
898   PLogInfo(0,"MatCreateHashTable_MPIBAIJ_Private: Average Search = %5.2f,max search = %d\n",
899            (j== 0)? 0.0:((double)(ct+j))/j,max);
900 #endif
901   PetscFunctionReturn(0);
902 }
903 
904 #undef __FUNC__
905 #define __FUNC__ "MatAssemblyEnd_MPIBAIJ"
906 int MatAssemblyEnd_MPIBAIJ(Mat mat,MatAssemblyType mode)
907 {
908   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
909   MPI_Status  *send_status,recv_status;
910   int         imdex,nrecvs = baij->nrecvs, count = nrecvs, i, n, ierr;
911   int         bs=baij->bs,row,col,other_disassembled;
912   Scalar      *values,val;
913   InsertMode  addv = mat->insertmode;
914 
915   PetscFunctionBegin;
916   /*  wait on receives */
917   while (count) {
918     ierr = MPI_Waitany(nrecvs,baij->recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
919     /* unpack receives into our local space */
920     values = baij->rvalues + 3*imdex*baij->rmax;
921     ierr = MPI_Get_count(&recv_status,MPIU_SCALAR,&n);CHKERRQ(ierr);
922     n = n/3;
923     for ( i=0; i<n; i++ ) {
924       row = (int) PetscReal(values[3*i]) - baij->rstart*bs;
925       col = (int) PetscReal(values[3*i+1]);
926       val = values[3*i+2];
927       if (col >= baij->cstart*bs && col < baij->cend*bs) {
928         col -= baij->cstart*bs;
929         ierr = MatSetValues(baij->A,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
930       } else {
931         if (mat->was_assembled) {
932           if (!baij->colmap) {
933             ierr = CreateColmap_MPIBAIJ_Private(mat); CHKERRQ(ierr);
934           }
935 #if defined (USE_CTABLE)
936 	  ierr = TableFind(baij->colmap,col/bs+1,&col); CHKERRQ(ierr);
937           col  = col - 1 + col%bs;
938 #else
939           col = (baij->colmap[col/bs]) - 1 + col%bs;
940 #endif
941           if (col < 0  && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
942             ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
943             col = (int) PetscReal(values[3*i+1]);
944           }
945         }
946         ierr = MatSetValues(baij->B,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
947       }
948     }
949     count--;
950   }
951   if (baij->recv_waits) PetscFree(baij->recv_waits);
952   if (baij->rvalues)    PetscFree(baij->rvalues);
953 
954   /* wait on sends */
955   if (baij->nsends) {
956     send_status = (MPI_Status *) PetscMalloc(baij->nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
957     ierr        = MPI_Waitall(baij->nsends,baij->send_waits,send_status);CHKERRQ(ierr);
958     PetscFree(send_status);
959   }
960   if (baij->send_waits) PetscFree(baij->send_waits);
961   if (baij->svalues)    PetscFree(baij->svalues);
962 
963   ierr = MatAssemblyBegin(baij->A,mode); CHKERRQ(ierr);
964   ierr = MatAssemblyEnd(baij->A,mode); CHKERRQ(ierr);
965 
966   /* determine if any processor has disassembled, if so we must
967      also disassemble ourselfs, in order that we may reassemble. */
968   /*
969      if nonzero structure of submatrix B cannot change then we know that
970      no processor disassembled thus we can skip this stuff
971   */
972   if (!((Mat_SeqBAIJ*) baij->B->data)->nonew)  {
973     ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
974     if (mat->was_assembled && !other_disassembled) {
975       ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
976     }
977   }
978 
979   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
980     ierr = MatSetUpMultiply_MPIBAIJ(mat); CHKERRQ(ierr);
981   }
982   ierr = MatAssemblyBegin(baij->B,mode); CHKERRQ(ierr);
983   ierr = MatAssemblyEnd(baij->B,mode); CHKERRQ(ierr);
984 
985 #if defined(USE_PETSC_BOPT_g)
986   if (baij->ht && mode== MAT_FINAL_ASSEMBLY) {
987     PLogInfo(0,"MatAssemblyEnd_MPIBAIJ:Average Hash Table Search in MatSetValues = %5.2f\n",
988              ((double)baij->ht_total_ct)/baij->ht_insert_ct);
989     baij->ht_total_ct  = 0;
990     baij->ht_insert_ct = 0;
991   }
992 #endif
993   if (baij->ht_flag && !baij->ht && mode == MAT_FINAL_ASSEMBLY) {
994     ierr = MatCreateHashTable_MPIBAIJ_Private(mat,baij->ht_fact); CHKERRQ(ierr);
995     mat->ops->setvalues        = MatSetValues_MPIBAIJ_HT;
996     mat->ops->setvaluesblocked = MatSetValuesBlocked_MPIBAIJ_HT;
997   }
998 
999   if (baij->rowvalues) {PetscFree(baij->rowvalues); baij->rowvalues = 0;}
1000   PetscFunctionReturn(0);
1001 }
1002 
1003 #undef __FUNC__
1004 #define __FUNC__ "MatView_MPIBAIJ_Binary"
1005 static int MatView_MPIBAIJ_Binary(Mat mat,Viewer viewer)
1006 {
1007   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
1008   int          ierr;
1009 
1010   PetscFunctionBegin;
1011   if (baij->size == 1) {
1012     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
1013   } else SETERRQ(PETSC_ERR_SUP,0,"Only uniprocessor output supported");
1014   PetscFunctionReturn(0);
1015 }
1016 
1017 #undef __FUNC__
1018 #define __FUNC__ "MatView_MPIBAIJ_ASCIIorDraworSocket"
1019 static int MatView_MPIBAIJ_ASCIIorDraworSocket(Mat mat,Viewer viewer)
1020 {
1021   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
1022   int          ierr, format,bs = baij->bs, size = baij->size, rank = baij->rank;
1023   FILE         *fd;
1024   ViewerType   vtype;
1025 
1026   PetscFunctionBegin;
1027   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
1028   if (PetscTypeCompare(vtype,ASCII_VIEWER)) {
1029     ierr = ViewerGetFormat(viewer,&format); CHKERRQ(ierr);
1030     if (format == VIEWER_FORMAT_ASCII_INFO_LONG) {
1031       MatInfo info;
1032       MPI_Comm_rank(mat->comm,&rank);
1033       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
1034       ierr = MatGetInfo(mat,MAT_LOCAL,&info);CHKERRQ(ierr);
1035       PetscSequentialPhaseBegin(mat->comm,1);
1036       fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
1037               rank,baij->m,(int)info.nz_used*bs,(int)info.nz_allocated*bs,
1038               baij->bs,(int)info.memory);
1039       ierr = MatGetInfo(baij->A,MAT_LOCAL,&info);
1040       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
1041       ierr = MatGetInfo(baij->B,MAT_LOCAL,&info);
1042       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
1043       fflush(fd);
1044       PetscSequentialPhaseEnd(mat->comm,1);
1045       ierr = VecScatterView(baij->Mvctx,viewer); CHKERRQ(ierr);
1046       PetscFunctionReturn(0);
1047     } else if (format == VIEWER_FORMAT_ASCII_INFO) {
1048       PetscPrintf(mat->comm,"  block size is %d\n",bs);
1049       PetscFunctionReturn(0);
1050     }
1051   }
1052 
1053   if (PetscTypeCompare(vtype,DRAW_VIEWER)) {
1054     Draw       draw;
1055     PetscTruth isnull;
1056     ierr = ViewerDrawGetDraw(viewer,0,&draw); CHKERRQ(ierr);
1057     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
1058   }
1059 
1060   if (size == 1) {
1061     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
1062   } else {
1063     /* assemble the entire matrix onto first processor. */
1064     Mat         A;
1065     Mat_SeqBAIJ *Aloc;
1066     int         M = baij->M, N = baij->N,*ai,*aj,col,i,j,k,*rvals;
1067     int         mbs = baij->mbs;
1068     Scalar      *a;
1069 
1070     if (!rank) {
1071       ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
1072     } else {
1073       ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
1074     }
1075     PLogObjectParent(mat,A);
1076 
1077     /* copy over the A part */
1078     Aloc = (Mat_SeqBAIJ*) baij->A->data;
1079     ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1080     rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1081 
1082     for ( i=0; i<mbs; i++ ) {
1083       rvals[0] = bs*(baij->rstart + i);
1084       for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1085       for ( j=ai[i]; j<ai[i+1]; j++ ) {
1086         col = (baij->cstart+aj[j])*bs;
1087         for (k=0; k<bs; k++ ) {
1088           ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
1089           col++; a += bs;
1090         }
1091       }
1092     }
1093     /* copy over the B part */
1094     Aloc = (Mat_SeqBAIJ*) baij->B->data;
1095     ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1096     for ( i=0; i<mbs; i++ ) {
1097       rvals[0] = bs*(baij->rstart + i);
1098       for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1099       for ( j=ai[i]; j<ai[i+1]; j++ ) {
1100         col = baij->garray[aj[j]]*bs;
1101         for (k=0; k<bs; k++ ) {
1102           ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
1103           col++; a += bs;
1104         }
1105       }
1106     }
1107     PetscFree(rvals);
1108     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1109     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1110     /*
1111        Everyone has to call to draw the matrix since the graphics waits are
1112        synchronized across all processors that share the Draw object
1113     */
1114     if (!rank || PetscTypeCompare(vtype,DRAW_VIEWER)) {
1115       ierr = MatView(((Mat_MPIBAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
1116     }
1117     ierr = MatDestroy(A); CHKERRQ(ierr);
1118   }
1119   PetscFunctionReturn(0);
1120 }
1121 
1122 
1123 
1124 #undef __FUNC__
1125 #define __FUNC__ "MatView_MPIBAIJ"
1126 int MatView_MPIBAIJ(Mat mat,Viewer viewer)
1127 {
1128   int         ierr;
1129   ViewerType  vtype;
1130 
1131   PetscFunctionBegin;
1132   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
1133   if (PetscTypeCompare(vtype,ASCII_VIEWER) || PetscTypeCompare(vtype,DRAW_VIEWER) ||
1134       PetscTypeCompare(vtype,SOCKET_VIEWER)) {
1135     ierr = MatView_MPIBAIJ_ASCIIorDraworSocket(mat,viewer); CHKERRQ(ierr);
1136   } else if (PetscTypeCompare(vtype,BINARY_VIEWER)) {
1137     ierr = MatView_MPIBAIJ_Binary(mat,viewer);CHKERRQ(ierr);
1138   } else {
1139     SETERRQ(1,1,"Viewer type not supported by PETSc object");
1140   }
1141   PetscFunctionReturn(0);
1142 }
1143 
1144 #undef __FUNC__
1145 #define __FUNC__ "MatDestroy_MPIBAIJ"
1146 int MatDestroy_MPIBAIJ(Mat mat)
1147 {
1148   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1149   int         ierr;
1150 
1151   PetscFunctionBegin;
1152   if (--mat->refct > 0) PetscFunctionReturn(0);
1153 
1154   if (mat->mapping) {
1155     ierr = ISLocalToGlobalMappingDestroy(mat->mapping); CHKERRQ(ierr);
1156   }
1157   if (mat->bmapping) {
1158     ierr = ISLocalToGlobalMappingDestroy(mat->bmapping); CHKERRQ(ierr);
1159   }
1160   if (mat->rmap) {
1161     ierr = MapDestroy(mat->rmap);CHKERRQ(ierr);
1162   }
1163   if (mat->cmap) {
1164     ierr = MapDestroy(mat->cmap);CHKERRQ(ierr);
1165   }
1166 #if defined(USE_PETSC_LOG)
1167   PLogObjectState((PetscObject)mat,"Rows=%d, Cols=%d",baij->M,baij->N);
1168 #endif
1169 
1170   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
1171   PetscFree(baij->rowners);
1172   ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1173   ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1174 #if defined (USE_CTABLE)
1175   if (baij->colmap) TableDelete(baij->colmap);
1176 #else
1177   if (baij->colmap) PetscFree(baij->colmap);
1178 #endif
1179   if (baij->garray) PetscFree(baij->garray);
1180   if (baij->lvec)   VecDestroy(baij->lvec);
1181   if (baij->Mvctx)  VecScatterDestroy(baij->Mvctx);
1182   if (baij->rowvalues) PetscFree(baij->rowvalues);
1183   if (baij->barray) PetscFree(baij->barray);
1184   if (baij->hd) PetscFree(baij->hd);
1185   PetscFree(baij);
1186   PLogObjectDestroy(mat);
1187   PetscHeaderDestroy(mat);
1188   PetscFunctionReturn(0);
1189 }
1190 
1191 #undef __FUNC__
1192 #define __FUNC__ "MatMult_MPIBAIJ"
1193 int MatMult_MPIBAIJ(Mat A,Vec xx,Vec yy)
1194 {
1195   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1196   int         ierr, nt;
1197 
1198   PetscFunctionBegin;
1199   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
1200   if (nt != a->n) {
1201     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible partition of A and xx");
1202   }
1203   ierr = VecGetLocalSize(yy,&nt);CHKERRQ(ierr);
1204   if (nt != a->m) {
1205     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible parition of A and yy");
1206   }
1207   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1208   ierr = (*a->A->ops->mult)(a->A,xx,yy); CHKERRQ(ierr);
1209   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1210   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
1211   ierr = VecScatterPostRecvs(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1212   PetscFunctionReturn(0);
1213 }
1214 
1215 #undef __FUNC__
1216 #define __FUNC__ "MatMultAdd_MPIBAIJ"
1217 int MatMultAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1218 {
1219   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1220   int        ierr;
1221 
1222   PetscFunctionBegin;
1223   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1224   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1225   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1226   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
1227   PetscFunctionReturn(0);
1228 }
1229 
1230 #undef __FUNC__
1231 #define __FUNC__ "MatMultTrans_MPIBAIJ"
1232 int MatMultTrans_MPIBAIJ(Mat A,Vec xx,Vec yy)
1233 {
1234   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1235   int         ierr;
1236 
1237   PetscFunctionBegin;
1238   /* do nondiagonal part */
1239   ierr = (*a->B->ops->multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1240   /* send it on its way */
1241   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1242   /* do local part */
1243   ierr = (*a->A->ops->multtrans)(a->A,xx,yy); CHKERRQ(ierr);
1244   /* receive remote parts: note this assumes the values are not actually */
1245   /* inserted in yy until the next line, which is true for my implementation*/
1246   /* but is not perhaps always true. */
1247   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1248   PetscFunctionReturn(0);
1249 }
1250 
1251 #undef __FUNC__
1252 #define __FUNC__ "MatMultTransAdd_MPIBAIJ"
1253 int MatMultTransAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1254 {
1255   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1256   int         ierr;
1257 
1258   PetscFunctionBegin;
1259   /* do nondiagonal part */
1260   ierr = (*a->B->ops->multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1261   /* send it on its way */
1262   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1263   /* do local part */
1264   ierr = (*a->A->ops->multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1265   /* receive remote parts: note this assumes the values are not actually */
1266   /* inserted in yy until the next line, which is true for my implementation*/
1267   /* but is not perhaps always true. */
1268   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1269   PetscFunctionReturn(0);
1270 }
1271 
1272 /*
1273   This only works correctly for square matrices where the subblock A->A is the
1274    diagonal block
1275 */
1276 #undef __FUNC__
1277 #define __FUNC__ "MatGetDiagonal_MPIBAIJ"
1278 int MatGetDiagonal_MPIBAIJ(Mat A,Vec v)
1279 {
1280   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1281   int         ierr;
1282 
1283   PetscFunctionBegin;
1284   if (a->M != a->N) SETERRQ(PETSC_ERR_SUP,0,"Supports only square matrix where A->A is diag block");
1285   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
1286   PetscFunctionReturn(0);
1287 }
1288 
1289 #undef __FUNC__
1290 #define __FUNC__ "MatScale_MPIBAIJ"
1291 int MatScale_MPIBAIJ(Scalar *aa,Mat A)
1292 {
1293   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1294   int         ierr;
1295 
1296   PetscFunctionBegin;
1297   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
1298   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
1299   PetscFunctionReturn(0);
1300 }
1301 
1302 #undef __FUNC__
1303 #define __FUNC__ "MatGetSize_MPIBAIJ"
1304 int MatGetSize_MPIBAIJ(Mat matin,int *m,int *n)
1305 {
1306   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1307 
1308   PetscFunctionBegin;
1309   if (m) *m = mat->M;
1310   if (n) *n = mat->N;
1311   PetscFunctionReturn(0);
1312 }
1313 
1314 #undef __FUNC__
1315 #define __FUNC__ "MatGetLocalSize_MPIBAIJ"
1316 int MatGetLocalSize_MPIBAIJ(Mat matin,int *m,int *n)
1317 {
1318   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1319 
1320   PetscFunctionBegin;
1321   *m = mat->m; *n = mat->n;
1322   PetscFunctionReturn(0);
1323 }
1324 
1325 #undef __FUNC__
1326 #define __FUNC__ "MatGetOwnershipRange_MPIBAIJ"
1327 int MatGetOwnershipRange_MPIBAIJ(Mat matin,int *m,int *n)
1328 {
1329   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1330 
1331   PetscFunctionBegin;
1332   *m = mat->rstart*mat->bs; *n = mat->rend*mat->bs;
1333   PetscFunctionReturn(0);
1334 }
1335 
1336 extern int MatGetRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1337 extern int MatRestoreRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1338 
1339 #undef __FUNC__
1340 #define __FUNC__ "MatGetRow_MPIBAIJ"
1341 int MatGetRow_MPIBAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1342 {
1343   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1344   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1345   int        bs = mat->bs, bs2 = mat->bs2, i, ierr, *cworkA, *cworkB, **pcA, **pcB;
1346   int        nztot, nzA, nzB, lrow, brstart = mat->rstart*bs, brend = mat->rend*bs;
1347   int        *cmap, *idx_p,cstart = mat->cstart;
1348 
1349   PetscFunctionBegin;
1350   if (mat->getrowactive == PETSC_TRUE) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Already active");
1351   mat->getrowactive = PETSC_TRUE;
1352 
1353   if (!mat->rowvalues && (idx || v)) {
1354     /*
1355         allocate enough space to hold information from the longest row.
1356     */
1357     Mat_SeqBAIJ *Aa = (Mat_SeqBAIJ *) mat->A->data,*Ba = (Mat_SeqBAIJ *) mat->B->data;
1358     int     max = 1,mbs = mat->mbs,tmp;
1359     for ( i=0; i<mbs; i++ ) {
1360       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1361       if (max < tmp) { max = tmp; }
1362     }
1363     mat->rowvalues = (Scalar *) PetscMalloc( max*bs2*(sizeof(int)+sizeof(Scalar)));
1364     CHKPTRQ(mat->rowvalues);
1365     mat->rowindices = (int *) (mat->rowvalues + max*bs2);
1366   }
1367 
1368   if (row < brstart || row >= brend) SETERRQ(PETSC_ERR_SUP,0,"Only local rows")
1369   lrow = row - brstart;
1370 
1371   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1372   if (!v)   {pvA = 0; pvB = 0;}
1373   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1374   ierr = (*mat->A->ops->getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1375   ierr = (*mat->B->ops->getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1376   nztot = nzA + nzB;
1377 
1378   cmap  = mat->garray;
1379   if (v  || idx) {
1380     if (nztot) {
1381       /* Sort by increasing column numbers, assuming A and B already sorted */
1382       int imark = -1;
1383       if (v) {
1384         *v = v_p = mat->rowvalues;
1385         for ( i=0; i<nzB; i++ ) {
1386           if (cmap[cworkB[i]/bs] < cstart)   v_p[i] = vworkB[i];
1387           else break;
1388         }
1389         imark = i;
1390         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1391         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1392       }
1393       if (idx) {
1394         *idx = idx_p = mat->rowindices;
1395         if (imark > -1) {
1396           for ( i=0; i<imark; i++ ) {
1397             idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs;
1398           }
1399         } else {
1400           for ( i=0; i<nzB; i++ ) {
1401             if (cmap[cworkB[i]/bs] < cstart)
1402               idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1403             else break;
1404           }
1405           imark = i;
1406         }
1407         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart*bs + cworkA[i];
1408         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1409       }
1410     } else {
1411       if (idx) *idx = 0;
1412       if (v)   *v   = 0;
1413     }
1414   }
1415   *nz = nztot;
1416   ierr = (*mat->A->ops->restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1417   ierr = (*mat->B->ops->restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1418   PetscFunctionReturn(0);
1419 }
1420 
1421 #undef __FUNC__
1422 #define __FUNC__ "MatRestoreRow_MPIBAIJ"
1423 int MatRestoreRow_MPIBAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1424 {
1425   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1426 
1427   PetscFunctionBegin;
1428   if (baij->getrowactive == PETSC_FALSE) {
1429     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"MatGetRow not called");
1430   }
1431   baij->getrowactive = PETSC_FALSE;
1432   PetscFunctionReturn(0);
1433 }
1434 
1435 #undef __FUNC__
1436 #define __FUNC__ "MatGetBlockSize_MPIBAIJ"
1437 int MatGetBlockSize_MPIBAIJ(Mat mat,int *bs)
1438 {
1439   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1440 
1441   PetscFunctionBegin;
1442   *bs = baij->bs;
1443   PetscFunctionReturn(0);
1444 }
1445 
1446 #undef __FUNC__
1447 #define __FUNC__ "MatZeroEntries_MPIBAIJ"
1448 int MatZeroEntries_MPIBAIJ(Mat A)
1449 {
1450   Mat_MPIBAIJ *l = (Mat_MPIBAIJ *) A->data;
1451   int         ierr;
1452 
1453   PetscFunctionBegin;
1454   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
1455   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
1456   PetscFunctionReturn(0);
1457 }
1458 
1459 #undef __FUNC__
1460 #define __FUNC__ "MatGetInfo_MPIBAIJ"
1461 int MatGetInfo_MPIBAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1462 {
1463   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) matin->data;
1464   Mat         A = a->A, B = a->B;
1465   int         ierr;
1466   double      isend[5], irecv[5];
1467 
1468   PetscFunctionBegin;
1469   info->block_size     = (double)a->bs;
1470   ierr = MatGetInfo(A,MAT_LOCAL,info); CHKERRQ(ierr);
1471   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->nz_unneeded;
1472   isend[3] = info->memory;  isend[4] = info->mallocs;
1473   ierr = MatGetInfo(B,MAT_LOCAL,info); CHKERRQ(ierr);
1474   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->nz_unneeded;
1475   isend[3] += info->memory;  isend[4] += info->mallocs;
1476   if (flag == MAT_LOCAL) {
1477     info->nz_used      = isend[0];
1478     info->nz_allocated = isend[1];
1479     info->nz_unneeded  = isend[2];
1480     info->memory       = isend[3];
1481     info->mallocs      = isend[4];
1482   } else if (flag == MAT_GLOBAL_MAX) {
1483     ierr = MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_MAX,matin->comm);CHKERRQ(ierr);
1484     info->nz_used      = irecv[0];
1485     info->nz_allocated = irecv[1];
1486     info->nz_unneeded  = irecv[2];
1487     info->memory       = irecv[3];
1488     info->mallocs      = irecv[4];
1489   } else if (flag == MAT_GLOBAL_SUM) {
1490     ierr = MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_SUM,matin->comm);CHKERRQ(ierr);
1491     info->nz_used      = irecv[0];
1492     info->nz_allocated = irecv[1];
1493     info->nz_unneeded  = irecv[2];
1494     info->memory       = irecv[3];
1495     info->mallocs      = irecv[4];
1496   } else {
1497     SETERRQ1(1,1,"Unknown MatInfoType argument %d",flag);
1498   }
1499   info->rows_global       = (double)a->M;
1500   info->columns_global    = (double)a->N;
1501   info->rows_local        = (double)a->m;
1502   info->columns_local     = (double)a->N;
1503   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1504   info->fill_ratio_needed = 0;
1505   info->factor_mallocs    = 0;
1506   PetscFunctionReturn(0);
1507 }
1508 
1509 #undef __FUNC__
1510 #define __FUNC__ "MatSetOption_MPIBAIJ"
1511 int MatSetOption_MPIBAIJ(Mat A,MatOption op)
1512 {
1513   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1514 
1515   PetscFunctionBegin;
1516   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1517       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1518       op == MAT_COLUMNS_UNSORTED ||
1519       op == MAT_COLUMNS_SORTED ||
1520       op == MAT_NEW_NONZERO_ALLOCATION_ERR ||
1521       op == MAT_NEW_NONZERO_LOCATION_ERR) {
1522         MatSetOption(a->A,op);
1523         MatSetOption(a->B,op);
1524   } else if (op == MAT_ROW_ORIENTED) {
1525         a->roworiented = 1;
1526         MatSetOption(a->A,op);
1527         MatSetOption(a->B,op);
1528   } else if (op == MAT_ROWS_SORTED ||
1529              op == MAT_ROWS_UNSORTED ||
1530              op == MAT_SYMMETRIC ||
1531              op == MAT_STRUCTURALLY_SYMMETRIC ||
1532              op == MAT_YES_NEW_DIAGONALS ||
1533              op == MAT_USE_HASH_TABLE)
1534     PLogInfo(A,"Info:MatSetOption_MPIBAIJ:Option ignored\n");
1535   else if (op == MAT_COLUMN_ORIENTED) {
1536     a->roworiented = 0;
1537     MatSetOption(a->A,op);
1538     MatSetOption(a->B,op);
1539   } else if (op == MAT_IGNORE_OFF_PROC_ENTRIES) {
1540     a->donotstash = 1;
1541   } else if (op == MAT_NO_NEW_DIAGONALS) {
1542     SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");
1543   } else if (op == MAT_USE_HASH_TABLE) {
1544     a->ht_flag = 1;
1545   } else {
1546     SETERRQ(PETSC_ERR_SUP,0,"unknown option");
1547   }
1548   PetscFunctionReturn(0);
1549 }
1550 
1551 #undef __FUNC__
1552 #define __FUNC__ "MatTranspose_MPIBAIJ("
1553 int MatTranspose_MPIBAIJ(Mat A,Mat *matout)
1554 {
1555   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) A->data;
1556   Mat_SeqBAIJ *Aloc;
1557   Mat        B;
1558   int        ierr,M=baij->M,N=baij->N,*ai,*aj,i,*rvals,j,k,col;
1559   int        bs=baij->bs,mbs=baij->mbs;
1560   Scalar     *a;
1561 
1562   PetscFunctionBegin;
1563   if (matout == PETSC_NULL && M != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Square matrix only for in-place");
1564   ierr = MatCreateMPIBAIJ(A->comm,baij->bs,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);
1565   CHKERRQ(ierr);
1566 
1567   /* copy over the A part */
1568   Aloc = (Mat_SeqBAIJ*) baij->A->data;
1569   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1570   rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1571 
1572   for ( i=0; i<mbs; i++ ) {
1573     rvals[0] = bs*(baij->rstart + i);
1574     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1575     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1576       col = (baij->cstart+aj[j])*bs;
1577       for (k=0; k<bs; k++ ) {
1578         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1579         col++; a += bs;
1580       }
1581     }
1582   }
1583   /* copy over the B part */
1584   Aloc = (Mat_SeqBAIJ*) baij->B->data;
1585   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1586   for ( i=0; i<mbs; i++ ) {
1587     rvals[0] = bs*(baij->rstart + i);
1588     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1589     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1590       col = baij->garray[aj[j]]*bs;
1591       for (k=0; k<bs; k++ ) {
1592         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1593         col++; a += bs;
1594       }
1595     }
1596   }
1597   PetscFree(rvals);
1598   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1599   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1600 
1601   if (matout != PETSC_NULL) {
1602     *matout = B;
1603   } else {
1604     PetscOps *Abops;
1605     MatOps   Aops;
1606 
1607     /* This isn't really an in-place transpose .... but free data structures from baij */
1608     PetscFree(baij->rowners);
1609     ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1610     ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1611 #if defined (USE_CTABLE)
1612     if (baij->colmap) TableDelete(baij->colmap);
1613 #else
1614     if (baij->colmap) PetscFree(baij->colmap);
1615 #endif
1616     if (baij->garray) PetscFree(baij->garray);
1617     if (baij->lvec) VecDestroy(baij->lvec);
1618     if (baij->Mvctx) VecScatterDestroy(baij->Mvctx);
1619     PetscFree(baij);
1620 
1621     /*
1622        This is horrible, horrible code. We need to keep the
1623       A pointers for the bops and ops but copy everything
1624       else from C.
1625     */
1626     Abops = A->bops;
1627     Aops  = A->ops;
1628     PetscMemcpy(A,B,sizeof(struct _p_Mat));
1629     A->bops = Abops;
1630     A->ops  = Aops;
1631 
1632     PetscHeaderDestroy(B);
1633   }
1634   PetscFunctionReturn(0);
1635 }
1636 
1637 #undef __FUNC__
1638 #define __FUNC__ "MatDiagonalScale_MPIBAIJ"
1639 int MatDiagonalScale_MPIBAIJ(Mat A,Vec ll,Vec rr)
1640 {
1641   Mat a = ((Mat_MPIBAIJ *) A->data)->A;
1642   Mat b = ((Mat_MPIBAIJ *) A->data)->B;
1643   int ierr,s1,s2,s3;
1644 
1645   PetscFunctionBegin;
1646   if (ll)  {
1647     ierr = VecGetLocalSize(ll,&s1); CHKERRQ(ierr);
1648     ierr = MatGetLocalSize(A,&s2,&s3); CHKERRQ(ierr);
1649     if (s1!=s2) SETERRQ(PETSC_ERR_ARG_SIZ,0,"non-conforming local sizes");
1650     ierr = MatDiagonalScale(a,ll,0); CHKERRQ(ierr);
1651     ierr = MatDiagonalScale(b,ll,0); CHKERRQ(ierr);
1652   }
1653   if (rr) SETERRQ(PETSC_ERR_SUP,0,"not supported for right vector");
1654   PetscFunctionReturn(0);
1655 }
1656 
1657 #undef __FUNC__
1658 #define __FUNC__ "MatZeroRows_MPIBAIJ"
1659 int MatZeroRows_MPIBAIJ(Mat A,IS is,Scalar *diag)
1660 {
1661   Mat_MPIBAIJ    *l = (Mat_MPIBAIJ *) A->data;
1662   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
1663   int            *procs,*nprocs,j,found,idx,nsends,*work,row;
1664   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
1665   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
1666   int            *lens,imdex,*lrows,*values,bs=l->bs,rstart_bs=l->rstart_bs;
1667   MPI_Comm       comm = A->comm;
1668   MPI_Request    *send_waits,*recv_waits;
1669   MPI_Status     recv_status,*send_status;
1670   IS             istmp;
1671 
1672   PetscFunctionBegin;
1673   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
1674   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
1675 
1676   /*  first count number of contributors to each processor */
1677   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
1678   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
1679   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
1680   for ( i=0; i<N; i++ ) {
1681     idx = rows[i];
1682     found = 0;
1683     for ( j=0; j<size; j++ ) {
1684       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
1685         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
1686       }
1687     }
1688     if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Index out of range");
1689   }
1690   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
1691 
1692   /* inform other processors of number of messages and max length*/
1693   work   = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
1694   ierr   = MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
1695   nrecvs = work[rank];
1696   ierr   = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
1697   nmax   = work[rank];
1698   PetscFree(work);
1699 
1700   /* post receives:   */
1701   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); CHKPTRQ(rvalues);
1702   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
1703   for ( i=0; i<nrecvs; i++ ) {
1704     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
1705   }
1706 
1707   /* do sends:
1708      1) starts[i] gives the starting index in svalues for stuff going to
1709      the ith processor
1710   */
1711   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
1712   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
1713   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
1714   starts[0] = 0;
1715   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1716   for ( i=0; i<N; i++ ) {
1717     svalues[starts[owner[i]]++] = rows[i];
1718   }
1719   ISRestoreIndices(is,&rows);
1720 
1721   starts[0] = 0;
1722   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1723   count = 0;
1724   for ( i=0; i<size; i++ ) {
1725     if (procs[i]) {
1726       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
1727     }
1728   }
1729   PetscFree(starts);
1730 
1731   base = owners[rank]*bs;
1732 
1733   /*  wait on receives */
1734   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
1735   source = lens + nrecvs;
1736   count  = nrecvs; slen = 0;
1737   while (count) {
1738     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
1739     /* unpack receives into our local space */
1740     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
1741     source[imdex]  = recv_status.MPI_SOURCE;
1742     lens[imdex]  = n;
1743     slen += n;
1744     count--;
1745   }
1746   PetscFree(recv_waits);
1747 
1748   /* move the data into the send scatter */
1749   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
1750   count = 0;
1751   for ( i=0; i<nrecvs; i++ ) {
1752     values = rvalues + i*nmax;
1753     for ( j=0; j<lens[i]; j++ ) {
1754       lrows[count++] = values[j] - base;
1755     }
1756   }
1757   PetscFree(rvalues); PetscFree(lens);
1758   PetscFree(owner); PetscFree(nprocs);
1759 
1760   /* actually zap the local rows */
1761   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
1762   PLogObjectParent(A,istmp);
1763 
1764   /*
1765         Zero the required rows. If the "diagonal block" of the matrix
1766      is square and the user wishes to set the diagonal we use seperate
1767      code so that MatSetValues() is not called for each diagonal allocating
1768      new memory, thus calling lots of mallocs and slowing things down.
1769 
1770        Contributed by: Mathew Knepley
1771   */
1772   /* must zero l->B before l->A because the (diag) case below may put values into l->B*/
1773   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
1774   if (diag && (l->A->M == l->A->N)) {
1775     ierr      = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
1776   } else if (diag) {
1777     ierr = MatZeroRows(l->A,istmp,0); CHKERRQ(ierr);
1778     if (((Mat_SeqBAIJ*)l->A->data)->nonew) {
1779       SETERRQ(PETSC_ERR_SUP,0,"MatZeroRows() on rectangular matrices cannot be used with the Mat options \n\
1780 MAT_NO_NEW_NONZERO_LOCATIONS,MAT_NEW_NONZERO_LOCATION_ERR,MAT_NEW_NONZERO_ALLOCATION_ERR");
1781     }
1782     for ( i = 0; i < slen; i++ ) {
1783       row  = lrows[i] + rstart_bs;
1784       ierr = MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES);CHKERRQ(ierr);
1785     }
1786     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1787     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1788   } else {
1789     ierr = MatZeroRows(l->A,istmp,0); CHKERRQ(ierr);
1790   }
1791 
1792   ierr = ISDestroy(istmp); CHKERRQ(ierr);
1793   PetscFree(lrows);
1794 
1795   /* wait on sends */
1796   if (nsends) {
1797     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
1798     ierr        = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
1799     PetscFree(send_status);
1800   }
1801   PetscFree(send_waits); PetscFree(svalues);
1802 
1803   PetscFunctionReturn(0);
1804 }
1805 
1806 extern int MatPrintHelp_SeqBAIJ(Mat);
1807 #undef __FUNC__
1808 #define __FUNC__ "MatPrintHelp_MPIBAIJ"
1809 int MatPrintHelp_MPIBAIJ(Mat A)
1810 {
1811   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1812   MPI_Comm    comm = A->comm;
1813   static int  called = 0;
1814   int         ierr;
1815 
1816   PetscFunctionBegin;
1817   if (!a->rank) {
1818     ierr = MatPrintHelp_SeqBAIJ(a->A);CHKERRQ(ierr);
1819   }
1820   if (called) {PetscFunctionReturn(0);} else called = 1;
1821   (*PetscHelpPrintf)(comm," Options for MATMPIBAIJ matrix format (the defaults):\n");
1822   (*PetscHelpPrintf)(comm,"  -mat_use_hash_table <factor>: Use hashtable for efficient matrix assembly\n");
1823   PetscFunctionReturn(0);
1824 }
1825 
1826 #undef __FUNC__
1827 #define __FUNC__ "MatSetUnfactored_MPIBAIJ"
1828 int MatSetUnfactored_MPIBAIJ(Mat A)
1829 {
1830   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1831   int         ierr;
1832 
1833   PetscFunctionBegin;
1834   ierr = MatSetUnfactored(a->A); CHKERRQ(ierr);
1835   PetscFunctionReturn(0);
1836 }
1837 
1838 static int MatDuplicate_MPIBAIJ(Mat,MatDuplicateOption,Mat *);
1839 
1840 /* -------------------------------------------------------------------*/
1841 static struct _MatOps MatOps_Values = {
1842   MatSetValues_MPIBAIJ,
1843   MatGetRow_MPIBAIJ,
1844   MatRestoreRow_MPIBAIJ,
1845   MatMult_MPIBAIJ,
1846   MatMultAdd_MPIBAIJ,
1847   MatMultTrans_MPIBAIJ,
1848   MatMultTransAdd_MPIBAIJ,
1849   0,
1850   0,
1851   0,
1852   0,
1853   0,
1854   0,
1855   0,
1856   MatTranspose_MPIBAIJ,
1857   MatGetInfo_MPIBAIJ,
1858   0,
1859   MatGetDiagonal_MPIBAIJ,
1860   MatDiagonalScale_MPIBAIJ,
1861   MatNorm_MPIBAIJ,
1862   MatAssemblyBegin_MPIBAIJ,
1863   MatAssemblyEnd_MPIBAIJ,
1864   0,
1865   MatSetOption_MPIBAIJ,
1866   MatZeroEntries_MPIBAIJ,
1867   MatZeroRows_MPIBAIJ,
1868   0,
1869   0,
1870   0,
1871   0,
1872   MatGetSize_MPIBAIJ,
1873   MatGetLocalSize_MPIBAIJ,
1874   MatGetOwnershipRange_MPIBAIJ,
1875   0,
1876   0,
1877   0,
1878   0,
1879   MatDuplicate_MPIBAIJ,
1880   0,
1881   0,
1882   0,
1883   0,
1884   0,
1885   MatGetSubMatrices_MPIBAIJ,
1886   MatIncreaseOverlap_MPIBAIJ,
1887   MatGetValues_MPIBAIJ,
1888   0,
1889   MatPrintHelp_MPIBAIJ,
1890   MatScale_MPIBAIJ,
1891   0,
1892   0,
1893   0,
1894   MatGetBlockSize_MPIBAIJ,
1895   0,
1896   0,
1897   0,
1898   0,
1899   0,
1900   0,
1901   MatSetUnfactored_MPIBAIJ,
1902   0,
1903   MatSetValuesBlocked_MPIBAIJ,
1904   0,
1905   0,
1906   0,
1907   MatGetMaps_Petsc};
1908 
1909 
1910 EXTERN_C_BEGIN
1911 #undef __FUNC__
1912 #define __FUNC__ "MatGetDiagonalBlock_MPIBAIJ"
1913 int MatGetDiagonalBlock_MPIBAIJ(Mat A,PetscTruth *iscopy,MatReuse reuse,Mat *a)
1914 {
1915   PetscFunctionBegin;
1916   *a      = ((Mat_MPIBAIJ *)A->data)->A;
1917   *iscopy = PETSC_FALSE;
1918   PetscFunctionReturn(0);
1919 }
1920 EXTERN_C_END
1921 
1922 #undef __FUNC__
1923 #define __FUNC__ "MatCreateMPIBAIJ"
1924 /*@C
1925    MatCreateMPIBAIJ - Creates a sparse parallel matrix in block AIJ format
1926    (block compressed row).  For good matrix assembly performance
1927    the user should preallocate the matrix storage by setting the parameters
1928    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1929    performance can be increased by more than a factor of 50.
1930 
1931    Collective on MPI_Comm
1932 
1933    Input Parameters:
1934 +  comm - MPI communicator
1935 .  bs   - size of blockk
1936 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1937            This value should be the same as the local size used in creating the
1938            y vector for the matrix-vector product y = Ax.
1939 .  n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1940            This value should be the same as the local size used in creating the
1941            x vector for the matrix-vector product y = Ax.
1942 .  M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1943 .  N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1944 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1945            submatrix  (same for all local rows)
1946 .  d_nzz - array containing the number of block nonzeros in the various block rows
1947            of the in diagonal portion of the local (possibly different for each block
1948            row) or PETSC_NULL.  You must leave room for the diagonal entry even if it is zero.
1949 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1950            submatrix (same for all local rows).
1951 -  o_nzz - array containing the number of nonzeros in the various block rows of the
1952            off-diagonal portion of the local submatrix (possibly different for
1953            each block row) or PETSC_NULL.
1954 
1955    Output Parameter:
1956 .  A - the matrix
1957 
1958    Options Database Keys:
1959 .   -mat_no_unroll - uses code that does not unroll the loops in the
1960                      block calculations (much slower)
1961 .   -mat_block_size - size of the blocks to use
1962 .   -mat_mpi - use the parallel matrix data structures even on one processor
1963                (defaults to using SeqBAIJ format on one processor)
1964 
1965    Notes:
1966    The user MUST specify either the local or global matrix dimensions
1967    (possibly both).
1968 
1969    If PETSC_DECIDE or  PETSC_DETERMINE is used for a particular argument on one processor
1970    than it must be used on all processors that share the object for that argument.
1971 
1972    Storage Information:
1973    For a square global matrix we define each processor's diagonal portion
1974    to be its local rows and the corresponding columns (a square submatrix);
1975    each processor's off-diagonal portion encompasses the remainder of the
1976    local matrix (a rectangular submatrix).
1977 
1978    The user can specify preallocated storage for the diagonal part of
1979    the local submatrix with either d_nz or d_nnz (not both).  Set
1980    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1981    memory allocation.  Likewise, specify preallocated storage for the
1982    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1983 
1984    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1985    the figure below we depict these three local rows and all columns (0-11).
1986 
1987 .vb
1988            0 1 2 3 4 5 6 7 8 9 10 11
1989           -------------------
1990    row 3  |  o o o d d d o o o o o o
1991    row 4  |  o o o d d d o o o o o o
1992    row 5  |  o o o d d d o o o o o o
1993           -------------------
1994 .ve
1995 
1996    Thus, any entries in the d locations are stored in the d (diagonal)
1997    submatrix, and any entries in the o locations are stored in the
1998    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1999    stored simply in the MATSEQBAIJ format for compressed row storage.
2000 
2001    Now d_nz should indicate the number of block nonzeros per row in the d matrix,
2002    and o_nz should indicate the number of block nonzeros per row in the o matrix.
2003    In general, for PDE problems in which most nonzeros are near the diagonal,
2004    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
2005    or you will get TERRIBLE performance; see the users' manual chapter on
2006    matrices.
2007 
2008    Level: intermediate
2009 
2010 .keywords: matrix, block, aij, compressed row, sparse, parallel
2011 
2012 .seealso: MatCreate(), MatCreateSeqBAIJ(), MatSetValues(), MatCreateMPIAIJ()
2013 @*/
2014 int MatCreateMPIBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,
2015                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
2016 {
2017   Mat          B;
2018   Mat_MPIBAIJ  *b;
2019   int          ierr, i,sum[2],work[2],mbs,nbs,Mbs=PETSC_DECIDE,Nbs=PETSC_DECIDE,size,flg;
2020   int          flag1 = 0,flag2 = 0;
2021 
2022   PetscFunctionBegin;
2023   if (bs < 1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Invalid block size specified, must be positive");
2024 
2025   MPI_Comm_size(comm,&size);
2026   ierr = OptionsHasName(PETSC_NULL,"-mat_mpibaij",&flag1); CHKERRQ(ierr);
2027   ierr = OptionsHasName(PETSC_NULL,"-mat_mpi",&flag2); CHKERRQ(ierr);
2028   if (!flag1 && !flag2 && size == 1) {
2029     if (M == PETSC_DECIDE) M = m;
2030     if (N == PETSC_DECIDE) N = n;
2031     ierr = MatCreateSeqBAIJ(comm,bs,M,N,d_nz,d_nnz,A); CHKERRQ(ierr);
2032     PetscFunctionReturn(0);
2033   }
2034 
2035   *A = 0;
2036   PetscHeaderCreate(B,_p_Mat,struct _MatOps,MAT_COOKIE,MATMPIBAIJ,"Mat",comm,MatDestroy,MatView);
2037   PLogObjectCreate(B);
2038   B->data       = (void *) (b = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(b);
2039   PetscMemzero(b,sizeof(Mat_MPIBAIJ));
2040   PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));
2041 
2042   B->ops->destroy    = MatDestroy_MPIBAIJ;
2043   B->ops->view       = MatView_MPIBAIJ;
2044   B->mapping    = 0;
2045   B->factor     = 0;
2046   B->assembled  = PETSC_FALSE;
2047 
2048   B->insertmode = NOT_SET_VALUES;
2049   MPI_Comm_rank(comm,&b->rank);
2050   MPI_Comm_size(comm,&b->size);
2051 
2052   if ( m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL)) {
2053     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
2054   }
2055   if ( M == PETSC_DECIDE && m == PETSC_DECIDE) {
2056     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either M or m should be specified");
2057   }
2058   if ( N == PETSC_DECIDE && n == PETSC_DECIDE) {
2059     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either N or n should be specified");
2060   }
2061   if ( M != PETSC_DECIDE && m != PETSC_DECIDE) M = PETSC_DECIDE;
2062   if ( N != PETSC_DECIDE && n != PETSC_DECIDE) N = PETSC_DECIDE;
2063 
2064   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
2065     work[0] = m; work[1] = n;
2066     mbs = m/bs; nbs = n/bs;
2067     ierr = MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );CHKERRQ(ierr);
2068     if (M == PETSC_DECIDE) {M = sum[0]; Mbs = M/bs;}
2069     if (N == PETSC_DECIDE) {N = sum[1]; Nbs = N/bs;}
2070   }
2071   if (m == PETSC_DECIDE) {
2072     Mbs = M/bs;
2073     if (Mbs*bs != M) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global rows must be divisible by blocksize");
2074     mbs = Mbs/b->size + ((Mbs % b->size) > b->rank);
2075     m   = mbs*bs;
2076   }
2077   if (n == PETSC_DECIDE) {
2078     Nbs = N/bs;
2079     if (Nbs*bs != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global cols must be divisible by blocksize");
2080     nbs = Nbs/b->size + ((Nbs % b->size) > b->rank);
2081     n   = nbs*bs;
2082   }
2083   if (mbs*bs != m || nbs*bs != n) {
2084     SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of local rows, cols must be divisible by blocksize");
2085   }
2086 
2087   b->m = m; B->m = m;
2088   b->n = n; B->n = n;
2089   b->N = N; B->N = N;
2090   b->M = M; B->M = M;
2091   b->bs  = bs;
2092   b->bs2 = bs*bs;
2093   b->mbs = mbs;
2094   b->nbs = nbs;
2095   b->Mbs = Mbs;
2096   b->Nbs = Nbs;
2097 
2098   /* the information in the maps duplicates the information computed below, eventually
2099      we should remove the duplicate information that is not contained in the maps */
2100   ierr = MapCreateMPI(comm,m,M,&B->rmap);CHKERRQ(ierr);
2101   ierr = MapCreateMPI(comm,n,N,&B->cmap);CHKERRQ(ierr);
2102 
2103   /* build local table of row and column ownerships */
2104   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
2105   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
2106   b->cowners = b->rowners + b->size + 2;
2107   ierr = MPI_Allgather(&mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2108   b->rowners[0] = 0;
2109   for ( i=2; i<=b->size; i++ ) {
2110     b->rowners[i] += b->rowners[i-1];
2111   }
2112   b->rstart    = b->rowners[b->rank];
2113   b->rend      = b->rowners[b->rank+1];
2114   b->rstart_bs = b->rstart * bs;
2115   b->rend_bs   = b->rend * bs;
2116 
2117   ierr = MPI_Allgather(&nbs,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2118   b->cowners[0] = 0;
2119   for ( i=2; i<=b->size; i++ ) {
2120     b->cowners[i] += b->cowners[i-1];
2121   }
2122   b->cstart    = b->cowners[b->rank];
2123   b->cend      = b->cowners[b->rank+1];
2124   b->cstart_bs = b->cstart * bs;
2125   b->cend_bs   = b->cend * bs;
2126 
2127 
2128   if (d_nz == PETSC_DEFAULT) d_nz = 5;
2129   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
2130   PLogObjectParent(B,b->A);
2131   if (o_nz == PETSC_DEFAULT) o_nz = 0;
2132   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
2133   PLogObjectParent(B,b->B);
2134 
2135   /* build cache for off array entries formed */
2136   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
2137   b->donotstash  = 0;
2138   b->colmap      = 0;
2139   b->garray      = 0;
2140   b->roworiented = 1;
2141 
2142   /* stuff used in block assembly */
2143   b->barray       = 0;
2144 
2145   /* stuff used for matrix vector multiply */
2146   b->lvec         = 0;
2147   b->Mvctx        = 0;
2148 
2149   /* stuff for MatGetRow() */
2150   b->rowindices   = 0;
2151   b->rowvalues    = 0;
2152   b->getrowactive = PETSC_FALSE;
2153 
2154   /* hash table stuff */
2155   b->ht           = 0;
2156   b->hd           = 0;
2157   b->ht_size      = 0;
2158   b->ht_flag      = 0;
2159   b->ht_fact      = 0;
2160   b->ht_total_ct  = 0;
2161   b->ht_insert_ct = 0;
2162 
2163   *A = B;
2164   ierr = OptionsHasName(PETSC_NULL,"-mat_use_hash_table",&flg); CHKERRQ(ierr);
2165   if (flg) {
2166     double fact = 1.39;
2167     ierr = MatSetOption(B,MAT_USE_HASH_TABLE); CHKERRQ(ierr);
2168     ierr = OptionsGetDouble(PETSC_NULL,"-mat_use_hash_table",&fact,&flg); CHKERRQ(ierr);
2169     if (fact <= 1.0) fact = 1.39;
2170     ierr = MatMPIBAIJSetHashTableFactor(B,fact); CHKERRQ(ierr);
2171     PLogInfo(0,"MatCreateMPIBAIJ:Hash table Factor used %5.2f\n",fact);
2172   }
2173   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetDiagonalBlock_C",
2174                                      "MatGetDiagonalBlock_MPIBAIJ",
2175                                      (void*)MatGetDiagonalBlock_MPIBAIJ);CHKERRQ(ierr);
2176   PetscFunctionReturn(0);
2177 }
2178 
2179 #undef __FUNC__
2180 #define __FUNC__ "MatDuplicate_MPIBAIJ"
2181 static int MatDuplicate_MPIBAIJ(Mat matin,MatDuplicateOption cpvalues,Mat *newmat)
2182 {
2183   Mat         mat;
2184   Mat_MPIBAIJ *a,*oldmat = (Mat_MPIBAIJ *) matin->data;
2185   int         ierr, len=0, flg;
2186 
2187   PetscFunctionBegin;
2188   *newmat       = 0;
2189   PetscHeaderCreate(mat,_p_Mat,struct _MatOps,MAT_COOKIE,MATMPIBAIJ,"Mat",matin->comm,MatDestroy,MatView);
2190   PLogObjectCreate(mat);
2191   mat->data       = (void *) (a = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(a);
2192   PetscMemcpy(mat->ops,&MatOps_Values,sizeof(struct _MatOps));
2193   mat->ops->destroy    = MatDestroy_MPIBAIJ;
2194   mat->ops->view       = MatView_MPIBAIJ;
2195   mat->factor          = matin->factor;
2196   mat->assembled       = PETSC_TRUE;
2197   mat->insertmode      = NOT_SET_VALUES;
2198 
2199   a->m = mat->m   = oldmat->m;
2200   a->n = mat->n   = oldmat->n;
2201   a->M = mat->M   = oldmat->M;
2202   a->N = mat->N   = oldmat->N;
2203 
2204   a->bs  = oldmat->bs;
2205   a->bs2 = oldmat->bs2;
2206   a->mbs = oldmat->mbs;
2207   a->nbs = oldmat->nbs;
2208   a->Mbs = oldmat->Mbs;
2209   a->Nbs = oldmat->Nbs;
2210 
2211   a->rstart       = oldmat->rstart;
2212   a->rend         = oldmat->rend;
2213   a->cstart       = oldmat->cstart;
2214   a->cend         = oldmat->cend;
2215   a->size         = oldmat->size;
2216   a->rank         = oldmat->rank;
2217   a->donotstash   = oldmat->donotstash;
2218   a->roworiented  = oldmat->roworiented;
2219   a->rowindices   = 0;
2220   a->rowvalues    = 0;
2221   a->getrowactive = PETSC_FALSE;
2222   a->barray       = 0;
2223   a->rstart_bs    = oldmat->rstart_bs;
2224   a->rend_bs      = oldmat->rend_bs;
2225   a->cstart_bs    = oldmat->cstart_bs;
2226   a->cend_bs      = oldmat->cend_bs;
2227 
2228   /* hash table stuff */
2229   a->ht           = 0;
2230   a->hd           = 0;
2231   a->ht_size      = 0;
2232   a->ht_flag      = oldmat->ht_flag;
2233   a->ht_fact      = oldmat->ht_fact;
2234   a->ht_total_ct  = 0;
2235   a->ht_insert_ct = 0;
2236 
2237 
2238   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
2239   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
2240   a->cowners = a->rowners + a->size + 2;
2241   PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));
2242   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
2243   if (oldmat->colmap) {
2244 #if defined (USE_CTABLE)
2245   ierr = TableCreateCopy(oldmat->colmap,&a->colmap); CHKERRQ(ierr);
2246 #else
2247     a->colmap = (int *) PetscMalloc((a->Nbs)*sizeof(int));CHKPTRQ(a->colmap);
2248     PLogObjectMemory(mat,(a->Nbs)*sizeof(int));
2249     PetscMemcpy(a->colmap,oldmat->colmap,(a->Nbs)*sizeof(int));
2250 #endif
2251   } else a->colmap = 0;
2252   if (oldmat->garray && (len = ((Mat_SeqBAIJ *) (oldmat->B->data))->nbs)) {
2253     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
2254     PLogObjectMemory(mat,len*sizeof(int));
2255     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
2256   } else a->garray = 0;
2257 
2258   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
2259   PLogObjectParent(mat,a->lvec);
2260   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
2261 
2262   PLogObjectParent(mat,a->Mvctx);
2263   ierr =  MatDuplicate(oldmat->A,cpvalues,&a->A); CHKERRQ(ierr);
2264   PLogObjectParent(mat,a->A);
2265   ierr =  MatDuplicate(oldmat->B,cpvalues,&a->B); CHKERRQ(ierr);
2266   PLogObjectParent(mat,a->B);
2267   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
2268   ierr = FListDuplicate(mat->qlist,&matin->qlist);CHKERRQ(ierr);
2269   if (flg) {
2270     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
2271   }
2272   *newmat = mat;
2273   PetscFunctionReturn(0);
2274 }
2275 
2276 #include "sys.h"
2277 
2278 #undef __FUNC__
2279 #define __FUNC__ "MatLoad_MPIBAIJ"
2280 int MatLoad_MPIBAIJ(Viewer viewer,MatType type,Mat *newmat)
2281 {
2282   Mat          A;
2283   int          i, nz, ierr, j,rstart, rend, fd;
2284   Scalar       *vals,*buf;
2285   MPI_Comm     comm = ((PetscObject)viewer)->comm;
2286   MPI_Status   status;
2287   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,*browners,maxnz,*cols;
2288   int          *locrowlens,*sndcounts = 0,*procsnz = 0, jj,*mycols,*ibuf;
2289   int          flg,tag = ((PetscObject)viewer)->tag,bs=1,Mbs,mbs,extra_rows;
2290   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
2291   int          dcount,kmax,k,nzcount,tmp;
2292 
2293   PetscFunctionBegin;
2294   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
2295 
2296   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
2297   if (!rank) {
2298     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
2299     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT); CHKERRQ(ierr);
2300     if (header[0] != MAT_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"not matrix object");
2301     if (header[3] < 0) {
2302       SETERRQ(PETSC_ERR_FILE_UNEXPECTED,1,"Matrix stored in special format, cannot load as MPIBAIJ");
2303     }
2304   }
2305 
2306   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
2307   M = header[1]; N = header[2];
2308 
2309   if (M != N) SETERRQ(PETSC_ERR_SUP,0,"Can only do square matrices");
2310 
2311   /*
2312      This code adds extra rows to make sure the number of rows is
2313      divisible by the blocksize
2314   */
2315   Mbs        = M/bs;
2316   extra_rows = bs - M + bs*(Mbs);
2317   if (extra_rows == bs) extra_rows = 0;
2318   else                  Mbs++;
2319   if (extra_rows &&!rank) {
2320     PLogInfo(0,"MatLoad_MPIBAIJ:Padding loaded matrix to match blocksize\n");
2321   }
2322 
2323   /* determine ownership of all rows */
2324   mbs = Mbs/size + ((Mbs % size) > rank);
2325   m   = mbs * bs;
2326   rowners = (int *) PetscMalloc(2*(size+2)*sizeof(int)); CHKPTRQ(rowners);
2327   browners = rowners + size + 1;
2328   ierr = MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2329   rowners[0] = 0;
2330   for ( i=2; i<=size; i++ ) rowners[i] += rowners[i-1];
2331   for ( i=0; i<=size;  i++ ) browners[i] = rowners[i]*bs;
2332   rstart = rowners[rank];
2333   rend   = rowners[rank+1];
2334 
2335   /* distribute row lengths to all processors */
2336   locrowlens = (int*) PetscMalloc( (rend-rstart)*bs*sizeof(int) ); CHKPTRQ(locrowlens);
2337   if (!rank) {
2338     rowlengths = (int*) PetscMalloc( (M+extra_rows)*sizeof(int) ); CHKPTRQ(rowlengths);
2339     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT); CHKERRQ(ierr);
2340     for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
2341     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
2342     for ( i=0; i<size; i++ ) sndcounts[i] = browners[i+1] - browners[i];
2343     ierr = MPI_Scatterv(rowlengths,sndcounts,browners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);CHKERRQ(ierr);
2344     PetscFree(sndcounts);
2345   } else {
2346     ierr = MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT, 0,comm);CHKERRQ(ierr);
2347   }
2348 
2349   if (!rank) {
2350     /* calculate the number of nonzeros on each processor */
2351     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
2352     PetscMemzero(procsnz,size*sizeof(int));
2353     for ( i=0; i<size; i++ ) {
2354       for ( j=rowners[i]*bs; j< rowners[i+1]*bs; j++ ) {
2355         procsnz[i] += rowlengths[j];
2356       }
2357     }
2358     PetscFree(rowlengths);
2359 
2360     /* determine max buffer needed and allocate it */
2361     maxnz = 0;
2362     for ( i=0; i<size; i++ ) {
2363       maxnz = PetscMax(maxnz,procsnz[i]);
2364     }
2365     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
2366 
2367     /* read in my part of the matrix column indices  */
2368     nz = procsnz[0];
2369     ibuf = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2370     mycols = ibuf;
2371     if (size == 1)  nz -= extra_rows;
2372     ierr = PetscBinaryRead(fd,mycols,nz,PETSC_INT); CHKERRQ(ierr);
2373     if (size == 1)  for (i=0; i< extra_rows; i++) { mycols[nz+i] = M+i; }
2374 
2375     /* read in every ones (except the last) and ship off */
2376     for ( i=1; i<size-1; i++ ) {
2377       nz   = procsnz[i];
2378       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2379       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
2380     }
2381     /* read in the stuff for the last proc */
2382     if ( size != 1 ) {
2383       nz   = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
2384       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2385       for ( i=0; i<extra_rows; i++ ) cols[nz+i] = M+i;
2386       ierr = MPI_Send(cols,nz+extra_rows,MPI_INT,size-1,tag,comm);CHKERRQ(ierr);
2387     }
2388     PetscFree(cols);
2389   } else {
2390     /* determine buffer space needed for message */
2391     nz = 0;
2392     for ( i=0; i<m; i++ ) {
2393       nz += locrowlens[i];
2394     }
2395     ibuf   = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2396     mycols = ibuf;
2397     /* receive message of column indices*/
2398     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
2399     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
2400     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2401   }
2402 
2403   /* loop over local rows, determining number of off diagonal entries */
2404   dlens  = (int *) PetscMalloc( 2*(rend-rstart+1)*sizeof(int) ); CHKPTRQ(dlens);
2405   odlens = dlens + (rend-rstart);
2406   mask   = (int *) PetscMalloc( 3*Mbs*sizeof(int) ); CHKPTRQ(mask);
2407   PetscMemzero(mask,3*Mbs*sizeof(int));
2408   masked1 = mask    + Mbs;
2409   masked2 = masked1 + Mbs;
2410   rowcount = 0; nzcount = 0;
2411   for ( i=0; i<mbs; i++ ) {
2412     dcount  = 0;
2413     odcount = 0;
2414     for ( j=0; j<bs; j++ ) {
2415       kmax = locrowlens[rowcount];
2416       for ( k=0; k<kmax; k++ ) {
2417         tmp = mycols[nzcount++]/bs;
2418         if (!mask[tmp]) {
2419           mask[tmp] = 1;
2420           if (tmp < rstart || tmp >= rend ) masked2[odcount++] = tmp;
2421           else masked1[dcount++] = tmp;
2422         }
2423       }
2424       rowcount++;
2425     }
2426 
2427     dlens[i]  = dcount;
2428     odlens[i] = odcount;
2429 
2430     /* zero out the mask elements we set */
2431     for ( j=0; j<dcount; j++ ) mask[masked1[j]] = 0;
2432     for ( j=0; j<odcount; j++ ) mask[masked2[j]] = 0;
2433   }
2434 
2435   /* create our matrix */
2436   ierr = MatCreateMPIBAIJ(comm,bs,m,PETSC_DECIDE,M+extra_rows,N+extra_rows,0,dlens,0,odlens,newmat);
2437          CHKERRQ(ierr);
2438   A = *newmat;
2439   MatSetOption(A,MAT_COLUMNS_SORTED);
2440 
2441   if (!rank) {
2442     buf = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(buf);
2443     /* read in my part of the matrix numerical values  */
2444     nz = procsnz[0];
2445     vals = buf;
2446     mycols = ibuf;
2447     if (size == 1)  nz -= extra_rows;
2448     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2449     if (size == 1)  for (i=0; i< extra_rows; i++) { vals[nz+i] = 1.0; }
2450 
2451     /* insert into matrix */
2452     jj      = rstart*bs;
2453     for ( i=0; i<m; i++ ) {
2454       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2455       mycols += locrowlens[i];
2456       vals   += locrowlens[i];
2457       jj++;
2458     }
2459     /* read in other processors (except the last one) and ship out */
2460     for ( i=1; i<size-1; i++ ) {
2461       nz   = procsnz[i];
2462       vals = buf;
2463       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2464       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
2465     }
2466     /* the last proc */
2467     if ( size != 1 ){
2468       nz   = procsnz[i] - extra_rows;
2469       vals = buf;
2470       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2471       for ( i=0; i<extra_rows; i++ ) vals[nz+i] = 1.0;
2472       ierr = MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,size-1,A->tag,comm);CHKERRQ(ierr);
2473     }
2474     PetscFree(procsnz);
2475   } else {
2476     /* receive numeric values */
2477     buf = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(buf);
2478 
2479     /* receive message of values*/
2480     vals   = buf;
2481     mycols = ibuf;
2482     ierr   = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
2483     ierr   = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
2484     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2485 
2486     /* insert into matrix */
2487     jj      = rstart*bs;
2488     for ( i=0; i<m; i++ ) {
2489       ierr    = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2490       mycols += locrowlens[i];
2491       vals   += locrowlens[i];
2492       jj++;
2493     }
2494   }
2495   PetscFree(locrowlens);
2496   PetscFree(buf);
2497   PetscFree(ibuf);
2498   PetscFree(rowners);
2499   PetscFree(dlens);
2500   PetscFree(mask);
2501   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2502   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2503   PetscFunctionReturn(0);
2504 }
2505 
2506 
2507 
2508 #undef __FUNC__
2509 #define __FUNC__ "MatMPIBAIJSetHashTableFactor"
2510 /*@
2511    MatMPIBAIJSetHashTableFactor - Sets the factor required to compute the size of the HashTable.
2512 
2513    Input Parameters:
2514 .  mat  - the matrix
2515 .  fact - factor
2516 
2517    Collective on Mat
2518 
2519   Notes:
2520    This can also be set by the command line option: -mat_use_hash_table fact
2521 
2522 .keywords: matrix, hashtable, factor, HT
2523 
2524 .seealso: MatSetOption()
2525 @*/
2526 int MatMPIBAIJSetHashTableFactor(Mat mat,double fact)
2527 {
2528   Mat_MPIBAIJ *baij;
2529 
2530   PetscFunctionBegin;
2531   PetscValidHeaderSpecific(mat,MAT_COOKIE);
2532   if (mat->type != MATMPIBAIJ) {
2533       SETERRQ(PETSC_ERR_ARG_WRONG,1,"Incorrect matrix type. Use MPIBAIJ only.");
2534   }
2535   baij = (Mat_MPIBAIJ*) mat->data;
2536   baij->ht_fact = fact;
2537   PetscFunctionReturn(0);
2538 }
2539