xref: /petsc/src/mat/impls/baij/mpi/mpibaij.c (revision 7eb4e439e85a62817f0f2ee27461d73cf6c9615d)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: mpibaij.c,v 1.141 1999/01/08 18:15:39 balay Exp balay $";
3 #endif
4 
5 #include "src/mat/impls/baij/mpi/mpibaij.h"   /*I  "mat.h"  I*/
6 #include "src/vec/vecimpl.h"
7 
8 extern int MatSetUpMultiply_MPIBAIJ(Mat);
9 extern int DisAssemble_MPIBAIJ(Mat);
10 extern int MatIncreaseOverlap_MPIBAIJ(Mat,int,IS *,int);
11 extern int MatGetSubMatrices_MPIBAIJ(Mat,int,IS *,IS *,MatGetSubMatrixCall,Mat **);
12 extern int MatGetValues_SeqBAIJ(Mat,int,int *,int,int *,Scalar *);
13 
14 /*
15      Local utility routine that creates a mapping from the global column
16    number to the local number in the off-diagonal part of the local
17    storage of the matrix.  This is done in a non scable way since the
18    length of colmap equals the global matrix length.
19 */
20 #undef __FUNC__
21 #define __FUNC__ "CreateColmap_MPIBAIJ_Private"
22 static int CreateColmap_MPIBAIJ_Private(Mat mat)
23 {
24   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
25   Mat_SeqBAIJ *B = (Mat_SeqBAIJ*) baij->B->data;
26   int         nbs = B->nbs,i,bs=B->bs,ierr;
27 
28   PetscFunctionBegin;
29 #if defined (USE_CTABLE)
30   ierr = TableCreate( &baij->colmap, baij->nbs/5 ); CHKERRQ(ierr);
31   for ( i=0; i<nbs; i++ ){
32     ierr = TableAdd( baij->colmap, baij->garray[i] + 1, i*bs+1 );CHKERRQ(ierr);
33   }
34 #else
35   baij->colmap = (int *) PetscMalloc((baij->Nbs+1)*sizeof(int));CHKPTRQ(baij->colmap);
36   PLogObjectMemory(mat,baij->Nbs*sizeof(int));
37   PetscMemzero(baij->colmap,baij->Nbs*sizeof(int));
38   for ( i=0; i<nbs; i++ ) baij->colmap[baij->garray[i]] = i*bs+1;
39 #endif
40   PetscFunctionReturn(0);
41 }
42 
43 #define CHUNKSIZE  10
44 
45 #define  MatSetValues_SeqBAIJ_A_Private(row,col,value,addv) \
46 { \
47  \
48     brow = row/bs;  \
49     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
50     rmax = aimax[brow]; nrow = ailen[brow]; \
51       bcol = col/bs; \
52       ridx = row % bs; cidx = col % bs; \
53       low = 0; high = nrow; \
54       while (high-low > 3) { \
55         t = (low+high)/2; \
56         if (rp[t] > bcol) high = t; \
57         else              low  = t; \
58       } \
59       for ( _i=low; _i<high; _i++ ) { \
60         if (rp[_i] > bcol) break; \
61         if (rp[_i] == bcol) { \
62           bap  = ap +  bs2*_i + bs*cidx + ridx; \
63           if (addv == ADD_VALUES) *bap += value;  \
64           else                    *bap  = value;  \
65           goto a_noinsert; \
66         } \
67       } \
68       if (a->nonew == 1) goto a_noinsert; \
69       else if (a->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
70       if (nrow >= rmax) { \
71         /* there is no extra room in row, therefore enlarge */ \
72         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
73         Scalar *new_a; \
74  \
75         if (a->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
76  \
77         /* malloc new storage space */ \
78         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int); \
79         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
80         new_j   = (int *) (new_a + bs2*new_nz); \
81         new_i   = new_j + new_nz; \
82  \
83         /* copy over old data into new slots */ \
84         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];} \
85         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
86         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int)); \
87         len = (new_nz - CHUNKSIZE - ai[brow] - nrow); \
88         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow, \
89                                                            len*sizeof(int)); \
90         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar)); \
91         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
92         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE), \
93                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));  \
94         /* free up old matrix storage */ \
95         PetscFree(a->a);  \
96         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} \
97         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
98         a->singlemalloc = 1; \
99  \
100         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
101         rmax = aimax[brow] = aimax[brow] + CHUNKSIZE; \
102         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
103         a->maxnz += bs2*CHUNKSIZE; \
104         a->reallocs++; \
105         a->nz++; \
106       } \
107       N = nrow++ - 1;  \
108       /* shift up all the later entries in this row */ \
109       for ( ii=N; ii>=_i; ii-- ) { \
110         rp[ii+1] = rp[ii]; \
111         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
112       } \
113       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
114       rp[_i]                      = bcol;  \
115       ap[bs2*_i + bs*cidx + ridx] = value;  \
116       a_noinsert:; \
117     ailen[brow] = nrow; \
118 }
119 
120 #define  MatSetValues_SeqBAIJ_B_Private(row,col,value,addv) \
121 { \
122  \
123     brow = row/bs;  \
124     rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
125     rmax = bimax[brow]; nrow = bilen[brow]; \
126       bcol = col/bs; \
127       ridx = row % bs; cidx = col % bs; \
128       low = 0; high = nrow; \
129       while (high-low > 3) { \
130         t = (low+high)/2; \
131         if (rp[t] > bcol) high = t; \
132         else              low  = t; \
133       } \
134       for ( _i=low; _i<high; _i++ ) { \
135         if (rp[_i] > bcol) break; \
136         if (rp[_i] == bcol) { \
137           bap  = ap +  bs2*_i + bs*cidx + ridx; \
138           if (addv == ADD_VALUES) *bap += value;  \
139           else                    *bap  = value;  \
140           goto b_noinsert; \
141         } \
142       } \
143       if (b->nonew == 1) goto b_noinsert; \
144       else if (b->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
145       if (nrow >= rmax) { \
146         /* there is no extra room in row, therefore enlarge */ \
147         int    new_nz = bi[b->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
148         Scalar *new_a; \
149  \
150         if (b->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
151  \
152         /* malloc new storage space */ \
153         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(b->mbs+1)*sizeof(int); \
154         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
155         new_j   = (int *) (new_a + bs2*new_nz); \
156         new_i   = new_j + new_nz; \
157  \
158         /* copy over old data into new slots */ \
159         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = bi[ii];} \
160         for ( ii=brow+1; ii<b->mbs+1; ii++ ) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
161         PetscMemcpy(new_j,bj,(bi[brow]+nrow)*sizeof(int)); \
162         len = (new_nz - CHUNKSIZE - bi[brow] - nrow); \
163         PetscMemcpy(new_j+bi[brow]+nrow+CHUNKSIZE,bj+bi[brow]+nrow, \
164                                                            len*sizeof(int)); \
165         PetscMemcpy(new_a,ba,(bi[brow]+nrow)*bs2*sizeof(Scalar)); \
166         PetscMemzero(new_a+bs2*(bi[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
167         PetscMemcpy(new_a+bs2*(bi[brow]+nrow+CHUNKSIZE), \
168                     ba+bs2*(bi[brow]+nrow),bs2*len*sizeof(Scalar));  \
169         /* free up old matrix storage */ \
170         PetscFree(b->a);  \
171         if (!b->singlemalloc) {PetscFree(b->i);PetscFree(b->j);} \
172         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
173         b->singlemalloc = 1; \
174  \
175         rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
176         rmax = bimax[brow] = bimax[brow] + CHUNKSIZE; \
177         PLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
178         b->maxnz += bs2*CHUNKSIZE; \
179         b->reallocs++; \
180         b->nz++; \
181       } \
182       N = nrow++ - 1;  \
183       /* shift up all the later entries in this row */ \
184       for ( ii=N; ii>=_i; ii-- ) { \
185         rp[ii+1] = rp[ii]; \
186         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
187       } \
188       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
189       rp[_i]                      = bcol;  \
190       ap[bs2*_i + bs*cidx + ridx] = value;  \
191       b_noinsert:; \
192     bilen[brow] = nrow; \
193 }
194 
195 #undef __FUNC__
196 #define __FUNC__ "MatSetValues_MPIBAIJ"
197 int MatSetValues_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
198 {
199   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
200   Scalar      value;
201   int         ierr,i,j,row,col;
202   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
203   int         rend_orig=baij->rend_bs,cstart_orig=baij->cstart_bs;
204   int         cend_orig=baij->cend_bs,bs=baij->bs;
205 
206   /* Some Variables required in the macro */
207   Mat         A = baij->A;
208   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) (A)->data;
209   int         *aimax=a->imax,*ai=a->i,*ailen=a->ilen,*aj=a->j;
210   Scalar      *aa=a->a;
211 
212   Mat         B = baij->B;
213   Mat_SeqBAIJ *b = (Mat_SeqBAIJ *) (B)->data;
214   int         *bimax=b->imax,*bi=b->i,*bilen=b->ilen,*bj=b->j;
215   Scalar      *ba=b->a;
216 
217   int         *rp,ii,nrow,_i,rmax,N,brow,bcol;
218   int         low,high,t,ridx,cidx,bs2=a->bs2;
219   Scalar      *ap,*bap;
220 
221   PetscFunctionBegin;
222   for ( i=0; i<m; i++ ) {
223 #if defined(USE_PETSC_BOPT_g)
224     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
225     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
226 #endif
227     if (im[i] >= rstart_orig && im[i] < rend_orig) {
228       row = im[i] - rstart_orig;
229       for ( j=0; j<n; j++ ) {
230         if (in[j] >= cstart_orig && in[j] < cend_orig){
231           col = in[j] - cstart_orig;
232           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
233           MatSetValues_SeqBAIJ_A_Private(row,col,value,addv);
234           /* ierr = MatSetValues_SeqBAIJ(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
235         }
236 #if defined(USE_PETSC_BOPT_g)
237         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
238         else if (in[j] >= baij->N) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Col too large");}
239 #endif
240         else {
241           if (mat->was_assembled) {
242             if (!baij->colmap) {
243               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
244             }
245 #if defined (USE_CTABLE)
246 	    col = TableFind( baij->colmap, in[j]/bs + 1 ) - 1 + in[j]%bs;
247 #else
248             col = baij->colmap[in[j]/bs] - 1 + in[j]%bs;
249 #endif
250             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
251               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
252               col =  in[j];
253               /* Reinitialize the variables required by MatSetValues_SeqBAIJ_B_Private() */
254               B = baij->B;
255               b = (Mat_SeqBAIJ *) (B)->data;
256               bimax=b->imax;bi=b->i;bilen=b->ilen;bj=b->j;
257               ba=b->a;
258             }
259           } else col = in[j];
260           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
261           MatSetValues_SeqBAIJ_B_Private(row,col,value,addv);
262           /* ierr = MatSetValues_SeqBAIJ(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
263         }
264       }
265     } else {
266       if (roworiented && !baij->donotstash) {
267         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
268       } else {
269         if (!baij->donotstash) {
270           row = im[i];
271 	  for ( j=0; j<n; j++ ) {
272 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
273           }
274         }
275       }
276     }
277   }
278   PetscFunctionReturn(0);
279 }
280 
281 extern int MatSetValuesBlocked_SeqBAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
282 #undef __FUNC__
283 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ"
284 int MatSetValuesBlocked_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
285 {
286   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
287   Scalar      *value,*barray=baij->barray;
288   int         ierr,i,j,ii,jj,row,col,k,l;
289   int         roworiented = baij->roworiented,rstart=baij->rstart ;
290   int         rend=baij->rend,cstart=baij->cstart,stepval;
291   int         cend=baij->cend,bs=baij->bs,bs2=baij->bs2;
292 
293   if(!barray) {
294     baij->barray = barray = (Scalar*) PetscMalloc(bs2*sizeof(Scalar)); CHKPTRQ(barray);
295   }
296 
297   if (roworiented) {
298     stepval = (n-1)*bs;
299   } else {
300     stepval = (m-1)*bs;
301   }
302   for ( i=0; i<m; i++ ) {
303 #if defined(USE_PETSC_BOPT_g)
304     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
305     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
306 #endif
307     if (im[i] >= rstart && im[i] < rend) {
308       row = im[i] - rstart;
309       for ( j=0; j<n; j++ ) {
310         /* If NumCol = 1 then a copy is not required */
311         if ((roworiented) && (n == 1)) {
312           barray = v + i*bs2;
313         } else if((!roworiented) && (m == 1)) {
314           barray = v + j*bs2;
315         } else { /* Here a copy is required */
316           if (roworiented) {
317             value = v + i*(stepval+bs)*bs + j*bs;
318           } else {
319             value = v + j*(stepval+bs)*bs + i*bs;
320           }
321           for ( ii=0; ii<bs; ii++,value+=stepval ) {
322             for (jj=0; jj<bs; jj++ ) {
323               *barray++  = *value++;
324             }
325           }
326           barray -=bs2;
327         }
328 
329         if (in[j] >= cstart && in[j] < cend){
330           col  = in[j] - cstart;
331           ierr = MatSetValuesBlocked_SeqBAIJ(baij->A,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
332         }
333 #if defined(USE_PETSC_BOPT_g)
334         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
335         else if (in[j] >= baij->Nbs) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");}
336 #endif
337         else {
338           if (mat->was_assembled) {
339             if (!baij->colmap) {
340               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
341             }
342 
343 #if defined(USE_PETSC_BOPT_g)
344 #if defined (USE_CTABLE)
345             if( (TableFind( baij->colmap, in[j] + 1 ) - 1) % bs)
346 	      {SETERRQ(PETSC_ERR_PLIB,0,"Incorrect colmap");}
347 #else
348             if ((baij->colmap[in[j]] - 1) % bs) {SETERRQ(PETSC_ERR_PLIB,0,"Incorrect colmap");}
349 #endif
350 #endif
351 #if defined (USE_CTABLE)
352 	    col = (TableFind( baij->colmap, in[j] + 1 ) - 1)/bs;
353 #else
354             col = (baij->colmap[in[j]] - 1)/bs;
355 #endif
356             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
357               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
358               col =  in[j];
359             }
360           }
361           else col = in[j];
362           ierr = MatSetValuesBlocked_SeqBAIJ(baij->B,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
363         }
364       }
365     } else {
366       if (!baij->donotstash) {
367         if (roworiented ) {
368           row   = im[i]*bs;
369           value = v + i*(stepval+bs)*bs;
370           for ( j=0; j<bs; j++,row++ ) {
371             for ( k=0; k<n; k++ ) {
372               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
373                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
374               }
375             }
376           }
377         } else {
378           for ( j=0; j<n; j++ ) {
379             value = v + j*(stepval+bs)*bs + i*bs;
380             col   = in[j]*bs;
381             for ( k=0; k<bs; k++,col++,value+=stepval) {
382               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
383                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
384               }
385             }
386           }
387         }
388       }
389     }
390   }
391   PetscFunctionReturn(0);
392 }
393 #define HASH_KEY 0.6180339887
394 /* #define HASH1(size,key) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
395 #define HASH(size,key,tmp) (tmp = (key)*HASH_KEY,(int)((size)*(tmp-(int)tmp)))
396 /* #define HASH(size,key,tmp) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
397 #undef __FUNC__
398 #define __FUNC__ "MatSetValues_MPIBAIJ_HT"
399 int MatSetValues_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
400 {
401   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
402   int         ierr,i,j,row,col;
403   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
404   int         rend_orig=baij->rend_bs,Nbs=baij->Nbs;
405   int         h1,key,size=baij->ht_size,bs=baij->bs,*HT=baij->ht,idx;
406   double      tmp;
407   Scalar      ** HD = baij->hd,value;
408 #if defined(USE_PETSC_BOPT_g)
409   int         total_ct=baij->ht_total_ct,insert_ct=baij->ht_insert_ct;
410 #endif
411 
412   PetscFunctionBegin;
413 
414   for ( i=0; i<m; i++ ) {
415 #if defined(USE_PETSC_BOPT_g)
416     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
417     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
418 #endif
419       row = im[i];
420     if (row >= rstart_orig && row < rend_orig) {
421       for ( j=0; j<n; j++ ) {
422         col = in[j];
423         if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
424         /* Look up into the Hash Table */
425         key = (row/bs)*Nbs+(col/bs)+1;
426         h1  = HASH(size,key,tmp);
427 
428 
429         idx = h1;
430 #if defined(USE_PETSC_BOPT_g)
431         insert_ct++;
432         total_ct++;
433         if (HT[idx] != key) {
434           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++,total_ct++);
435           if (idx == size) {
436             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++,total_ct++);
437             if (idx == h1) {
438               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
439             }
440           }
441         }
442 #else
443         if (HT[idx] != key) {
444           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++);
445           if (idx == size) {
446             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++);
447             if (idx == h1) {
448               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
449             }
450           }
451         }
452 #endif
453         /* A HASH table entry is found, so insert the values at the correct address */
454         if (addv == ADD_VALUES) *(HD[idx]+ (col % bs)*bs + (row % bs)) += value;
455         else                    *(HD[idx]+ (col % bs)*bs + (row % bs))  = value;
456       }
457     } else {
458       if (roworiented && !baij->donotstash) {
459         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
460       } else {
461         if (!baij->donotstash) {
462           row = im[i];
463 	  for ( j=0; j<n; j++ ) {
464 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
465           }
466         }
467       }
468     }
469   }
470 #if defined(USE_PETSC_BOPT_g)
471   baij->ht_total_ct = total_ct;
472   baij->ht_insert_ct = insert_ct;
473 #endif
474   PetscFunctionReturn(0);
475 }
476 
477 #undef __FUNC__
478 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ_HT"
479 int MatSetValuesBlocked_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
480 {
481   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
482   int         ierr,i,j,ii,jj,row,col,k,l;
483   int         roworiented = baij->roworiented,rstart=baij->rstart ;
484   int         rend=baij->rend,stepval,bs=baij->bs,bs2=baij->bs2;
485   int         h1,key,size=baij->ht_size,idx,*HT=baij->ht,Nbs=baij->Nbs;
486   double      tmp;
487   Scalar      ** HD = baij->hd,*value,*v_t,*baij_a;
488 #if defined(USE_PETSC_BOPT_g)
489   int         total_ct=baij->ht_total_ct,insert_ct=baij->ht_insert_ct;
490 #endif
491 
492   PetscFunctionBegin;
493 
494   if (roworiented) {
495     stepval = (n-1)*bs;
496   } else {
497     stepval = (m-1)*bs;
498   }
499   for ( i=0; i<m; i++ ) {
500 #if defined(USE_PETSC_BOPT_g)
501     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
502     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
503 #endif
504     row   = im[i];
505     v_t   = v + i*bs2;
506     if (row >= rstart && row < rend) {
507       for ( j=0; j<n; j++ ) {
508         col = in[j];
509 
510         /* Look up into the Hash Table */
511         key = row*Nbs+col+1;
512         h1  = HASH(size,key,tmp);
513 
514         idx = h1;
515 #if defined(USE_PETSC_BOPT_g)
516         total_ct++;
517         insert_ct++;
518        if (HT[idx] != key) {
519           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++,total_ct++);
520           if (idx == size) {
521             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++,total_ct++);
522             if (idx == h1) {
523               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
524             }
525           }
526         }
527 #else
528         if (HT[idx] != key) {
529           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++);
530           if (idx == size) {
531             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++);
532             if (idx == h1) {
533               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
534             }
535           }
536         }
537 #endif
538         baij_a = HD[idx];
539         if (roworiented) {
540           /*value = v + i*(stepval+bs)*bs + j*bs;*/
541           /* value = v + (i*(stepval+bs)+j)*bs; */
542           value = v_t;
543           v_t  += bs;
544           if (addv == ADD_VALUES) {
545             for ( ii=0; ii<bs; ii++,value+=stepval) {
546               for ( jj=ii; jj<bs2; jj+=bs ) {
547                 baij_a[jj]  += *value++;
548               }
549             }
550           } else {
551             for ( ii=0; ii<bs; ii++,value+=stepval) {
552               for ( jj=ii; jj<bs2; jj+=bs ) {
553                 baij_a[jj]  = *value++;
554               }
555             }
556           }
557         } else {
558           value = v + j*(stepval+bs)*bs + i*bs;
559           if (addv == ADD_VALUES) {
560             for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
561               for ( jj=0; jj<bs; jj++ ) {
562                 baij_a[jj]  += *value++;
563               }
564             }
565           } else {
566             for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
567               for ( jj=0; jj<bs; jj++ ) {
568                 baij_a[jj]  = *value++;
569               }
570             }
571           }
572         }
573       }
574     } else {
575       if (!baij->donotstash) {
576         if (roworiented ) {
577           row   = im[i]*bs;
578           value = v + i*(stepval+bs)*bs;
579           for ( j=0; j<bs; j++,row++ ) {
580             for ( k=0; k<n; k++ ) {
581               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
582                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
583               }
584             }
585           }
586         } else {
587           for ( j=0; j<n; j++ ) {
588             value = v + j*(stepval+bs)*bs + i*bs;
589             col   = in[j]*bs;
590             for ( k=0; k<bs; k++,col++,value+=stepval) {
591               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
592                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
593               }
594             }
595           }
596         }
597       }
598     }
599   }
600 #if defined(USE_PETSC_BOPT_g)
601   baij->ht_total_ct = total_ct;
602   baij->ht_insert_ct = insert_ct;
603 #endif
604   PetscFunctionReturn(0);
605 }
606 
607 #undef __FUNC__
608 #define __FUNC__ "MatGetValues_MPIBAIJ"
609 int MatGetValues_MPIBAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
610 {
611   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
612   int        bs=baij->bs,ierr,i,j, bsrstart = baij->rstart*bs, bsrend = baij->rend*bs;
613   int        bscstart = baij->cstart*bs, bscend = baij->cend*bs,row,col,data;
614 
615   PetscFunctionBegin;
616   for ( i=0; i<m; i++ ) {
617     if (idxm[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
618     if (idxm[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
619     if (idxm[i] >= bsrstart && idxm[i] < bsrend) {
620       row = idxm[i] - bsrstart;
621       for ( j=0; j<n; j++ ) {
622         if (idxn[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");
623         if (idxn[j] >= baij->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");
624         if (idxn[j] >= bscstart && idxn[j] < bscend){
625           col = idxn[j] - bscstart;
626           ierr = MatGetValues_SeqBAIJ(baij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
627         } else {
628           if (!baij->colmap) {
629             ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
630           }
631 #if defined (USE_CTABLE)
632           data = TableFind( baij->colmap, idxn[j]/bs + 1 ) - 1;
633 #else
634           data = baij->colmap[idxn[j]/bs]-1;
635 #endif
636           if((data < 0) || (baij->garray[data/bs] != idxn[j]/bs)) *(v+i*n+j) = 0.0;
637           else {
638             col  = data + idxn[j]%bs;
639             ierr = MatGetValues_SeqBAIJ(baij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
640           }
641         }
642       }
643     } else {
644       SETERRQ(PETSC_ERR_SUP,0,"Only local values currently supported");
645     }
646   }
647  PetscFunctionReturn(0);
648 }
649 
650 #undef __FUNC__
651 #define __FUNC__ "MatNorm_MPIBAIJ"
652 int MatNorm_MPIBAIJ(Mat mat,NormType type,double *norm)
653 {
654   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
655   Mat_SeqBAIJ *amat = (Mat_SeqBAIJ*) baij->A->data, *bmat = (Mat_SeqBAIJ*) baij->B->data;
656   int        ierr, i,bs2=baij->bs2;
657   double     sum = 0.0;
658   Scalar     *v;
659 
660   PetscFunctionBegin;
661   if (baij->size == 1) {
662     ierr =  MatNorm(baij->A,type,norm); CHKERRQ(ierr);
663   } else {
664     if (type == NORM_FROBENIUS) {
665       v = amat->a;
666       for (i=0; i<amat->nz*bs2; i++ ) {
667 #if defined(USE_PETSC_COMPLEX)
668         sum += PetscReal(PetscConj(*v)*(*v)); v++;
669 #else
670         sum += (*v)*(*v); v++;
671 #endif
672       }
673       v = bmat->a;
674       for (i=0; i<bmat->nz*bs2; i++ ) {
675 #if defined(USE_PETSC_COMPLEX)
676         sum += PetscReal(PetscConj(*v)*(*v)); v++;
677 #else
678         sum += (*v)*(*v); v++;
679 #endif
680       }
681       ierr = MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
682       *norm = sqrt(*norm);
683     } else {
684       SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
685     }
686   }
687   PetscFunctionReturn(0);
688 }
689 
690 #undef __FUNC__
691 #define __FUNC__ "MatAssemblyBegin_MPIBAIJ"
692 int MatAssemblyBegin_MPIBAIJ(Mat mat,MatAssemblyType mode)
693 {
694   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
695   MPI_Comm    comm = mat->comm;
696   int         size = baij->size, *owners = baij->rowners,bs=baij->bs;
697   int         rank = baij->rank,tag = mat->tag, *owner,*starts,count,ierr;
698   MPI_Request *send_waits,*recv_waits;
699   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
700   InsertMode  addv;
701   Scalar      *rvalues,*svalues;
702 
703   PetscFunctionBegin;
704   if (baij->donotstash) {
705     baij->svalues    = 0; baij->rvalues    = 0;
706     baij->nsends     = 0; baij->nrecvs     = 0;
707     baij->send_waits = 0; baij->recv_waits = 0;
708     baij->rmax       = 0;
709     PetscFunctionReturn(0);
710   }
711 
712   /* make sure all processors are either in INSERTMODE or ADDMODE */
713   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);CHKERRQ(ierr);
714   if (addv == (ADD_VALUES|INSERT_VALUES)) {
715     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Some processors inserted others added");
716   }
717   mat->insertmode = addv; /* in case this processor had no cache */
718 
719   /*  first count number of contributors to each processor */
720   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
721   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
722   owner = (int *) PetscMalloc( (baij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
723   for ( i=0; i<baij->stash.n; i++ ) {
724     idx = baij->stash.idx[i];
725     for ( j=0; j<size; j++ ) {
726       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
727         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
728       }
729     }
730   }
731   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
732 
733   /* inform other processors of number of messages and max length*/
734   work      = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
735   ierr      = MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
736   nreceives = work[rank];
737   ierr      = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
738   nmax      = work[rank];
739   PetscFree(work);
740 
741   /* post receives:
742        1) each message will consist of ordered pairs
743      (global index,value) we store the global index as a double
744      to simplify the message passing.
745        2) since we don't know how long each individual message is we
746      allocate the largest needed buffer for each receive. Potentially
747      this is a lot of wasted space.
748 
749 
750        This could be done better.
751   */
752   rvalues    = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));CHKPTRQ(rvalues);
753   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
754   for ( i=0; i<nreceives; i++ ) {
755     ierr = MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
756   }
757 
758   /* do sends:
759       1) starts[i] gives the starting index in svalues for stuff going to
760          the ith processor
761   */
762   svalues    = (Scalar *) PetscMalloc(3*(baij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
763   send_waits = (MPI_Request *) PetscMalloc((nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
764   starts     = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
765   starts[0] = 0;
766   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
767   for ( i=0; i<baij->stash.n; i++ ) {
768     svalues[3*starts[owner[i]]]       = (Scalar)  baij->stash.idx[i];
769     svalues[3*starts[owner[i]]+1]     = (Scalar)  baij->stash.idy[i];
770     svalues[3*(starts[owner[i]]++)+2] =  baij->stash.array[i];
771   }
772   PetscFree(owner);
773   starts[0] = 0;
774   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
775   count = 0;
776   for ( i=0; i<size; i++ ) {
777     if (procs[i]) {
778       ierr = MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
779     }
780   }
781   PetscFree(starts); PetscFree(nprocs);
782 
783   /* Free cache space */
784   PLogInfo(baij->A,"MatAssemblyBegin_MPIBAIJ:Number of off-processor values %d\n",baij->stash.n);
785   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
786 
787   baij->svalues    = svalues;    baij->rvalues    = rvalues;
788   baij->nsends     = nsends;     baij->nrecvs     = nreceives;
789   baij->send_waits = send_waits; baij->recv_waits = recv_waits;
790   baij->rmax       = nmax;
791 
792   PetscFunctionReturn(0);
793 }
794 
795 /*
796   Creates the hash table, and sets the table
797   This table is created only once.
798   If new entried need to be added to the matrix
799   then the hash table has to be destroyed and
800   recreated.
801 */
802 #undef __FUNC__
803 #define __FUNC__ "MatCreateHashTable_MPIBAIJ_Private"
804 int MatCreateHashTable_MPIBAIJ_Private(Mat mat,double factor)
805 {
806   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
807   Mat         A = baij->A, B=baij->B;
808   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data, *b=(Mat_SeqBAIJ *)B->data;
809   int         i,j,k,nz=a->nz+b->nz,h1,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
810   int         size,bs2=baij->bs2,rstart=baij->rstart;
811   int         cstart=baij->cstart,*garray=baij->garray,row,col,Nbs=baij->Nbs;
812   int         *HT,key;
813   Scalar      **HD;
814   double      tmp;
815 #if defined(USE_PETSC_BOPT_g)
816   int         ct=0,max=0;
817 #endif
818 
819   PetscFunctionBegin;
820   baij->ht_size=(int)(factor*nz);
821   size = baij->ht_size;
822 
823   if (baij->ht) {
824     PetscFunctionReturn(0);
825   }
826 
827   /* Allocate Memory for Hash Table */
828   baij->hd = (Scalar**)PetscMalloc((size)*(sizeof(int)+sizeof(Scalar*))+1); CHKPTRQ(baij->hd);
829   baij->ht = (int*)(baij->hd + size);
830   HD = baij->hd;
831   HT = baij->ht;
832 
833 
834   PetscMemzero(HD,size*(sizeof(int)+sizeof(Scalar*)));
835 
836 
837   /* Loop Over A */
838   for ( i=0; i<a->mbs; i++ ) {
839     for ( j=ai[i]; j<ai[i+1]; j++ ) {
840       row = i+rstart;
841       col = aj[j]+cstart;
842 
843       key = row*Nbs + col + 1;
844       h1  = HASH(size,key,tmp);
845       for ( k=0; k<size; k++ ){
846         if (HT[(h1+k)%size] == 0.0) {
847           HT[(h1+k)%size] = key;
848           HD[(h1+k)%size] = a->a + j*bs2;
849           break;
850 #if defined(USE_PETSC_BOPT_g)
851         } else {
852           ct++;
853 #endif
854         }
855       }
856 #if defined(USE_PETSC_BOPT_g)
857       if (k> max) max = k;
858 #endif
859     }
860   }
861   /* Loop Over B */
862   for ( i=0; i<b->mbs; i++ ) {
863     for ( j=bi[i]; j<bi[i+1]; j++ ) {
864       row = i+rstart;
865       col = garray[bj[j]];
866       key = row*Nbs + col + 1;
867       h1  = HASH(size,key,tmp);
868       for ( k=0; k<size; k++ ){
869         if (HT[(h1+k)%size] == 0.0) {
870           HT[(h1+k)%size] = key;
871           HD[(h1+k)%size] = b->a + j*bs2;
872           break;
873 #if defined(USE_PETSC_BOPT_g)
874         } else {
875           ct++;
876 #endif
877         }
878       }
879 #if defined(USE_PETSC_BOPT_g)
880       if (k> max) max = k;
881 #endif
882     }
883   }
884 
885   /* Print Summary */
886 #if defined(USE_PETSC_BOPT_g)
887   for ( i=0,j=0; i<size; i++)
888     if (HT[i]) {j++;}
889   PLogInfo(0,"MatCreateHashTable_MPIBAIJ_Private: Average Search = %5.2f,max search = %d\n",
890            (j== 0)? 0.0:((double)(ct+j))/j,max);
891 #endif
892   PetscFunctionReturn(0);
893 }
894 
895 #undef __FUNC__
896 #define __FUNC__ "MatAssemblyEnd_MPIBAIJ"
897 int MatAssemblyEnd_MPIBAIJ(Mat mat,MatAssemblyType mode)
898 {
899   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
900   MPI_Status  *send_status,recv_status;
901   int         imdex,nrecvs = baij->nrecvs, count = nrecvs, i, n, ierr;
902   int         bs=baij->bs,row,col,other_disassembled;
903   Scalar      *values,val;
904   InsertMode  addv = mat->insertmode;
905 
906   PetscFunctionBegin;
907   /*  wait on receives */
908   while (count) {
909     ierr = MPI_Waitany(nrecvs,baij->recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
910     /* unpack receives into our local space */
911     values = baij->rvalues + 3*imdex*baij->rmax;
912     ierr = MPI_Get_count(&recv_status,MPIU_SCALAR,&n);CHKERRQ(ierr);
913     n = n/3;
914     for ( i=0; i<n; i++ ) {
915       row = (int) PetscReal(values[3*i]) - baij->rstart*bs;
916       col = (int) PetscReal(values[3*i+1]);
917       val = values[3*i+2];
918       if (col >= baij->cstart*bs && col < baij->cend*bs) {
919         col -= baij->cstart*bs;
920         ierr = MatSetValues(baij->A,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
921       } else {
922         if (mat->was_assembled) {
923           if (!baij->colmap) {
924             ierr = CreateColmap_MPIBAIJ_Private(mat); CHKERRQ(ierr);
925           }
926 #if defined (USE_CTABLE)
927 	  col = TableFind( baij->colmap, col/bs + 1 ) - 1 + col%bs;
928 #else
929           col = (baij->colmap[col/bs]) - 1 + col%bs;
930 #endif
931           if (col < 0  && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
932             ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
933             col = (int) PetscReal(values[3*i+1]);
934           }
935         }
936         ierr = MatSetValues(baij->B,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
937       }
938     }
939     count--;
940   }
941   if (baij->recv_waits) PetscFree(baij->recv_waits);
942   if (baij->rvalues)    PetscFree(baij->rvalues);
943 
944   /* wait on sends */
945   if (baij->nsends) {
946     send_status = (MPI_Status *) PetscMalloc(baij->nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
947     ierr        = MPI_Waitall(baij->nsends,baij->send_waits,send_status);CHKERRQ(ierr);
948     PetscFree(send_status);
949   }
950   if (baij->send_waits) PetscFree(baij->send_waits);
951   if (baij->svalues)    PetscFree(baij->svalues);
952 
953   ierr = MatAssemblyBegin(baij->A,mode); CHKERRQ(ierr);
954   ierr = MatAssemblyEnd(baij->A,mode); CHKERRQ(ierr);
955 
956   /* determine if any processor has disassembled, if so we must
957      also disassemble ourselfs, in order that we may reassemble. */
958   /*
959      if nonzero structure of submatrix B cannot change then we know that
960      no processor disassembled thus we can skip this stuff
961   */
962   if (!((Mat_SeqBAIJ*) baij->B->data)->nonew)  {
963     ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
964     if (mat->was_assembled && !other_disassembled) {
965       ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
966     }
967   }
968 
969   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
970     ierr = MatSetUpMultiply_MPIBAIJ(mat); CHKERRQ(ierr);
971   }
972   ierr = MatAssemblyBegin(baij->B,mode); CHKERRQ(ierr);
973   ierr = MatAssemblyEnd(baij->B,mode); CHKERRQ(ierr);
974 
975 #if defined(USE_PETSC_BOPT_g)
976   if (baij->ht && mode== MAT_FINAL_ASSEMBLY) {
977     PLogInfo(0,"MatAssemblyEnd_MPIBAIJ:Average Hash Table Search in MatSetValues = %5.2f\n",
978              ((double)baij->ht_total_ct)/baij->ht_insert_ct);
979     baij->ht_total_ct  = 0;
980     baij->ht_insert_ct = 0;
981   }
982 #endif
983   if (baij->ht_flag && !baij->ht && mode == MAT_FINAL_ASSEMBLY) {
984     ierr = MatCreateHashTable_MPIBAIJ_Private(mat,baij->ht_fact); CHKERRQ(ierr);
985     mat->ops->setvalues        = MatSetValues_MPIBAIJ_HT;
986     mat->ops->setvaluesblocked = MatSetValuesBlocked_MPIBAIJ_HT;
987   }
988 
989   if (baij->rowvalues) {PetscFree(baij->rowvalues); baij->rowvalues = 0;}
990   PetscFunctionReturn(0);
991 }
992 
993 #undef __FUNC__
994 #define __FUNC__ "MatView_MPIBAIJ_Binary"
995 static int MatView_MPIBAIJ_Binary(Mat mat,Viewer viewer)
996 {
997   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
998   int          ierr;
999 
1000   PetscFunctionBegin;
1001   if (baij->size == 1) {
1002     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
1003   } else SETERRQ(PETSC_ERR_SUP,0,"Only uniprocessor output supported");
1004   PetscFunctionReturn(0);
1005 }
1006 
1007 #undef __FUNC__
1008 #define __FUNC__ "MatView_MPIBAIJ_ASCIIorDraworMatlab"
1009 static int MatView_MPIBAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
1010 {
1011   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
1012   int          ierr, format,bs = baij->bs, size = baij->size, rank = baij->rank;
1013   FILE         *fd;
1014   ViewerType   vtype;
1015 
1016   PetscFunctionBegin;
1017   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
1018   if (PetscTypeCompare(vtype,ASCII_VIEWER)) {
1019     ierr = ViewerGetFormat(viewer,&format);
1020     if (format == VIEWER_FORMAT_ASCII_INFO_LONG) {
1021       MatInfo info;
1022       MPI_Comm_rank(mat->comm,&rank);
1023       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
1024       ierr = MatGetInfo(mat,MAT_LOCAL,&info);
1025       PetscSequentialPhaseBegin(mat->comm,1);
1026       fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
1027               rank,baij->m,(int)info.nz_used*bs,(int)info.nz_allocated*bs,
1028               baij->bs,(int)info.memory);
1029       ierr = MatGetInfo(baij->A,MAT_LOCAL,&info);
1030       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
1031       ierr = MatGetInfo(baij->B,MAT_LOCAL,&info);
1032       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
1033       fflush(fd);
1034       PetscSequentialPhaseEnd(mat->comm,1);
1035       ierr = VecScatterView(baij->Mvctx,viewer); CHKERRQ(ierr);
1036       PetscFunctionReturn(0);
1037     } else if (format == VIEWER_FORMAT_ASCII_INFO) {
1038       PetscPrintf(mat->comm,"  block size is %d\n",bs);
1039       PetscFunctionReturn(0);
1040     }
1041   }
1042 
1043   if (PetscTypeCompare(vtype,DRAW_VIEWER)) {
1044     Draw       draw;
1045     PetscTruth isnull;
1046     ierr = ViewerDrawGetDraw(viewer,0,&draw); CHKERRQ(ierr);
1047     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
1048   }
1049 
1050   if (size == 1) {
1051     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
1052   } else {
1053     /* assemble the entire matrix onto first processor. */
1054     Mat         A;
1055     Mat_SeqBAIJ *Aloc;
1056     int         M = baij->M, N = baij->N,*ai,*aj,col,i,j,k,*rvals;
1057     int         mbs = baij->mbs;
1058     Scalar      *a;
1059 
1060     if (!rank) {
1061       ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
1062     } else {
1063       ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
1064     }
1065     PLogObjectParent(mat,A);
1066 
1067     /* copy over the A part */
1068     Aloc = (Mat_SeqBAIJ*) baij->A->data;
1069     ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1070     rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1071 
1072     for ( i=0; i<mbs; i++ ) {
1073       rvals[0] = bs*(baij->rstart + i);
1074       for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1075       for ( j=ai[i]; j<ai[i+1]; j++ ) {
1076         col = (baij->cstart+aj[j])*bs;
1077         for (k=0; k<bs; k++ ) {
1078           ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
1079           col++; a += bs;
1080         }
1081       }
1082     }
1083     /* copy over the B part */
1084     Aloc = (Mat_SeqBAIJ*) baij->B->data;
1085     ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1086     for ( i=0; i<mbs; i++ ) {
1087       rvals[0] = bs*(baij->rstart + i);
1088       for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1089       for ( j=ai[i]; j<ai[i+1]; j++ ) {
1090         col = baij->garray[aj[j]]*bs;
1091         for (k=0; k<bs; k++ ) {
1092           ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
1093           col++; a += bs;
1094         }
1095       }
1096     }
1097     PetscFree(rvals);
1098     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1099     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1100     /*
1101        Everyone has to call to draw the matrix since the graphics waits are
1102        synchronized across all processors that share the Draw object
1103     */
1104     if (!rank || PetscTypeCompare(vtype,DRAW_VIEWER)) {
1105       ierr = MatView(((Mat_MPIBAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
1106     }
1107     ierr = MatDestroy(A); CHKERRQ(ierr);
1108   }
1109   PetscFunctionReturn(0);
1110 }
1111 
1112 
1113 
1114 #undef __FUNC__
1115 #define __FUNC__ "MatView_MPIBAIJ"
1116 int MatView_MPIBAIJ(Mat mat,Viewer viewer)
1117 {
1118   int         ierr;
1119   ViewerType  vtype;
1120 
1121   PetscFunctionBegin;
1122   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
1123   if (PetscTypeCompare(vtype,ASCII_VIEWER) || PetscTypeCompare(vtype,DRAW_VIEWER) ||
1124       PetscTypeCompare(vtype,MATLAB_VIEWER)) {
1125     ierr = MatView_MPIBAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
1126   } else if (PetscTypeCompare(vtype,BINARY_VIEWER)) {
1127     ierr = MatView_MPIBAIJ_Binary(mat,viewer);CHKERRQ(ierr);
1128   } else {
1129     SETERRQ(1,1,"Viewer type not supported by PETSc object");
1130   }
1131   PetscFunctionReturn(0);
1132 }
1133 
1134 #undef __FUNC__
1135 #define __FUNC__ "MatDestroy_MPIBAIJ"
1136 int MatDestroy_MPIBAIJ(Mat mat)
1137 {
1138   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1139   int         ierr;
1140 
1141   PetscFunctionBegin;
1142   if (--mat->refct > 0) PetscFunctionReturn(0);
1143 
1144   if (mat->mapping) {
1145     ierr = ISLocalToGlobalMappingDestroy(mat->mapping); CHKERRQ(ierr);
1146   }
1147   if (mat->bmapping) {
1148     ierr = ISLocalToGlobalMappingDestroy(mat->bmapping); CHKERRQ(ierr);
1149   }
1150   if (mat->rmap) {
1151     ierr = MapDestroy(mat->rmap);CHKERRQ(ierr);
1152   }
1153   if (mat->cmap) {
1154     ierr = MapDestroy(mat->cmap);CHKERRQ(ierr);
1155   }
1156 #if defined(USE_PETSC_LOG)
1157   PLogObjectState((PetscObject)mat,"Rows=%d, Cols=%d",baij->M,baij->N);
1158 #endif
1159 
1160   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
1161   PetscFree(baij->rowners);
1162   ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1163   ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1164 #if defined (USE_CTABLE)
1165   if (baij->colmap) TableDelete(baij->colmap);
1166 #else
1167   if (baij->colmap) PetscFree(baij->colmap);
1168 #endif
1169   if (baij->garray) PetscFree(baij->garray);
1170   if (baij->lvec)   VecDestroy(baij->lvec);
1171   if (baij->Mvctx)  VecScatterDestroy(baij->Mvctx);
1172   if (baij->rowvalues) PetscFree(baij->rowvalues);
1173   if (baij->barray) PetscFree(baij->barray);
1174   if (baij->hd) PetscFree(baij->hd);
1175   PetscFree(baij);
1176   PLogObjectDestroy(mat);
1177   PetscHeaderDestroy(mat);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 #undef __FUNC__
1182 #define __FUNC__ "MatMult_MPIBAIJ"
1183 int MatMult_MPIBAIJ(Mat A,Vec xx,Vec yy)
1184 {
1185   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1186   int         ierr, nt;
1187 
1188   PetscFunctionBegin;
1189   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
1190   if (nt != a->n) {
1191     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible partition of A and xx");
1192   }
1193   ierr = VecGetLocalSize(yy,&nt);CHKERRQ(ierr);
1194   if (nt != a->m) {
1195     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible parition of A and yy");
1196   }
1197   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1198   ierr = (*a->A->ops->mult)(a->A,xx,yy); CHKERRQ(ierr);
1199   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1200   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
1201   ierr = VecScatterPostRecvs(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1202   PetscFunctionReturn(0);
1203 }
1204 
1205 #undef __FUNC__
1206 #define __FUNC__ "MatMultAdd_MPIBAIJ"
1207 int MatMultAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1208 {
1209   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1210   int        ierr;
1211 
1212   PetscFunctionBegin;
1213   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1214   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1215   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1216   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
1217   PetscFunctionReturn(0);
1218 }
1219 
1220 #undef __FUNC__
1221 #define __FUNC__ "MatMultTrans_MPIBAIJ"
1222 int MatMultTrans_MPIBAIJ(Mat A,Vec xx,Vec yy)
1223 {
1224   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1225   int         ierr;
1226 
1227   PetscFunctionBegin;
1228   /* do nondiagonal part */
1229   ierr = (*a->B->ops->multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1230   /* send it on its way */
1231   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1232   /* do local part */
1233   ierr = (*a->A->ops->multtrans)(a->A,xx,yy); CHKERRQ(ierr);
1234   /* receive remote parts: note this assumes the values are not actually */
1235   /* inserted in yy until the next line, which is true for my implementation*/
1236   /* but is not perhaps always true. */
1237   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1238   PetscFunctionReturn(0);
1239 }
1240 
1241 #undef __FUNC__
1242 #define __FUNC__ "MatMultTransAdd_MPIBAIJ"
1243 int MatMultTransAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1244 {
1245   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1246   int         ierr;
1247 
1248   PetscFunctionBegin;
1249   /* do nondiagonal part */
1250   ierr = (*a->B->ops->multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1251   /* send it on its way */
1252   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1253   /* do local part */
1254   ierr = (*a->A->ops->multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1255   /* receive remote parts: note this assumes the values are not actually */
1256   /* inserted in yy until the next line, which is true for my implementation*/
1257   /* but is not perhaps always true. */
1258   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1259   PetscFunctionReturn(0);
1260 }
1261 
1262 /*
1263   This only works correctly for square matrices where the subblock A->A is the
1264    diagonal block
1265 */
1266 #undef __FUNC__
1267 #define __FUNC__ "MatGetDiagonal_MPIBAIJ"
1268 int MatGetDiagonal_MPIBAIJ(Mat A,Vec v)
1269 {
1270   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1271   int         ierr;
1272 
1273   PetscFunctionBegin;
1274   if (a->M != a->N) SETERRQ(PETSC_ERR_SUP,0,"Supports only square matrix where A->A is diag block");
1275   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
1276   PetscFunctionReturn(0);
1277 }
1278 
1279 #undef __FUNC__
1280 #define __FUNC__ "MatScale_MPIBAIJ"
1281 int MatScale_MPIBAIJ(Scalar *aa,Mat A)
1282 {
1283   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1284   int         ierr;
1285 
1286   PetscFunctionBegin;
1287   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
1288   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
1289   PetscFunctionReturn(0);
1290 }
1291 
1292 #undef __FUNC__
1293 #define __FUNC__ "MatGetSize_MPIBAIJ"
1294 int MatGetSize_MPIBAIJ(Mat matin,int *m,int *n)
1295 {
1296   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1297 
1298   PetscFunctionBegin;
1299   if (m) *m = mat->M;
1300   if (n) *n = mat->N;
1301   PetscFunctionReturn(0);
1302 }
1303 
1304 #undef __FUNC__
1305 #define __FUNC__ "MatGetLocalSize_MPIBAIJ"
1306 int MatGetLocalSize_MPIBAIJ(Mat matin,int *m,int *n)
1307 {
1308   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1309 
1310   PetscFunctionBegin;
1311   *m = mat->m; *n = mat->n;
1312   PetscFunctionReturn(0);
1313 }
1314 
1315 #undef __FUNC__
1316 #define __FUNC__ "MatGetOwnershipRange_MPIBAIJ"
1317 int MatGetOwnershipRange_MPIBAIJ(Mat matin,int *m,int *n)
1318 {
1319   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1320 
1321   PetscFunctionBegin;
1322   *m = mat->rstart*mat->bs; *n = mat->rend*mat->bs;
1323   PetscFunctionReturn(0);
1324 }
1325 
1326 extern int MatGetRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1327 extern int MatRestoreRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1328 
1329 #undef __FUNC__
1330 #define __FUNC__ "MatGetRow_MPIBAIJ"
1331 int MatGetRow_MPIBAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1332 {
1333   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1334   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1335   int        bs = mat->bs, bs2 = mat->bs2, i, ierr, *cworkA, *cworkB, **pcA, **pcB;
1336   int        nztot, nzA, nzB, lrow, brstart = mat->rstart*bs, brend = mat->rend*bs;
1337   int        *cmap, *idx_p,cstart = mat->cstart;
1338 
1339   PetscFunctionBegin;
1340   if (mat->getrowactive == PETSC_TRUE) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Already active");
1341   mat->getrowactive = PETSC_TRUE;
1342 
1343   if (!mat->rowvalues && (idx || v)) {
1344     /*
1345         allocate enough space to hold information from the longest row.
1346     */
1347     Mat_SeqBAIJ *Aa = (Mat_SeqBAIJ *) mat->A->data,*Ba = (Mat_SeqBAIJ *) mat->B->data;
1348     int     max = 1,mbs = mat->mbs,tmp;
1349     for ( i=0; i<mbs; i++ ) {
1350       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1351       if (max < tmp) { max = tmp; }
1352     }
1353     mat->rowvalues = (Scalar *) PetscMalloc( max*bs2*(sizeof(int)+sizeof(Scalar)));
1354     CHKPTRQ(mat->rowvalues);
1355     mat->rowindices = (int *) (mat->rowvalues + max*bs2);
1356   }
1357 
1358   if (row < brstart || row >= brend) SETERRQ(PETSC_ERR_SUP,0,"Only local rows")
1359   lrow = row - brstart;
1360 
1361   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1362   if (!v)   {pvA = 0; pvB = 0;}
1363   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1364   ierr = (*mat->A->ops->getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1365   ierr = (*mat->B->ops->getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1366   nztot = nzA + nzB;
1367 
1368   cmap  = mat->garray;
1369   if (v  || idx) {
1370     if (nztot) {
1371       /* Sort by increasing column numbers, assuming A and B already sorted */
1372       int imark = -1;
1373       if (v) {
1374         *v = v_p = mat->rowvalues;
1375         for ( i=0; i<nzB; i++ ) {
1376           if (cmap[cworkB[i]/bs] < cstart)   v_p[i] = vworkB[i];
1377           else break;
1378         }
1379         imark = i;
1380         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1381         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1382       }
1383       if (idx) {
1384         *idx = idx_p = mat->rowindices;
1385         if (imark > -1) {
1386           for ( i=0; i<imark; i++ ) {
1387             idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs;
1388           }
1389         } else {
1390           for ( i=0; i<nzB; i++ ) {
1391             if (cmap[cworkB[i]/bs] < cstart)
1392               idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1393             else break;
1394           }
1395           imark = i;
1396         }
1397         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart*bs + cworkA[i];
1398         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1399       }
1400     } else {
1401       if (idx) *idx = 0;
1402       if (v)   *v   = 0;
1403     }
1404   }
1405   *nz = nztot;
1406   ierr = (*mat->A->ops->restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1407   ierr = (*mat->B->ops->restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1408   PetscFunctionReturn(0);
1409 }
1410 
1411 #undef __FUNC__
1412 #define __FUNC__ "MatRestoreRow_MPIBAIJ"
1413 int MatRestoreRow_MPIBAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1414 {
1415   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1416 
1417   PetscFunctionBegin;
1418   if (baij->getrowactive == PETSC_FALSE) {
1419     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"MatGetRow not called");
1420   }
1421   baij->getrowactive = PETSC_FALSE;
1422   PetscFunctionReturn(0);
1423 }
1424 
1425 #undef __FUNC__
1426 #define __FUNC__ "MatGetBlockSize_MPIBAIJ"
1427 int MatGetBlockSize_MPIBAIJ(Mat mat,int *bs)
1428 {
1429   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1430 
1431   PetscFunctionBegin;
1432   *bs = baij->bs;
1433   PetscFunctionReturn(0);
1434 }
1435 
1436 #undef __FUNC__
1437 #define __FUNC__ "MatZeroEntries_MPIBAIJ"
1438 int MatZeroEntries_MPIBAIJ(Mat A)
1439 {
1440   Mat_MPIBAIJ *l = (Mat_MPIBAIJ *) A->data;
1441   int         ierr;
1442 
1443   PetscFunctionBegin;
1444   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
1445   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
1446   PetscFunctionReturn(0);
1447 }
1448 
1449 #undef __FUNC__
1450 #define __FUNC__ "MatGetInfo_MPIBAIJ"
1451 int MatGetInfo_MPIBAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1452 {
1453   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) matin->data;
1454   Mat         A = a->A, B = a->B;
1455   int         ierr;
1456   double      isend[5], irecv[5];
1457 
1458   PetscFunctionBegin;
1459   info->block_size     = (double)a->bs;
1460   ierr = MatGetInfo(A,MAT_LOCAL,info); CHKERRQ(ierr);
1461   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->nz_unneeded;
1462   isend[3] = info->memory;  isend[4] = info->mallocs;
1463   ierr = MatGetInfo(B,MAT_LOCAL,info); CHKERRQ(ierr);
1464   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->nz_unneeded;
1465   isend[3] += info->memory;  isend[4] += info->mallocs;
1466   if (flag == MAT_LOCAL) {
1467     info->nz_used      = isend[0];
1468     info->nz_allocated = isend[1];
1469     info->nz_unneeded  = isend[2];
1470     info->memory       = isend[3];
1471     info->mallocs      = isend[4];
1472   } else if (flag == MAT_GLOBAL_MAX) {
1473     ierr = MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_MAX,matin->comm);CHKERRQ(ierr);
1474     info->nz_used      = irecv[0];
1475     info->nz_allocated = irecv[1];
1476     info->nz_unneeded  = irecv[2];
1477     info->memory       = irecv[3];
1478     info->mallocs      = irecv[4];
1479   } else if (flag == MAT_GLOBAL_SUM) {
1480     ierr = MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_SUM,matin->comm);CHKERRQ(ierr);
1481     info->nz_used      = irecv[0];
1482     info->nz_allocated = irecv[1];
1483     info->nz_unneeded  = irecv[2];
1484     info->memory       = irecv[3];
1485     info->mallocs      = irecv[4];
1486   }
1487   info->rows_global       = (double)a->M;
1488   info->columns_global    = (double)a->N;
1489   info->rows_local        = (double)a->m;
1490   info->columns_local     = (double)a->N;
1491   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1492   info->fill_ratio_needed = 0;
1493   info->factor_mallocs    = 0;
1494   PetscFunctionReturn(0);
1495 }
1496 
1497 #undef __FUNC__
1498 #define __FUNC__ "MatSetOption_MPIBAIJ"
1499 int MatSetOption_MPIBAIJ(Mat A,MatOption op)
1500 {
1501   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1502 
1503   PetscFunctionBegin;
1504   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1505       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1506       op == MAT_COLUMNS_UNSORTED ||
1507       op == MAT_COLUMNS_SORTED ||
1508       op == MAT_NEW_NONZERO_ALLOCATION_ERROR ||
1509       op == MAT_NEW_NONZERO_LOCATION_ERROR) {
1510         MatSetOption(a->A,op);
1511         MatSetOption(a->B,op);
1512   } else if (op == MAT_ROW_ORIENTED) {
1513         a->roworiented = 1;
1514         MatSetOption(a->A,op);
1515         MatSetOption(a->B,op);
1516   } else if (op == MAT_ROWS_SORTED ||
1517              op == MAT_ROWS_UNSORTED ||
1518              op == MAT_SYMMETRIC ||
1519              op == MAT_STRUCTURALLY_SYMMETRIC ||
1520              op == MAT_YES_NEW_DIAGONALS ||
1521              op == MAT_USE_HASH_TABLE)
1522     PLogInfo(A,"Info:MatSetOption_MPIBAIJ:Option ignored\n");
1523   else if (op == MAT_COLUMN_ORIENTED) {
1524     a->roworiented = 0;
1525     MatSetOption(a->A,op);
1526     MatSetOption(a->B,op);
1527   } else if (op == MAT_IGNORE_OFF_PROC_ENTRIES) {
1528     a->donotstash = 1;
1529   } else if (op == MAT_NO_NEW_DIAGONALS) {
1530     SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");
1531   } else if (op == MAT_USE_HASH_TABLE) {
1532     a->ht_flag = 1;
1533   } else {
1534     SETERRQ(PETSC_ERR_SUP,0,"unknown option");
1535   }
1536   PetscFunctionReturn(0);
1537 }
1538 
1539 #undef __FUNC__
1540 #define __FUNC__ "MatTranspose_MPIBAIJ("
1541 int MatTranspose_MPIBAIJ(Mat A,Mat *matout)
1542 {
1543   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) A->data;
1544   Mat_SeqBAIJ *Aloc;
1545   Mat        B;
1546   int        ierr,M=baij->M,N=baij->N,*ai,*aj,i,*rvals,j,k,col;
1547   int        bs=baij->bs,mbs=baij->mbs;
1548   Scalar     *a;
1549 
1550   PetscFunctionBegin;
1551   if (matout == PETSC_NULL && M != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Square matrix only for in-place");
1552   ierr = MatCreateMPIBAIJ(A->comm,baij->bs,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);
1553   CHKERRQ(ierr);
1554 
1555   /* copy over the A part */
1556   Aloc = (Mat_SeqBAIJ*) baij->A->data;
1557   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1558   rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1559 
1560   for ( i=0; i<mbs; i++ ) {
1561     rvals[0] = bs*(baij->rstart + i);
1562     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1563     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1564       col = (baij->cstart+aj[j])*bs;
1565       for (k=0; k<bs; k++ ) {
1566         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1567         col++; a += bs;
1568       }
1569     }
1570   }
1571   /* copy over the B part */
1572   Aloc = (Mat_SeqBAIJ*) baij->B->data;
1573   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1574   for ( i=0; i<mbs; i++ ) {
1575     rvals[0] = bs*(baij->rstart + i);
1576     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1577     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1578       col = baij->garray[aj[j]]*bs;
1579       for (k=0; k<bs; k++ ) {
1580         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1581         col++; a += bs;
1582       }
1583     }
1584   }
1585   PetscFree(rvals);
1586   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1587   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1588 
1589   if (matout != PETSC_NULL) {
1590     *matout = B;
1591   } else {
1592     PetscOps *Abops;
1593     MatOps   Aops;
1594 
1595     /* This isn't really an in-place transpose .... but free data structures from baij */
1596     PetscFree(baij->rowners);
1597     ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1598     ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1599 #if defined (USE_CTABLE)
1600     if (baij->colmap) TableDelete(baij->colmap);
1601 #else
1602     if (baij->colmap) PetscFree(baij->colmap);
1603 #endif
1604     if (baij->garray) PetscFree(baij->garray);
1605     if (baij->lvec) VecDestroy(baij->lvec);
1606     if (baij->Mvctx) VecScatterDestroy(baij->Mvctx);
1607     PetscFree(baij);
1608 
1609     /*
1610        This is horrible, horrible code. We need to keep the
1611       A pointers for the bops and ops but copy everything
1612       else from C.
1613     */
1614     Abops = A->bops;
1615     Aops  = A->ops;
1616     PetscMemcpy(A,B,sizeof(struct _p_Mat));
1617     A->bops = Abops;
1618     A->ops  = Aops;
1619 
1620     PetscHeaderDestroy(B);
1621   }
1622   PetscFunctionReturn(0);
1623 }
1624 
1625 #undef __FUNC__
1626 #define __FUNC__ "MatDiagonalScale_MPIBAIJ"
1627 int MatDiagonalScale_MPIBAIJ(Mat A,Vec ll,Vec rr)
1628 {
1629   Mat a = ((Mat_MPIBAIJ *) A->data)->A;
1630   Mat b = ((Mat_MPIBAIJ *) A->data)->B;
1631   int ierr,s1,s2,s3;
1632 
1633   PetscFunctionBegin;
1634   if (ll)  {
1635     ierr = VecGetLocalSize(ll,&s1); CHKERRQ(ierr);
1636     ierr = MatGetLocalSize(A,&s2,&s3); CHKERRQ(ierr);
1637     if (s1!=s2) SETERRQ(PETSC_ERR_ARG_SIZ,0,"non-conforming local sizes");
1638     ierr = MatDiagonalScale(a,ll,0); CHKERRQ(ierr);
1639     ierr = MatDiagonalScale(b,ll,0); CHKERRQ(ierr);
1640   }
1641   if (rr) SETERRQ(PETSC_ERR_SUP,0,"not supported for right vector");
1642   PetscFunctionReturn(0);
1643 }
1644 
1645 #undef __FUNC__
1646 #define __FUNC__ "MatZeroRows_MPIBAIJ"
1647 int MatZeroRows_MPIBAIJ(Mat A,IS is,Scalar *diag)
1648 {
1649   Mat_MPIBAIJ    *l = (Mat_MPIBAIJ *) A->data;
1650   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
1651   int            *procs,*nprocs,j,found,idx,nsends,*work,row;
1652   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
1653   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
1654   int            *lens,imdex,*lrows,*values,bs=l->bs,rstart_bs=l->rstart_bs;
1655   MPI_Comm       comm = A->comm;
1656   MPI_Request    *send_waits,*recv_waits;
1657   MPI_Status     recv_status,*send_status;
1658   IS             istmp;
1659   PetscTruth     localdiag;
1660 
1661   PetscFunctionBegin;
1662   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
1663   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
1664 
1665   /*  first count number of contributors to each processor */
1666   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
1667   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
1668   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
1669   for ( i=0; i<N; i++ ) {
1670     idx = rows[i];
1671     found = 0;
1672     for ( j=0; j<size; j++ ) {
1673       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
1674         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
1675       }
1676     }
1677     if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Index out of range");
1678   }
1679   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
1680 
1681   /* inform other processors of number of messages and max length*/
1682   work   = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
1683   ierr   = MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
1684   nrecvs = work[rank];
1685   ierr   = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
1686   nmax   = work[rank];
1687   PetscFree(work);
1688 
1689   /* post receives:   */
1690   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); CHKPTRQ(rvalues);
1691   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
1692   for ( i=0; i<nrecvs; i++ ) {
1693     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
1694   }
1695 
1696   /* do sends:
1697      1) starts[i] gives the starting index in svalues for stuff going to
1698      the ith processor
1699   */
1700   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
1701   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
1702   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
1703   starts[0] = 0;
1704   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1705   for ( i=0; i<N; i++ ) {
1706     svalues[starts[owner[i]]++] = rows[i];
1707   }
1708   ISRestoreIndices(is,&rows);
1709 
1710   starts[0] = 0;
1711   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1712   count = 0;
1713   for ( i=0; i<size; i++ ) {
1714     if (procs[i]) {
1715       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
1716     }
1717   }
1718   PetscFree(starts);
1719 
1720   base = owners[rank]*bs;
1721 
1722   /*  wait on receives */
1723   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
1724   source = lens + nrecvs;
1725   count  = nrecvs; slen = 0;
1726   while (count) {
1727     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
1728     /* unpack receives into our local space */
1729     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
1730     source[imdex]  = recv_status.MPI_SOURCE;
1731     lens[imdex]  = n;
1732     slen += n;
1733     count--;
1734   }
1735   PetscFree(recv_waits);
1736 
1737   /* move the data into the send scatter */
1738   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
1739   count = 0;
1740   for ( i=0; i<nrecvs; i++ ) {
1741     values = rvalues + i*nmax;
1742     for ( j=0; j<lens[i]; j++ ) {
1743       lrows[count++] = values[j] - base;
1744     }
1745   }
1746   PetscFree(rvalues); PetscFree(lens);
1747   PetscFree(owner); PetscFree(nprocs);
1748 
1749   /* actually zap the local rows */
1750   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
1751   PLogObjectParent(A,istmp);
1752 
1753   /*
1754         Zero the required rows. If the "diagonal block" of the matrix
1755      is square and the user wishes to set the diagonal we use seperate
1756      code so that MatSetValues() is not called for each diagonal allocating
1757      new memory, thus calling lots of mallocs and slowing things down.
1758 
1759        Contributed by: Mathew Knepley
1760   */
1761   localdiag = PETSC_FALSE;
1762   if (diag && (l->A->M == l->A->N)) {
1763     localdiag = PETSC_TRUE;
1764     ierr      = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
1765   } else {
1766     ierr = MatZeroRows(l->A,istmp,0); CHKERRQ(ierr);
1767   }
1768   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
1769   ierr = ISDestroy(istmp); CHKERRQ(ierr);
1770 
1771   if (diag && (localdiag == PETSC_FALSE)) {
1772     for ( i = 0; i < slen; i++ ) {
1773       row = lrows[i] + rstart_bs;
1774       ierr = MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES); CHKERRQ(ierr);
1775     }
1776     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1777     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1778   }
1779   PetscFree(lrows);
1780 
1781   /* wait on sends */
1782   if (nsends) {
1783     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
1784     ierr        = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
1785     PetscFree(send_status);
1786   }
1787   PetscFree(send_waits); PetscFree(svalues);
1788 
1789   PetscFunctionReturn(0);
1790 }
1791 
1792 extern int MatPrintHelp_SeqBAIJ(Mat);
1793 #undef __FUNC__
1794 #define __FUNC__ "MatPrintHelp_MPIBAIJ"
1795 int MatPrintHelp_MPIBAIJ(Mat A)
1796 {
1797   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1798   MPI_Comm    comm = A->comm;
1799   static int  called = 0;
1800   int         ierr;
1801 
1802   PetscFunctionBegin;
1803   if (!a->rank) {
1804     ierr = MatPrintHelp_SeqBAIJ(a->A);CHKERRQ(ierr);
1805   }
1806   if (called) {PetscFunctionReturn(0);} else called = 1;
1807   (*PetscHelpPrintf)(comm," Options for MATMPIBAIJ matrix format (the defaults):\n");
1808   (*PetscHelpPrintf)(comm,"  -mat_use_hash_table <factor>: Use hashtable for efficient matrix assembly\n");
1809   PetscFunctionReturn(0);
1810 }
1811 
1812 #undef __FUNC__
1813 #define __FUNC__ "MatSetUnfactored_MPIBAIJ"
1814 int MatSetUnfactored_MPIBAIJ(Mat A)
1815 {
1816   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1817   int         ierr;
1818 
1819   PetscFunctionBegin;
1820   ierr = MatSetUnfactored(a->A); CHKERRQ(ierr);
1821   PetscFunctionReturn(0);
1822 }
1823 
1824 static int MatDuplicate_MPIBAIJ(Mat,MatDuplicateOption,Mat *);
1825 
1826 /* -------------------------------------------------------------------*/
1827 static struct _MatOps MatOps_Values = {
1828   MatSetValues_MPIBAIJ,
1829   MatGetRow_MPIBAIJ,
1830   MatRestoreRow_MPIBAIJ,
1831   MatMult_MPIBAIJ,
1832   MatMultAdd_MPIBAIJ,
1833   MatMultTrans_MPIBAIJ,
1834   MatMultTransAdd_MPIBAIJ,
1835   0,
1836   0,
1837   0,
1838   0,
1839   0,
1840   0,
1841   0,
1842   MatTranspose_MPIBAIJ,
1843   MatGetInfo_MPIBAIJ,
1844   0,
1845   MatGetDiagonal_MPIBAIJ,
1846   MatDiagonalScale_MPIBAIJ,
1847   MatNorm_MPIBAIJ,
1848   MatAssemblyBegin_MPIBAIJ,
1849   MatAssemblyEnd_MPIBAIJ,
1850   0,
1851   MatSetOption_MPIBAIJ,
1852   MatZeroEntries_MPIBAIJ,
1853   MatZeroRows_MPIBAIJ,
1854   0,
1855   0,
1856   0,
1857   0,
1858   MatGetSize_MPIBAIJ,
1859   MatGetLocalSize_MPIBAIJ,
1860   MatGetOwnershipRange_MPIBAIJ,
1861   0,
1862   0,
1863   0,
1864   0,
1865   MatDuplicate_MPIBAIJ,
1866   0,
1867   0,
1868   0,
1869   0,
1870   0,
1871   MatGetSubMatrices_MPIBAIJ,
1872   MatIncreaseOverlap_MPIBAIJ,
1873   MatGetValues_MPIBAIJ,
1874   0,
1875   MatPrintHelp_MPIBAIJ,
1876   MatScale_MPIBAIJ,
1877   0,
1878   0,
1879   0,
1880   MatGetBlockSize_MPIBAIJ,
1881   0,
1882   0,
1883   0,
1884   0,
1885   0,
1886   0,
1887   MatSetUnfactored_MPIBAIJ,
1888   0,
1889   MatSetValuesBlocked_MPIBAIJ,
1890   0,
1891   0,
1892   0,
1893   MatGetMaps_Petsc};
1894 
1895 #include "pc.h"
1896 EXTERN_C_BEGIN
1897 extern int PCSetUp_BJacobi_BAIJ(PC);
1898 EXTERN_C_END
1899 
1900 #undef __FUNC__
1901 #define __FUNC__ "MatCreateMPIBAIJ"
1902 /*@C
1903    MatCreateMPIBAIJ - Creates a sparse parallel matrix in block AIJ format
1904    (block compressed row).  For good matrix assembly performance
1905    the user should preallocate the matrix storage by setting the parameters
1906    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1907    performance can be increased by more than a factor of 50.
1908 
1909    Collective on MPI_Comm
1910 
1911    Input Parameters:
1912 +  comm - MPI communicator
1913 .  bs   - size of blockk
1914 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1915            This value should be the same as the local size used in creating the
1916            y vector for the matrix-vector product y = Ax.
1917 .  n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1918            This value should be the same as the local size used in creating the
1919            x vector for the matrix-vector product y = Ax.
1920 .  M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1921 .  N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1922 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1923            submatrix  (same for all local rows)
1924 .  d_nzz - array containing the number of block nonzeros in the various block rows
1925            of the in diagonal portion of the local (possibly different for each block
1926            row) or PETSC_NULL.  You must leave room for the diagonal entry even if it is zero.
1927 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1928            submatrix (same for all local rows).
1929 -  o_nzz - array containing the number of nonzeros in the various block rows of the
1930            off-diagonal portion of the local submatrix (possibly different for
1931            each block row) or PETSC_NULL.
1932 
1933    Output Parameter:
1934 .  A - the matrix
1935 
1936    Options Database Keys:
1937 .   -mat_no_unroll - uses code that does not unroll the loops in the
1938                      block calculations (much slower)
1939 .   -mat_block_size - size of the blocks to use
1940 .   -mat_mpi - use the parallel matrix data structures even on one processor
1941                (defaults to using SeqBAIJ format on one processor)
1942 
1943    Notes:
1944    The user MUST specify either the local or global matrix dimensions
1945    (possibly both).
1946 
1947    If PETSC_DECIDE or  PETSC_DETERMINE is used for a particular argument on one processor
1948    than it must be used on all processors that share the object for that argument.
1949 
1950    Storage Information:
1951    For a square global matrix we define each processor's diagonal portion
1952    to be its local rows and the corresponding columns (a square submatrix);
1953    each processor's off-diagonal portion encompasses the remainder of the
1954    local matrix (a rectangular submatrix).
1955 
1956    The user can specify preallocated storage for the diagonal part of
1957    the local submatrix with either d_nz or d_nnz (not both).  Set
1958    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1959    memory allocation.  Likewise, specify preallocated storage for the
1960    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1961 
1962    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1963    the figure below we depict these three local rows and all columns (0-11).
1964 
1965 .vb
1966            0 1 2 3 4 5 6 7 8 9 10 11
1967           -------------------
1968    row 3  |  o o o d d d o o o o o o
1969    row 4  |  o o o d d d o o o o o o
1970    row 5  |  o o o d d d o o o o o o
1971           -------------------
1972 .ve
1973 
1974    Thus, any entries in the d locations are stored in the d (diagonal)
1975    submatrix, and any entries in the o locations are stored in the
1976    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1977    stored simply in the MATSEQBAIJ format for compressed row storage.
1978 
1979    Now d_nz should indicate the number of block nonzeros per row in the d matrix,
1980    and o_nz should indicate the number of block nonzeros per row in the o matrix.
1981    In general, for PDE problems in which most nonzeros are near the diagonal,
1982    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1983    or you will get TERRIBLE performance; see the users' manual chapter on
1984    matrices.
1985 
1986 .keywords: matrix, block, aij, compressed row, sparse, parallel
1987 
1988 .seealso: MatCreate(), MatCreateSeqBAIJ(), MatSetValues(), MatCreateMPIAIJ()
1989 @*/
1990 int MatCreateMPIBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,
1991                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1992 {
1993   Mat          B;
1994   Mat_MPIBAIJ  *b;
1995   int          ierr, i,sum[2],work[2],mbs,nbs,Mbs=PETSC_DECIDE,Nbs=PETSC_DECIDE,size,flg;
1996   int          flag1 = 0,flag2 = 0;
1997 
1998   PetscFunctionBegin;
1999   if (bs < 1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Invalid block size specified, must be positive");
2000 
2001   MPI_Comm_size(comm,&size);
2002   ierr = OptionsHasName(PETSC_NULL,"-mat_mpibaij",&flag1); CHKERRQ(ierr);
2003   ierr = OptionsHasName(PETSC_NULL,"-mat_mpi",&flag2); CHKERRQ(ierr);
2004   if (!flag1 && !flag2 && size == 1) {
2005     if (M == PETSC_DECIDE) M = m;
2006     if (N == PETSC_DECIDE) N = n;
2007     ierr = MatCreateSeqBAIJ(comm,bs,M,N,d_nz,d_nnz,A); CHKERRQ(ierr);
2008     PetscFunctionReturn(0);
2009   }
2010 
2011   *A = 0;
2012   PetscHeaderCreate(B,_p_Mat,struct _MatOps,MAT_COOKIE,MATMPIBAIJ,"Mat",comm,MatDestroy,MatView);
2013   PLogObjectCreate(B);
2014   B->data       = (void *) (b = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(b);
2015   PetscMemzero(b,sizeof(Mat_MPIBAIJ));
2016   PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));
2017 
2018   B->ops->destroy    = MatDestroy_MPIBAIJ;
2019   B->ops->view       = MatView_MPIBAIJ;
2020   B->mapping    = 0;
2021   B->factor     = 0;
2022   B->assembled  = PETSC_FALSE;
2023 
2024   B->insertmode = NOT_SET_VALUES;
2025   MPI_Comm_rank(comm,&b->rank);
2026   MPI_Comm_size(comm,&b->size);
2027 
2028   if ( m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL)) {
2029     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
2030   }
2031   if ( M == PETSC_DECIDE && m == PETSC_DECIDE) {
2032     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either M or m should be specified");
2033   }
2034   if ( N == PETSC_DECIDE && n == PETSC_DECIDE) {
2035     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either N or n should be specified");
2036   }
2037   if ( M != PETSC_DECIDE && m != PETSC_DECIDE) M = PETSC_DECIDE;
2038   if ( N != PETSC_DECIDE && n != PETSC_DECIDE) N = PETSC_DECIDE;
2039 
2040   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
2041     work[0] = m; work[1] = n;
2042     mbs = m/bs; nbs = n/bs;
2043     ierr = MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );CHKERRQ(ierr);
2044     if (M == PETSC_DECIDE) {M = sum[0]; Mbs = M/bs;}
2045     if (N == PETSC_DECIDE) {N = sum[1]; Nbs = N/bs;}
2046   }
2047   if (m == PETSC_DECIDE) {
2048     Mbs = M/bs;
2049     if (Mbs*bs != M) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global rows must be divisible by blocksize");
2050     mbs = Mbs/b->size + ((Mbs % b->size) > b->rank);
2051     m   = mbs*bs;
2052   }
2053   if (n == PETSC_DECIDE) {
2054     Nbs = N/bs;
2055     if (Nbs*bs != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global cols must be divisible by blocksize");
2056     nbs = Nbs/b->size + ((Nbs % b->size) > b->rank);
2057     n   = nbs*bs;
2058   }
2059   if (mbs*bs != m || nbs*bs != n) {
2060     SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of local rows, cols must be divisible by blocksize");
2061   }
2062 
2063   b->m = m; B->m = m;
2064   b->n = n; B->n = n;
2065   b->N = N; B->N = N;
2066   b->M = M; B->M = M;
2067   b->bs  = bs;
2068   b->bs2 = bs*bs;
2069   b->mbs = mbs;
2070   b->nbs = nbs;
2071   b->Mbs = Mbs;
2072   b->Nbs = Nbs;
2073 
2074   /* the information in the maps duplicates the information computed below, eventually
2075      we should remove the duplicate information that is not contained in the maps */
2076   ierr = MapCreateMPI(comm,m,M,&B->rmap);CHKERRQ(ierr);
2077   ierr = MapCreateMPI(comm,n,N,&B->cmap);CHKERRQ(ierr);
2078 
2079   /* build local table of row and column ownerships */
2080   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
2081   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
2082   b->cowners = b->rowners + b->size + 2;
2083   ierr = MPI_Allgather(&mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2084   b->rowners[0] = 0;
2085   for ( i=2; i<=b->size; i++ ) {
2086     b->rowners[i] += b->rowners[i-1];
2087   }
2088   b->rstart    = b->rowners[b->rank];
2089   b->rend      = b->rowners[b->rank+1];
2090   b->rstart_bs = b->rstart * bs;
2091   b->rend_bs   = b->rend * bs;
2092 
2093   ierr = MPI_Allgather(&nbs,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2094   b->cowners[0] = 0;
2095   for ( i=2; i<=b->size; i++ ) {
2096     b->cowners[i] += b->cowners[i-1];
2097   }
2098   b->cstart    = b->cowners[b->rank];
2099   b->cend      = b->cowners[b->rank+1];
2100   b->cstart_bs = b->cstart * bs;
2101   b->cend_bs   = b->cend * bs;
2102 
2103 
2104   if (d_nz == PETSC_DEFAULT) d_nz = 5;
2105   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
2106   PLogObjectParent(B,b->A);
2107   if (o_nz == PETSC_DEFAULT) o_nz = 0;
2108   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
2109   PLogObjectParent(B,b->B);
2110 
2111   /* build cache for off array entries formed */
2112   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
2113   b->donotstash  = 0;
2114   b->colmap      = 0;
2115   b->garray      = 0;
2116   b->roworiented = 1;
2117 
2118   /* stuff used in block assembly */
2119   b->barray       = 0;
2120 
2121   /* stuff used for matrix vector multiply */
2122   b->lvec         = 0;
2123   b->Mvctx        = 0;
2124 
2125   /* stuff for MatGetRow() */
2126   b->rowindices   = 0;
2127   b->rowvalues    = 0;
2128   b->getrowactive = PETSC_FALSE;
2129 
2130   /* hash table stuff */
2131   b->ht           = 0;
2132   b->hd           = 0;
2133   b->ht_size      = 0;
2134   b->ht_flag      = 0;
2135   b->ht_fact      = 0;
2136   b->ht_total_ct  = 0;
2137   b->ht_insert_ct = 0;
2138 
2139   *A = B;
2140   ierr = OptionsHasName(PETSC_NULL,"-mat_use_hash_table",&flg); CHKERRQ(ierr);
2141   if (flg) {
2142     double fact = 1.39;
2143     ierr = MatSetOption(B,MAT_USE_HASH_TABLE); CHKERRQ(ierr);
2144     ierr = OptionsGetDouble(PETSC_NULL,"-mat_use_hash_table",&fact,&flg); CHKERRQ(ierr);
2145     if (fact <= 1.0) fact = 1.39;
2146     ierr = MatMPIBAIJSetHashTableFactor(B,fact); CHKERRQ(ierr);
2147     PLogInfo(0,"MatCreateMPIBAIJ:Hash table Factor used %5.2f\n",fact);
2148   }
2149   ierr = PetscObjectComposeFunction((PetscObject)B,"PCSetUp_BJacobi_C",
2150                                      "PCSetUp_BJacobi_BAIJ",
2151                                      (void*)PCSetUp_BJacobi_BAIJ);CHKERRQ(ierr);
2152   PetscFunctionReturn(0);
2153 }
2154 
2155 #undef __FUNC__
2156 #define __FUNC__ "MatDuplicate_MPIBAIJ"
2157 static int MatDuplicate_MPIBAIJ(Mat matin,MatDuplicateOption cpvalues,Mat *newmat)
2158 {
2159   Mat         mat;
2160   Mat_MPIBAIJ *a,*oldmat = (Mat_MPIBAIJ *) matin->data;
2161   int         ierr, len=0, flg;
2162 
2163   PetscFunctionBegin;
2164   *newmat       = 0;
2165   PetscHeaderCreate(mat,_p_Mat,struct _MatOps,MAT_COOKIE,MATMPIBAIJ,"Mat",matin->comm,MatDestroy,MatView);
2166   PLogObjectCreate(mat);
2167   mat->data       = (void *) (a = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(a);
2168   PetscMemcpy(mat->ops,&MatOps_Values,sizeof(struct _MatOps));
2169   mat->ops->destroy    = MatDestroy_MPIBAIJ;
2170   mat->ops->view       = MatView_MPIBAIJ;
2171   mat->factor          = matin->factor;
2172   mat->assembled       = PETSC_TRUE;
2173 
2174   a->m = mat->m   = oldmat->m;
2175   a->n = mat->n   = oldmat->n;
2176   a->M = mat->M   = oldmat->M;
2177   a->N = mat->N   = oldmat->N;
2178 
2179   a->bs  = oldmat->bs;
2180   a->bs2 = oldmat->bs2;
2181   a->mbs = oldmat->mbs;
2182   a->nbs = oldmat->nbs;
2183   a->Mbs = oldmat->Mbs;
2184   a->Nbs = oldmat->Nbs;
2185 
2186   a->rstart       = oldmat->rstart;
2187   a->rend         = oldmat->rend;
2188   a->cstart       = oldmat->cstart;
2189   a->cend         = oldmat->cend;
2190   a->size         = oldmat->size;
2191   a->rank         = oldmat->rank;
2192   mat->insertmode = NOT_SET_VALUES;
2193   a->rowvalues    = 0;
2194   a->getrowactive = PETSC_FALSE;
2195   a->barray       = 0;
2196 
2197   /* hash table stuff */
2198   a->ht           = 0;
2199   a->hd           = 0;
2200   a->ht_size      = 0;
2201   a->ht_flag      = oldmat->ht_flag;
2202   a->ht_fact      = oldmat->ht_fact;
2203   a->ht_total_ct  = 0;
2204   a->ht_insert_ct = 0;
2205 
2206 
2207   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
2208   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
2209   a->cowners = a->rowners + a->size + 2;
2210   PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));
2211   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
2212   if (oldmat->colmap) {
2213 #if defined (USE_CTABLE)
2214   ierr = TableCreateCopy( &a->colmap,oldmat->colmap ); CHKERRQ(ierr);
2215 #else
2216     a->colmap = (int *) PetscMalloc((a->Nbs)*sizeof(int));CHKPTRQ(a->colmap);
2217     PLogObjectMemory(mat,(a->Nbs)*sizeof(int));
2218     PetscMemcpy(a->colmap,oldmat->colmap,(a->Nbs)*sizeof(int));
2219 #endif
2220   } else a->colmap = 0;
2221   if (oldmat->garray && (len = ((Mat_SeqBAIJ *) (oldmat->B->data))->nbs)) {
2222     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
2223     PLogObjectMemory(mat,len*sizeof(int));
2224     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
2225   } else a->garray = 0;
2226 
2227   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
2228   PLogObjectParent(mat,a->lvec);
2229   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
2230 
2231   PLogObjectParent(mat,a->Mvctx);
2232   ierr =  MatDuplicate(oldmat->A,cpvalues,&a->A); CHKERRQ(ierr);
2233   PLogObjectParent(mat,a->A);
2234   ierr =  MatDuplicate(oldmat->B,cpvalues,&a->B); CHKERRQ(ierr);
2235   PLogObjectParent(mat,a->B);
2236   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
2237   ierr = FListDuplicate(mat->qlist,&matin->qlist);CHKERRQ(ierr);
2238   if (flg) {
2239     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
2240   }
2241   *newmat = mat;
2242   PetscFunctionReturn(0);
2243 }
2244 
2245 #include "sys.h"
2246 
2247 #undef __FUNC__
2248 #define __FUNC__ "MatLoad_MPIBAIJ"
2249 int MatLoad_MPIBAIJ(Viewer viewer,MatType type,Mat *newmat)
2250 {
2251   Mat          A;
2252   int          i, nz, ierr, j,rstart, rend, fd;
2253   Scalar       *vals,*buf;
2254   MPI_Comm     comm = ((PetscObject)viewer)->comm;
2255   MPI_Status   status;
2256   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,*browners,maxnz,*cols;
2257   int          *locrowlens,*sndcounts = 0,*procsnz = 0, jj,*mycols,*ibuf;
2258   int          flg,tag = ((PetscObject)viewer)->tag,bs=1,Mbs,mbs,extra_rows;
2259   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
2260   int          dcount,kmax,k,nzcount,tmp;
2261 
2262   PetscFunctionBegin;
2263   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
2264 
2265   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
2266   if (!rank) {
2267     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
2268     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT); CHKERRQ(ierr);
2269     if (header[0] != MAT_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"not matrix object");
2270     if (header[3] < 0) {
2271       SETERRQ(PETSC_ERR_FILE_UNEXPECTED,1,"Matrix stored in special format, cannot load as MPIBAIJ");
2272     }
2273   }
2274 
2275   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
2276   M = header[1]; N = header[2];
2277 
2278   if (M != N) SETERRQ(PETSC_ERR_SUP,0,"Can only do square matrices");
2279 
2280   /*
2281      This code adds extra rows to make sure the number of rows is
2282      divisible by the blocksize
2283   */
2284   Mbs        = M/bs;
2285   extra_rows = bs - M + bs*(Mbs);
2286   if (extra_rows == bs) extra_rows = 0;
2287   else                  Mbs++;
2288   if (extra_rows &&!rank) {
2289     PLogInfo(0,"MatLoad_MPIBAIJ:Padding loaded matrix to match blocksize\n");
2290   }
2291 
2292   /* determine ownership of all rows */
2293   mbs = Mbs/size + ((Mbs % size) > rank);
2294   m   = mbs * bs;
2295   rowners = (int *) PetscMalloc(2*(size+2)*sizeof(int)); CHKPTRQ(rowners);
2296   browners = rowners + size + 1;
2297   ierr = MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2298   rowners[0] = 0;
2299   for ( i=2; i<=size; i++ ) rowners[i] += rowners[i-1];
2300   for ( i=0; i<=size;  i++ ) browners[i] = rowners[i]*bs;
2301   rstart = rowners[rank];
2302   rend   = rowners[rank+1];
2303 
2304   /* distribute row lengths to all processors */
2305   locrowlens = (int*) PetscMalloc( (rend-rstart)*bs*sizeof(int) ); CHKPTRQ(locrowlens);
2306   if (!rank) {
2307     rowlengths = (int*) PetscMalloc( (M+extra_rows)*sizeof(int) ); CHKPTRQ(rowlengths);
2308     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT); CHKERRQ(ierr);
2309     for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
2310     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
2311     for ( i=0; i<size; i++ ) sndcounts[i] = browners[i+1] - browners[i];
2312     ierr = MPI_Scatterv(rowlengths,sndcounts,browners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);CHKERRQ(ierr);
2313     PetscFree(sndcounts);
2314   } else {
2315     ierr = MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT, 0,comm);CHKERRQ(ierr);
2316   }
2317 
2318   if (!rank) {
2319     /* calculate the number of nonzeros on each processor */
2320     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
2321     PetscMemzero(procsnz,size*sizeof(int));
2322     for ( i=0; i<size; i++ ) {
2323       for ( j=rowners[i]*bs; j< rowners[i+1]*bs; j++ ) {
2324         procsnz[i] += rowlengths[j];
2325       }
2326     }
2327     PetscFree(rowlengths);
2328 
2329     /* determine max buffer needed and allocate it */
2330     maxnz = 0;
2331     for ( i=0; i<size; i++ ) {
2332       maxnz = PetscMax(maxnz,procsnz[i]);
2333     }
2334     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
2335 
2336     /* read in my part of the matrix column indices  */
2337     nz = procsnz[0];
2338     ibuf = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2339     mycols = ibuf;
2340     if (size == 1)  nz -= extra_rows;
2341     ierr = PetscBinaryRead(fd,mycols,nz,PETSC_INT); CHKERRQ(ierr);
2342     if (size == 1)  for (i=0; i< extra_rows; i++) { mycols[nz+i] = M+i; }
2343 
2344     /* read in every ones (except the last) and ship off */
2345     for ( i=1; i<size-1; i++ ) {
2346       nz   = procsnz[i];
2347       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2348       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
2349     }
2350     /* read in the stuff for the last proc */
2351     if ( size != 1 ) {
2352       nz   = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
2353       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2354       for ( i=0; i<extra_rows; i++ ) cols[nz+i] = M+i;
2355       ierr = MPI_Send(cols,nz+extra_rows,MPI_INT,size-1,tag,comm);CHKERRQ(ierr);
2356     }
2357     PetscFree(cols);
2358   } else {
2359     /* determine buffer space needed for message */
2360     nz = 0;
2361     for ( i=0; i<m; i++ ) {
2362       nz += locrowlens[i];
2363     }
2364     ibuf   = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2365     mycols = ibuf;
2366     /* receive message of column indices*/
2367     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
2368     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
2369     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2370   }
2371 
2372   /* loop over local rows, determining number of off diagonal entries */
2373   dlens  = (int *) PetscMalloc( 2*(rend-rstart+1)*sizeof(int) ); CHKPTRQ(dlens);
2374   odlens = dlens + (rend-rstart);
2375   mask   = (int *) PetscMalloc( 3*Mbs*sizeof(int) ); CHKPTRQ(mask);
2376   PetscMemzero(mask,3*Mbs*sizeof(int));
2377   masked1 = mask    + Mbs;
2378   masked2 = masked1 + Mbs;
2379   rowcount = 0; nzcount = 0;
2380   for ( i=0; i<mbs; i++ ) {
2381     dcount  = 0;
2382     odcount = 0;
2383     for ( j=0; j<bs; j++ ) {
2384       kmax = locrowlens[rowcount];
2385       for ( k=0; k<kmax; k++ ) {
2386         tmp = mycols[nzcount++]/bs;
2387         if (!mask[tmp]) {
2388           mask[tmp] = 1;
2389           if (tmp < rstart || tmp >= rend ) masked2[odcount++] = tmp;
2390           else masked1[dcount++] = tmp;
2391         }
2392       }
2393       rowcount++;
2394     }
2395 
2396     dlens[i]  = dcount;
2397     odlens[i] = odcount;
2398 
2399     /* zero out the mask elements we set */
2400     for ( j=0; j<dcount; j++ ) mask[masked1[j]] = 0;
2401     for ( j=0; j<odcount; j++ ) mask[masked2[j]] = 0;
2402   }
2403 
2404   /* create our matrix */
2405   ierr = MatCreateMPIBAIJ(comm,bs,m,PETSC_DECIDE,M+extra_rows,N+extra_rows,0,dlens,0,odlens,newmat);
2406          CHKERRQ(ierr);
2407   A = *newmat;
2408   MatSetOption(A,MAT_COLUMNS_SORTED);
2409 
2410   if (!rank) {
2411     buf = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(buf);
2412     /* read in my part of the matrix numerical values  */
2413     nz = procsnz[0];
2414     vals = buf;
2415     mycols = ibuf;
2416     if (size == 1)  nz -= extra_rows;
2417     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2418     if (size == 1)  for (i=0; i< extra_rows; i++) { vals[nz+i] = 1.0; }
2419 
2420     /* insert into matrix */
2421     jj      = rstart*bs;
2422     for ( i=0; i<m; i++ ) {
2423       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2424       mycols += locrowlens[i];
2425       vals   += locrowlens[i];
2426       jj++;
2427     }
2428     /* read in other processors (except the last one) and ship out */
2429     for ( i=1; i<size-1; i++ ) {
2430       nz   = procsnz[i];
2431       vals = buf;
2432       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2433       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
2434     }
2435     /* the last proc */
2436     if ( size != 1 ){
2437       nz   = procsnz[i] - extra_rows;
2438       vals = buf;
2439       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2440       for ( i=0; i<extra_rows; i++ ) vals[nz+i] = 1.0;
2441       ierr = MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,size-1,A->tag,comm);CHKERRQ(ierr);
2442     }
2443     PetscFree(procsnz);
2444   } else {
2445     /* receive numeric values */
2446     buf = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(buf);
2447 
2448     /* receive message of values*/
2449     vals   = buf;
2450     mycols = ibuf;
2451     ierr   = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
2452     ierr   = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
2453     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2454 
2455     /* insert into matrix */
2456     jj      = rstart*bs;
2457     for ( i=0; i<m; i++ ) {
2458       ierr    = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2459       mycols += locrowlens[i];
2460       vals   += locrowlens[i];
2461       jj++;
2462     }
2463   }
2464   PetscFree(locrowlens);
2465   PetscFree(buf);
2466   PetscFree(ibuf);
2467   PetscFree(rowners);
2468   PetscFree(dlens);
2469   PetscFree(mask);
2470   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2471   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2472   PetscFunctionReturn(0);
2473 }
2474 
2475 
2476 
2477 #undef __FUNC__
2478 #define __FUNC__ "MatMPIBAIJSetHashTableFactor"
2479 /*@
2480    MatMPIBAIJSetHashTableFactor - Sets the factor required to compute the size of the HashTable.
2481 
2482    Input Parameters:
2483 .  mat  - the matrix
2484 .  fact - factor
2485 
2486    Collective on Mat
2487 
2488   Notes:
2489    This can also be set by the command line option: -mat_use_hash_table fact
2490 
2491 .keywords: matrix, hashtable, factor, HT
2492 
2493 .seealso: MatSetOption()
2494 @*/
2495 int MatMPIBAIJSetHashTableFactor(Mat mat,double fact)
2496 {
2497   Mat_MPIBAIJ *baij;
2498 
2499   PetscFunctionBegin;
2500   PetscValidHeaderSpecific(mat,MAT_COOKIE);
2501   if (mat->type != MATMPIBAIJ) {
2502       SETERRQ(PETSC_ERR_ARG_WRONG,1,"Incorrect matrix type. Use MPIBAIJ only.");
2503   }
2504   baij = (Mat_MPIBAIJ*) mat->data;
2505   baij->ht_fact = fact;
2506   PetscFunctionReturn(0);
2507 }
2508