xref: /petsc/src/mat/impls/baij/mpi/mpibaij.c (revision 48e59246f5401e5a7adbba88193a2a53d0c4f8b6)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: mpibaij.c,v 1.140 1999/01/08 16:55:04 balay Exp balay $";
3 #endif
4 
5 #include "src/mat/impls/baij/mpi/mpibaij.h"   /*I  "mat.h"  I*/
6 #include "src/vec/vecimpl.h"
7 
8 extern int MatSetUpMultiply_MPIBAIJ(Mat);
9 extern int DisAssemble_MPIBAIJ(Mat);
10 extern int MatIncreaseOverlap_MPIBAIJ(Mat,int,IS *,int);
11 extern int MatGetSubMatrices_MPIBAIJ(Mat,int,IS *,IS *,MatGetSubMatrixCall,Mat **);
12 extern int MatGetValues_SeqBAIJ(Mat,int,int *,int,int *,Scalar *);
13 
14 /*
15      Local utility routine that creates a mapping from the global column
16    number to the local number in the off-diagonal part of the local
17    storage of the matrix.  This is done in a non scable way since the
18    length of colmap equals the global matrix length.
19 */
20 #undef __FUNC__
21 #define __FUNC__ "CreateColmap_MPIBAIJ_Private"
22 static int CreateColmap_MPIBAIJ_Private(Mat mat)
23 {
24   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
25   Mat_SeqBAIJ *B = (Mat_SeqBAIJ*) baij->B->data;
26   int         nbs = B->nbs,i,bs=B->bs,ierr;
27 
28   PetscFunctionBegin;
29 #if defined (USE_CTABLE)
30   ierr = TableCreate( &baij->colmap, baij->nbs/5 ); CHKERRQ(ierr);
31   for ( i=0; i<nbs; i++ ){
32     ierr = TableAdd( baij->colmap, baij->garray[i] + 1, i*bs+1 );CHKERRQ(ierr);
33   }
34 #else
35   baij->colmap = (int *) PetscMalloc((baij->Nbs+1)*sizeof(int));CHKPTRQ(baij->colmap);
36   PLogObjectMemory(mat,baij->Nbs*sizeof(int));
37   PetscMemzero(baij->colmap,baij->Nbs*sizeof(int));
38   for ( i=0; i<nbs; i++ ) baij->colmap[baij->garray[i]] = i*bs+1;
39 #endif
40   PetscFunctionReturn(0);
41 }
42 
43 #define CHUNKSIZE  10
44 
45 #define  MatSetValues_SeqBAIJ_A_Private(row,col,value,addv) \
46 { \
47  \
48     brow = row/bs;  \
49     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
50     rmax = aimax[brow]; nrow = ailen[brow]; \
51       bcol = col/bs; \
52       ridx = row % bs; cidx = col % bs; \
53       low = 0; high = nrow; \
54       while (high-low > 3) { \
55         t = (low+high)/2; \
56         if (rp[t] > bcol) high = t; \
57         else              low  = t; \
58       } \
59       for ( _i=low; _i<high; _i++ ) { \
60         if (rp[_i] > bcol) break; \
61         if (rp[_i] == bcol) { \
62           bap  = ap +  bs2*_i + bs*cidx + ridx; \
63           if (addv == ADD_VALUES) *bap += value;  \
64           else                    *bap  = value;  \
65           goto a_noinsert; \
66         } \
67       } \
68       if (a->nonew == 1) goto a_noinsert; \
69       else if (a->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
70       if (nrow >= rmax) { \
71         /* there is no extra room in row, therefore enlarge */ \
72         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
73         Scalar *new_a; \
74  \
75         if (a->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
76  \
77         /* malloc new storage space */ \
78         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int); \
79         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
80         new_j   = (int *) (new_a + bs2*new_nz); \
81         new_i   = new_j + new_nz; \
82  \
83         /* copy over old data into new slots */ \
84         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];} \
85         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
86         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int)); \
87         len = (new_nz - CHUNKSIZE - ai[brow] - nrow); \
88         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow, \
89                                                            len*sizeof(int)); \
90         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar)); \
91         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
92         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE), \
93                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));  \
94         /* free up old matrix storage */ \
95         PetscFree(a->a);  \
96         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} \
97         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
98         a->singlemalloc = 1; \
99  \
100         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
101         rmax = aimax[brow] = aimax[brow] + CHUNKSIZE; \
102         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
103         a->maxnz += bs2*CHUNKSIZE; \
104         a->reallocs++; \
105         a->nz++; \
106       } \
107       N = nrow++ - 1;  \
108       /* shift up all the later entries in this row */ \
109       for ( ii=N; ii>=_i; ii-- ) { \
110         rp[ii+1] = rp[ii]; \
111         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
112       } \
113       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
114       rp[_i]                      = bcol;  \
115       ap[bs2*_i + bs*cidx + ridx] = value;  \
116       a_noinsert:; \
117     ailen[brow] = nrow; \
118 }
119 
120 #define  MatSetValues_SeqBAIJ_B_Private(row,col,value,addv) \
121 { \
122  \
123     brow = row/bs;  \
124     rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
125     rmax = bimax[brow]; nrow = bilen[brow]; \
126       bcol = col/bs; \
127       ridx = row % bs; cidx = col % bs; \
128       low = 0; high = nrow; \
129       while (high-low > 3) { \
130         t = (low+high)/2; \
131         if (rp[t] > bcol) high = t; \
132         else              low  = t; \
133       } \
134       for ( _i=low; _i<high; _i++ ) { \
135         if (rp[_i] > bcol) break; \
136         if (rp[_i] == bcol) { \
137           bap  = ap +  bs2*_i + bs*cidx + ridx; \
138           if (addv == ADD_VALUES) *bap += value;  \
139           else                    *bap  = value;  \
140           goto b_noinsert; \
141         } \
142       } \
143       if (b->nonew == 1) goto b_noinsert; \
144       else if (b->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
145       if (nrow >= rmax) { \
146         /* there is no extra room in row, therefore enlarge */ \
147         int    new_nz = bi[b->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
148         Scalar *new_a; \
149  \
150         if (b->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
151  \
152         /* malloc new storage space */ \
153         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(b->mbs+1)*sizeof(int); \
154         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
155         new_j   = (int *) (new_a + bs2*new_nz); \
156         new_i   = new_j + new_nz; \
157  \
158         /* copy over old data into new slots */ \
159         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = bi[ii];} \
160         for ( ii=brow+1; ii<b->mbs+1; ii++ ) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
161         PetscMemcpy(new_j,bj,(bi[brow]+nrow)*sizeof(int)); \
162         len = (new_nz - CHUNKSIZE - bi[brow] - nrow); \
163         PetscMemcpy(new_j+bi[brow]+nrow+CHUNKSIZE,bj+bi[brow]+nrow, \
164                                                            len*sizeof(int)); \
165         PetscMemcpy(new_a,ba,(bi[brow]+nrow)*bs2*sizeof(Scalar)); \
166         PetscMemzero(new_a+bs2*(bi[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
167         PetscMemcpy(new_a+bs2*(bi[brow]+nrow+CHUNKSIZE), \
168                     ba+bs2*(bi[brow]+nrow),bs2*len*sizeof(Scalar));  \
169         /* free up old matrix storage */ \
170         PetscFree(b->a);  \
171         if (!b->singlemalloc) {PetscFree(b->i);PetscFree(b->j);} \
172         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
173         b->singlemalloc = 1; \
174  \
175         rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
176         rmax = bimax[brow] = bimax[brow] + CHUNKSIZE; \
177         PLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
178         b->maxnz += bs2*CHUNKSIZE; \
179         b->reallocs++; \
180         b->nz++; \
181       } \
182       N = nrow++ - 1;  \
183       /* shift up all the later entries in this row */ \
184       for ( ii=N; ii>=_i; ii-- ) { \
185         rp[ii+1] = rp[ii]; \
186         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
187       } \
188       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
189       rp[_i]                      = bcol;  \
190       ap[bs2*_i + bs*cidx + ridx] = value;  \
191       b_noinsert:; \
192     bilen[brow] = nrow; \
193 }
194 
195 #undef __FUNC__
196 #define __FUNC__ "MatSetValues_MPIBAIJ"
197 int MatSetValues_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
198 {
199   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
200   Scalar      value;
201   int         ierr,i,j,row,col;
202   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
203   int         rend_orig=baij->rend_bs,cstart_orig=baij->cstart_bs;
204   int         cend_orig=baij->cend_bs,bs=baij->bs;
205 
206   /* Some Variables required in the macro */
207   Mat         A = baij->A;
208   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) (A)->data;
209   int         *aimax=a->imax,*ai=a->i,*ailen=a->ilen,*aj=a->j;
210   Scalar      *aa=a->a;
211 
212   Mat         B = baij->B;
213   Mat_SeqBAIJ *b = (Mat_SeqBAIJ *) (B)->data;
214   int         *bimax=b->imax,*bi=b->i,*bilen=b->ilen,*bj=b->j;
215   Scalar      *ba=b->a;
216 
217   int         *rp,ii,nrow,_i,rmax,N,brow,bcol;
218   int         low,high,t,ridx,cidx,bs2=a->bs2;
219   Scalar      *ap,*bap;
220 
221   PetscFunctionBegin;
222   for ( i=0; i<m; i++ ) {
223 #if defined(USE_PETSC_BOPT_g)
224     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
225     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
226 #endif
227     if (im[i] >= rstart_orig && im[i] < rend_orig) {
228       row = im[i] - rstart_orig;
229       for ( j=0; j<n; j++ ) {
230         if (in[j] >= cstart_orig && in[j] < cend_orig){
231           col = in[j] - cstart_orig;
232           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
233           MatSetValues_SeqBAIJ_A_Private(row,col,value,addv);
234           /* ierr = MatSetValues_SeqBAIJ(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
235         }
236 #if defined(USE_PETSC_BOPT_g)
237         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
238         else if (in[j] >= baij->N) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Col too large");}
239 #endif
240         else {
241           if (mat->was_assembled) {
242             if (!baij->colmap) {
243               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
244             }
245 #if defined (USE_CTABLE)
246 	    col = TableFind( baij->colmap, in[j]/bs + 1 ) - 1 + in[j]%bs;
247 #else
248             col = baij->colmap[in[j]/bs] - 1 + in[j]%bs;
249 #endif
250             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
251               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
252               col =  in[j];
253               /* Reinitialize the variables required by MatSetValues_SeqBAIJ_B_Private() */
254               B = baij->B;
255               b = (Mat_SeqBAIJ *) (B)->data;
256               bimax=b->imax;bi=b->i;bilen=b->ilen;bj=b->j;
257               ba=b->a;
258             }
259           } else col = in[j];
260           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
261           MatSetValues_SeqBAIJ_B_Private(row,col,value,addv);
262           /* ierr = MatSetValues_SeqBAIJ(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
263         }
264       }
265     } else {
266       if (roworiented && !baij->donotstash) {
267         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
268       } else {
269         if (!baij->donotstash) {
270           row = im[i];
271 	  for ( j=0; j<n; j++ ) {
272 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
273           }
274         }
275       }
276     }
277   }
278   PetscFunctionReturn(0);
279 }
280 
281 extern int MatSetValuesBlocked_SeqBAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
282 #undef __FUNC__
283 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ"
284 int MatSetValuesBlocked_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
285 {
286   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
287   Scalar      *value,*barray=baij->barray;
288   int         ierr,i,j,ii,jj,row,col,k,l;
289   int         roworiented = baij->roworiented,rstart=baij->rstart ;
290   int         rend=baij->rend,cstart=baij->cstart,stepval;
291   int         cend=baij->cend,bs=baij->bs,bs2=baij->bs2;
292 
293   if(!barray) {
294     baij->barray = barray = (Scalar*) PetscMalloc(bs2*sizeof(Scalar)); CHKPTRQ(barray);
295   }
296 
297   if (roworiented) {
298     stepval = (n-1)*bs;
299   } else {
300     stepval = (m-1)*bs;
301   }
302   for ( i=0; i<m; i++ ) {
303 #if defined(USE_PETSC_BOPT_g)
304     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
305     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
306 #endif
307     if (im[i] >= rstart && im[i] < rend) {
308       row = im[i] - rstart;
309       for ( j=0; j<n; j++ ) {
310         /* If NumCol = 1 then a copy is not required */
311         if ((roworiented) && (n == 1)) {
312           barray = v + i*bs2;
313         } else if((!roworiented) && (m == 1)) {
314           barray = v + j*bs2;
315         } else { /* Here a copy is required */
316           if (roworiented) {
317             value = v + i*(stepval+bs)*bs + j*bs;
318           } else {
319             value = v + j*(stepval+bs)*bs + i*bs;
320           }
321           for ( ii=0; ii<bs; ii++,value+=stepval ) {
322             for (jj=0; jj<bs; jj++ ) {
323               *barray++  = *value++;
324             }
325           }
326           barray -=bs2;
327         }
328 
329         if (in[j] >= cstart && in[j] < cend){
330           col  = in[j] - cstart;
331           ierr = MatSetValuesBlocked_SeqBAIJ(baij->A,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
332         }
333 #if defined(USE_PETSC_BOPT_g)
334         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
335         else if (in[j] >= baij->Nbs) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");}
336 #endif
337         else {
338           if (mat->was_assembled) {
339             if (!baij->colmap) {
340               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
341             }
342 
343 #if defined(USE_PETSC_BOPT_g)
344 #if defined (USE_CTABLE)
345             if( (TableFind( baij->colmap, in[j] + 1 ) - 1) % bs)
346 	      {SETERRQ(PETSC_ERR_PLIB,0,"Incorrect colmap");}
347 #else
348             if ((baij->colmap[in[j]] - 1) % bs) {SETERRQ(PETSC_ERR_PLIB,0,"Incorrect colmap");}
349 #endif
350 #endif
351 #if defined (USE_CTABLE)
352 	    col = (TableFind( baij->colmap, in[j] + 1 ) - 1)/bs;
353 #else
354             col = (baij->colmap[in[j]] - 1)/bs;
355 #endif
356             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
357               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
358               col =  in[j];
359             }
360           }
361           else col = in[j];
362           ierr = MatSetValuesBlocked_SeqBAIJ(baij->B,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
363         }
364       }
365     } else {
366       if (!baij->donotstash) {
367         if (roworiented ) {
368           row   = im[i]*bs;
369           value = v + i*(stepval+bs)*bs;
370           for ( j=0; j<bs; j++,row++ ) {
371             for ( k=0; k<n; k++ ) {
372               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
373                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
374               }
375             }
376           }
377         } else {
378           for ( j=0; j<n; j++ ) {
379             value = v + j*(stepval+bs)*bs + i*bs;
380             col   = in[j]*bs;
381             for ( k=0; k<bs; k++,col++,value+=stepval) {
382               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
383                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
384               }
385             }
386           }
387         }
388       }
389     }
390   }
391   PetscFunctionReturn(0);
392 }
393 #define HASH_KEY 0.6180339887
394 /* #define HASH1(size,key) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
395 #define HASH(size,key,tmp) (tmp = (key)*HASH_KEY,(int)((size)*(tmp-(int)tmp)))
396 /* #define HASH(size,key,tmp) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
397 #undef __FUNC__
398 #define __FUNC__ "MatSetValues_MPIBAIJ_HT"
399 int MatSetValues_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
400 {
401   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
402   int         ierr,i,j,row,col;
403   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
404   int         rend_orig=baij->rend_bs,Nbs=baij->Nbs;
405   int         h1,key,size=baij->ht_size,bs=baij->bs,*HT=baij->ht,idx;
406   double      tmp;
407   Scalar      ** HD = baij->hd,value;
408 #if defined(USE_PETSC_BOPT_g)
409   int         total_ct=baij->ht_total_ct,insert_ct=baij->ht_insert_ct;
410 #endif
411 
412   PetscFunctionBegin;
413 
414   for ( i=0; i<m; i++ ) {
415 #if defined(USE_PETSC_BOPT_g)
416     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
417     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
418 #endif
419       row = im[i];
420     if (row >= rstart_orig && row < rend_orig) {
421       for ( j=0; j<n; j++ ) {
422         col = in[j];
423         if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
424         /* Look up into the Hash Table */
425         key = (row/bs)*Nbs+(col/bs)+1;
426         h1  = HASH(size,key,tmp);
427 
428 
429         idx = h1;
430 #if defined(USE_PETSC_BOPT_g)
431         insert_ct++;
432         total_ct++;
433         if (HT[idx] != key) {
434           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++,total_ct++);
435           if (idx == size) {
436             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++,total_ct++);
437             if (idx == h1) {
438               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
439             }
440           }
441         }
442 #else
443         if (HT[idx] != key) {
444           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++);
445           if (idx == size) {
446             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++);
447             if (idx == h1) {
448               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
449             }
450           }
451         }
452 #endif
453         /* A HASH table entry is found, so insert the values at the correct address */
454         if (addv == ADD_VALUES) *(HD[idx]+ (col % bs)*bs + (row % bs)) += value;
455         else                    *(HD[idx]+ (col % bs)*bs + (row % bs))  = value;
456       }
457     } else {
458       if (roworiented && !baij->donotstash) {
459         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
460       } else {
461         if (!baij->donotstash) {
462           row = im[i];
463 	  for ( j=0; j<n; j++ ) {
464 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
465           }
466         }
467       }
468     }
469   }
470 #if defined(USE_PETSC_BOPT_g)
471   baij->ht_total_ct = total_ct;
472   baij->ht_insert_ct = insert_ct;
473 #endif
474   PetscFunctionReturn(0);
475 }
476 
477 #undef __FUNC__
478 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ_HT"
479 int MatSetValuesBlocked_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
480 {
481   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
482   int         ierr,i,j,ii,jj,row,col,k,l;
483   int         roworiented = baij->roworiented,rstart=baij->rstart ;
484   int         rend=baij->rend,stepval,bs=baij->bs,bs2=baij->bs2;
485   int         h1,key,size=baij->ht_size,idx,*HT=baij->ht,Nbs=baij->Nbs;
486   double      tmp;
487   Scalar      ** HD = baij->hd,*value,*v_t,*baij_a;
488 #if defined(USE_PETSC_BOPT_g)
489   int         total_ct=baij->ht_total_ct,insert_ct=baij->ht_insert_ct;
490 #endif
491 
492   PetscFunctionBegin;
493 
494   if (roworiented) {
495     stepval = (n-1)*bs;
496   } else {
497     stepval = (m-1)*bs;
498   }
499   for ( i=0; i<m; i++ ) {
500 #if defined(USE_PETSC_BOPT_g)
501     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
502     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
503 #endif
504     row   = im[i];
505     v_t   = v + i*bs2;
506     if (row >= rstart && row < rend) {
507       for ( j=0; j<n; j++ ) {
508         col = in[j];
509 
510         /* Look up into the Hash Table */
511         key = row*Nbs+col+1;
512         h1  = HASH(size,key,tmp);
513 
514         idx = h1;
515 #if defined(USE_PETSC_BOPT_g)
516         total_ct++;
517         insert_ct++;
518        if (HT[idx] != key) {
519           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++,total_ct++);
520           if (idx == size) {
521             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++,total_ct++);
522             if (idx == h1) {
523               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
524             }
525           }
526         }
527 #else
528         if (HT[idx] != key) {
529           for ( idx=h1; (idx<size) && (HT[idx]!=key); idx++);
530           if (idx == size) {
531             for ( idx=0; (idx<h1) && (HT[idx]!=key); idx++);
532             if (idx == h1) {
533               SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
534             }
535           }
536         }
537 #endif
538         baij_a = HD[idx];
539         if (roworiented) {
540           /*value = v + i*(stepval+bs)*bs + j*bs;*/
541           /* value = v + (i*(stepval+bs)+j)*bs; */
542           value = v_t;
543           v_t  += bs;
544           if (addv == ADD_VALUES) {
545             for ( ii=0; ii<bs; ii++,value+=stepval) {
546               for ( jj=ii; jj<bs2; jj+=bs ) {
547                 baij_a[jj]  += *value++;
548               }
549             }
550           } else {
551             for ( ii=0; ii<bs; ii++,value+=stepval) {
552               for ( jj=ii; jj<bs2; jj+=bs ) {
553                 baij_a[jj]  = *value++;
554               }
555             }
556           }
557         } else {
558           value = v + j*(stepval+bs)*bs + i*bs;
559           if (addv == ADD_VALUES) {
560             for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
561               for ( jj=0; jj<bs; jj++ ) {
562                 baij_a[jj]  += *value++;
563               }
564             }
565           } else {
566             for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
567               for ( jj=0; jj<bs; jj++ ) {
568                 baij_a[jj]  = *value++;
569               }
570             }
571           }
572         }
573       }
574     } else {
575       if (!baij->donotstash) {
576         if (roworiented ) {
577           row   = im[i]*bs;
578           value = v + i*(stepval+bs)*bs;
579           for ( j=0; j<bs; j++,row++ ) {
580             for ( k=0; k<n; k++ ) {
581               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
582                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
583               }
584             }
585           }
586         } else {
587           for ( j=0; j<n; j++ ) {
588             value = v + j*(stepval+bs)*bs + i*bs;
589             col   = in[j]*bs;
590             for ( k=0; k<bs; k++,col++,value+=stepval) {
591               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
592                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
593               }
594             }
595           }
596         }
597       }
598     }
599   }
600 #if defined(USE_PETSC_BOPT_g)
601   baij->ht_total_ct = total_ct;
602   baij->ht_insert_ct = insert_ct;
603 #endif
604   PetscFunctionReturn(0);
605 }
606 
607 #undef __FUNC__
608 #define __FUNC__ "MatGetValues_MPIBAIJ"
609 int MatGetValues_MPIBAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
610 {
611   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
612   int        bs=baij->bs,ierr,i,j, bsrstart = baij->rstart*bs, bsrend = baij->rend*bs;
613   int        bscstart = baij->cstart*bs, bscend = baij->cend*bs,row,col,data;
614 
615   PetscFunctionBegin;
616   for ( i=0; i<m; i++ ) {
617     if (idxm[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
618     if (idxm[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
619     if (idxm[i] >= bsrstart && idxm[i] < bsrend) {
620       row = idxm[i] - bsrstart;
621       for ( j=0; j<n; j++ ) {
622         if (idxn[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");
623         if (idxn[j] >= baij->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");
624         if (idxn[j] >= bscstart && idxn[j] < bscend){
625           col = idxn[j] - bscstart;
626           ierr = MatGetValues_SeqBAIJ(baij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
627         } else {
628           if (!baij->colmap) {
629             ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
630           }
631 #if defined (USE_CTABLE)
632           data = TableFind( baij->colmap, idxn[j]/bs + 1 ) - 1;
633 #else
634           data = baij->colmap[idxn[j]/bs]-1;
635 #endif
636           if((data < 0) || (baij->garray[data/bs] != idxn[j]/bs)) *(v+i*n+j) = 0.0;
637           else {
638             col  = data + idxn[j]%bs;
639             ierr = MatGetValues_SeqBAIJ(baij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
640           }
641         }
642       }
643     } else {
644       SETERRQ(PETSC_ERR_SUP,0,"Only local values currently supported");
645     }
646   }
647  PetscFunctionReturn(0);
648 }
649 
650 #undef __FUNC__
651 #define __FUNC__ "MatNorm_MPIBAIJ"
652 int MatNorm_MPIBAIJ(Mat mat,NormType type,double *norm)
653 {
654   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
655   Mat_SeqBAIJ *amat = (Mat_SeqBAIJ*) baij->A->data, *bmat = (Mat_SeqBAIJ*) baij->B->data;
656   int        ierr, i,bs2=baij->bs2;
657   double     sum = 0.0;
658   Scalar     *v;
659 
660   PetscFunctionBegin;
661   if (baij->size == 1) {
662     ierr =  MatNorm(baij->A,type,norm); CHKERRQ(ierr);
663   } else {
664     if (type == NORM_FROBENIUS) {
665       v = amat->a;
666       for (i=0; i<amat->nz*bs2; i++ ) {
667 #if defined(USE_PETSC_COMPLEX)
668         sum += PetscReal(PetscConj(*v)*(*v)); v++;
669 #else
670         sum += (*v)*(*v); v++;
671 #endif
672       }
673       v = bmat->a;
674       for (i=0; i<bmat->nz*bs2; i++ ) {
675 #if defined(USE_PETSC_COMPLEX)
676         sum += PetscReal(PetscConj(*v)*(*v)); v++;
677 #else
678         sum += (*v)*(*v); v++;
679 #endif
680       }
681       ierr = MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
682       *norm = sqrt(*norm);
683     } else {
684       SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
685     }
686   }
687   PetscFunctionReturn(0);
688 }
689 
690 #undef __FUNC__
691 #define __FUNC__ "MatAssemblyBegin_MPIBAIJ"
692 int MatAssemblyBegin_MPIBAIJ(Mat mat,MatAssemblyType mode)
693 {
694   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
695   MPI_Comm    comm = mat->comm;
696   int         size = baij->size, *owners = baij->rowners,bs=baij->bs;
697   int         rank = baij->rank,tag = mat->tag, *owner,*starts,count,ierr;
698   MPI_Request *send_waits,*recv_waits;
699   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
700   InsertMode  addv;
701   Scalar      *rvalues,*svalues;
702 
703   PetscFunctionBegin;
704   if (baij->donotstash) {
705     baij->svalues    = 0; baij->rvalues    = 0;
706     baij->nsends     = 0; baij->nrecvs     = 0;
707     baij->send_waits = 0; baij->recv_waits = 0;
708     baij->rmax       = 0;
709     PetscFunctionReturn(0);
710   }
711 
712   /* make sure all processors are either in INSERTMODE or ADDMODE */
713   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);CHKERRQ(ierr);
714   if (addv == (ADD_VALUES|INSERT_VALUES)) {
715     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Some processors inserted others added");
716   }
717   mat->insertmode = addv; /* in case this processor had no cache */
718 
719   /*  first count number of contributors to each processor */
720   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
721   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
722   owner = (int *) PetscMalloc( (baij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
723   for ( i=0; i<baij->stash.n; i++ ) {
724     idx = baij->stash.idx[i];
725     for ( j=0; j<size; j++ ) {
726       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
727         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
728       }
729     }
730   }
731   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
732 
733   /* inform other processors of number of messages and max length*/
734   work      = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
735   ierr      = MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
736   nreceives = work[rank];
737   ierr      = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
738   nmax      = work[rank];
739   PetscFree(work);
740 
741   /* post receives:
742        1) each message will consist of ordered pairs
743      (global index,value) we store the global index as a double
744      to simplify the message passing.
745        2) since we don't know how long each individual message is we
746      allocate the largest needed buffer for each receive. Potentially
747      this is a lot of wasted space.
748 
749 
750        This could be done better.
751   */
752   rvalues    = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));CHKPTRQ(rvalues);
753   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
754   for ( i=0; i<nreceives; i++ ) {
755     ierr = MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
756   }
757 
758   /* do sends:
759       1) starts[i] gives the starting index in svalues for stuff going to
760          the ith processor
761   */
762   svalues    = (Scalar *) PetscMalloc(3*(baij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
763   send_waits = (MPI_Request *) PetscMalloc((nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
764   starts     = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
765   starts[0] = 0;
766   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
767   for ( i=0; i<baij->stash.n; i++ ) {
768     svalues[3*starts[owner[i]]]       = (Scalar)  baij->stash.idx[i];
769     svalues[3*starts[owner[i]]+1]     = (Scalar)  baij->stash.idy[i];
770     svalues[3*(starts[owner[i]]++)+2] =  baij->stash.array[i];
771   }
772   PetscFree(owner);
773   starts[0] = 0;
774   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
775   count = 0;
776   for ( i=0; i<size; i++ ) {
777     if (procs[i]) {
778       ierr = MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
779     }
780   }
781   PetscFree(starts); PetscFree(nprocs);
782 
783   /* Free cache space */
784   PLogInfo(baij->A,"MatAssemblyBegin_MPIBAIJ:Number of off-processor values %d\n",baij->stash.n);
785   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
786 
787   baij->svalues    = svalues;    baij->rvalues    = rvalues;
788   baij->nsends     = nsends;     baij->nrecvs     = nreceives;
789   baij->send_waits = send_waits; baij->recv_waits = recv_waits;
790   baij->rmax       = nmax;
791 
792   PetscFunctionReturn(0);
793 }
794 
795 /*
796   Creates the hash table, and sets the table
797   This table is created only once.
798   If new entried need to be added to the matrix
799   then the hash table has to be destroyed and
800   recreated.
801 */
802 #undef __FUNC__
803 #define __FUNC__ "MatCreateHashTable_MPIBAIJ_Private"
804 int MatCreateHashTable_MPIBAIJ_Private(Mat mat,double factor)
805 {
806   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
807   Mat         A = baij->A, B=baij->B;
808   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data, *b=(Mat_SeqBAIJ *)B->data;
809   int         i,j,k,nz=a->nz+b->nz,h1,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
810   int         size,bs2=baij->bs2,rstart=baij->rstart;
811   int         cstart=baij->cstart,*garray=baij->garray,row,col,Nbs=baij->Nbs;
812   int         *HT,key;
813   Scalar      **HD;
814   double      tmp;
815 #if defined(USE_PETSC_BOPT_g)
816   int         ct=0,max=0;
817 #endif
818 
819   PetscFunctionBegin;
820   baij->ht_size=(int)(factor*nz);
821   size = baij->ht_size;
822 
823   if (baij->ht) {
824     PetscFunctionReturn(0);
825   }
826 
827   /* Allocate Memory for Hash Table */
828   baij->hd = (Scalar**)PetscMalloc((size)*(sizeof(int)+sizeof(Scalar*))+1); CHKPTRQ(baij->hd);
829   baij->ht = (int*)(baij->hd + size);
830   HD = baij->hd;
831   HT = baij->ht;
832 
833 
834   PetscMemzero(HD,size*(sizeof(int)+sizeof(Scalar*)));
835 
836 
837   /* Loop Over A */
838   for ( i=0; i<a->mbs; i++ ) {
839     for ( j=ai[i]; j<ai[i+1]; j++ ) {
840       row = i+rstart;
841       col = aj[j]+cstart;
842 
843       key = row*Nbs + col + 1;
844       h1  = HASH(size,key,tmp);
845       for ( k=0; k<size; k++ ){
846         if (HT[(h1+k)%size] == 0.0) {
847           HT[(h1+k)%size] = key;
848           HD[(h1+k)%size] = a->a + j*bs2;
849           break;
850 #if defined(USE_PETSC_BOPT_g)
851         } else {
852           ct++;
853 #endif
854         }
855       }
856 #if defined(USE_PETSC_BOPT_g)
857       if (k> max) max = k;
858 #endif
859     }
860   }
861   /* Loop Over B */
862   for ( i=0; i<b->mbs; i++ ) {
863     for ( j=bi[i]; j<bi[i+1]; j++ ) {
864       row = i+rstart;
865       col = garray[bj[j]];
866       key = row*Nbs + col + 1;
867       h1  = HASH(size,key,tmp);
868       for ( k=0; k<size; k++ ){
869         if (HT[(h1+k)%size] == 0.0) {
870           HT[(h1+k)%size] = key;
871           HD[(h1+k)%size] = b->a + j*bs2;
872           break;
873 #if defined(USE_PETSC_BOPT_g)
874         } else {
875           ct++;
876 #endif
877         }
878       }
879 #if defined(USE_PETSC_BOPT_g)
880       if (k> max) max = k;
881 #endif
882     }
883   }
884 
885   /* Print Summary */
886 #if defined(USE_PETSC_BOPT_g)
887   for ( i=0,j=0; i<size; i++)
888     if (HT[i]) {j++;}
889   PLogInfo(0,"MatCreateHashTable_MPIBAIJ_Private: Average Search = %5.2f,max search = %d\n",
890            (j== 0)? 0.0:((double)(ct+j))/j,max);
891 #endif
892   PetscFunctionReturn(0);
893 }
894 
895 #undef __FUNC__
896 #define __FUNC__ "MatAssemblyEnd_MPIBAIJ"
897 int MatAssemblyEnd_MPIBAIJ(Mat mat,MatAssemblyType mode)
898 {
899   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
900   MPI_Status  *send_status,recv_status;
901   int         imdex,nrecvs = baij->nrecvs, count = nrecvs, i, n, ierr;
902   int         bs=baij->bs,row,col,other_disassembled;
903   Scalar      *values,val;
904   InsertMode  addv = mat->insertmode;
905 
906   PetscFunctionBegin;
907   /*  wait on receives */
908   while (count) {
909     ierr = MPI_Waitany(nrecvs,baij->recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
910     /* unpack receives into our local space */
911     values = baij->rvalues + 3*imdex*baij->rmax;
912     ierr = MPI_Get_count(&recv_status,MPIU_SCALAR,&n);CHKERRQ(ierr);
913     n = n/3;
914     for ( i=0; i<n; i++ ) {
915       row = (int) PetscReal(values[3*i]) - baij->rstart*bs;
916       col = (int) PetscReal(values[3*i+1]);
917       val = values[3*i+2];
918       if (col >= baij->cstart*bs && col < baij->cend*bs) {
919         col -= baij->cstart*bs;
920         ierr = MatSetValues(baij->A,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
921       } else {
922         if (mat->was_assembled) {
923           if (!baij->colmap) {
924             ierr = CreateColmap_MPIBAIJ_Private(mat); CHKERRQ(ierr);
925           }
926 #if defined (USE_CTABLE)
927 	  col = TableFind( baij->colmap, col/bs + 1 ) - 1 + col%bs;
928 #else
929           col = (baij->colmap[col/bs]) - 1 + col%bs;
930 #endif
931           if (col < 0  && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
932             ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
933             col = (int) PetscReal(values[3*i+1]);
934           }
935         }
936         ierr = MatSetValues(baij->B,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
937       }
938     }
939     count--;
940   }
941   if (baij->recv_waits) PetscFree(baij->recv_waits);
942   if (baij->rvalues)    PetscFree(baij->rvalues);
943 
944   /* wait on sends */
945   if (baij->nsends) {
946     send_status = (MPI_Status *) PetscMalloc(baij->nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
947     ierr        = MPI_Waitall(baij->nsends,baij->send_waits,send_status);CHKERRQ(ierr);
948     PetscFree(send_status);
949   }
950   if (baij->send_waits) PetscFree(baij->send_waits);
951   if (baij->svalues)    PetscFree(baij->svalues);
952 
953   ierr = MatAssemblyBegin(baij->A,mode); CHKERRQ(ierr);
954   ierr = MatAssemblyEnd(baij->A,mode); CHKERRQ(ierr);
955 
956   /* determine if any processor has disassembled, if so we must
957      also disassemble ourselfs, in order that we may reassemble. */
958   /*
959      if nonzero structure of submatrix B cannot change then we know that
960      no processor disassembled thus we can skip this stuff
961   */
962   if (!((Mat_SeqBAIJ*) baij->B->data)->nonew)  {
963     ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
964     if (mat->was_assembled && !other_disassembled) {
965       ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
966     }
967   }
968 
969   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
970     ierr = MatSetUpMultiply_MPIBAIJ(mat); CHKERRQ(ierr);
971   }
972   ierr = MatAssemblyBegin(baij->B,mode); CHKERRQ(ierr);
973   ierr = MatAssemblyEnd(baij->B,mode); CHKERRQ(ierr);
974 
975 #if defined(USE_PETSC_BOPT_g)
976   if (baij->ht && mode== MAT_FINAL_ASSEMBLY) {
977     PLogInfo(0,"MatAssemblyEnd_MPIBAIJ:Average Hash Table Search in MatSetValues = %5.2f\n",
978              ((double)baij->ht_total_ct)/baij->ht_insert_ct);
979     baij->ht_total_ct  = 0;
980     baij->ht_insert_ct = 0;
981   }
982 #endif
983   if (baij->ht_flag && !baij->ht && mode == MAT_FINAL_ASSEMBLY) {
984     ierr = MatCreateHashTable_MPIBAIJ_Private(mat,baij->ht_fact); CHKERRQ(ierr);
985     mat->ops->setvalues        = MatSetValues_MPIBAIJ_HT;
986     mat->ops->setvaluesblocked = MatSetValuesBlocked_MPIBAIJ_HT;
987   }
988 
989   if (baij->rowvalues) {PetscFree(baij->rowvalues); baij->rowvalues = 0;}
990   PetscFunctionReturn(0);
991 }
992 
993 #undef __FUNC__
994 #define __FUNC__ "MatView_MPIBAIJ_Binary"
995 static int MatView_MPIBAIJ_Binary(Mat mat,Viewer viewer)
996 {
997   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
998   int          ierr;
999 
1000   PetscFunctionBegin;
1001   if (baij->size == 1) {
1002     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
1003   } else SETERRQ(PETSC_ERR_SUP,0,"Only uniprocessor output supported");
1004   PetscFunctionReturn(0);
1005 }
1006 
1007 #undef __FUNC__
1008 #define __FUNC__ "MatView_MPIBAIJ_ASCIIorDraworMatlab"
1009 static int MatView_MPIBAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
1010 {
1011   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
1012   int          ierr, format,bs = baij->bs, size = baij->size, rank = baij->rank;
1013   FILE         *fd;
1014   ViewerType   vtype;
1015 
1016   PetscFunctionBegin;
1017   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
1018   if (PetscTypeCompare(vtype,ASCII_VIEWER)) {
1019     ierr = ViewerGetFormat(viewer,&format);
1020     if (format == VIEWER_FORMAT_ASCII_INFO_LONG) {
1021       MatInfo info;
1022       MPI_Comm_rank(mat->comm,&rank);
1023       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
1024       ierr = MatGetInfo(mat,MAT_LOCAL,&info);
1025       PetscSequentialPhaseBegin(mat->comm,1);
1026       fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
1027               rank,baij->m,(int)info.nz_used*bs,(int)info.nz_allocated*bs,
1028               baij->bs,(int)info.memory);
1029       ierr = MatGetInfo(baij->A,MAT_LOCAL,&info);
1030       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
1031       ierr = MatGetInfo(baij->B,MAT_LOCAL,&info);
1032       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
1033       fflush(fd);
1034       PetscSequentialPhaseEnd(mat->comm,1);
1035       ierr = VecScatterView(baij->Mvctx,viewer); CHKERRQ(ierr);
1036       PetscFunctionReturn(0);
1037     } else if (format == VIEWER_FORMAT_ASCII_INFO) {
1038       PetscPrintf(mat->comm,"  block size is %d\n",bs);
1039       PetscFunctionReturn(0);
1040     }
1041   }
1042 
1043   if (PetscTypeCompare(vtype,DRAW_VIEWER)) {
1044     Draw       draw;
1045     PetscTruth isnull;
1046     ierr = ViewerDrawGetDraw(viewer,0,&draw); CHKERRQ(ierr);
1047     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
1048   }
1049 
1050   if (size == 1) {
1051     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
1052   } else {
1053     /* assemble the entire matrix onto first processor. */
1054     Mat         A;
1055     Mat_SeqBAIJ *Aloc;
1056     int         M = baij->M, N = baij->N,*ai,*aj,col,i,j,k,*rvals;
1057     int         mbs = baij->mbs;
1058     Scalar      *a;
1059 
1060     if (!rank) {
1061       ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
1062     } else {
1063       ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
1064     }
1065     PLogObjectParent(mat,A);
1066 
1067     /* copy over the A part */
1068     Aloc = (Mat_SeqBAIJ*) baij->A->data;
1069     ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1070     rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1071 
1072     for ( i=0; i<mbs; i++ ) {
1073       rvals[0] = bs*(baij->rstart + i);
1074       for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1075       for ( j=ai[i]; j<ai[i+1]; j++ ) {
1076         col = (baij->cstart+aj[j])*bs;
1077         for (k=0; k<bs; k++ ) {
1078           ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
1079           col++; a += bs;
1080         }
1081       }
1082     }
1083     /* copy over the B part */
1084     Aloc = (Mat_SeqBAIJ*) baij->B->data;
1085     ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1086     for ( i=0; i<mbs; i++ ) {
1087       rvals[0] = bs*(baij->rstart + i);
1088       for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1089       for ( j=ai[i]; j<ai[i+1]; j++ ) {
1090         col = baij->garray[aj[j]]*bs;
1091         for (k=0; k<bs; k++ ) {
1092           ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
1093           col++; a += bs;
1094         }
1095       }
1096     }
1097     PetscFree(rvals);
1098     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1099     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1100     /*
1101        Everyone has to call to draw the matrix since the graphics waits are
1102        synchronized across all processors that share the Draw object
1103     */
1104     if (!rank || PetscTypeCompare(vtype,DRAW_VIEWER)) {
1105       ierr = MatView(((Mat_MPIBAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
1106     }
1107     ierr = MatDestroy(A); CHKERRQ(ierr);
1108   }
1109   PetscFunctionReturn(0);
1110 }
1111 
1112 
1113 
1114 #undef __FUNC__
1115 #define __FUNC__ "MatView_MPIBAIJ"
1116 int MatView_MPIBAIJ(Mat mat,Viewer viewer)
1117 {
1118   int         ierr;
1119   ViewerType  vtype;
1120 
1121   PetscFunctionBegin;
1122   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
1123   if (PetscTypeCompare(vtype,ASCII_VIEWER) || PetscTypeCompare(vtype,DRAW_VIEWER) ||
1124       PetscTypeCompare(vtype,MATLAB_VIEWER)) {
1125     ierr = MatView_MPIBAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
1126   } else if (PetscTypeCompare(vtype,BINARY_VIEWER)) {
1127     ierr = MatView_MPIBAIJ_Binary(mat,viewer);CHKERRQ(ierr);
1128   } else {
1129     SETERRQ(1,1,"Viewer type not supported by PETSc object");
1130   }
1131   PetscFunctionReturn(0);
1132 }
1133 
1134 #undef __FUNC__
1135 #define __FUNC__ "MatDestroy_MPIBAIJ"
1136 int MatDestroy_MPIBAIJ(Mat mat)
1137 {
1138   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1139   int         ierr;
1140 
1141   PetscFunctionBegin;
1142   if (--mat->refct > 0) PetscFunctionReturn(0);
1143 
1144   if (mat->mapping) {
1145     ierr = ISLocalToGlobalMappingDestroy(mat->mapping); CHKERRQ(ierr);
1146   }
1147   if (mat->bmapping) {
1148     ierr = ISLocalToGlobalMappingDestroy(mat->bmapping); CHKERRQ(ierr);
1149   }
1150   if (mat->rmap) {
1151     ierr = MapDestroy(mat->rmap);CHKERRQ(ierr);
1152   }
1153   if (mat->cmap) {
1154     ierr = MapDestroy(mat->cmap);CHKERRQ(ierr);
1155   }
1156 #if defined(USE_PETSC_LOG)
1157   PLogObjectState((PetscObject)mat,"Rows=%d, Cols=%d",baij->M,baij->N);
1158 #endif
1159 
1160   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
1161   PetscFree(baij->rowners);
1162   ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1163   ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1164 #if defined (USE_CTABLE)
1165   if (baij->colmap) TableDelete(baij->colmap);
1166 #else
1167   if (baij->colmap) PetscFree(baij->colmap);
1168 #endif
1169   if (baij->garray) PetscFree(baij->garray);
1170   if (baij->lvec)   VecDestroy(baij->lvec);
1171   if (baij->Mvctx)  VecScatterDestroy(baij->Mvctx);
1172   if (baij->rowvalues) PetscFree(baij->rowvalues);
1173   if (baij->barray) PetscFree(baij->barray);
1174   if (baij->hd) PetscFree(baij->hd);
1175   PetscFree(baij);
1176   PLogObjectDestroy(mat);
1177   PetscHeaderDestroy(mat);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 #undef __FUNC__
1182 #define __FUNC__ "MatMult_MPIBAIJ"
1183 int MatMult_MPIBAIJ(Mat A,Vec xx,Vec yy)
1184 {
1185   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1186   int         ierr, nt;
1187 
1188   PetscFunctionBegin;
1189   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
1190   if (nt != a->n) {
1191     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible partition of A and xx");
1192   }
1193   ierr = VecGetLocalSize(yy,&nt);CHKERRQ(ierr);
1194   if (nt != a->m) {
1195     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible parition of A and yy");
1196   }
1197   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1198   ierr = (*a->A->ops->mult)(a->A,xx,yy); CHKERRQ(ierr);
1199   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1200   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
1201   ierr = VecScatterPostRecvs(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1202   PetscFunctionReturn(0);
1203 }
1204 
1205 #undef __FUNC__
1206 #define __FUNC__ "MatMultAdd_MPIBAIJ"
1207 int MatMultAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1208 {
1209   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1210   int        ierr;
1211 
1212   PetscFunctionBegin;
1213   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1214   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1215   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1216   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
1217   PetscFunctionReturn(0);
1218 }
1219 
1220 #undef __FUNC__
1221 #define __FUNC__ "MatMultTrans_MPIBAIJ"
1222 int MatMultTrans_MPIBAIJ(Mat A,Vec xx,Vec yy)
1223 {
1224   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1225   int         ierr;
1226 
1227   PetscFunctionBegin;
1228   /* do nondiagonal part */
1229   ierr = (*a->B->ops->multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1230   /* send it on its way */
1231   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1232   /* do local part */
1233   ierr = (*a->A->ops->multtrans)(a->A,xx,yy); CHKERRQ(ierr);
1234   /* receive remote parts: note this assumes the values are not actually */
1235   /* inserted in yy until the next line, which is true for my implementation*/
1236   /* but is not perhaps always true. */
1237   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1238   PetscFunctionReturn(0);
1239 }
1240 
1241 #undef __FUNC__
1242 #define __FUNC__ "MatMultTransAdd_MPIBAIJ"
1243 int MatMultTransAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1244 {
1245   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1246   int         ierr;
1247 
1248   PetscFunctionBegin;
1249   /* do nondiagonal part */
1250   ierr = (*a->B->ops->multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1251   /* send it on its way */
1252   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1253   /* do local part */
1254   ierr = (*a->A->ops->multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1255   /* receive remote parts: note this assumes the values are not actually */
1256   /* inserted in yy until the next line, which is true for my implementation*/
1257   /* but is not perhaps always true. */
1258   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1259   PetscFunctionReturn(0);
1260 }
1261 
1262 /*
1263   This only works correctly for square matrices where the subblock A->A is the
1264    diagonal block
1265 */
1266 #undef __FUNC__
1267 #define __FUNC__ "MatGetDiagonal_MPIBAIJ"
1268 int MatGetDiagonal_MPIBAIJ(Mat A,Vec v)
1269 {
1270   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1271   int         ierr;
1272 
1273   PetscFunctionBegin;
1274   if (a->M != a->N) SETERRQ(PETSC_ERR_SUP,0,"Supports only square matrix where A->A is diag block");
1275   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
1276   PetscFunctionReturn(0);
1277 }
1278 
1279 #undef __FUNC__
1280 #define __FUNC__ "MatScale_MPIBAIJ"
1281 int MatScale_MPIBAIJ(Scalar *aa,Mat A)
1282 {
1283   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1284   int         ierr;
1285 
1286   PetscFunctionBegin;
1287   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
1288   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
1289   PetscFunctionReturn(0);
1290 }
1291 
1292 #undef __FUNC__
1293 #define __FUNC__ "MatGetSize_MPIBAIJ"
1294 int MatGetSize_MPIBAIJ(Mat matin,int *m,int *n)
1295 {
1296   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1297 
1298   PetscFunctionBegin;
1299   if (m) *m = mat->M;
1300   if (n) *n = mat->N;
1301   PetscFunctionReturn(0);
1302 }
1303 
1304 #undef __FUNC__
1305 #define __FUNC__ "MatGetLocalSize_MPIBAIJ"
1306 int MatGetLocalSize_MPIBAIJ(Mat matin,int *m,int *n)
1307 {
1308   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1309 
1310   PetscFunctionBegin;
1311   *m = mat->m; *n = mat->n;
1312   PetscFunctionReturn(0);
1313 }
1314 
1315 #undef __FUNC__
1316 #define __FUNC__ "MatGetOwnershipRange_MPIBAIJ"
1317 int MatGetOwnershipRange_MPIBAIJ(Mat matin,int *m,int *n)
1318 {
1319   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1320 
1321   PetscFunctionBegin;
1322   *m = mat->rstart*mat->bs; *n = mat->rend*mat->bs;
1323   PetscFunctionReturn(0);
1324 }
1325 
1326 extern int MatGetRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1327 extern int MatRestoreRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1328 
1329 #undef __FUNC__
1330 #define __FUNC__ "MatGetRow_MPIBAIJ"
1331 int MatGetRow_MPIBAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1332 {
1333   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1334   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1335   int        bs = mat->bs, bs2 = mat->bs2, i, ierr, *cworkA, *cworkB, **pcA, **pcB;
1336   int        nztot, nzA, nzB, lrow, brstart = mat->rstart*bs, brend = mat->rend*bs;
1337   int        *cmap, *idx_p,cstart = mat->cstart;
1338 
1339   PetscFunctionBegin;
1340   if (mat->getrowactive == PETSC_TRUE) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Already active");
1341   mat->getrowactive = PETSC_TRUE;
1342 
1343   if (!mat->rowvalues && (idx || v)) {
1344     /*
1345         allocate enough space to hold information from the longest row.
1346     */
1347     Mat_SeqBAIJ *Aa = (Mat_SeqBAIJ *) mat->A->data,*Ba = (Mat_SeqBAIJ *) mat->B->data;
1348     int     max = 1,mbs = mat->mbs,tmp;
1349     for ( i=0; i<mbs; i++ ) {
1350       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1351       if (max < tmp) { max = tmp; }
1352     }
1353     mat->rowvalues = (Scalar *) PetscMalloc( max*bs2*(sizeof(int)+sizeof(Scalar)));
1354     CHKPTRQ(mat->rowvalues);
1355     mat->rowindices = (int *) (mat->rowvalues + max*bs2);
1356   }
1357 
1358   if (row < brstart || row >= brend) SETERRQ(PETSC_ERR_SUP,0,"Only local rows")
1359   lrow = row - brstart;
1360 
1361   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1362   if (!v)   {pvA = 0; pvB = 0;}
1363   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1364   ierr = (*mat->A->ops->getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1365   ierr = (*mat->B->ops->getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1366   nztot = nzA + nzB;
1367 
1368   cmap  = mat->garray;
1369   if (v  || idx) {
1370     if (nztot) {
1371       /* Sort by increasing column numbers, assuming A and B already sorted */
1372       int imark = -1;
1373       if (v) {
1374         *v = v_p = mat->rowvalues;
1375         for ( i=0; i<nzB; i++ ) {
1376           if (cmap[cworkB[i]/bs] < cstart)   v_p[i] = vworkB[i];
1377           else break;
1378         }
1379         imark = i;
1380         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1381         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1382       }
1383       if (idx) {
1384         *idx = idx_p = mat->rowindices;
1385         if (imark > -1) {
1386           for ( i=0; i<imark; i++ ) {
1387             idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs;
1388           }
1389         } else {
1390           for ( i=0; i<nzB; i++ ) {
1391             if (cmap[cworkB[i]/bs] < cstart)
1392               idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1393             else break;
1394           }
1395           imark = i;
1396         }
1397         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart*bs + cworkA[i];
1398         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1399       }
1400     } else {
1401       if (idx) *idx = 0;
1402       if (v)   *v   = 0;
1403     }
1404   }
1405   *nz = nztot;
1406   ierr = (*mat->A->ops->restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1407   ierr = (*mat->B->ops->restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1408   PetscFunctionReturn(0);
1409 }
1410 
1411 #undef __FUNC__
1412 #define __FUNC__ "MatRestoreRow_MPIBAIJ"
1413 int MatRestoreRow_MPIBAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1414 {
1415   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1416 
1417   PetscFunctionBegin;
1418   if (baij->getrowactive == PETSC_FALSE) {
1419     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"MatGetRow not called");
1420   }
1421   baij->getrowactive = PETSC_FALSE;
1422   PetscFunctionReturn(0);
1423 }
1424 
1425 #undef __FUNC__
1426 #define __FUNC__ "MatGetBlockSize_MPIBAIJ"
1427 int MatGetBlockSize_MPIBAIJ(Mat mat,int *bs)
1428 {
1429   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1430 
1431   PetscFunctionBegin;
1432   *bs = baij->bs;
1433   PetscFunctionReturn(0);
1434 }
1435 
1436 #undef __FUNC__
1437 #define __FUNC__ "MatZeroEntries_MPIBAIJ"
1438 int MatZeroEntries_MPIBAIJ(Mat A)
1439 {
1440   Mat_MPIBAIJ *l = (Mat_MPIBAIJ *) A->data;
1441   int         ierr;
1442 
1443   PetscFunctionBegin;
1444   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
1445   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
1446   PetscFunctionReturn(0);
1447 }
1448 
1449 #undef __FUNC__
1450 #define __FUNC__ "MatGetInfo_MPIBAIJ"
1451 int MatGetInfo_MPIBAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1452 {
1453   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) matin->data;
1454   Mat         A = a->A, B = a->B;
1455   int         ierr;
1456   double      isend[5], irecv[5];
1457 
1458   PetscFunctionBegin;
1459   info->block_size     = (double)a->bs;
1460   ierr = MatGetInfo(A,MAT_LOCAL,info); CHKERRQ(ierr);
1461   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->nz_unneeded;
1462   isend[3] = info->memory;  isend[4] = info->mallocs;
1463   ierr = MatGetInfo(B,MAT_LOCAL,info); CHKERRQ(ierr);
1464   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->nz_unneeded;
1465   isend[3] += info->memory;  isend[4] += info->mallocs;
1466   if (flag == MAT_LOCAL) {
1467     info->nz_used      = isend[0];
1468     info->nz_allocated = isend[1];
1469     info->nz_unneeded  = isend[2];
1470     info->memory       = isend[3];
1471     info->mallocs      = isend[4];
1472   } else if (flag == MAT_GLOBAL_MAX) {
1473     ierr = MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_MAX,matin->comm);CHKERRQ(ierr);
1474     info->nz_used      = irecv[0];
1475     info->nz_allocated = irecv[1];
1476     info->nz_unneeded  = irecv[2];
1477     info->memory       = irecv[3];
1478     info->mallocs      = irecv[4];
1479   } else if (flag == MAT_GLOBAL_SUM) {
1480     ierr = MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_SUM,matin->comm);CHKERRQ(ierr);
1481     info->nz_used      = irecv[0];
1482     info->nz_allocated = irecv[1];
1483     info->nz_unneeded  = irecv[2];
1484     info->memory       = irecv[3];
1485     info->mallocs      = irecv[4];
1486   }
1487   info->rows_global       = (double)a->M;
1488   info->columns_global    = (double)a->N;
1489   info->rows_local        = (double)a->m;
1490   info->columns_local     = (double)a->N;
1491   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1492   info->fill_ratio_needed = 0;
1493   info->factor_mallocs    = 0;
1494   PetscFunctionReturn(0);
1495 }
1496 
1497 #undef __FUNC__
1498 #define __FUNC__ "MatSetOption_MPIBAIJ"
1499 int MatSetOption_MPIBAIJ(Mat A,MatOption op)
1500 {
1501   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1502 
1503   PetscFunctionBegin;
1504   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1505       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1506       op == MAT_COLUMNS_UNSORTED ||
1507       op == MAT_COLUMNS_SORTED ||
1508       op == MAT_NEW_NONZERO_ALLOCATION_ERROR ||
1509       op == MAT_NEW_NONZERO_LOCATION_ERROR) {
1510         MatSetOption(a->A,op);
1511         MatSetOption(a->B,op);
1512   } else if (op == MAT_ROW_ORIENTED) {
1513         a->roworiented = 1;
1514         MatSetOption(a->A,op);
1515         MatSetOption(a->B,op);
1516   } else if (op == MAT_ROWS_SORTED ||
1517              op == MAT_ROWS_UNSORTED ||
1518              op == MAT_SYMMETRIC ||
1519              op == MAT_STRUCTURALLY_SYMMETRIC ||
1520              op == MAT_YES_NEW_DIAGONALS ||
1521              op == MAT_USE_HASH_TABLE)
1522     PLogInfo(A,"Info:MatSetOption_MPIBAIJ:Option ignored\n");
1523   else if (op == MAT_COLUMN_ORIENTED) {
1524     a->roworiented = 0;
1525     MatSetOption(a->A,op);
1526     MatSetOption(a->B,op);
1527   } else if (op == MAT_IGNORE_OFF_PROC_ENTRIES) {
1528     a->donotstash = 1;
1529   } else if (op == MAT_NO_NEW_DIAGONALS) {
1530     SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");
1531   } else if (op == MAT_USE_HASH_TABLE) {
1532     a->ht_flag = 1;
1533   } else {
1534     SETERRQ(PETSC_ERR_SUP,0,"unknown option");
1535   }
1536   PetscFunctionReturn(0);
1537 }
1538 
1539 #undef __FUNC__
1540 #define __FUNC__ "MatTranspose_MPIBAIJ("
1541 int MatTranspose_MPIBAIJ(Mat A,Mat *matout)
1542 {
1543   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) A->data;
1544   Mat_SeqBAIJ *Aloc;
1545   Mat        B;
1546   int        ierr,M=baij->M,N=baij->N,*ai,*aj,i,*rvals,j,k,col;
1547   int        bs=baij->bs,mbs=baij->mbs;
1548   Scalar     *a;
1549 
1550   PetscFunctionBegin;
1551   if (matout == PETSC_NULL && M != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Square matrix only for in-place");
1552   ierr = MatCreateMPIBAIJ(A->comm,baij->bs,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);
1553   CHKERRQ(ierr);
1554 
1555   /* copy over the A part */
1556   Aloc = (Mat_SeqBAIJ*) baij->A->data;
1557   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1558   rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1559 
1560   for ( i=0; i<mbs; i++ ) {
1561     rvals[0] = bs*(baij->rstart + i);
1562     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1563     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1564       col = (baij->cstart+aj[j])*bs;
1565       for (k=0; k<bs; k++ ) {
1566         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1567         col++; a += bs;
1568       }
1569     }
1570   }
1571   /* copy over the B part */
1572   Aloc = (Mat_SeqBAIJ*) baij->B->data;
1573   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1574   for ( i=0; i<mbs; i++ ) {
1575     rvals[0] = bs*(baij->rstart + i);
1576     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1577     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1578       col = baij->garray[aj[j]]*bs;
1579       for (k=0; k<bs; k++ ) {
1580         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1581         col++; a += bs;
1582       }
1583     }
1584   }
1585   PetscFree(rvals);
1586   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1587   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1588 
1589   if (matout != PETSC_NULL) {
1590     *matout = B;
1591   } else {
1592     PetscOps *Abops;
1593     MatOps   Aops;
1594 
1595     /* This isn't really an in-place transpose .... but free data structures from baij */
1596     PetscFree(baij->rowners);
1597     ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1598     ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1599     if (baij->colmap) PetscFree(baij->colmap);
1600     if (baij->garray) PetscFree(baij->garray);
1601     if (baij->lvec) VecDestroy(baij->lvec);
1602     if (baij->Mvctx) VecScatterDestroy(baij->Mvctx);
1603     PetscFree(baij);
1604 
1605     /*
1606        This is horrible, horrible code. We need to keep the
1607       A pointers for the bops and ops but copy everything
1608       else from C.
1609     */
1610     Abops = A->bops;
1611     Aops  = A->ops;
1612     PetscMemcpy(A,B,sizeof(struct _p_Mat));
1613     A->bops = Abops;
1614     A->ops  = Aops;
1615 
1616     PetscHeaderDestroy(B);
1617   }
1618   PetscFunctionReturn(0);
1619 }
1620 
1621 #undef __FUNC__
1622 #define __FUNC__ "MatDiagonalScale_MPIBAIJ"
1623 int MatDiagonalScale_MPIBAIJ(Mat A,Vec ll,Vec rr)
1624 {
1625   Mat a = ((Mat_MPIBAIJ *) A->data)->A;
1626   Mat b = ((Mat_MPIBAIJ *) A->data)->B;
1627   int ierr,s1,s2,s3;
1628 
1629   PetscFunctionBegin;
1630   if (ll)  {
1631     ierr = VecGetLocalSize(ll,&s1); CHKERRQ(ierr);
1632     ierr = MatGetLocalSize(A,&s2,&s3); CHKERRQ(ierr);
1633     if (s1!=s2) SETERRQ(PETSC_ERR_ARG_SIZ,0,"non-conforming local sizes");
1634     ierr = MatDiagonalScale(a,ll,0); CHKERRQ(ierr);
1635     ierr = MatDiagonalScale(b,ll,0); CHKERRQ(ierr);
1636   }
1637   if (rr) SETERRQ(PETSC_ERR_SUP,0,"not supported for right vector");
1638   PetscFunctionReturn(0);
1639 }
1640 
1641 #undef __FUNC__
1642 #define __FUNC__ "MatZeroRows_MPIBAIJ"
1643 int MatZeroRows_MPIBAIJ(Mat A,IS is,Scalar *diag)
1644 {
1645   Mat_MPIBAIJ    *l = (Mat_MPIBAIJ *) A->data;
1646   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
1647   int            *procs,*nprocs,j,found,idx,nsends,*work,row;
1648   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
1649   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
1650   int            *lens,imdex,*lrows,*values,bs=l->bs,rstart_bs=l->rstart_bs;
1651   MPI_Comm       comm = A->comm;
1652   MPI_Request    *send_waits,*recv_waits;
1653   MPI_Status     recv_status,*send_status;
1654   IS             istmp;
1655   PetscTruth     localdiag;
1656 
1657   PetscFunctionBegin;
1658   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
1659   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
1660 
1661   /*  first count number of contributors to each processor */
1662   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
1663   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
1664   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
1665   for ( i=0; i<N; i++ ) {
1666     idx = rows[i];
1667     found = 0;
1668     for ( j=0; j<size; j++ ) {
1669       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
1670         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
1671       }
1672     }
1673     if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Index out of range");
1674   }
1675   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
1676 
1677   /* inform other processors of number of messages and max length*/
1678   work   = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
1679   ierr   = MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
1680   nrecvs = work[rank];
1681   ierr   = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
1682   nmax   = work[rank];
1683   PetscFree(work);
1684 
1685   /* post receives:   */
1686   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); CHKPTRQ(rvalues);
1687   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
1688   for ( i=0; i<nrecvs; i++ ) {
1689     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
1690   }
1691 
1692   /* do sends:
1693      1) starts[i] gives the starting index in svalues for stuff going to
1694      the ith processor
1695   */
1696   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
1697   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
1698   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
1699   starts[0] = 0;
1700   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1701   for ( i=0; i<N; i++ ) {
1702     svalues[starts[owner[i]]++] = rows[i];
1703   }
1704   ISRestoreIndices(is,&rows);
1705 
1706   starts[0] = 0;
1707   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1708   count = 0;
1709   for ( i=0; i<size; i++ ) {
1710     if (procs[i]) {
1711       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
1712     }
1713   }
1714   PetscFree(starts);
1715 
1716   base = owners[rank]*bs;
1717 
1718   /*  wait on receives */
1719   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
1720   source = lens + nrecvs;
1721   count  = nrecvs; slen = 0;
1722   while (count) {
1723     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
1724     /* unpack receives into our local space */
1725     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
1726     source[imdex]  = recv_status.MPI_SOURCE;
1727     lens[imdex]  = n;
1728     slen += n;
1729     count--;
1730   }
1731   PetscFree(recv_waits);
1732 
1733   /* move the data into the send scatter */
1734   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
1735   count = 0;
1736   for ( i=0; i<nrecvs; i++ ) {
1737     values = rvalues + i*nmax;
1738     for ( j=0; j<lens[i]; j++ ) {
1739       lrows[count++] = values[j] - base;
1740     }
1741   }
1742   PetscFree(rvalues); PetscFree(lens);
1743   PetscFree(owner); PetscFree(nprocs);
1744 
1745   /* actually zap the local rows */
1746   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
1747   PLogObjectParent(A,istmp);
1748 
1749   /*
1750         Zero the required rows. If the "diagonal block" of the matrix
1751      is square and the user wishes to set the diagonal we use seperate
1752      code so that MatSetValues() is not called for each diagonal allocating
1753      new memory, thus calling lots of mallocs and slowing things down.
1754 
1755        Contributed by: Mathew Knepley
1756   */
1757   localdiag = PETSC_FALSE;
1758   if (diag && (l->A->M == l->A->N)) {
1759     localdiag = PETSC_TRUE;
1760     ierr      = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
1761   } else {
1762     ierr = MatZeroRows(l->A,istmp,0); CHKERRQ(ierr);
1763   }
1764   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
1765   ierr = ISDestroy(istmp); CHKERRQ(ierr);
1766 
1767   if (diag && (localdiag == PETSC_FALSE)) {
1768     for ( i = 0; i < slen; i++ ) {
1769       row = lrows[i] + rstart_bs;
1770       ierr = MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES); CHKERRQ(ierr);
1771     }
1772     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1773     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1774   }
1775   PetscFree(lrows);
1776 
1777   /* wait on sends */
1778   if (nsends) {
1779     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
1780     ierr        = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
1781     PetscFree(send_status);
1782   }
1783   PetscFree(send_waits); PetscFree(svalues);
1784 
1785   PetscFunctionReturn(0);
1786 }
1787 
1788 extern int MatPrintHelp_SeqBAIJ(Mat);
1789 #undef __FUNC__
1790 #define __FUNC__ "MatPrintHelp_MPIBAIJ"
1791 int MatPrintHelp_MPIBAIJ(Mat A)
1792 {
1793   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1794   MPI_Comm    comm = A->comm;
1795   static int  called = 0;
1796   int         ierr;
1797 
1798   PetscFunctionBegin;
1799   if (!a->rank) {
1800     ierr = MatPrintHelp_SeqBAIJ(a->A);CHKERRQ(ierr);
1801   }
1802   if (called) {PetscFunctionReturn(0);} else called = 1;
1803   (*PetscHelpPrintf)(comm," Options for MATMPIBAIJ matrix format (the defaults):\n");
1804   (*PetscHelpPrintf)(comm,"  -mat_use_hash_table <factor>: Use hashtable for efficient matrix assembly\n");
1805   PetscFunctionReturn(0);
1806 }
1807 
1808 #undef __FUNC__
1809 #define __FUNC__ "MatSetUnfactored_MPIBAIJ"
1810 int MatSetUnfactored_MPIBAIJ(Mat A)
1811 {
1812   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1813   int         ierr;
1814 
1815   PetscFunctionBegin;
1816   ierr = MatSetUnfactored(a->A); CHKERRQ(ierr);
1817   PetscFunctionReturn(0);
1818 }
1819 
1820 static int MatDuplicate_MPIBAIJ(Mat,MatDuplicateOption,Mat *);
1821 
1822 /* -------------------------------------------------------------------*/
1823 static struct _MatOps MatOps_Values = {
1824   MatSetValues_MPIBAIJ,
1825   MatGetRow_MPIBAIJ,
1826   MatRestoreRow_MPIBAIJ,
1827   MatMult_MPIBAIJ,
1828   MatMultAdd_MPIBAIJ,
1829   MatMultTrans_MPIBAIJ,
1830   MatMultTransAdd_MPIBAIJ,
1831   0,
1832   0,
1833   0,
1834   0,
1835   0,
1836   0,
1837   0,
1838   MatTranspose_MPIBAIJ,
1839   MatGetInfo_MPIBAIJ,
1840   0,
1841   MatGetDiagonal_MPIBAIJ,
1842   MatDiagonalScale_MPIBAIJ,
1843   MatNorm_MPIBAIJ,
1844   MatAssemblyBegin_MPIBAIJ,
1845   MatAssemblyEnd_MPIBAIJ,
1846   0,
1847   MatSetOption_MPIBAIJ,
1848   MatZeroEntries_MPIBAIJ,
1849   MatZeroRows_MPIBAIJ,
1850   0,
1851   0,
1852   0,
1853   0,
1854   MatGetSize_MPIBAIJ,
1855   MatGetLocalSize_MPIBAIJ,
1856   MatGetOwnershipRange_MPIBAIJ,
1857   0,
1858   0,
1859   0,
1860   0,
1861   MatDuplicate_MPIBAIJ,
1862   0,
1863   0,
1864   0,
1865   0,
1866   0,
1867   MatGetSubMatrices_MPIBAIJ,
1868   MatIncreaseOverlap_MPIBAIJ,
1869   MatGetValues_MPIBAIJ,
1870   0,
1871   MatPrintHelp_MPIBAIJ,
1872   MatScale_MPIBAIJ,
1873   0,
1874   0,
1875   0,
1876   MatGetBlockSize_MPIBAIJ,
1877   0,
1878   0,
1879   0,
1880   0,
1881   0,
1882   0,
1883   MatSetUnfactored_MPIBAIJ,
1884   0,
1885   MatSetValuesBlocked_MPIBAIJ,
1886   0,
1887   0,
1888   0,
1889   MatGetMaps_Petsc};
1890 
1891 #include "pc.h"
1892 EXTERN_C_BEGIN
1893 extern int PCSetUp_BJacobi_BAIJ(PC);
1894 EXTERN_C_END
1895 
1896 #undef __FUNC__
1897 #define __FUNC__ "MatCreateMPIBAIJ"
1898 /*@C
1899    MatCreateMPIBAIJ - Creates a sparse parallel matrix in block AIJ format
1900    (block compressed row).  For good matrix assembly performance
1901    the user should preallocate the matrix storage by setting the parameters
1902    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1903    performance can be increased by more than a factor of 50.
1904 
1905    Collective on MPI_Comm
1906 
1907    Input Parameters:
1908 +  comm - MPI communicator
1909 .  bs   - size of blockk
1910 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1911            This value should be the same as the local size used in creating the
1912            y vector for the matrix-vector product y = Ax.
1913 .  n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1914            This value should be the same as the local size used in creating the
1915            x vector for the matrix-vector product y = Ax.
1916 .  M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1917 .  N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1918 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1919            submatrix  (same for all local rows)
1920 .  d_nzz - array containing the number of block nonzeros in the various block rows
1921            of the in diagonal portion of the local (possibly different for each block
1922            row) or PETSC_NULL.  You must leave room for the diagonal entry even if it is zero.
1923 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1924            submatrix (same for all local rows).
1925 -  o_nzz - array containing the number of nonzeros in the various block rows of the
1926            off-diagonal portion of the local submatrix (possibly different for
1927            each block row) or PETSC_NULL.
1928 
1929    Output Parameter:
1930 .  A - the matrix
1931 
1932    Options Database Keys:
1933 .   -mat_no_unroll - uses code that does not unroll the loops in the
1934                      block calculations (much slower)
1935 .   -mat_block_size - size of the blocks to use
1936 .   -mat_mpi - use the parallel matrix data structures even on one processor
1937                (defaults to using SeqBAIJ format on one processor)
1938 
1939    Notes:
1940    The user MUST specify either the local or global matrix dimensions
1941    (possibly both).
1942 
1943    If PETSC_DECIDE or  PETSC_DETERMINE is used for a particular argument on one processor
1944    than it must be used on all processors that share the object for that argument.
1945 
1946    Storage Information:
1947    For a square global matrix we define each processor's diagonal portion
1948    to be its local rows and the corresponding columns (a square submatrix);
1949    each processor's off-diagonal portion encompasses the remainder of the
1950    local matrix (a rectangular submatrix).
1951 
1952    The user can specify preallocated storage for the diagonal part of
1953    the local submatrix with either d_nz or d_nnz (not both).  Set
1954    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1955    memory allocation.  Likewise, specify preallocated storage for the
1956    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1957 
1958    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1959    the figure below we depict these three local rows and all columns (0-11).
1960 
1961 .vb
1962            0 1 2 3 4 5 6 7 8 9 10 11
1963           -------------------
1964    row 3  |  o o o d d d o o o o o o
1965    row 4  |  o o o d d d o o o o o o
1966    row 5  |  o o o d d d o o o o o o
1967           -------------------
1968 .ve
1969 
1970    Thus, any entries in the d locations are stored in the d (diagonal)
1971    submatrix, and any entries in the o locations are stored in the
1972    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1973    stored simply in the MATSEQBAIJ format for compressed row storage.
1974 
1975    Now d_nz should indicate the number of block nonzeros per row in the d matrix,
1976    and o_nz should indicate the number of block nonzeros per row in the o matrix.
1977    In general, for PDE problems in which most nonzeros are near the diagonal,
1978    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1979    or you will get TERRIBLE performance; see the users' manual chapter on
1980    matrices.
1981 
1982 .keywords: matrix, block, aij, compressed row, sparse, parallel
1983 
1984 .seealso: MatCreate(), MatCreateSeqBAIJ(), MatSetValues(), MatCreateMPIAIJ()
1985 @*/
1986 int MatCreateMPIBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,
1987                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1988 {
1989   Mat          B;
1990   Mat_MPIBAIJ  *b;
1991   int          ierr, i,sum[2],work[2],mbs,nbs,Mbs=PETSC_DECIDE,Nbs=PETSC_DECIDE,size,flg;
1992   int          flag1 = 0,flag2 = 0;
1993 
1994   PetscFunctionBegin;
1995   if (bs < 1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Invalid block size specified, must be positive");
1996 
1997   MPI_Comm_size(comm,&size);
1998   ierr = OptionsHasName(PETSC_NULL,"-mat_mpibaij",&flag1); CHKERRQ(ierr);
1999   ierr = OptionsHasName(PETSC_NULL,"-mat_mpi",&flag2); CHKERRQ(ierr);
2000   if (!flag1 && !flag2 && size == 1) {
2001     if (M == PETSC_DECIDE) M = m;
2002     if (N == PETSC_DECIDE) N = n;
2003     ierr = MatCreateSeqBAIJ(comm,bs,M,N,d_nz,d_nnz,A); CHKERRQ(ierr);
2004     PetscFunctionReturn(0);
2005   }
2006 
2007   *A = 0;
2008   PetscHeaderCreate(B,_p_Mat,struct _MatOps,MAT_COOKIE,MATMPIBAIJ,"Mat",comm,MatDestroy,MatView);
2009   PLogObjectCreate(B);
2010   B->data       = (void *) (b = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(b);
2011   PetscMemzero(b,sizeof(Mat_MPIBAIJ));
2012   PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));
2013 
2014   B->ops->destroy    = MatDestroy_MPIBAIJ;
2015   B->ops->view       = MatView_MPIBAIJ;
2016   B->mapping    = 0;
2017   B->factor     = 0;
2018   B->assembled  = PETSC_FALSE;
2019 
2020   B->insertmode = NOT_SET_VALUES;
2021   MPI_Comm_rank(comm,&b->rank);
2022   MPI_Comm_size(comm,&b->size);
2023 
2024   if ( m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL)) {
2025     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
2026   }
2027   if ( M == PETSC_DECIDE && m == PETSC_DECIDE) {
2028     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either M or m should be specified");
2029   }
2030   if ( N == PETSC_DECIDE && n == PETSC_DECIDE) {
2031     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either N or n should be specified");
2032   }
2033   if ( M != PETSC_DECIDE && m != PETSC_DECIDE) M = PETSC_DECIDE;
2034   if ( N != PETSC_DECIDE && n != PETSC_DECIDE) N = PETSC_DECIDE;
2035 
2036   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
2037     work[0] = m; work[1] = n;
2038     mbs = m/bs; nbs = n/bs;
2039     ierr = MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );CHKERRQ(ierr);
2040     if (M == PETSC_DECIDE) {M = sum[0]; Mbs = M/bs;}
2041     if (N == PETSC_DECIDE) {N = sum[1]; Nbs = N/bs;}
2042   }
2043   if (m == PETSC_DECIDE) {
2044     Mbs = M/bs;
2045     if (Mbs*bs != M) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global rows must be divisible by blocksize");
2046     mbs = Mbs/b->size + ((Mbs % b->size) > b->rank);
2047     m   = mbs*bs;
2048   }
2049   if (n == PETSC_DECIDE) {
2050     Nbs = N/bs;
2051     if (Nbs*bs != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global cols must be divisible by blocksize");
2052     nbs = Nbs/b->size + ((Nbs % b->size) > b->rank);
2053     n   = nbs*bs;
2054   }
2055   if (mbs*bs != m || nbs*bs != n) {
2056     SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of local rows, cols must be divisible by blocksize");
2057   }
2058 
2059   b->m = m; B->m = m;
2060   b->n = n; B->n = n;
2061   b->N = N; B->N = N;
2062   b->M = M; B->M = M;
2063   b->bs  = bs;
2064   b->bs2 = bs*bs;
2065   b->mbs = mbs;
2066   b->nbs = nbs;
2067   b->Mbs = Mbs;
2068   b->Nbs = Nbs;
2069 
2070   /* the information in the maps duplicates the information computed below, eventually
2071      we should remove the duplicate information that is not contained in the maps */
2072   ierr = MapCreateMPI(comm,m,M,&B->rmap);CHKERRQ(ierr);
2073   ierr = MapCreateMPI(comm,n,N,&B->cmap);CHKERRQ(ierr);
2074 
2075   /* build local table of row and column ownerships */
2076   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
2077   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
2078   b->cowners = b->rowners + b->size + 2;
2079   ierr = MPI_Allgather(&mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2080   b->rowners[0] = 0;
2081   for ( i=2; i<=b->size; i++ ) {
2082     b->rowners[i] += b->rowners[i-1];
2083   }
2084   b->rstart    = b->rowners[b->rank];
2085   b->rend      = b->rowners[b->rank+1];
2086   b->rstart_bs = b->rstart * bs;
2087   b->rend_bs   = b->rend * bs;
2088 
2089   ierr = MPI_Allgather(&nbs,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2090   b->cowners[0] = 0;
2091   for ( i=2; i<=b->size; i++ ) {
2092     b->cowners[i] += b->cowners[i-1];
2093   }
2094   b->cstart    = b->cowners[b->rank];
2095   b->cend      = b->cowners[b->rank+1];
2096   b->cstart_bs = b->cstart * bs;
2097   b->cend_bs   = b->cend * bs;
2098 
2099 
2100   if (d_nz == PETSC_DEFAULT) d_nz = 5;
2101   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
2102   PLogObjectParent(B,b->A);
2103   if (o_nz == PETSC_DEFAULT) o_nz = 0;
2104   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
2105   PLogObjectParent(B,b->B);
2106 
2107   /* build cache for off array entries formed */
2108   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
2109   b->donotstash  = 0;
2110   b->colmap      = 0;
2111   b->garray      = 0;
2112   b->roworiented = 1;
2113 
2114   /* stuff used in block assembly */
2115   b->barray       = 0;
2116 
2117   /* stuff used for matrix vector multiply */
2118   b->lvec         = 0;
2119   b->Mvctx        = 0;
2120 
2121   /* stuff for MatGetRow() */
2122   b->rowindices   = 0;
2123   b->rowvalues    = 0;
2124   b->getrowactive = PETSC_FALSE;
2125 
2126   /* hash table stuff */
2127   b->ht           = 0;
2128   b->hd           = 0;
2129   b->ht_size      = 0;
2130   b->ht_flag      = 0;
2131   b->ht_fact      = 0;
2132   b->ht_total_ct  = 0;
2133   b->ht_insert_ct = 0;
2134 
2135   *A = B;
2136   ierr = OptionsHasName(PETSC_NULL,"-mat_use_hash_table",&flg); CHKERRQ(ierr);
2137   if (flg) {
2138     double fact = 1.39;
2139     ierr = MatSetOption(B,MAT_USE_HASH_TABLE); CHKERRQ(ierr);
2140     ierr = OptionsGetDouble(PETSC_NULL,"-mat_use_hash_table",&fact,&flg); CHKERRQ(ierr);
2141     if (fact <= 1.0) fact = 1.39;
2142     ierr = MatMPIBAIJSetHashTableFactor(B,fact); CHKERRQ(ierr);
2143     PLogInfo(0,"MatCreateMPIBAIJ:Hash table Factor used %5.2f\n",fact);
2144   }
2145   ierr = PetscObjectComposeFunction((PetscObject)B,"PCSetUp_BJacobi_C",
2146                                      "PCSetUp_BJacobi_BAIJ",
2147                                      (void*)PCSetUp_BJacobi_BAIJ);CHKERRQ(ierr);
2148   PetscFunctionReturn(0);
2149 }
2150 
2151 #undef __FUNC__
2152 #define __FUNC__ "MatDuplicate_MPIBAIJ"
2153 static int MatDuplicate_MPIBAIJ(Mat matin,MatDuplicateOption cpvalues,Mat *newmat)
2154 {
2155   Mat         mat;
2156   Mat_MPIBAIJ *a,*oldmat = (Mat_MPIBAIJ *) matin->data;
2157   int         ierr, len=0, flg;
2158 
2159   PetscFunctionBegin;
2160   *newmat       = 0;
2161   PetscHeaderCreate(mat,_p_Mat,struct _MatOps,MAT_COOKIE,MATMPIBAIJ,"Mat",matin->comm,MatDestroy,MatView);
2162   PLogObjectCreate(mat);
2163   mat->data       = (void *) (a = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(a);
2164   PetscMemcpy(mat->ops,&MatOps_Values,sizeof(struct _MatOps));
2165   mat->ops->destroy    = MatDestroy_MPIBAIJ;
2166   mat->ops->view       = MatView_MPIBAIJ;
2167   mat->factor          = matin->factor;
2168   mat->assembled       = PETSC_TRUE;
2169 
2170   a->m = mat->m   = oldmat->m;
2171   a->n = mat->n   = oldmat->n;
2172   a->M = mat->M   = oldmat->M;
2173   a->N = mat->N   = oldmat->N;
2174 
2175   a->bs  = oldmat->bs;
2176   a->bs2 = oldmat->bs2;
2177   a->mbs = oldmat->mbs;
2178   a->nbs = oldmat->nbs;
2179   a->Mbs = oldmat->Mbs;
2180   a->Nbs = oldmat->Nbs;
2181 
2182   a->rstart       = oldmat->rstart;
2183   a->rend         = oldmat->rend;
2184   a->cstart       = oldmat->cstart;
2185   a->cend         = oldmat->cend;
2186   a->size         = oldmat->size;
2187   a->rank         = oldmat->rank;
2188   mat->insertmode = NOT_SET_VALUES;
2189   a->rowvalues    = 0;
2190   a->getrowactive = PETSC_FALSE;
2191   a->barray       = 0;
2192 
2193   /* hash table stuff */
2194   a->ht           = 0;
2195   a->hd           = 0;
2196   a->ht_size      = 0;
2197   a->ht_flag      = oldmat->ht_flag;
2198   a->ht_fact      = oldmat->ht_fact;
2199   a->ht_total_ct  = 0;
2200   a->ht_insert_ct = 0;
2201 
2202 
2203   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
2204   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
2205   a->cowners = a->rowners + a->size + 2;
2206   PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));
2207   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
2208   if (oldmat->colmap) {
2209 #if defined (USE_CTABLE)
2210   ierr = TableCreateCopy( &a->colmap,oldmat->colmap ); CHKERRQ(ierr);
2211 #else
2212     a->colmap = (int *) PetscMalloc((a->Nbs)*sizeof(int));CHKPTRQ(a->colmap);
2213     PLogObjectMemory(mat,(a->Nbs)*sizeof(int));
2214     PetscMemcpy(a->colmap,oldmat->colmap,(a->Nbs)*sizeof(int));
2215 #endif
2216   } else a->colmap = 0;
2217   if (oldmat->garray && (len = ((Mat_SeqBAIJ *) (oldmat->B->data))->nbs)) {
2218     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
2219     PLogObjectMemory(mat,len*sizeof(int));
2220     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
2221   } else a->garray = 0;
2222 
2223   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
2224   PLogObjectParent(mat,a->lvec);
2225   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
2226 
2227   PLogObjectParent(mat,a->Mvctx);
2228   ierr =  MatDuplicate(oldmat->A,cpvalues,&a->A); CHKERRQ(ierr);
2229   PLogObjectParent(mat,a->A);
2230   ierr =  MatDuplicate(oldmat->B,cpvalues,&a->B); CHKERRQ(ierr);
2231   PLogObjectParent(mat,a->B);
2232   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
2233   ierr = FListDuplicate(mat->qlist,&matin->qlist);CHKERRQ(ierr);
2234   if (flg) {
2235     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
2236   }
2237   *newmat = mat;
2238   PetscFunctionReturn(0);
2239 }
2240 
2241 #include "sys.h"
2242 
2243 #undef __FUNC__
2244 #define __FUNC__ "MatLoad_MPIBAIJ"
2245 int MatLoad_MPIBAIJ(Viewer viewer,MatType type,Mat *newmat)
2246 {
2247   Mat          A;
2248   int          i, nz, ierr, j,rstart, rend, fd;
2249   Scalar       *vals,*buf;
2250   MPI_Comm     comm = ((PetscObject)viewer)->comm;
2251   MPI_Status   status;
2252   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,*browners,maxnz,*cols;
2253   int          *locrowlens,*sndcounts = 0,*procsnz = 0, jj,*mycols,*ibuf;
2254   int          flg,tag = ((PetscObject)viewer)->tag,bs=1,Mbs,mbs,extra_rows;
2255   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
2256   int          dcount,kmax,k,nzcount,tmp;
2257 
2258   PetscFunctionBegin;
2259   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
2260 
2261   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
2262   if (!rank) {
2263     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
2264     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT); CHKERRQ(ierr);
2265     if (header[0] != MAT_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"not matrix object");
2266     if (header[3] < 0) {
2267       SETERRQ(PETSC_ERR_FILE_UNEXPECTED,1,"Matrix stored in special format, cannot load as MPIBAIJ");
2268     }
2269   }
2270 
2271   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
2272   M = header[1]; N = header[2];
2273 
2274   if (M != N) SETERRQ(PETSC_ERR_SUP,0,"Can only do square matrices");
2275 
2276   /*
2277      This code adds extra rows to make sure the number of rows is
2278      divisible by the blocksize
2279   */
2280   Mbs        = M/bs;
2281   extra_rows = bs - M + bs*(Mbs);
2282   if (extra_rows == bs) extra_rows = 0;
2283   else                  Mbs++;
2284   if (extra_rows &&!rank) {
2285     PLogInfo(0,"MatLoad_MPIBAIJ:Padding loaded matrix to match blocksize\n");
2286   }
2287 
2288   /* determine ownership of all rows */
2289   mbs = Mbs/size + ((Mbs % size) > rank);
2290   m   = mbs * bs;
2291   rowners = (int *) PetscMalloc(2*(size+2)*sizeof(int)); CHKPTRQ(rowners);
2292   browners = rowners + size + 1;
2293   ierr = MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2294   rowners[0] = 0;
2295   for ( i=2; i<=size; i++ ) rowners[i] += rowners[i-1];
2296   for ( i=0; i<=size;  i++ ) browners[i] = rowners[i]*bs;
2297   rstart = rowners[rank];
2298   rend   = rowners[rank+1];
2299 
2300   /* distribute row lengths to all processors */
2301   locrowlens = (int*) PetscMalloc( (rend-rstart)*bs*sizeof(int) ); CHKPTRQ(locrowlens);
2302   if (!rank) {
2303     rowlengths = (int*) PetscMalloc( (M+extra_rows)*sizeof(int) ); CHKPTRQ(rowlengths);
2304     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT); CHKERRQ(ierr);
2305     for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
2306     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
2307     for ( i=0; i<size; i++ ) sndcounts[i] = browners[i+1] - browners[i];
2308     ierr = MPI_Scatterv(rowlengths,sndcounts,browners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);CHKERRQ(ierr);
2309     PetscFree(sndcounts);
2310   } else {
2311     ierr = MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT, 0,comm);CHKERRQ(ierr);
2312   }
2313 
2314   if (!rank) {
2315     /* calculate the number of nonzeros on each processor */
2316     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
2317     PetscMemzero(procsnz,size*sizeof(int));
2318     for ( i=0; i<size; i++ ) {
2319       for ( j=rowners[i]*bs; j< rowners[i+1]*bs; j++ ) {
2320         procsnz[i] += rowlengths[j];
2321       }
2322     }
2323     PetscFree(rowlengths);
2324 
2325     /* determine max buffer needed and allocate it */
2326     maxnz = 0;
2327     for ( i=0; i<size; i++ ) {
2328       maxnz = PetscMax(maxnz,procsnz[i]);
2329     }
2330     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
2331 
2332     /* read in my part of the matrix column indices  */
2333     nz = procsnz[0];
2334     ibuf = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2335     mycols = ibuf;
2336     if (size == 1)  nz -= extra_rows;
2337     ierr = PetscBinaryRead(fd,mycols,nz,PETSC_INT); CHKERRQ(ierr);
2338     if (size == 1)  for (i=0; i< extra_rows; i++) { mycols[nz+i] = M+i; }
2339 
2340     /* read in every ones (except the last) and ship off */
2341     for ( i=1; i<size-1; i++ ) {
2342       nz   = procsnz[i];
2343       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2344       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
2345     }
2346     /* read in the stuff for the last proc */
2347     if ( size != 1 ) {
2348       nz   = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
2349       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2350       for ( i=0; i<extra_rows; i++ ) cols[nz+i] = M+i;
2351       ierr = MPI_Send(cols,nz+extra_rows,MPI_INT,size-1,tag,comm);CHKERRQ(ierr);
2352     }
2353     PetscFree(cols);
2354   } else {
2355     /* determine buffer space needed for message */
2356     nz = 0;
2357     for ( i=0; i<m; i++ ) {
2358       nz += locrowlens[i];
2359     }
2360     ibuf   = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2361     mycols = ibuf;
2362     /* receive message of column indices*/
2363     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
2364     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
2365     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2366   }
2367 
2368   /* loop over local rows, determining number of off diagonal entries */
2369   dlens  = (int *) PetscMalloc( 2*(rend-rstart+1)*sizeof(int) ); CHKPTRQ(dlens);
2370   odlens = dlens + (rend-rstart);
2371   mask   = (int *) PetscMalloc( 3*Mbs*sizeof(int) ); CHKPTRQ(mask);
2372   PetscMemzero(mask,3*Mbs*sizeof(int));
2373   masked1 = mask    + Mbs;
2374   masked2 = masked1 + Mbs;
2375   rowcount = 0; nzcount = 0;
2376   for ( i=0; i<mbs; i++ ) {
2377     dcount  = 0;
2378     odcount = 0;
2379     for ( j=0; j<bs; j++ ) {
2380       kmax = locrowlens[rowcount];
2381       for ( k=0; k<kmax; k++ ) {
2382         tmp = mycols[nzcount++]/bs;
2383         if (!mask[tmp]) {
2384           mask[tmp] = 1;
2385           if (tmp < rstart || tmp >= rend ) masked2[odcount++] = tmp;
2386           else masked1[dcount++] = tmp;
2387         }
2388       }
2389       rowcount++;
2390     }
2391 
2392     dlens[i]  = dcount;
2393     odlens[i] = odcount;
2394 
2395     /* zero out the mask elements we set */
2396     for ( j=0; j<dcount; j++ ) mask[masked1[j]] = 0;
2397     for ( j=0; j<odcount; j++ ) mask[masked2[j]] = 0;
2398   }
2399 
2400   /* create our matrix */
2401   ierr = MatCreateMPIBAIJ(comm,bs,m,PETSC_DECIDE,M+extra_rows,N+extra_rows,0,dlens,0,odlens,newmat);
2402          CHKERRQ(ierr);
2403   A = *newmat;
2404   MatSetOption(A,MAT_COLUMNS_SORTED);
2405 
2406   if (!rank) {
2407     buf = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(buf);
2408     /* read in my part of the matrix numerical values  */
2409     nz = procsnz[0];
2410     vals = buf;
2411     mycols = ibuf;
2412     if (size == 1)  nz -= extra_rows;
2413     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2414     if (size == 1)  for (i=0; i< extra_rows; i++) { vals[nz+i] = 1.0; }
2415 
2416     /* insert into matrix */
2417     jj      = rstart*bs;
2418     for ( i=0; i<m; i++ ) {
2419       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2420       mycols += locrowlens[i];
2421       vals   += locrowlens[i];
2422       jj++;
2423     }
2424     /* read in other processors (except the last one) and ship out */
2425     for ( i=1; i<size-1; i++ ) {
2426       nz   = procsnz[i];
2427       vals = buf;
2428       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2429       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
2430     }
2431     /* the last proc */
2432     if ( size != 1 ){
2433       nz   = procsnz[i] - extra_rows;
2434       vals = buf;
2435       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2436       for ( i=0; i<extra_rows; i++ ) vals[nz+i] = 1.0;
2437       ierr = MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,size-1,A->tag,comm);CHKERRQ(ierr);
2438     }
2439     PetscFree(procsnz);
2440   } else {
2441     /* receive numeric values */
2442     buf = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(buf);
2443 
2444     /* receive message of values*/
2445     vals   = buf;
2446     mycols = ibuf;
2447     ierr   = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
2448     ierr   = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
2449     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2450 
2451     /* insert into matrix */
2452     jj      = rstart*bs;
2453     for ( i=0; i<m; i++ ) {
2454       ierr    = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2455       mycols += locrowlens[i];
2456       vals   += locrowlens[i];
2457       jj++;
2458     }
2459   }
2460   PetscFree(locrowlens);
2461   PetscFree(buf);
2462   PetscFree(ibuf);
2463   PetscFree(rowners);
2464   PetscFree(dlens);
2465   PetscFree(mask);
2466   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2467   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2468   PetscFunctionReturn(0);
2469 }
2470 
2471 
2472 
2473 #undef __FUNC__
2474 #define __FUNC__ "MatMPIBAIJSetHashTableFactor"
2475 /*@
2476    MatMPIBAIJSetHashTableFactor - Sets the factor required to compute the size of the HashTable.
2477 
2478    Input Parameters:
2479 .  mat  - the matrix
2480 .  fact - factor
2481 
2482    Collective on Mat
2483 
2484   Notes:
2485    This can also be set by the command line option: -mat_use_hash_table fact
2486 
2487 .keywords: matrix, hashtable, factor, HT
2488 
2489 .seealso: MatSetOption()
2490 @*/
2491 int MatMPIBAIJSetHashTableFactor(Mat mat,double fact)
2492 {
2493   Mat_MPIBAIJ *baij;
2494 
2495   PetscFunctionBegin;
2496   PetscValidHeaderSpecific(mat,MAT_COOKIE);
2497   if (mat->type != MATMPIBAIJ) {
2498       SETERRQ(PETSC_ERR_ARG_WRONG,1,"Incorrect matrix type. Use MPIBAIJ only.");
2499   }
2500   baij = (Mat_MPIBAIJ*) mat->data;
2501   baij->ht_fact = fact;
2502   PetscFunctionReturn(0);
2503 }
2504