xref: /petsc/src/mat/impls/baij/mpi/mpibaij.c (revision d0a415805d4ada8adc754cff5723cc7354ee7166)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: mpibaij.c,v 1.95 1998/01/12 17:17:31 balay Exp balay $";
3 #endif
4 
5 #include "pinclude/pviewer.h"
6 #include "src/mat/impls/baij/mpi/mpibaij.h"
7 #include "src/vec/vecimpl.h"
8 
9 
10 extern int MatSetUpMultiply_MPIBAIJ(Mat);
11 extern int DisAssemble_MPIBAIJ(Mat);
12 extern int MatIncreaseOverlap_MPIBAIJ(Mat,int,IS *,int);
13 extern int MatGetSubMatrices_MPIBAIJ(Mat,int,IS *,IS *,MatGetSubMatrixCall,Mat **);
14 
15 /*
16      Local utility routine that creates a mapping from the global column
17    number to the local number in the off-diagonal part of the local
18    storage of the matrix.  This is done in a non scable way since the
19    length of colmap equals the global matrix length.
20 */
21 #undef __FUNC__
22 #define __FUNC__ "CreateColmap_MPIBAIJ_Private"
23 static int CreateColmap_MPIBAIJ_Private(Mat mat)
24 {
25   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
26   Mat_SeqBAIJ *B = (Mat_SeqBAIJ*) baij->B->data;
27   int         nbs = B->nbs,i,bs=B->bs;;
28 
29   PetscFunctionBegin;
30   baij->colmap = (int *) PetscMalloc((baij->Nbs+1)*sizeof(int));CHKPTRQ(baij->colmap);
31   PLogObjectMemory(mat,baij->Nbs*sizeof(int));
32   PetscMemzero(baij->colmap,baij->Nbs*sizeof(int));
33   for ( i=0; i<nbs; i++ ) baij->colmap[baij->garray[i]] = i*bs+1;
34   PetscFunctionReturn(0);
35 }
36 
37 #define CHUNKSIZE  10
38 
39 #define  MatSetValues_SeqBAIJ_A_Private(row,col,value,addv) \
40 { \
41  \
42     brow = row/bs;  \
43     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
44     rmax = aimax[brow]; nrow = ailen[brow]; \
45       bcol = col/bs; \
46       ridx = row % bs; cidx = col % bs; \
47       low = 0; high = nrow; \
48       while (high-low > 3) { \
49         t = (low+high)/2; \
50         if (rp[t] > bcol) high = t; \
51         else              low  = t; \
52       } \
53       for ( _i=low; _i<high; _i++ ) { \
54         if (rp[_i] > bcol) break; \
55         if (rp[_i] == bcol) { \
56           bap  = ap +  bs2*_i + bs*cidx + ridx; \
57           if (addv == ADD_VALUES) *bap += value;  \
58           else                    *bap  = value;  \
59           goto a_noinsert; \
60         } \
61       } \
62       if (a->nonew == 1) goto a_noinsert; \
63       else if (a->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
64       if (nrow >= rmax) { \
65         /* there is no extra room in row, therefore enlarge */ \
66         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
67         Scalar *new_a; \
68  \
69         if (a->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
70  \
71         /* malloc new storage space */ \
72         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int); \
73         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
74         new_j   = (int *) (new_a + bs2*new_nz); \
75         new_i   = new_j + new_nz; \
76  \
77         /* copy over old data into new slots */ \
78         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];} \
79         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
80         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int)); \
81         len = (new_nz - CHUNKSIZE - ai[brow] - nrow); \
82         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow, \
83                                                            len*sizeof(int)); \
84         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar)); \
85         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
86         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE), \
87                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));  \
88         /* free up old matrix storage */ \
89         PetscFree(a->a);  \
90         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} \
91         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
92         a->singlemalloc = 1; \
93  \
94         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
95         rmax = aimax[brow] = aimax[brow] + CHUNKSIZE; \
96         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
97         a->maxnz += bs2*CHUNKSIZE; \
98         a->reallocs++; \
99         a->nz++; \
100       } \
101       N = nrow++ - 1;  \
102       /* shift up all the later entries in this row */ \
103       for ( ii=N; ii>=_i; ii-- ) { \
104         rp[ii+1] = rp[ii]; \
105         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
106       } \
107       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
108       rp[_i]                      = bcol;  \
109       ap[bs2*_i + bs*cidx + ridx] = value;  \
110       a_noinsert:; \
111     ailen[brow] = nrow; \
112 }
113 
114 #define  MatSetValues_SeqBAIJ_B_Private(row,col,value,addv) \
115 { \
116  \
117     brow = row/bs;  \
118     rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
119     rmax = bimax[brow]; nrow = bilen[brow]; \
120       bcol = col/bs; \
121       ridx = row % bs; cidx = col % bs; \
122       low = 0; high = nrow; \
123       while (high-low > 3) { \
124         t = (low+high)/2; \
125         if (rp[t] > bcol) high = t; \
126         else              low  = t; \
127       } \
128       for ( _i=low; _i<high; _i++ ) { \
129         if (rp[_i] > bcol) break; \
130         if (rp[_i] == bcol) { \
131           bap  = ap +  bs2*_i + bs*cidx + ridx; \
132           if (addv == ADD_VALUES) *bap += value;  \
133           else                    *bap  = value;  \
134           goto b_noinsert; \
135         } \
136       } \
137       if (b->nonew == 1) goto b_noinsert; \
138       else if (b->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
139       if (nrow >= rmax) { \
140         /* there is no extra room in row, therefore enlarge */ \
141         int    new_nz = bi[b->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
142         Scalar *new_a; \
143  \
144         if (b->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
145  \
146         /* malloc new storage space */ \
147         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(b->mbs+1)*sizeof(int); \
148         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
149         new_j   = (int *) (new_a + bs2*new_nz); \
150         new_i   = new_j + new_nz; \
151  \
152         /* copy over old data into new slots */ \
153         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = bi[ii];} \
154         for ( ii=brow+1; ii<b->mbs+1; ii++ ) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
155         PetscMemcpy(new_j,bj,(bi[brow]+nrow)*sizeof(int)); \
156         len = (new_nz - CHUNKSIZE - bi[brow] - nrow); \
157         PetscMemcpy(new_j+bi[brow]+nrow+CHUNKSIZE,bj+bi[brow]+nrow, \
158                                                            len*sizeof(int)); \
159         PetscMemcpy(new_a,ba,(bi[brow]+nrow)*bs2*sizeof(Scalar)); \
160         PetscMemzero(new_a+bs2*(bi[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
161         PetscMemcpy(new_a+bs2*(bi[brow]+nrow+CHUNKSIZE), \
162                     ba+bs2*(bi[brow]+nrow),bs2*len*sizeof(Scalar));  \
163         /* free up old matrix storage */ \
164         PetscFree(b->a);  \
165         if (!b->singlemalloc) {PetscFree(b->i);PetscFree(b->j);} \
166         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
167         b->singlemalloc = 1; \
168  \
169         rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
170         rmax = bimax[brow] = bimax[brow] + CHUNKSIZE; \
171         PLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
172         b->maxnz += bs2*CHUNKSIZE; \
173         b->reallocs++; \
174         b->nz++; \
175       } \
176       N = nrow++ - 1;  \
177       /* shift up all the later entries in this row */ \
178       for ( ii=N; ii>=_i; ii-- ) { \
179         rp[ii+1] = rp[ii]; \
180         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
181       } \
182       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
183       rp[_i]                      = bcol;  \
184       ap[bs2*_i + bs*cidx + ridx] = value;  \
185       b_noinsert:; \
186     bilen[brow] = nrow; \
187 }
188 
189 #undef __FUNC__
190 #define __FUNC__ "MatSetValues_MPIBAIJ"
191 int MatSetValues_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
192 {
193   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
194   Scalar      value;
195   int         ierr,i,j,row,col;
196   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
197   int         rend_orig=baij->rend_bs,cstart_orig=baij->cstart_bs;
198   int         cend_orig=baij->cend_bs,bs=baij->bs;
199 
200   /* Some Variables required in the macro */
201   Mat         A = baij->A;
202   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) (A)->data;
203   int         *aimax=a->imax,*ai=a->i,*ailen=a->ilen,*aj=a->j;
204   Scalar      *aa=a->a;
205 
206   Mat         B = baij->B;
207   Mat_SeqBAIJ *b = (Mat_SeqBAIJ *) (B)->data;
208   int         *bimax=b->imax,*bi=b->i,*bilen=b->ilen,*bj=b->j;
209   Scalar      *ba=b->a;
210 
211   int         *rp,ii,nrow,_i,rmax,N,brow,bcol;
212   int         low,high,t,ridx,cidx,bs2=a->bs2;
213   Scalar      *ap,*bap;
214 
215   PetscFunctionBegin;
216   for ( i=0; i<m; i++ ) {
217 #if defined(USE_PETSC_BOPT_g)
218     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
219     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
220 #endif
221     if (im[i] >= rstart_orig && im[i] < rend_orig) {
222       row = im[i] - rstart_orig;
223       for ( j=0; j<n; j++ ) {
224         if (in[j] >= cstart_orig && in[j] < cend_orig){
225           col = in[j] - cstart_orig;
226           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
227           MatSetValues_SeqBAIJ_A_Private(row,col,value,addv);
228           /* ierr = MatSetValues_SeqBAIJ(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
229         }
230 #if defined(USE_PETSC_BOPT_g)
231         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
232         else if (in[j] >= baij->N) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Col too large");}
233 #endif
234         else {
235           if (mat->was_assembled) {
236             if (!baij->colmap) {
237               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
238             }
239             col = baij->colmap[in[j]/bs] - 1 + in[j]%bs;
240             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
241               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
242               col =  in[j];
243               /* Reinitialize the variables required by MatSetValues_SeqBAIJ_B_Private() */
244               B = baij->B;
245               b = (Mat_SeqBAIJ *) (B)->data;
246               bimax=b->imax;bi=b->i;bilen=b->ilen;bj=b->j;
247               ba=b->a;
248             }
249           } else col = in[j];
250           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
251           MatSetValues_SeqBAIJ_B_Private(row,col,value,addv);
252           /* ierr = MatSetValues_SeqBAIJ(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
253         }
254       }
255     } else {
256       if (roworiented && !baij->donotstash) {
257         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
258       } else {
259         if (!baij->donotstash) {
260           row = im[i];
261 	  for ( j=0; j<n; j++ ) {
262 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
263           }
264         }
265       }
266     }
267   }
268   PetscFunctionReturn(0);
269 }
270 
271 extern int MatSetValuesBlocked_SeqBAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
272 #undef __FUNC__
273 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ"
274 int MatSetValuesBlocked_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
275 {
276   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
277   Scalar      *value,*barray=baij->barray;
278   int         ierr,i,j,ii,jj,row,col,k,l;
279   int         roworiented = baij->roworiented,rstart=baij->rstart ;
280   int         rend=baij->rend,cstart=baij->cstart,stepval;
281   int         cend=baij->cend,bs=baij->bs,bs2=baij->bs2;
282 
283   if(!barray) {
284     baij->barray = barray = (Scalar*) PetscMalloc(bs2*sizeof(Scalar)); CHKPTRQ(barray);
285   }
286 
287   if (roworiented) {
288     stepval = (n-1)*bs;
289   } else {
290     stepval = (m-1)*bs;
291   }
292   for ( i=0; i<m; i++ ) {
293 #if defined(USE_PETSC_BOPT_g)
294     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
295     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
296 #endif
297     if (im[i] >= rstart && im[i] < rend) {
298       row = im[i] - rstart;
299       for ( j=0; j<n; j++ ) {
300         /* If NumCol = 1 then a copy is not required */
301         if ((roworiented) && (n == 1)) {
302           barray = v + i*bs2;
303         } else if((!roworiented) && (m == 1)) {
304           barray = v + j*bs2;
305         } else { /* Here a copy is required */
306           if (roworiented) {
307             value = v + i*(stepval+bs)*bs + j*bs;
308           } else {
309             value = v + j*(stepval+bs)*bs + i*bs;
310           }
311           for ( ii=0; ii<bs; ii++,value+=stepval ) {
312             for (jj=0; jj<bs; jj++ ) {
313               *barray++  = *value++;
314             }
315           }
316           barray -=bs2;
317         }
318 
319         if (in[j] >= cstart && in[j] < cend){
320           col  = in[j] - cstart;
321           ierr = MatSetValuesBlocked_SeqBAIJ(baij->A,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
322         }
323 #if defined(USE_PETSC_BOPT_g)
324         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
325         else if (in[j] >= baij->Nbs) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");}
326 #endif
327         else {
328           if (mat->was_assembled) {
329             if (!baij->colmap) {
330               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
331             }
332 
333 #if defined(USE_PETSC_BOPT_g)
334             if ((baij->colmap[in[j]] - 1) % bs) {SETERRQ(PETSC_ERR_PLIB,0,"Incorrect colmap");}
335 #endif
336             col = (baij->colmap[in[j]] - 1)/bs;
337             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
338               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
339               col =  in[j];
340             }
341           }
342           else col = in[j];
343           ierr = MatSetValuesBlocked_SeqBAIJ(baij->B,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
344         }
345       }
346     } else {
347       if (!baij->donotstash) {
348         if (roworiented ) {
349           row   = im[i]*bs;
350           value = v + i*(stepval+bs)*bs;
351           for ( j=0; j<bs; j++,row++ ) {
352             for ( k=0; k<n; k++ ) {
353               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
354                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
355               }
356             }
357           }
358         } else {
359           for ( j=0; j<n; j++ ) {
360             value = v + j*(stepval+bs)*bs + i*bs;
361             col   = in[j]*bs;
362             for ( k=0; k<bs; k++,col++,value+=stepval) {
363               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
364                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
365               }
366             }
367           }
368         }
369       }
370     }
371   }
372   PetscFunctionReturn(0);
373 }
374 #include <math.h>
375 #define HASH_KEY 0.6180339887
376 #define HASH1(size,key) ((int)((size)*fmod(((key)*HASH_KEY),1)))
377 
378 #undef __FUNC__
379 #define __FUNC__ "MatSetValues_MPIBAIJ_HT"
380 int MatSetValues_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
381 {
382   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
383   int         ierr,i,j,row,col;
384   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
385   int         rend_orig=baij->rend_bs;
386 
387   int         h1,key,size=baij->ht_size,k,bs=baij->bs;
388   double      * HT  = baij->ht;
389   Scalar      ** HD = (Scalar **)(HT + size),value;
390 
391 
392   PetscFunctionBegin;
393 
394   if(((Mat_SeqBAIJ*)baij->A->data)->nonew !=1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix");
395 
396   for ( i=0; i<m; i++ ) {
397 #if defined(USE_PETSC_BOPT_g)
398     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
399     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
400 #endif
401     if (im[i] >= rstart_orig && im[i] < rend_orig) {
402       row = im[i];
403       for ( j=0; j<n; j++ ) {
404         col = in[j];
405         if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
406         /* Look up into the Hash Table */
407         key = (row/bs)*baij->Nbs+(col/bs)+1;
408         h1  = HASH1(size,key);
409 
410         for ( k=0; k<size; k++ ){
411           if (HT[(h1+k)%size] == key) {
412             if (addv == ADD_VALUES) *(HD[(h1+k)%size]+ (col % bs)*bs + row % bs) += value;
413             else                    *(HD[(h1+k)%size]+ (col % bs)*bs + row % bs)  = value;
414             break;
415           }
416         }
417         if ( k==size) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Hastable Entry not found");
418       }
419     } else {
420       if (roworiented && !baij->donotstash) {
421         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
422       } else {
423         if (!baij->donotstash) {
424           row = im[i];
425 	  for ( j=0; j<n; j++ ) {
426 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
427           }
428         }
429       }
430     }
431   }
432   PetscFunctionReturn(0);
433 }
434 
435 #undef __FUNC__
436 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ_HT"
437 int MatSetValuesBlocked_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
438 {
439   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
440   int         ierr,i,j,ii,jj,row,col,k,l;
441   int         roworiented = baij->roworiented,rstart=baij->rstart ;
442   int         rend=baij->rend,stepval,bs=baij->bs,bs2=baij->bs2;
443 
444   int         h1,key,size=baij->ht_size;
445   double      * HT  = baij->ht;
446   Scalar      ** HD = (Scalar **)(HT + size),*value,*baij_a;
447 
448   PetscFunctionBegin;
449 
450   if(((Mat_SeqBAIJ*)baij->A->data)->nonew !=1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix");
451 
452   if (roworiented) {
453     stepval = (n-1)*bs;
454   } else {
455     stepval = (m-1)*bs;
456   }
457   for ( i=0; i<m; i++ ) {
458 #if defined(USE_PETSC_BOPT_g)
459     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
460     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
461 #endif
462     if (im[i] >= rstart && im[i] < rend) {
463       row = im[i];
464       for ( j=0; j<n; j++ ) {
465         col = in[j];
466 
467         /* Look up into the Hash Table */
468         key = row*baij->Nbs+col+1;
469         h1  = HASH1(size,key);
470 
471         for ( k=0; k<size; k++ ){
472           if (HT[(h1+k)%size] == key) {
473             baij_a = HD[(h1+k)%size];
474 
475             if (roworiented) {
476               value = v + i*(stepval+bs)*bs + j*bs;
477               for ( ii=0; ii<bs; ii++,value+=stepval,baij_a++ ) {
478                 for ( jj=0; jj<bs2; jj+=bs ) {
479                   if (addv == ADD_VALUES) baij_a[jj]  += *value++;
480                   else                    baij_a[jj]   = *value++;
481                 }
482               }
483 
484             } else {
485               value = v + j*(stepval+bs)*bs + i*bs;
486               for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
487                 for ( jj=0; jj<bs; jj++ ) {
488                   if (addv == ADD_VALUES) baij_a[jj]  += *value++;
489                   else                    baij_a[jj]   = *value++;
490                 }
491               }
492             }
493             break;
494           }
495         }
496         if ( k==size) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Hastable Entry not found");
497       }
498     } else {
499       if (!baij->donotstash) {
500         if (roworiented ) {
501           row   = im[i]*bs;
502           value = v + i*(stepval+bs)*bs;
503           for ( j=0; j<bs; j++,row++ ) {
504             for ( k=0; k<n; k++ ) {
505               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
506                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
507               }
508             }
509           }
510         } else {
511           for ( j=0; j<n; j++ ) {
512             value = v + j*(stepval+bs)*bs + i*bs;
513             col   = in[j]*bs;
514             for ( k=0; k<bs; k++,col++,value+=stepval) {
515               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
516                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
517               }
518             }
519           }
520         }
521       }
522     }
523   }
524   PetscFunctionReturn(0);
525 }
526 #undef __FUNC__
527 #define __FUNC__ "MatGetValues_MPIBAIJ"
528 int MatGetValues_MPIBAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
529 {
530   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
531   int        bs=baij->bs,ierr,i,j, bsrstart = baij->rstart*bs, bsrend = baij->rend*bs;
532   int        bscstart = baij->cstart*bs, bscend = baij->cend*bs,row,col;
533 
534   for ( i=0; i<m; i++ ) {
535     if (idxm[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
536     if (idxm[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
537     if (idxm[i] >= bsrstart && idxm[i] < bsrend) {
538       row = idxm[i] - bsrstart;
539       for ( j=0; j<n; j++ ) {
540         if (idxn[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");
541         if (idxn[j] >= baij->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");
542         if (idxn[j] >= bscstart && idxn[j] < bscend){
543           col = idxn[j] - bscstart;
544           ierr = MatGetValues(baij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
545         } else {
546           if (!baij->colmap) {
547             ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
548           }
549           if((baij->colmap[idxn[j]/bs]-1 < 0) ||
550              (baij->garray[(baij->colmap[idxn[j]/bs]-1)/bs] != idxn[j]/bs)) *(v+i*n+j) = 0.0;
551           else {
552             col  = (baij->colmap[idxn[j]/bs]-1) + idxn[j]%bs;
553             ierr = MatGetValues(baij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
554           }
555         }
556       }
557     } else {
558       SETERRQ(PETSC_ERR_SUP,0,"Only local values currently supported");
559     }
560   }
561   PetscFunctionReturn(0);
562 }
563 
564 #undef __FUNC__
565 #define __FUNC__ "MatNorm_MPIBAIJ"
566 int MatNorm_MPIBAIJ(Mat mat,NormType type,double *norm)
567 {
568   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
569   Mat_SeqBAIJ *amat = (Mat_SeqBAIJ*) baij->A->data, *bmat = (Mat_SeqBAIJ*) baij->B->data;
570   int        ierr, i,bs2=baij->bs2;
571   double     sum = 0.0;
572   Scalar     *v;
573 
574   PetscFunctionBegin;
575   if (baij->size == 1) {
576     ierr =  MatNorm(baij->A,type,norm); CHKERRQ(ierr);
577   } else {
578     if (type == NORM_FROBENIUS) {
579       v = amat->a;
580       for (i=0; i<amat->nz*bs2; i++ ) {
581 #if defined(USE_PETSC_COMPLEX)
582         sum += real(conj(*v)*(*v)); v++;
583 #else
584         sum += (*v)*(*v); v++;
585 #endif
586       }
587       v = bmat->a;
588       for (i=0; i<bmat->nz*bs2; i++ ) {
589 #if defined(USE_PETSC_COMPLEX)
590         sum += real(conj(*v)*(*v)); v++;
591 #else
592         sum += (*v)*(*v); v++;
593 #endif
594       }
595       ierr = MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
596       *norm = sqrt(*norm);
597     } else {
598       SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
599     }
600   }
601   PetscFunctionReturn(0);
602 }
603 
604 #undef __FUNC__
605 #define __FUNC__ "MatAssemblyBegin_MPIBAIJ"
606 int MatAssemblyBegin_MPIBAIJ(Mat mat,MatAssemblyType mode)
607 {
608   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
609   MPI_Comm    comm = mat->comm;
610   int         size = baij->size, *owners = baij->rowners,bs=baij->bs;
611   int         rank = baij->rank,tag = mat->tag, *owner,*starts,count,ierr;
612   MPI_Request *send_waits,*recv_waits;
613   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
614   InsertMode  addv;
615   Scalar      *rvalues,*svalues;
616 
617   PetscFunctionBegin;
618   /* make sure all processors are either in INSERTMODE or ADDMODE */
619   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);CHKERRQ(ierr);
620   if (addv == (ADD_VALUES|INSERT_VALUES)) {
621     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Some processors inserted others added");
622   }
623   mat->insertmode = addv; /* in case this processor had no cache */
624 
625   /*  first count number of contributors to each processor */
626   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
627   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
628   owner = (int *) PetscMalloc( (baij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
629   for ( i=0; i<baij->stash.n; i++ ) {
630     idx = baij->stash.idx[i];
631     for ( j=0; j<size; j++ ) {
632       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
633         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
634       }
635     }
636   }
637   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
638 
639   /* inform other processors of number of messages and max length*/
640   work      = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
641   ierr      = MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
642   nreceives = work[rank];
643   ierr      = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
644   nmax      = work[rank];
645   PetscFree(work);
646 
647   /* post receives:
648        1) each message will consist of ordered pairs
649      (global index,value) we store the global index as a double
650      to simplify the message passing.
651        2) since we don't know how long each individual message is we
652      allocate the largest needed buffer for each receive. Potentially
653      this is a lot of wasted space.
654 
655 
656        This could be done better.
657   */
658   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));CHKPTRQ(rvalues);
659   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
660   for ( i=0; i<nreceives; i++ ) {
661     ierr = MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
662   }
663 
664   /* do sends:
665       1) starts[i] gives the starting index in svalues for stuff going to
666          the ith processor
667   */
668   svalues = (Scalar *) PetscMalloc(3*(baij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
669   send_waits = (MPI_Request *) PetscMalloc((nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
670   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
671   starts[0] = 0;
672   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
673   for ( i=0; i<baij->stash.n; i++ ) {
674     svalues[3*starts[owner[i]]]       = (Scalar)  baij->stash.idx[i];
675     svalues[3*starts[owner[i]]+1]     = (Scalar)  baij->stash.idy[i];
676     svalues[3*(starts[owner[i]]++)+2] =  baij->stash.array[i];
677   }
678   PetscFree(owner);
679   starts[0] = 0;
680   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
681   count = 0;
682   for ( i=0; i<size; i++ ) {
683     if (procs[i]) {
684       ierr = MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
685     }
686   }
687   PetscFree(starts); PetscFree(nprocs);
688 
689   /* Free cache space */
690   PLogInfo(0,"[%d]MatAssemblyBegin_MPIBAIJ:Number of off-processor values %d\n",rank,baij->stash.n);
691   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
692 
693   baij->svalues    = svalues;    baij->rvalues    = rvalues;
694   baij->nsends     = nsends;     baij->nrecvs     = nreceives;
695   baij->send_waits = send_waits; baij->recv_waits = recv_waits;
696   baij->rmax       = nmax;
697 
698   PetscFunctionReturn(0);
699 }
700 
701 int MatCreateHashTable_MPIBAIJ_Private(Mat mat,double factor)
702 {
703   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
704   Mat         A = baij->A, B=baij->B;
705   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data, *b=(Mat_SeqBAIJ *)B->data;
706   int         i,j,k,nz=a->nz+b->nz,h1,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
707   int         size,ct=0,max1=0,max2=0,bs2=baij->bs2,rstart=baij->rstart;
708   int         cstart=baij->cstart,*garray=baij->garray,row,col;
709   double      *HT,key;
710   extern int  PetscGlobalRank;
711   Scalar      **HD;
712   /*
713      Scalar      *aa=a->a,*ba=b->a;
714      static double *HT;
715      static      int flag=1;
716      */
717   PetscFunctionBegin;
718   baij->ht_size=(int)(factor*nz);
719   size = baij->ht_size;
720   /* Allocate Memory for Hash Table */
721   if (baij->ht) {
722     PetscFunctionReturn(0);
723   }
724 
725   baij->ht = (double*)PetscMalloc(size*(sizeof(double)+sizeof(Scalar*))+1); CHKPTRQ(baij->ht);
726   HT = baij->ht;
727   HD = (Scalar**)(HT + size);
728   PetscMemzero(HT,size*sizeof(double)+sizeof(Scalar*));
729 
730 
731   /* Loop Over A */
732   for ( i=0; i<a->mbs; i++ ) {
733     for ( j=ai[i]; j<ai[i+1]; j++ ) {
734       row = i+rstart;
735       col = aj[j]+cstart;
736 
737       key = row*baij->Nbs + col + 1;
738       /* printf("[%d]row,col,key = %2d,%2d,%5.2f\n",PetscGlobalRank,row,col,key); */
739       h1  = HASH1(size,key);
740 
741       for ( k=0; k<size; k++ ){
742         if (HT[(h1+k)%size] == 0.0) {
743           HT[(h1+k)%size] = key;
744           HD[(h1+k)%size] = a->a + j*bs2;
745           break;
746         } else ct++;
747       }
748       if (k> max1) max1 = k;
749     }
750   }
751   /* Loop Over B */
752   for ( i=0; i<b->mbs; i++ ) {
753     for ( j=bi[i]; j<bi[i+1]; j++ ) {
754       row = i+rstart;
755       col = garray[bj[j]];
756       key = row*baij->Nbs + col + 1;
757       /* printf("[%d]row,col,key = %2d,%2d,%5.2f\n",PetscGlobalRank,row,col,key); */
758       h1  = HASH1(size,key);
759       for ( k=0; k<size; k++ ){
760         if (HT[(h1+k)%size] == 0.0) {
761           HT[(h1+k)%size] = key;
762           HD[(h1+k)%size] = b->a + j*bs2;
763           break;
764         } else ct++;
765       }
766       if (k> max2) max2 = k;
767     }
768   }
769 
770   /* Print Summary */
771   for ( i=0,key=0.0,j=0; i<size; i++)
772     if (HT[i]) {j++;}
773 
774   /*printf("[%d] fact = %3.2f max1 = %5d max2 = %5d Size %5d - Searches %5d Avg %5.2f Keys %5d\n",
775          PetscGlobalRank,factor,max1,max2,size,ct+j,((float)ct+j),j); */
776   fflush(stdout);
777   PetscFunctionReturn(0);
778 }
779 
780 #undef __FUNC__
781 #define __FUNC__ "MatAssemblyEnd_MPIBAIJ"
782 int MatAssemblyEnd_MPIBAIJ(Mat mat,MatAssemblyType mode)
783 {
784   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
785   MPI_Status  *send_status,recv_status;
786   int         imdex,nrecvs = baij->nrecvs, count = nrecvs, i, n, ierr;
787   int         bs=baij->bs,row,col,other_disassembled,flg;
788   Scalar      *values,val;
789   InsertMode  addv = mat->insertmode;
790 
791   PetscFunctionBegin;
792   /*  wait on receives */
793   while (count) {
794     ierr = MPI_Waitany(nrecvs,baij->recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
795     /* unpack receives into our local space */
796     values = baij->rvalues + 3*imdex*baij->rmax;
797     ierr = MPI_Get_count(&recv_status,MPIU_SCALAR,&n);CHKERRQ(ierr);
798     n = n/3;
799     for ( i=0; i<n; i++ ) {
800       row = (int) PetscReal(values[3*i]) - baij->rstart*bs;
801       col = (int) PetscReal(values[3*i+1]);
802       val = values[3*i+2];
803       if (col >= baij->cstart*bs && col < baij->cend*bs) {
804         col -= baij->cstart*bs;
805         ierr = MatSetValues(baij->A,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
806       } else {
807         if (mat->was_assembled) {
808           if (!baij->colmap) {
809             ierr = CreateColmap_MPIBAIJ_Private(mat); CHKERRQ(ierr);
810           }
811           col = (baij->colmap[col/bs]) - 1 + col%bs;
812           if (col < 0  && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
813             ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
814             col = (int) PetscReal(values[3*i+1]);
815           }
816         }
817         ierr = MatSetValues(baij->B,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
818       }
819     }
820     count--;
821   }
822   PetscFree(baij->recv_waits); PetscFree(baij->rvalues);
823 
824   /* wait on sends */
825   if (baij->nsends) {
826     send_status = (MPI_Status *) PetscMalloc(baij->nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
827     ierr        = MPI_Waitall(baij->nsends,baij->send_waits,send_status);CHKERRQ(ierr);
828     PetscFree(send_status);
829   }
830   PetscFree(baij->send_waits); PetscFree(baij->svalues);
831 
832   ierr = MatAssemblyBegin(baij->A,mode); CHKERRQ(ierr);
833   ierr = MatAssemblyEnd(baij->A,mode); CHKERRQ(ierr);
834 
835   /* determine if any processor has disassembled, if so we must
836      also disassemble ourselfs, in order that we may reassemble. */
837   ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
838   if (mat->was_assembled && !other_disassembled) {
839     ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
840   }
841 
842   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
843     ierr = MatSetUpMultiply_MPIBAIJ(mat); CHKERRQ(ierr);
844   }
845   ierr = MatAssemblyBegin(baij->B,mode); CHKERRQ(ierr);
846   ierr = MatAssemblyEnd(baij->B,mode); CHKERRQ(ierr);
847 
848   ierr = OptionsHasName(PETSC_NULL,"-use_hash",&flg); CHKERRQ(ierr);
849   if (flg && !baij->ht && mode== MAT_FINAL_ASSEMBLY) {
850     double fact = 1.39;
851     MatCreateHashTable_MPIBAIJ_Private(mat,fact);
852     mat->ops.setvalues        = MatSetValues_MPIBAIJ_HT;
853     mat->ops.setvaluesblocked = MatSetValuesBlocked_MPIBAIJ_HT;
854   }
855   if (baij->rowvalues) {PetscFree(baij->rowvalues); baij->rowvalues = 0;}
856   PetscFunctionReturn(0);
857 }
858 
859 #undef __FUNC__
860 #define __FUNC__ "MatView_MPIBAIJ_Binary"
861 static int MatView_MPIBAIJ_Binary(Mat mat,Viewer viewer)
862 {
863   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
864   int          ierr;
865 
866   PetscFunctionBegin;
867   if (baij->size == 1) {
868     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
869   } else SETERRQ(PETSC_ERR_SUP,0,"Only uniprocessor output supported");
870   PetscFunctionReturn(0);
871 }
872 
873 #undef __FUNC__
874 #define __FUNC__ "MatView_MPIBAIJ_ASCIIorDraworMatlab"
875 static int MatView_MPIBAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
876 {
877   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
878   int          ierr, format,rank,bs = baij->bs;
879   FILE         *fd;
880   ViewerType   vtype;
881 
882   PetscFunctionBegin;
883   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
884   if (vtype  == ASCII_FILES_VIEWER || vtype == ASCII_FILE_VIEWER) {
885     ierr = ViewerGetFormat(viewer,&format);
886     if (format == VIEWER_FORMAT_ASCII_INFO_LONG) {
887       MatInfo info;
888       MPI_Comm_rank(mat->comm,&rank);
889       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
890       ierr = MatGetInfo(mat,MAT_LOCAL,&info);
891       PetscSequentialPhaseBegin(mat->comm,1);
892       fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
893               rank,baij->m,(int)info.nz_used*bs,(int)info.nz_allocated*bs,
894               baij->bs,(int)info.memory);
895       ierr = MatGetInfo(baij->A,MAT_LOCAL,&info);
896       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
897       ierr = MatGetInfo(baij->B,MAT_LOCAL,&info);
898       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
899       fflush(fd);
900       PetscSequentialPhaseEnd(mat->comm,1);
901       ierr = VecScatterView(baij->Mvctx,viewer); CHKERRQ(ierr);
902       PetscFunctionReturn(0);
903     } else if (format == VIEWER_FORMAT_ASCII_INFO) {
904       PetscPrintf(mat->comm,"  block size is %d\n",bs);
905       PetscFunctionReturn(0);
906     }
907   }
908 
909   if (vtype == DRAW_VIEWER) {
910     Draw       draw;
911     PetscTruth isnull;
912     ierr = ViewerDrawGetDraw(viewer,&draw); CHKERRQ(ierr);
913     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
914   }
915 
916   if (vtype == ASCII_FILE_VIEWER) {
917     ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
918     PetscSequentialPhaseBegin(mat->comm,1);
919     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
920            baij->rank,baij->m,baij->rstart*bs,baij->rend*bs,baij->n,
921             baij->cstart*bs,baij->cend*bs);
922     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
923     ierr = MatView(baij->B,viewer); CHKERRQ(ierr);
924     fflush(fd);
925     PetscSequentialPhaseEnd(mat->comm,1);
926   } else {
927     int size = baij->size;
928     rank = baij->rank;
929     if (size == 1) {
930       ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
931     } else {
932       /* assemble the entire matrix onto first processor. */
933       Mat         A;
934       Mat_SeqBAIJ *Aloc;
935       int         M = baij->M, N = baij->N,*ai,*aj,row,col,i,j,k,*rvals;
936       int         mbs=baij->mbs;
937       Scalar      *a;
938 
939       if (!rank) {
940         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
941       } else {
942         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
943       }
944       PLogObjectParent(mat,A);
945 
946       /* copy over the A part */
947       Aloc = (Mat_SeqBAIJ*) baij->A->data;
948       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
949       row = baij->rstart;
950       rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
951 
952       for ( i=0; i<mbs; i++ ) {
953         rvals[0] = bs*(baij->rstart + i);
954         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
955         for ( j=ai[i]; j<ai[i+1]; j++ ) {
956           col = (baij->cstart+aj[j])*bs;
957           for (k=0; k<bs; k++ ) {
958             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
959             col++; a += bs;
960           }
961         }
962       }
963       /* copy over the B part */
964       Aloc = (Mat_SeqBAIJ*) baij->B->data;
965       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
966       row = baij->rstart*bs;
967       for ( i=0; i<mbs; i++ ) {
968         rvals[0] = bs*(baij->rstart + i);
969         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
970         for ( j=ai[i]; j<ai[i+1]; j++ ) {
971           col = baij->garray[aj[j]]*bs;
972           for (k=0; k<bs; k++ ) {
973             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
974             col++; a += bs;
975           }
976         }
977       }
978       PetscFree(rvals);
979       ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
980       ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
981       /*
982          Everyone has to call to draw the matrix since the graphics waits are
983          synchronized across all processors that share the Draw object
984       */
985       if (!rank || vtype == DRAW_VIEWER) {
986         ierr = MatView(((Mat_MPIBAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
987       }
988       ierr = MatDestroy(A); CHKERRQ(ierr);
989     }
990   }
991   PetscFunctionReturn(0);
992 }
993 
994 
995 
996 #undef __FUNC__
997 #define __FUNC__ "MatView_MPIBAIJ"
998 int MatView_MPIBAIJ(PetscObject obj,Viewer viewer)
999 {
1000   Mat         mat = (Mat) obj;
1001   int         ierr;
1002   ViewerType  vtype;
1003 
1004   PetscFunctionBegin;
1005   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
1006   if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER ||
1007       vtype == DRAW_VIEWER       || vtype == MATLAB_VIEWER) {
1008     ierr = MatView_MPIBAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
1009   } else if (vtype == BINARY_FILE_VIEWER) {
1010     ierr = MatView_MPIBAIJ_Binary(mat,viewer);CHKERRQ(ierr);
1011   }
1012   PetscFunctionReturn(0);
1013 }
1014 
1015 #undef __FUNC__
1016 #define __FUNC__ "MatDestroy_MPIBAIJ"
1017 int MatDestroy_MPIBAIJ(PetscObject obj)
1018 {
1019   Mat         mat = (Mat) obj;
1020   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1021   int         ierr;
1022 
1023   PetscFunctionBegin;
1024 #if defined(USE_PETSC_LOG)
1025   PLogObjectState(obj,"Rows=%d, Cols=%d",baij->M,baij->N);
1026 #endif
1027 
1028   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
1029   PetscFree(baij->rowners);
1030   ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1031   ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1032   if (baij->colmap) PetscFree(baij->colmap);
1033   if (baij->garray) PetscFree(baij->garray);
1034   if (baij->lvec)   VecDestroy(baij->lvec);
1035   if (baij->Mvctx)  VecScatterDestroy(baij->Mvctx);
1036   if (baij->rowvalues) PetscFree(baij->rowvalues);
1037   if (baij->barray) PetscFree(baij->barray);
1038   if (baij->ht) PetscFree(baij->ht);
1039   PetscFree(baij);
1040   PLogObjectDestroy(mat);
1041   PetscHeaderDestroy(mat);
1042   PetscFunctionReturn(0);
1043 }
1044 
1045 #undef __FUNC__
1046 #define __FUNC__ "MatMult_MPIBAIJ"
1047 int MatMult_MPIBAIJ(Mat A,Vec xx,Vec yy)
1048 {
1049   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1050   int         ierr, nt;
1051 
1052   PetscFunctionBegin;
1053   VecGetLocalSize_Fast(xx,nt);
1054   if (nt != a->n) {
1055     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible partition of A and xx");
1056   }
1057   VecGetLocalSize_Fast(yy,nt);
1058   if (nt != a->m) {
1059     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible parition of A and yy");
1060   }
1061   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1062   ierr = (*a->A->ops.mult)(a->A,xx,yy); CHKERRQ(ierr);
1063   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1064   ierr = (*a->B->ops.multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
1065   ierr = VecScatterPostRecvs(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1066   PetscFunctionReturn(0);
1067 }
1068 
1069 #undef __FUNC__
1070 #define __FUNC__ "MatMultAdd_MPIBAIJ"
1071 int MatMultAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1072 {
1073   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1074   int        ierr;
1075 
1076   PetscFunctionBegin;
1077   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1078   ierr = (*a->A->ops.multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1079   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1080   ierr = (*a->B->ops.multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
1081   PetscFunctionReturn(0);
1082 }
1083 
1084 #undef __FUNC__
1085 #define __FUNC__ "MatMultTrans_MPIBAIJ"
1086 int MatMultTrans_MPIBAIJ(Mat A,Vec xx,Vec yy)
1087 {
1088   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1089   int         ierr;
1090 
1091   PetscFunctionBegin;
1092   /* do nondiagonal part */
1093   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1094   /* send it on its way */
1095   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1096   /* do local part */
1097   ierr = (*a->A->ops.multtrans)(a->A,xx,yy); CHKERRQ(ierr);
1098   /* receive remote parts: note this assumes the values are not actually */
1099   /* inserted in yy until the next line, which is true for my implementation*/
1100   /* but is not perhaps always true. */
1101   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1102   PetscFunctionReturn(0);
1103 }
1104 
1105 #undef __FUNC__
1106 #define __FUNC__ "MatMultTransAdd_MPIBAIJ"
1107 int MatMultTransAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1108 {
1109   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1110   int         ierr;
1111 
1112   PetscFunctionBegin;
1113   /* do nondiagonal part */
1114   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1115   /* send it on its way */
1116   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1117   /* do local part */
1118   ierr = (*a->A->ops.multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1119   /* receive remote parts: note this assumes the values are not actually */
1120   /* inserted in yy until the next line, which is true for my implementation*/
1121   /* but is not perhaps always true. */
1122   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1123   PetscFunctionReturn(0);
1124 }
1125 
1126 /*
1127   This only works correctly for square matrices where the subblock A->A is the
1128    diagonal block
1129 */
1130 #undef __FUNC__
1131 #define __FUNC__ "MatGetDiagonal_MPIBAIJ"
1132 int MatGetDiagonal_MPIBAIJ(Mat A,Vec v)
1133 {
1134   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1135   int         ierr;
1136 
1137   PetscFunctionBegin;
1138   if (a->M != a->N) SETERRQ(PETSC_ERR_SUP,0,"Supports only square matrix where A->A is diag block");
1139   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
1140   PetscFunctionReturn(0);
1141 }
1142 
1143 #undef __FUNC__
1144 #define __FUNC__ "MatScale_MPIBAIJ"
1145 int MatScale_MPIBAIJ(Scalar *aa,Mat A)
1146 {
1147   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1148   int         ierr;
1149 
1150   PetscFunctionBegin;
1151   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
1152   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
1153   PetscFunctionReturn(0);
1154 }
1155 
1156 #undef __FUNC__
1157 #define __FUNC__ "MatGetSize_MPIBAIJ"
1158 int MatGetSize_MPIBAIJ(Mat matin,int *m,int *n)
1159 {
1160   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1161 
1162   PetscFunctionBegin;
1163   if (m) *m = mat->M;
1164   if (n) *n = mat->N;
1165   PetscFunctionReturn(0);
1166 }
1167 
1168 #undef __FUNC__
1169 #define __FUNC__ "MatGetLocalSize_MPIBAIJ"
1170 int MatGetLocalSize_MPIBAIJ(Mat matin,int *m,int *n)
1171 {
1172   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1173 
1174   PetscFunctionBegin;
1175   *m = mat->m; *n = mat->N;
1176   PetscFunctionReturn(0);
1177 }
1178 
1179 #undef __FUNC__
1180 #define __FUNC__ "MatGetOwnershipRange_MPIBAIJ"
1181 int MatGetOwnershipRange_MPIBAIJ(Mat matin,int *m,int *n)
1182 {
1183   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1184 
1185   PetscFunctionBegin;
1186   *m = mat->rstart*mat->bs; *n = mat->rend*mat->bs;
1187   PetscFunctionReturn(0);
1188 }
1189 
1190 extern int MatGetRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1191 extern int MatRestoreRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1192 
1193 #undef __FUNC__
1194 #define __FUNC__ "MatGetRow_MPIBAIJ"
1195 int MatGetRow_MPIBAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1196 {
1197   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1198   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1199   int        bs = mat->bs, bs2 = mat->bs2, i, ierr, *cworkA, *cworkB, **pcA, **pcB;
1200   int        nztot, nzA, nzB, lrow, brstart = mat->rstart*bs, brend = mat->rend*bs;
1201   int        *cmap, *idx_p,cstart = mat->cstart;
1202 
1203   PetscFunctionBegin;
1204   if (mat->getrowactive == PETSC_TRUE) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Already active");
1205   mat->getrowactive = PETSC_TRUE;
1206 
1207   if (!mat->rowvalues && (idx || v)) {
1208     /*
1209         allocate enough space to hold information from the longest row.
1210     */
1211     Mat_SeqBAIJ *Aa = (Mat_SeqBAIJ *) mat->A->data,*Ba = (Mat_SeqBAIJ *) mat->B->data;
1212     int     max = 1,mbs = mat->mbs,tmp;
1213     for ( i=0; i<mbs; i++ ) {
1214       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1215       if (max < tmp) { max = tmp; }
1216     }
1217     mat->rowvalues = (Scalar *) PetscMalloc( max*bs2*(sizeof(int)+sizeof(Scalar)));
1218     CHKPTRQ(mat->rowvalues);
1219     mat->rowindices = (int *) (mat->rowvalues + max*bs2);
1220   }
1221 
1222   if (row < brstart || row >= brend) SETERRQ(PETSC_ERR_SUP,0,"Only local rows")
1223   lrow = row - brstart;
1224 
1225   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1226   if (!v)   {pvA = 0; pvB = 0;}
1227   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1228   ierr = (*mat->A->ops.getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1229   ierr = (*mat->B->ops.getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1230   nztot = nzA + nzB;
1231 
1232   cmap  = mat->garray;
1233   if (v  || idx) {
1234     if (nztot) {
1235       /* Sort by increasing column numbers, assuming A and B already sorted */
1236       int imark = -1;
1237       if (v) {
1238         *v = v_p = mat->rowvalues;
1239         for ( i=0; i<nzB; i++ ) {
1240           if (cmap[cworkB[i]/bs] < cstart)   v_p[i] = vworkB[i];
1241           else break;
1242         }
1243         imark = i;
1244         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1245         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1246       }
1247       if (idx) {
1248         *idx = idx_p = mat->rowindices;
1249         if (imark > -1) {
1250           for ( i=0; i<imark; i++ ) {
1251             idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs;
1252           }
1253         } else {
1254           for ( i=0; i<nzB; i++ ) {
1255             if (cmap[cworkB[i]/bs] < cstart)
1256               idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1257             else break;
1258           }
1259           imark = i;
1260         }
1261         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart*bs + cworkA[i];
1262         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1263       }
1264     } else {
1265       if (idx) *idx = 0;
1266       if (v)   *v   = 0;
1267     }
1268   }
1269   *nz = nztot;
1270   ierr = (*mat->A->ops.restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1271   ierr = (*mat->B->ops.restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1272   PetscFunctionReturn(0);
1273 }
1274 
1275 #undef __FUNC__
1276 #define __FUNC__ "MatRestoreRow_MPIBAIJ"
1277 int MatRestoreRow_MPIBAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1278 {
1279   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1280 
1281   PetscFunctionBegin;
1282   if (baij->getrowactive == PETSC_FALSE) {
1283     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"MatGetRow not called");
1284   }
1285   baij->getrowactive = PETSC_FALSE;
1286   PetscFunctionReturn(0);
1287 }
1288 
1289 #undef __FUNC__
1290 #define __FUNC__ "MatGetBlockSize_MPIBAIJ"
1291 int MatGetBlockSize_MPIBAIJ(Mat mat,int *bs)
1292 {
1293   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1294 
1295   PetscFunctionBegin;
1296   *bs = baij->bs;
1297   PetscFunctionReturn(0);
1298 }
1299 
1300 #undef __FUNC__
1301 #define __FUNC__ "MatZeroEntries_MPIBAIJ"
1302 int MatZeroEntries_MPIBAIJ(Mat A)
1303 {
1304   Mat_MPIBAIJ *l = (Mat_MPIBAIJ *) A->data;
1305   int         ierr;
1306 
1307   PetscFunctionBegin;
1308   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
1309   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
1310   PetscFunctionReturn(0);
1311 }
1312 
1313 #undef __FUNC__
1314 #define __FUNC__ "MatGetInfo_MPIBAIJ"
1315 int MatGetInfo_MPIBAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1316 {
1317   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) matin->data;
1318   Mat         A = a->A, B = a->B;
1319   int         ierr;
1320   double      isend[5], irecv[5];
1321 
1322   PetscFunctionBegin;
1323   info->block_size     = (double)a->bs;
1324   ierr = MatGetInfo(A,MAT_LOCAL,info); CHKERRQ(ierr);
1325   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->memory;
1326   ierr = MatGetInfo(B,MAT_LOCAL,info); CHKERRQ(ierr);
1327   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->memory;
1328   if (flag == MAT_LOCAL) {
1329     info->nz_used      = isend[0];
1330     info->nz_allocated = isend[1];
1331     info->nz_unneeded  = isend[2];
1332     info->memory       = isend[3];
1333     info->mallocs      = isend[4];
1334   } else if (flag == MAT_GLOBAL_MAX) {
1335     ierr = MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_MAX,matin->comm);CHKERRQ(ierr);
1336     info->nz_used      = irecv[0];
1337     info->nz_allocated = irecv[1];
1338     info->nz_unneeded  = irecv[2];
1339     info->memory       = irecv[3];
1340     info->mallocs      = irecv[4];
1341   } else if (flag == MAT_GLOBAL_SUM) {
1342     ierr = MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_SUM,matin->comm);CHKERRQ(ierr);
1343     info->nz_used      = irecv[0];
1344     info->nz_allocated = irecv[1];
1345     info->nz_unneeded  = irecv[2];
1346     info->memory       = irecv[3];
1347     info->mallocs      = irecv[4];
1348   }
1349   info->rows_global       = (double)a->M;
1350   info->columns_global    = (double)a->N;
1351   info->rows_local        = (double)a->m;
1352   info->columns_local     = (double)a->N;
1353   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1354   info->fill_ratio_needed = 0;
1355   info->factor_mallocs    = 0;
1356   PetscFunctionReturn(0);
1357 }
1358 
1359 #undef __FUNC__
1360 #define __FUNC__ "MatSetOption_MPIBAIJ"
1361 int MatSetOption_MPIBAIJ(Mat A,MatOption op)
1362 {
1363   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1364 
1365   PetscFunctionBegin;
1366   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1367       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1368       op == MAT_COLUMNS_UNSORTED ||
1369       op == MAT_COLUMNS_SORTED ||
1370       op == MAT_NEW_NONZERO_ALLOCATION_ERROR ||
1371       op == MAT_NEW_NONZERO_LOCATION_ERROR) {
1372         MatSetOption(a->A,op);
1373         MatSetOption(a->B,op);
1374   } else if (op == MAT_ROW_ORIENTED) {
1375         a->roworiented = 1;
1376         MatSetOption(a->A,op);
1377         MatSetOption(a->B,op);
1378   } else if (op == MAT_ROWS_SORTED ||
1379              op == MAT_ROWS_UNSORTED ||
1380              op == MAT_SYMMETRIC ||
1381              op == MAT_STRUCTURALLY_SYMMETRIC ||
1382              op == MAT_YES_NEW_DIAGONALS)
1383     PLogInfo(A,"Info:MatSetOption_MPIBAIJ:Option ignored\n");
1384   else if (op == MAT_COLUMN_ORIENTED) {
1385     a->roworiented = 0;
1386     MatSetOption(a->A,op);
1387     MatSetOption(a->B,op);
1388   } else if (op == MAT_IGNORE_OFF_PROC_ENTRIES) {
1389     a->donotstash = 1;
1390   } else if (op == MAT_NO_NEW_DIAGONALS) {
1391     SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");
1392   } else {
1393     SETERRQ(PETSC_ERR_SUP,0,"unknown option");
1394   }
1395   PetscFunctionReturn(0);
1396 }
1397 
1398 #undef __FUNC__
1399 #define __FUNC__ "MatTranspose_MPIBAIJ("
1400 int MatTranspose_MPIBAIJ(Mat A,Mat *matout)
1401 {
1402   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) A->data;
1403   Mat_SeqBAIJ *Aloc;
1404   Mat        B;
1405   int        ierr,M=baij->M,N=baij->N,*ai,*aj,row,i,*rvals,j,k,col;
1406   int        bs=baij->bs,mbs=baij->mbs;
1407   Scalar     *a;
1408 
1409   PetscFunctionBegin;
1410   if (matout == PETSC_NULL && M != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Square matrix only for in-place");
1411   ierr = MatCreateMPIBAIJ(A->comm,baij->bs,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);
1412   CHKERRQ(ierr);
1413 
1414   /* copy over the A part */
1415   Aloc = (Mat_SeqBAIJ*) baij->A->data;
1416   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1417   row = baij->rstart;
1418   rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1419 
1420   for ( i=0; i<mbs; i++ ) {
1421     rvals[0] = bs*(baij->rstart + i);
1422     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1423     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1424       col = (baij->cstart+aj[j])*bs;
1425       for (k=0; k<bs; k++ ) {
1426         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1427         col++; a += bs;
1428       }
1429     }
1430   }
1431   /* copy over the B part */
1432   Aloc = (Mat_SeqBAIJ*) baij->B->data;
1433   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1434   row = baij->rstart*bs;
1435   for ( i=0; i<mbs; i++ ) {
1436     rvals[0] = bs*(baij->rstart + i);
1437     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1438     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1439       col = baij->garray[aj[j]]*bs;
1440       for (k=0; k<bs; k++ ) {
1441         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1442         col++; a += bs;
1443       }
1444     }
1445   }
1446   PetscFree(rvals);
1447   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1448   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1449 
1450   if (matout != PETSC_NULL) {
1451     *matout = B;
1452   } else {
1453     /* This isn't really an in-place transpose .... but free data structures from baij */
1454     PetscFree(baij->rowners);
1455     ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1456     ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1457     if (baij->colmap) PetscFree(baij->colmap);
1458     if (baij->garray) PetscFree(baij->garray);
1459     if (baij->lvec) VecDestroy(baij->lvec);
1460     if (baij->Mvctx) VecScatterDestroy(baij->Mvctx);
1461     PetscFree(baij);
1462     PetscMemcpy(A,B,sizeof(struct _p_Mat));
1463     PetscHeaderDestroy(B);
1464   }
1465   PetscFunctionReturn(0);
1466 }
1467 
1468 #undef __FUNC__
1469 #define __FUNC__ "MatDiagonalScale_MPIBAIJ"
1470 int MatDiagonalScale_MPIBAIJ(Mat A,Vec ll,Vec rr)
1471 {
1472   Mat a = ((Mat_MPIBAIJ *) A->data)->A;
1473   Mat b = ((Mat_MPIBAIJ *) A->data)->B;
1474   int ierr,s1,s2,s3;
1475 
1476   PetscFunctionBegin;
1477   if (ll)  {
1478     ierr = VecGetLocalSize(ll,&s1); CHKERRQ(ierr);
1479     ierr = MatGetLocalSize(A,&s2,&s3); CHKERRQ(ierr);
1480     if (s1!=s2) SETERRQ(PETSC_ERR_ARG_SIZ,0,"non-conforming local sizes");
1481     ierr = MatDiagonalScale(a,ll,0); CHKERRQ(ierr);
1482     ierr = MatDiagonalScale(b,ll,0); CHKERRQ(ierr);
1483   }
1484   if (rr) SETERRQ(PETSC_ERR_SUP,0,"not supported for right vector");
1485   PetscFunctionReturn(0);
1486 }
1487 
1488 #undef __FUNC__
1489 #define __FUNC__ "MatZeroRows_MPIBAIJ"
1490 int MatZeroRows_MPIBAIJ(Mat A,IS is,Scalar *diag)
1491 {
1492   Mat_MPIBAIJ    *l = (Mat_MPIBAIJ *) A->data;
1493   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
1494   int            *procs,*nprocs,j,found,idx,nsends,*work,row;
1495   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
1496   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
1497   int            *lens,imdex,*lrows,*values,bs=l->bs,rstart_bs=l->rstart_bs;
1498   MPI_Comm       comm = A->comm;
1499   MPI_Request    *send_waits,*recv_waits;
1500   MPI_Status     recv_status,*send_status;
1501   IS             istmp;
1502 
1503   PetscFunctionBegin;
1504   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
1505   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
1506 
1507   /*  first count number of contributors to each processor */
1508   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
1509   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
1510   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
1511   for ( i=0; i<N; i++ ) {
1512     idx = rows[i];
1513     found = 0;
1514     for ( j=0; j<size; j++ ) {
1515       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
1516         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
1517       }
1518     }
1519     if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Index out of range");
1520   }
1521   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
1522 
1523   /* inform other processors of number of messages and max length*/
1524   work   = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
1525   ierr   = MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
1526   nrecvs = work[rank];
1527   ierr   = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
1528   nmax   = work[rank];
1529   PetscFree(work);
1530 
1531   /* post receives:   */
1532   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); CHKPTRQ(rvalues);
1533   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
1534   for ( i=0; i<nrecvs; i++ ) {
1535     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
1536   }
1537 
1538   /* do sends:
1539      1) starts[i] gives the starting index in svalues for stuff going to
1540      the ith processor
1541   */
1542   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
1543   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
1544   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
1545   starts[0] = 0;
1546   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1547   for ( i=0; i<N; i++ ) {
1548     svalues[starts[owner[i]]++] = rows[i];
1549   }
1550   ISRestoreIndices(is,&rows);
1551 
1552   starts[0] = 0;
1553   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1554   count = 0;
1555   for ( i=0; i<size; i++ ) {
1556     if (procs[i]) {
1557       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
1558     }
1559   }
1560   PetscFree(starts);
1561 
1562   base = owners[rank]*bs;
1563 
1564   /*  wait on receives */
1565   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
1566   source = lens + nrecvs;
1567   count  = nrecvs; slen = 0;
1568   while (count) {
1569     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
1570     /* unpack receives into our local space */
1571     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
1572     source[imdex]  = recv_status.MPI_SOURCE;
1573     lens[imdex]  = n;
1574     slen += n;
1575     count--;
1576   }
1577   PetscFree(recv_waits);
1578 
1579   /* move the data into the send scatter */
1580   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
1581   count = 0;
1582   for ( i=0; i<nrecvs; i++ ) {
1583     values = rvalues + i*nmax;
1584     for ( j=0; j<lens[i]; j++ ) {
1585       lrows[count++] = values[j] - base;
1586     }
1587   }
1588   PetscFree(rvalues); PetscFree(lens);
1589   PetscFree(owner); PetscFree(nprocs);
1590 
1591   /* actually zap the local rows */
1592   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
1593   PLogObjectParent(A,istmp);
1594 
1595   ierr = MatZeroRows(l->A,istmp,0); CHKERRQ(ierr);
1596   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
1597   ierr = ISDestroy(istmp); CHKERRQ(ierr);
1598 
1599   if (diag) {
1600     for ( i = 0; i < slen; i++ ) {
1601       row = lrows[i] + rstart_bs;
1602       ierr = MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES); CHKERRQ(ierr);
1603     }
1604     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1605     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1606   }
1607   PetscFree(lrows);
1608 
1609   /* wait on sends */
1610   if (nsends) {
1611     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
1612     ierr        = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
1613     PetscFree(send_status);
1614   }
1615   PetscFree(send_waits); PetscFree(svalues);
1616 
1617   PetscFunctionReturn(0);
1618 }
1619 extern int MatPrintHelp_SeqBAIJ(Mat);
1620 #undef __FUNC__
1621 #define __FUNC__ "MatPrintHelp_MPIBAIJ"
1622 int MatPrintHelp_MPIBAIJ(Mat A)
1623 {
1624   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1625   int         ierr;
1626 
1627   PetscFunctionBegin;
1628   if (!a->rank) {
1629     ierr = MatPrintHelp_SeqBAIJ(a->A);CHKERRQ(ierr);
1630   }
1631   PetscFunctionReturn(0);
1632 }
1633 
1634 #undef __FUNC__
1635 #define __FUNC__ "MatSetUnfactored_MPIBAIJ"
1636 int MatSetUnfactored_MPIBAIJ(Mat A)
1637 {
1638   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1639   int         ierr;
1640 
1641   PetscFunctionBegin;
1642   ierr = MatSetUnfactored(a->A); CHKERRQ(ierr);
1643   PetscFunctionReturn(0);
1644 }
1645 
1646 static int MatConvertSameType_MPIBAIJ(Mat,Mat *,int);
1647 
1648 /* -------------------------------------------------------------------*/
1649 static struct _MatOps MatOps = {
1650   MatSetValues_MPIBAIJ,MatGetRow_MPIBAIJ,MatRestoreRow_MPIBAIJ,MatMult_MPIBAIJ,
1651   MatMultAdd_MPIBAIJ,MatMultTrans_MPIBAIJ,MatMultTransAdd_MPIBAIJ,0,
1652   0,0,0,0,
1653   0,0,MatTranspose_MPIBAIJ,MatGetInfo_MPIBAIJ,
1654   0,MatGetDiagonal_MPIBAIJ,MatDiagonalScale_MPIBAIJ,MatNorm_MPIBAIJ,
1655   MatAssemblyBegin_MPIBAIJ,MatAssemblyEnd_MPIBAIJ,0,MatSetOption_MPIBAIJ,
1656   MatZeroEntries_MPIBAIJ,MatZeroRows_MPIBAIJ,0,
1657   0,0,0,MatGetSize_MPIBAIJ,
1658   MatGetLocalSize_MPIBAIJ,MatGetOwnershipRange_MPIBAIJ,0,0,
1659   0,0,MatConvertSameType_MPIBAIJ,0,0,
1660   0,0,0,MatGetSubMatrices_MPIBAIJ,
1661   MatIncreaseOverlap_MPIBAIJ,MatGetValues_MPIBAIJ,0,MatPrintHelp_MPIBAIJ,
1662   MatScale_MPIBAIJ,0,0,0,MatGetBlockSize_MPIBAIJ,
1663   0,0,0,0,0,0,MatSetUnfactored_MPIBAIJ,0,MatSetValuesBlocked_MPIBAIJ};
1664 
1665 
1666 #undef __FUNC__
1667 #define __FUNC__ "MatCreateMPIBAIJ"
1668 /*@C
1669    MatCreateMPIBAIJ - Creates a sparse parallel matrix in block AIJ format
1670    (block compressed row).  For good matrix assembly performance
1671    the user should preallocate the matrix storage by setting the parameters
1672    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1673    performance can be increased by more than a factor of 50.
1674 
1675    Input Parameters:
1676 .  comm - MPI communicator
1677 .  bs   - size of blockk
1678 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1679            This value should be the same as the local size used in creating the
1680            y vector for the matrix-vector product y = Ax.
1681 .  n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1682            This value should be the same as the local size used in creating the
1683            x vector for the matrix-vector product y = Ax.
1684 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1685 .  N - number of global columns (or PETSC_DECIDE to have calculated if n is given)
1686 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1687            submatrix  (same for all local rows)
1688 .  d_nzz - array containing the number of block nonzeros in the various block rows
1689            of the in diagonal portion of the local (possibly different for each block
1690            row) or PETSC_NULL.  You must leave room for the diagonal entry even if
1691            it is zero.
1692 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1693            submatrix (same for all local rows).
1694 .  o_nzz - array containing the number of nonzeros in the various block rows of the
1695            off-diagonal portion of the local submatrix (possibly different for
1696            each block row) or PETSC_NULL.
1697 
1698    Output Parameter:
1699 .  A - the matrix
1700 
1701    Notes:
1702    The user MUST specify either the local or global matrix dimensions
1703    (possibly both).
1704 
1705    Storage Information:
1706    For a square global matrix we define each processor's diagonal portion
1707    to be its local rows and the corresponding columns (a square submatrix);
1708    each processor's off-diagonal portion encompasses the remainder of the
1709    local matrix (a rectangular submatrix).
1710 
1711    The user can specify preallocated storage for the diagonal part of
1712    the local submatrix with either d_nz or d_nnz (not both).  Set
1713    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1714    memory allocation.  Likewise, specify preallocated storage for the
1715    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1716 
1717    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1718    the figure below we depict these three local rows and all columns (0-11).
1719 
1720 $          0 1 2 3 4 5 6 7 8 9 10 11
1721 $         -------------------
1722 $  row 3  |  o o o d d d o o o o o o
1723 $  row 4  |  o o o d d d o o o o o o
1724 $  row 5  |  o o o d d d o o o o o o
1725 $         -------------------
1726 $
1727 
1728    Thus, any entries in the d locations are stored in the d (diagonal)
1729    submatrix, and any entries in the o locations are stored in the
1730    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1731    stored simply in the MATSEQBAIJ format for compressed row storage.
1732 
1733    Now d_nz should indicate the number of block nonzeros per row in the d matrix,
1734    and o_nz should indicate the number of block nonzeros per row in the o matrix.
1735    In general, for PDE problems in which most nonzeros are near the diagonal,
1736    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1737    or you will get TERRIBLE performance; see the users' manual chapter on
1738    matrices.
1739 
1740 .keywords: matrix, block, aij, compressed row, sparse, parallel
1741 
1742 .seealso: MatCreate(), MatCreateSeqBAIJ(), MatSetValues()
1743 @*/
1744 int MatCreateMPIBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,
1745                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1746 {
1747   Mat          B;
1748   Mat_MPIBAIJ  *b;
1749   int          ierr, i,sum[2],work[2],mbs,nbs,Mbs=PETSC_DECIDE,Nbs=PETSC_DECIDE,size;
1750 
1751   PetscFunctionBegin;
1752   if (bs < 1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Invalid block size specified, must be positive");
1753 
1754   MPI_Comm_size(comm,&size);
1755   if (size == 1) {
1756     if (M == PETSC_DECIDE) M = m;
1757     if (N == PETSC_DECIDE) N = n;
1758     ierr = MatCreateSeqBAIJ(comm,bs,M,N,d_nz,d_nnz,A); CHKERRQ(ierr);
1759     PetscFunctionReturn(0);
1760   }
1761 
1762   *A = 0;
1763   PetscHeaderCreate(B,_p_Mat,MAT_COOKIE,MATMPIBAIJ,comm,MatDestroy,MatView);
1764   PLogObjectCreate(B);
1765   B->data       = (void *) (b = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(b);
1766   PetscMemzero(b,sizeof(Mat_MPIBAIJ));
1767   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
1768 
1769   B->destroy    = MatDestroy_MPIBAIJ;
1770   B->view       = MatView_MPIBAIJ;
1771   B->mapping    = 0;
1772   B->factor     = 0;
1773   B->assembled  = PETSC_FALSE;
1774 
1775   B->insertmode = NOT_SET_VALUES;
1776   MPI_Comm_rank(comm,&b->rank);
1777   MPI_Comm_size(comm,&b->size);
1778 
1779   if ( m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL)) {
1780     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
1781   }
1782   if ( M == PETSC_DECIDE && m == PETSC_DECIDE) {
1783     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either M or m should be specified");
1784   }
1785   if ( N == PETSC_DECIDE && n == PETSC_DECIDE) {
1786     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either N or n should be specified");
1787   }
1788   if ( M != PETSC_DECIDE && m != PETSC_DECIDE) M = PETSC_DECIDE;
1789   if ( N != PETSC_DECIDE && n != PETSC_DECIDE) N = PETSC_DECIDE;
1790 
1791   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1792     work[0] = m; work[1] = n;
1793     mbs = m/bs; nbs = n/bs;
1794     ierr = MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );CHKERRQ(ierr);
1795     if (M == PETSC_DECIDE) {M = sum[0]; Mbs = M/bs;}
1796     if (N == PETSC_DECIDE) {N = sum[1]; Nbs = N/bs;}
1797   }
1798   if (m == PETSC_DECIDE) {
1799     Mbs = M/bs;
1800     if (Mbs*bs != M) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global rows must be divisible by blocksize");
1801     mbs = Mbs/b->size + ((Mbs % b->size) > b->rank);
1802     m   = mbs*bs;
1803   }
1804   if (n == PETSC_DECIDE) {
1805     Nbs = N/bs;
1806     if (Nbs*bs != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global cols must be divisible by blocksize");
1807     nbs = Nbs/b->size + ((Nbs % b->size) > b->rank);
1808     n   = nbs*bs;
1809   }
1810   if (mbs*bs != m || nbs*bs != n) {
1811     SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of local rows, cols must be divisible by blocksize");
1812   }
1813 
1814   b->m = m; B->m = m;
1815   b->n = n; B->n = n;
1816   b->N = N; B->N = N;
1817   b->M = M; B->M = M;
1818   b->bs  = bs;
1819   b->bs2 = bs*bs;
1820   b->mbs = mbs;
1821   b->nbs = nbs;
1822   b->Mbs = Mbs;
1823   b->Nbs = Nbs;
1824 
1825   /* build local table of row and column ownerships */
1826   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
1827   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
1828   b->cowners = b->rowners + b->size + 2;
1829   ierr = MPI_Allgather(&mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1830   b->rowners[0] = 0;
1831   for ( i=2; i<=b->size; i++ ) {
1832     b->rowners[i] += b->rowners[i-1];
1833   }
1834   b->rstart    = b->rowners[b->rank];
1835   b->rend      = b->rowners[b->rank+1];
1836   b->rstart_bs = b->rstart * bs;
1837   b->rend_bs   = b->rend * bs;
1838 
1839   ierr = MPI_Allgather(&nbs,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1840   b->cowners[0] = 0;
1841   for ( i=2; i<=b->size; i++ ) {
1842     b->cowners[i] += b->cowners[i-1];
1843   }
1844   b->cstart    = b->cowners[b->rank];
1845   b->cend      = b->cowners[b->rank+1];
1846   b->cstart_bs = b->cstart * bs;
1847   b->cend_bs   = b->cend * bs;
1848 
1849 
1850   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1851   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
1852   PLogObjectParent(B,b->A);
1853   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1854   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
1855   PLogObjectParent(B,b->B);
1856 
1857   /* build cache for off array entries formed */
1858   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
1859   b->donotstash  = 0;
1860   b->colmap      = 0;
1861   b->garray      = 0;
1862   b->roworiented = 1;
1863 
1864   /* stuff used in block assembly */
1865   b->barray       = 0;
1866 
1867   /* stuff used for matrix vector multiply */
1868   b->lvec         = 0;
1869   b->Mvctx        = 0;
1870 
1871   /* stuff for MatGetRow() */
1872   b->rowindices   = 0;
1873   b->rowvalues    = 0;
1874   b->getrowactive = PETSC_FALSE;
1875 
1876   /* hash table stuff */
1877   b->ht          = 0;
1878   b->ht_size     = 0;
1879 
1880   *A = B;
1881   PetscFunctionReturn(0);
1882 }
1883 
1884 #undef __FUNC__
1885 #define __FUNC__ "MatConvertSameType_MPIBAIJ"
1886 static int MatConvertSameType_MPIBAIJ(Mat matin,Mat *newmat,int cpvalues)
1887 {
1888   Mat         mat;
1889   Mat_MPIBAIJ *a,*oldmat = (Mat_MPIBAIJ *) matin->data;
1890   int         ierr, len=0, flg;
1891 
1892   PetscFunctionBegin;
1893   *newmat       = 0;
1894   PetscHeaderCreate(mat,_p_Mat,MAT_COOKIE,MATMPIBAIJ,matin->comm,MatDestroy,MatView);
1895   PLogObjectCreate(mat);
1896   mat->data       = (void *) (a = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(a);
1897   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1898   mat->destroy    = MatDestroy_MPIBAIJ;
1899   mat->view       = MatView_MPIBAIJ;
1900   mat->factor     = matin->factor;
1901   mat->assembled  = PETSC_TRUE;
1902 
1903   a->m = mat->m   = oldmat->m;
1904   a->n = mat->n   = oldmat->n;
1905   a->M = mat->M   = oldmat->M;
1906   a->N = mat->N   = oldmat->N;
1907 
1908   a->bs  = oldmat->bs;
1909   a->bs2 = oldmat->bs2;
1910   a->mbs = oldmat->mbs;
1911   a->nbs = oldmat->nbs;
1912   a->Mbs = oldmat->Mbs;
1913   a->Nbs = oldmat->Nbs;
1914 
1915   a->rstart       = oldmat->rstart;
1916   a->rend         = oldmat->rend;
1917   a->cstart       = oldmat->cstart;
1918   a->cend         = oldmat->cend;
1919   a->size         = oldmat->size;
1920   a->rank         = oldmat->rank;
1921   mat->insertmode = NOT_SET_VALUES;
1922   a->rowvalues    = 0;
1923   a->getrowactive = PETSC_FALSE;
1924   a->barray       = 0;
1925 
1926   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
1927   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
1928   a->cowners = a->rowners + a->size + 2;
1929   PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));
1930   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
1931   if (oldmat->colmap) {
1932     a->colmap = (int *) PetscMalloc((a->Nbs)*sizeof(int));CHKPTRQ(a->colmap);
1933     PLogObjectMemory(mat,(a->Nbs)*sizeof(int));
1934     PetscMemcpy(a->colmap,oldmat->colmap,(a->Nbs)*sizeof(int));
1935   } else a->colmap = 0;
1936   if (oldmat->garray && (len = ((Mat_SeqBAIJ *) (oldmat->B->data))->nbs)) {
1937     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
1938     PLogObjectMemory(mat,len*sizeof(int));
1939     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
1940   } else a->garray = 0;
1941 
1942   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
1943   PLogObjectParent(mat,a->lvec);
1944   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
1945   PLogObjectParent(mat,a->Mvctx);
1946   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
1947   PLogObjectParent(mat,a->A);
1948   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
1949   PLogObjectParent(mat,a->B);
1950   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
1951   if (flg) {
1952     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
1953   }
1954   *newmat = mat;
1955   PetscFunctionReturn(0);
1956 }
1957 
1958 #include "sys.h"
1959 
1960 #undef __FUNC__
1961 #define __FUNC__ "MatLoad_MPIBAIJ"
1962 int MatLoad_MPIBAIJ(Viewer viewer,MatType type,Mat *newmat)
1963 {
1964   Mat          A;
1965   int          i, nz, ierr, j,rstart, rend, fd;
1966   Scalar       *vals,*buf;
1967   MPI_Comm     comm = ((PetscObject)viewer)->comm;
1968   MPI_Status   status;
1969   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,*browners,maxnz,*cols;
1970   int          *locrowlens,*sndcounts = 0,*procsnz = 0, jj,*mycols,*ibuf;
1971   int          flg,tag = ((PetscObject)viewer)->tag,bs=1,bs2,Mbs,mbs,extra_rows;
1972   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
1973   int          dcount,kmax,k,nzcount,tmp;
1974 
1975   PetscFunctionBegin;
1976   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
1977   bs2  = bs*bs;
1978 
1979   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
1980   if (!rank) {
1981     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
1982     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT); CHKERRQ(ierr);
1983     if (header[0] != MAT_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"not matrix object");
1984     if (header[3] < 0) {
1985       SETERRQ(PETSC_ERR_FILE_UNEXPECTED,1,"Matrix stored in special format, cannot load as MPIBAIJ");
1986     }
1987   }
1988 
1989   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
1990   M = header[1]; N = header[2];
1991 
1992   if (M != N) SETERRQ(PETSC_ERR_SUP,0,"Can only do square matrices");
1993 
1994   /*
1995      This code adds extra rows to make sure the number of rows is
1996      divisible by the blocksize
1997   */
1998   Mbs        = M/bs;
1999   extra_rows = bs - M + bs*(Mbs);
2000   if (extra_rows == bs) extra_rows = 0;
2001   else                  Mbs++;
2002   if (extra_rows &&!rank) {
2003     PLogInfo(0,"MatLoad_MPIBAIJ:Padding loaded matrix to match blocksize\n");
2004   }
2005 
2006   /* determine ownership of all rows */
2007   mbs = Mbs/size + ((Mbs % size) > rank);
2008   m   = mbs * bs;
2009   rowners = (int *) PetscMalloc(2*(size+2)*sizeof(int)); CHKPTRQ(rowners);
2010   browners = rowners + size + 1;
2011   ierr = MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2012   rowners[0] = 0;
2013   for ( i=2; i<=size; i++ ) rowners[i] += rowners[i-1];
2014   for ( i=0; i<=size;  i++ ) browners[i] = rowners[i]*bs;
2015   rstart = rowners[rank];
2016   rend   = rowners[rank+1];
2017 
2018   /* distribute row lengths to all processors */
2019   locrowlens = (int*) PetscMalloc( (rend-rstart)*bs*sizeof(int) ); CHKPTRQ(locrowlens);
2020   if (!rank) {
2021     rowlengths = (int*) PetscMalloc( (M+extra_rows)*sizeof(int) ); CHKPTRQ(rowlengths);
2022     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT); CHKERRQ(ierr);
2023     for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
2024     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
2025     for ( i=0; i<size; i++ ) sndcounts[i] = browners[i+1] - browners[i];
2026     ierr = MPI_Scatterv(rowlengths,sndcounts,browners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);CHKERRQ(ierr);
2027     PetscFree(sndcounts);
2028   } else {
2029     ierr = MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT, 0,comm);CHKERRQ(ierr);
2030   }
2031 
2032   if (!rank) {
2033     /* calculate the number of nonzeros on each processor */
2034     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
2035     PetscMemzero(procsnz,size*sizeof(int));
2036     for ( i=0; i<size; i++ ) {
2037       for ( j=rowners[i]*bs; j< rowners[i+1]*bs; j++ ) {
2038         procsnz[i] += rowlengths[j];
2039       }
2040     }
2041     PetscFree(rowlengths);
2042 
2043     /* determine max buffer needed and allocate it */
2044     maxnz = 0;
2045     for ( i=0; i<size; i++ ) {
2046       maxnz = PetscMax(maxnz,procsnz[i]);
2047     }
2048     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
2049 
2050     /* read in my part of the matrix column indices  */
2051     nz = procsnz[0];
2052     ibuf = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2053     mycols = ibuf;
2054     if (size == 1)  nz -= extra_rows;
2055     ierr = PetscBinaryRead(fd,mycols,nz,PETSC_INT); CHKERRQ(ierr);
2056     if (size == 1)  for (i=0; i< extra_rows; i++) { mycols[nz+i] = M+i; }
2057 
2058     /* read in every ones (except the last) and ship off */
2059     for ( i=1; i<size-1; i++ ) {
2060       nz   = procsnz[i];
2061       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2062       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
2063     }
2064     /* read in the stuff for the last proc */
2065     if ( size != 1 ) {
2066       nz   = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
2067       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2068       for ( i=0; i<extra_rows; i++ ) cols[nz+i] = M+i;
2069       ierr = MPI_Send(cols,nz+extra_rows,MPI_INT,size-1,tag,comm);CHKERRQ(ierr);
2070     }
2071     PetscFree(cols);
2072   } else {
2073     /* determine buffer space needed for message */
2074     nz = 0;
2075     for ( i=0; i<m; i++ ) {
2076       nz += locrowlens[i];
2077     }
2078     ibuf   = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2079     mycols = ibuf;
2080     /* receive message of column indices*/
2081     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
2082     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
2083     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2084   }
2085 
2086   /* loop over local rows, determining number of off diagonal entries */
2087   dlens  = (int *) PetscMalloc( 2*(rend-rstart+1)*sizeof(int) ); CHKPTRQ(dlens);
2088   odlens = dlens + (rend-rstart);
2089   mask   = (int *) PetscMalloc( 3*Mbs*sizeof(int) ); CHKPTRQ(mask);
2090   PetscMemzero(mask,3*Mbs*sizeof(int));
2091   masked1 = mask    + Mbs;
2092   masked2 = masked1 + Mbs;
2093   rowcount = 0; nzcount = 0;
2094   for ( i=0; i<mbs; i++ ) {
2095     dcount  = 0;
2096     odcount = 0;
2097     for ( j=0; j<bs; j++ ) {
2098       kmax = locrowlens[rowcount];
2099       for ( k=0; k<kmax; k++ ) {
2100         tmp = mycols[nzcount++]/bs;
2101         if (!mask[tmp]) {
2102           mask[tmp] = 1;
2103           if (tmp < rstart || tmp >= rend ) masked2[odcount++] = tmp;
2104           else masked1[dcount++] = tmp;
2105         }
2106       }
2107       rowcount++;
2108     }
2109 
2110     dlens[i]  = dcount;
2111     odlens[i] = odcount;
2112 
2113     /* zero out the mask elements we set */
2114     for ( j=0; j<dcount; j++ ) mask[masked1[j]] = 0;
2115     for ( j=0; j<odcount; j++ ) mask[masked2[j]] = 0;
2116   }
2117 
2118   /* create our matrix */
2119   ierr = MatCreateMPIBAIJ(comm,bs,m,PETSC_DECIDE,M+extra_rows,N+extra_rows,0,dlens,0,odlens,newmat);
2120          CHKERRQ(ierr);
2121   A = *newmat;
2122   MatSetOption(A,MAT_COLUMNS_SORTED);
2123 
2124   if (!rank) {
2125     buf = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(buf);
2126     /* read in my part of the matrix numerical values  */
2127     nz = procsnz[0];
2128     vals = buf;
2129     mycols = ibuf;
2130     if (size == 1)  nz -= extra_rows;
2131     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2132     if (size == 1)  for (i=0; i< extra_rows; i++) { vals[nz+i] = 1.0; }
2133 
2134     /* insert into matrix */
2135     jj      = rstart*bs;
2136     for ( i=0; i<m; i++ ) {
2137       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2138       mycols += locrowlens[i];
2139       vals   += locrowlens[i];
2140       jj++;
2141     }
2142     /* read in other processors (except the last one) and ship out */
2143     for ( i=1; i<size-1; i++ ) {
2144       nz   = procsnz[i];
2145       vals = buf;
2146       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2147       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
2148     }
2149     /* the last proc */
2150     if ( size != 1 ){
2151       nz   = procsnz[i] - extra_rows;
2152       vals = buf;
2153       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2154       for ( i=0; i<extra_rows; i++ ) vals[nz+i] = 1.0;
2155       ierr = MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,size-1,A->tag,comm);CHKERRQ(ierr);
2156     }
2157     PetscFree(procsnz);
2158   } else {
2159     /* receive numeric values */
2160     buf = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(buf);
2161 
2162     /* receive message of values*/
2163     vals   = buf;
2164     mycols = ibuf;
2165     ierr   = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
2166     ierr   = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
2167     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2168 
2169     /* insert into matrix */
2170     jj      = rstart*bs;
2171     for ( i=0; i<m; i++ ) {
2172       ierr    = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2173       mycols += locrowlens[i];
2174       vals   += locrowlens[i];
2175       jj++;
2176     }
2177   }
2178   PetscFree(locrowlens);
2179   PetscFree(buf);
2180   PetscFree(ibuf);
2181   PetscFree(rowners);
2182   PetscFree(dlens);
2183   PetscFree(mask);
2184   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2185   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2186   PetscFunctionReturn(0);
2187 }
2188 
2189 
2190