xref: /petsc/src/mat/impls/baij/mpi/mpibaij.c (revision 0bdbc534f55137a4724bb01c5722bc8c0492b9b5)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: mpibaij.c,v 1.92 1997/12/06 00:08:36 balay Exp balay $";
3 #endif
4 
5 #include "pinclude/pviewer.h"
6 #include "src/mat/impls/baij/mpi/mpibaij.h"
7 #include "src/vec/vecimpl.h"
8 
9 
10 extern int MatSetUpMultiply_MPIBAIJ(Mat);
11 extern int DisAssemble_MPIBAIJ(Mat);
12 extern int MatIncreaseOverlap_MPIBAIJ(Mat,int,IS *,int);
13 extern int MatGetSubMatrices_MPIBAIJ(Mat,int,IS *,IS *,MatGetSubMatrixCall,Mat **);
14 
15 /*
16      Local utility routine that creates a mapping from the global column
17    number to the local number in the off-diagonal part of the local
18    storage of the matrix.  This is done in a non scable way since the
19    length of colmap equals the global matrix length.
20 */
21 #undef __FUNC__
22 #define __FUNC__ "CreateColmap_MPIBAIJ_Private"
23 static int CreateColmap_MPIBAIJ_Private(Mat mat)
24 {
25   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
26   Mat_SeqBAIJ *B = (Mat_SeqBAIJ*) baij->B->data;
27   int         nbs = B->nbs,i,bs=B->bs;;
28 
29   PetscFunctionBegin;
30   baij->colmap = (int *) PetscMalloc((baij->Nbs+1)*sizeof(int));CHKPTRQ(baij->colmap);
31   PLogObjectMemory(mat,baij->Nbs*sizeof(int));
32   PetscMemzero(baij->colmap,baij->Nbs*sizeof(int));
33   for ( i=0; i<nbs; i++ ) baij->colmap[baij->garray[i]] = i*bs+1;
34   PetscFunctionReturn(0);
35 }
36 
37 #define CHUNKSIZE  10
38 
39 #define  MatSetValues_SeqBAIJ_A_Private(row,col,value,addv) \
40 { \
41  \
42     brow = row/bs;  \
43     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
44     rmax = aimax[brow]; nrow = ailen[brow]; \
45       bcol = col/bs; \
46       ridx = row % bs; cidx = col % bs; \
47       low = 0; high = nrow; \
48       while (high-low > 3) { \
49         t = (low+high)/2; \
50         if (rp[t] > bcol) high = t; \
51         else              low  = t; \
52       } \
53       for ( _i=low; _i<high; _i++ ) { \
54         if (rp[_i] > bcol) break; \
55         if (rp[_i] == bcol) { \
56           bap  = ap +  bs2*_i + bs*cidx + ridx; \
57           if (addv == ADD_VALUES) *bap += value;  \
58           else                    *bap  = value;  \
59           goto a_noinsert; \
60         } \
61       } \
62       if (a->nonew == 1) goto a_noinsert; \
63       else if (a->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
64       if (nrow >= rmax) { \
65         /* there is no extra room in row, therefore enlarge */ \
66         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
67         Scalar *new_a; \
68  \
69         if (a->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
70  \
71         /* malloc new storage space */ \
72         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int); \
73         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
74         new_j   = (int *) (new_a + bs2*new_nz); \
75         new_i   = new_j + new_nz; \
76  \
77         /* copy over old data into new slots */ \
78         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];} \
79         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
80         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int)); \
81         len = (new_nz - CHUNKSIZE - ai[brow] - nrow); \
82         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow, \
83                                                            len*sizeof(int)); \
84         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar)); \
85         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
86         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE), \
87                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));  \
88         /* free up old matrix storage */ \
89         PetscFree(a->a);  \
90         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} \
91         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
92         a->singlemalloc = 1; \
93  \
94         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
95         rmax = aimax[brow] = aimax[brow] + CHUNKSIZE; \
96         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
97         a->maxnz += bs2*CHUNKSIZE; \
98         a->reallocs++; \
99         a->nz++; \
100       } \
101       N = nrow++ - 1;  \
102       /* shift up all the later entries in this row */ \
103       for ( ii=N; ii>=_i; ii-- ) { \
104         rp[ii+1] = rp[ii]; \
105         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
106       } \
107       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
108       rp[_i]                      = bcol;  \
109       ap[bs2*_i + bs*cidx + ridx] = value;  \
110       a_noinsert:; \
111     ailen[brow] = nrow; \
112 }
113 
114 #define  MatSetValues_SeqBAIJ_B_Private(row,col,value,addv) \
115 { \
116  \
117     brow = row/bs;  \
118     rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
119     rmax = bimax[brow]; nrow = bilen[brow]; \
120       bcol = col/bs; \
121       ridx = row % bs; cidx = col % bs; \
122       low = 0; high = nrow; \
123       while (high-low > 3) { \
124         t = (low+high)/2; \
125         if (rp[t] > bcol) high = t; \
126         else              low  = t; \
127       } \
128       for ( _i=low; _i<high; _i++ ) { \
129         if (rp[_i] > bcol) break; \
130         if (rp[_i] == bcol) { \
131           bap  = ap +  bs2*_i + bs*cidx + ridx; \
132           if (addv == ADD_VALUES) *bap += value;  \
133           else                    *bap  = value;  \
134           goto b_noinsert; \
135         } \
136       } \
137       if (b->nonew == 1) goto b_noinsert; \
138       else if (b->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
139       if (nrow >= rmax) { \
140         /* there is no extra room in row, therefore enlarge */ \
141         int    new_nz = bi[b->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
142         Scalar *new_a; \
143  \
144         if (b->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
145  \
146         /* malloc new storage space */ \
147         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(b->mbs+1)*sizeof(int); \
148         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
149         new_j   = (int *) (new_a + bs2*new_nz); \
150         new_i   = new_j + new_nz; \
151  \
152         /* copy over old data into new slots */ \
153         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = bi[ii];} \
154         for ( ii=brow+1; ii<b->mbs+1; ii++ ) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
155         PetscMemcpy(new_j,bj,(bi[brow]+nrow)*sizeof(int)); \
156         len = (new_nz - CHUNKSIZE - bi[brow] - nrow); \
157         PetscMemcpy(new_j+bi[brow]+nrow+CHUNKSIZE,bj+bi[brow]+nrow, \
158                                                            len*sizeof(int)); \
159         PetscMemcpy(new_a,ba,(bi[brow]+nrow)*bs2*sizeof(Scalar)); \
160         PetscMemzero(new_a+bs2*(bi[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
161         PetscMemcpy(new_a+bs2*(bi[brow]+nrow+CHUNKSIZE), \
162                     ba+bs2*(bi[brow]+nrow),bs2*len*sizeof(Scalar));  \
163         /* free up old matrix storage */ \
164         PetscFree(b->a);  \
165         if (!b->singlemalloc) {PetscFree(b->i);PetscFree(b->j);} \
166         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
167         b->singlemalloc = 1; \
168  \
169         rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
170         rmax = bimax[brow] = bimax[brow] + CHUNKSIZE; \
171         PLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
172         b->maxnz += bs2*CHUNKSIZE; \
173         b->reallocs++; \
174         b->nz++; \
175       } \
176       N = nrow++ - 1;  \
177       /* shift up all the later entries in this row */ \
178       for ( ii=N; ii>=_i; ii-- ) { \
179         rp[ii+1] = rp[ii]; \
180         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
181       } \
182       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
183       rp[_i]                      = bcol;  \
184       ap[bs2*_i + bs*cidx + ridx] = value;  \
185       b_noinsert:; \
186     bilen[brow] = nrow; \
187 }
188 
189 #undef __FUNC__
190 #define __FUNC__ "MatSetValues_MPIBAIJ"
191 int MatSetValues_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
192 {
193   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
194   Scalar      value;
195   int         ierr,i,j,row,col;
196   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
197   int         rend_orig=baij->rend_bs,cstart_orig=baij->cstart_bs;
198   int         cend_orig=baij->cend_bs,bs=baij->bs;
199 
200   /* Some Variables required in the macro */
201   Mat         A = baij->A;
202   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) (A)->data;
203   int         *aimax=a->imax,*ai=a->i,*ailen=a->ilen,*aj=a->j;
204   Scalar      *aa=a->a;
205 
206   Mat         B = baij->B;
207   Mat_SeqBAIJ *b = (Mat_SeqBAIJ *) (B)->data;
208   int         *bimax=b->imax,*bi=b->i,*bilen=b->ilen,*bj=b->j;
209   Scalar      *ba=b->a;
210 
211   int         *rp,ii,nrow,_i,rmax,N,brow,bcol;
212   int         low,high,t,ridx,cidx,bs2=a->bs2;
213   Scalar      *ap,*bap;
214 
215   PetscFunctionBegin;
216   for ( i=0; i<m; i++ ) {
217 #if defined(USE_PETSC_BOPT_g)
218     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
219     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
220 #endif
221     if (im[i] >= rstart_orig && im[i] < rend_orig) {
222       row = im[i] - rstart_orig;
223       for ( j=0; j<n; j++ ) {
224         if (in[j] >= cstart_orig && in[j] < cend_orig){
225           col = in[j] - cstart_orig;
226           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
227           MatSetValues_SeqBAIJ_A_Private(row,col,value,addv);
228           /* ierr = MatSetValues_SeqBAIJ(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
229         }
230 #if defined(USE_PETSC_BOPT_g)
231         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
232         else if (in[j] >= baij->N) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Col too large");}
233 #endif
234         else {
235           if (mat->was_assembled) {
236             if (!baij->colmap) {
237               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
238             }
239             col = baij->colmap[in[j]/bs] - 1 + in[j]%bs;
240             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
241               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
242               col =  in[j];
243               /* Reinitialize the variables required by MatSetValues_SeqBAIJ_B_Private() */
244               B = baij->B;
245               b = (Mat_SeqBAIJ *) (B)->data;
246               bimax=b->imax;bi=b->i;bilen=b->ilen;bj=b->j;
247               ba=b->a;
248             }
249           } else col = in[j];
250           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
251           MatSetValues_SeqBAIJ_B_Private(row,col,value,addv);
252           /* ierr = MatSetValues_SeqBAIJ(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
253         }
254       }
255     } else {
256       if (roworiented && !baij->donotstash) {
257         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
258       } else {
259         if (!baij->donotstash) {
260           row = im[i];
261 	  for ( j=0; j<n; j++ ) {
262 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
263           }
264         }
265       }
266     }
267   }
268   PetscFunctionReturn(0);
269 }
270 
271 extern int MatSetValuesBlocked_SeqBAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
272 #undef __FUNC__
273 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ"
274 int MatSetValuesBlocked_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
275 {
276   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
277   Scalar      *value,*barray=baij->barray;
278   int         ierr,i,j,ii,jj,row,col,k,l;
279   int         roworiented = baij->roworiented,rstart=baij->rstart ;
280   int         rend=baij->rend,cstart=baij->cstart,stepval;
281   int         cend=baij->cend,bs=baij->bs,bs2=baij->bs2;
282 
283   if(!barray) {
284     baij->barray = barray = (Scalar*) PetscMalloc(bs2*sizeof(Scalar)); CHKPTRQ(barray);
285   }
286 
287   if (roworiented) {
288     stepval = (n-1)*bs;
289   } else {
290     stepval = (m-1)*bs;
291   }
292   for ( i=0; i<m; i++ ) {
293 #if defined(USE_PETSC_BOPT_g)
294     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
295     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
296 #endif
297     if (im[i] >= rstart && im[i] < rend) {
298       row = im[i] - rstart;
299       for ( j=0; j<n; j++ ) {
300         /* If NumCol = 1 then a copy is not required */
301         if ((roworiented) && (n == 1)) {
302           barray = v + i*bs2;
303         } else if((!roworiented) && (m == 1)) {
304           barray = v + j*bs2;
305         } else { /* Here a copy is required */
306           if (roworiented) {
307             value = v + i*(stepval+bs)*bs + j*bs;
308           } else {
309             value = v + j*(stepval+bs)*bs + i*bs;
310           }
311           for ( ii=0; ii<bs; ii++,value+=stepval ) {
312             for (jj=0; jj<bs; jj++ ) {
313               *barray++  = *value++;
314             }
315           }
316           barray -=bs2;
317         }
318 
319         if (in[j] >= cstart && in[j] < cend){
320           col  = in[j] - cstart;
321           ierr = MatSetValuesBlocked_SeqBAIJ(baij->A,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
322         }
323 #if defined(USE_PETSC_BOPT_g)
324         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
325         else if (in[j] >= baij->Nbs) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");}
326 #endif
327         else {
328           if (mat->was_assembled) {
329             if (!baij->colmap) {
330               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
331             }
332 
333 #if defined(USE_PETSC_BOPT_g)
334             if ((baij->colmap[in[j]] - 1) % bs) {SETERRQ(PETSC_ERR_PLIB,0,"Incorrect colmap");}
335 #endif
336             col = (baij->colmap[in[j]] - 1)/bs;
337             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
338               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
339               col =  in[j];
340             }
341           }
342           else col = in[j];
343           ierr = MatSetValuesBlocked_SeqBAIJ(baij->B,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
344         }
345       }
346     } else {
347       if (!baij->donotstash) {
348         if (roworiented ) {
349           row   = im[i]*bs;
350           value = v + i*(stepval+bs)*bs;
351           for ( j=0; j<bs; j++,row++ ) {
352             for ( k=0; k<n; k++ ) {
353               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
354                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
355               }
356             }
357           }
358         } else {
359           for ( j=0; j<n; j++ ) {
360             value = v + j*(stepval+bs)*bs + i*bs;
361             col   = in[j]*bs;
362             for ( k=0; k<bs; k++,col++,value+=stepval) {
363               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
364                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
365               }
366             }
367           }
368         }
369       }
370     }
371   }
372   PetscFunctionReturn(0);
373 }
374 #include <math.h>
375 #define HASH_KEY 0.6180339887
376 #define HASH1(size,key) ((int)((size)*fmod(((key)*HASH_KEY),1)))
377 
378 #undef __FUNC__
379 #define __FUNC__ "MatSetValues_MPIBAIJ_HT"
380 int MatSetValues_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
381 {
382   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
383   int         ierr,i,j,row,col;
384   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
385   int         rend_orig=baij->rend_bs;
386 
387   int         h1,key,size=baij->ht_size,k,bs=baij->bs;
388   double      * HT  = baij->ht;
389   Scalar      ** HD = (Scalar **)(HT + size),value;
390 
391 
392   PetscFunctionBegin;
393 
394   if(((Mat_SeqBAIJ*)baij->A->data)->nonew !=1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix");
395 
396   for ( i=0; i<m; i++ ) {
397 #if defined(USE_PETSC_BOPT_g)
398     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
399     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
400 #endif
401     if (im[i] >= rstart_orig && im[i] < rend_orig) {
402       row = im[i];
403       for ( j=0; j<n; j++ ) {
404         col = in[j];
405         if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
406         /* Look up into the Hash Table */
407         key = (row/bs)*baij->Nbs+(col/bs)+1;
408         h1  = HASH1(size,key);
409 
410         for ( k=0; k<size; k++ ){
411           if (HT[(h1+k)%size] == key) {
412             if (addv == ADD_VALUES) *(HD[(h1+k)%size]+ (col % bs)*bs + row % bs) += value;
413             else                    *(HD[(h1+k)%size]+ (col % bs)*bs + row % bs)  = value;
414             break;
415           }
416         }
417         if ( k==size) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Hastable Entry not found");
418       }
419     } else {
420       if (roworiented && !baij->donotstash) {
421         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
422       } else {
423         if (!baij->donotstash) {
424           row = im[i];
425 	  for ( j=0; j<n; j++ ) {
426 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
427           }
428         }
429       }
430     }
431   }
432   PetscFunctionReturn(0);
433 }
434 
435 #undef __FUNC__
436 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ_HT"
437 int MatSetValuesBlocked_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
438 {
439   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
440   int         ierr,i,j,ii,jj,row,col,k,l;
441   int         roworiented = baij->roworiented,rstart=baij->rstart ;
442   int         rend=baij->rend,stepval,bs=baij->bs;
443 
444   int         h1,key,size=baij->ht_size;
445   double      * HT  = baij->ht;
446   Scalar      ** HD = (Scalar **)(HT + size),*value,*baij_a;
447 
448 
449   if (roworiented) {
450     stepval = (n-1)*bs;
451   } else {
452     stepval = (m-1)*bs;
453   }
454   for ( i=0; i<m; i++ ) {
455 #if defined(USE_PETSC_BOPT_g)
456     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
457     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
458 #endif
459     if (im[i] >= rstart && im[i] < rend) {
460       row = im[i];
461       for ( j=0; j<n; j++ ) {
462         col = in[j];
463 
464         /* Look up into the Hash Table */
465         key = row*baij->Nbs+col+1;
466         h1  = HASH1(size,key);
467 
468         for ( k=0; k<size; k++ ){
469           if (HT[(h1+k)%size] == key) {
470             baij_a = HD[(h1+k)%size];
471 
472             if (roworiented) {
473               value = v + i*(stepval+bs)*bs + j*bs;
474             } else {
475               value = v + j*(stepval+bs)*bs + i*bs;
476             }
477             for ( ii=0; ii<bs; ii++,value+=stepval ) {
478               for (jj=0; jj<bs; jj++ ) {
479                 if (addv == ADD_VALUES) *baij_a  += *value;
480                 else                    *baij_a   = *value;
481                 baij++; value++;
482               }
483             }
484             break;
485           }
486         }
487         if ( k==size) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Hastable Entry not found");
488       }
489     } else {
490       if (!baij->donotstash) {
491         if (roworiented ) {
492           row   = im[i]*bs;
493           value = v + i*(stepval+bs)*bs;
494           for ( j=0; j<bs; j++,row++ ) {
495             for ( k=0; k<n; k++ ) {
496               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
497                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
498               }
499             }
500           }
501         } else {
502           for ( j=0; j<n; j++ ) {
503             value = v + j*(stepval+bs)*bs + i*bs;
504             col   = in[j]*bs;
505             for ( k=0; k<bs; k++,col++,value+=stepval) {
506               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
507                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
508               }
509             }
510           }
511         }
512       }
513     }
514   }
515   PetscFunctionReturn(0);
516 }
517 #undef __FUNC__
518 #define __FUNC__ "MatGetValues_MPIBAIJ"
519 int MatGetValues_MPIBAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
520 {
521   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
522   int        bs=baij->bs,ierr,i,j, bsrstart = baij->rstart*bs, bsrend = baij->rend*bs;
523   int        bscstart = baij->cstart*bs, bscend = baij->cend*bs,row,col;
524 
525   for ( i=0; i<m; i++ ) {
526     if (idxm[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
527     if (idxm[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
528     if (idxm[i] >= bsrstart && idxm[i] < bsrend) {
529       row = idxm[i] - bsrstart;
530       for ( j=0; j<n; j++ ) {
531         if (idxn[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");
532         if (idxn[j] >= baij->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");
533         if (idxn[j] >= bscstart && idxn[j] < bscend){
534           col = idxn[j] - bscstart;
535           ierr = MatGetValues(baij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
536         } else {
537           if (!baij->colmap) {
538             ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
539           }
540           if((baij->colmap[idxn[j]/bs]-1 < 0) ||
541              (baij->garray[(baij->colmap[idxn[j]/bs]-1)/bs] != idxn[j]/bs)) *(v+i*n+j) = 0.0;
542           else {
543             col  = (baij->colmap[idxn[j]/bs]-1) + idxn[j]%bs;
544             ierr = MatGetValues(baij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
545           }
546         }
547       }
548     } else {
549       SETERRQ(PETSC_ERR_SUP,0,"Only local values currently supported");
550     }
551   }
552   PetscFunctionReturn(0);
553 }
554 
555 #undef __FUNC__
556 #define __FUNC__ "MatNorm_MPIBAIJ"
557 int MatNorm_MPIBAIJ(Mat mat,NormType type,double *norm)
558 {
559   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
560   Mat_SeqBAIJ *amat = (Mat_SeqBAIJ*) baij->A->data, *bmat = (Mat_SeqBAIJ*) baij->B->data;
561   int        ierr, i,bs2=baij->bs2;
562   double     sum = 0.0;
563   Scalar     *v;
564 
565   PetscFunctionBegin;
566   if (baij->size == 1) {
567     ierr =  MatNorm(baij->A,type,norm); CHKERRQ(ierr);
568   } else {
569     if (type == NORM_FROBENIUS) {
570       v = amat->a;
571       for (i=0; i<amat->nz*bs2; i++ ) {
572 #if defined(USE_PETSC_COMPLEX)
573         sum += real(conj(*v)*(*v)); v++;
574 #else
575         sum += (*v)*(*v); v++;
576 #endif
577       }
578       v = bmat->a;
579       for (i=0; i<bmat->nz*bs2; i++ ) {
580 #if defined(USE_PETSC_COMPLEX)
581         sum += real(conj(*v)*(*v)); v++;
582 #else
583         sum += (*v)*(*v); v++;
584 #endif
585       }
586       ierr = MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
587       *norm = sqrt(*norm);
588     } else {
589       SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
590     }
591   }
592   PetscFunctionReturn(0);
593 }
594 
595 #undef __FUNC__
596 #define __FUNC__ "MatAssemblyBegin_MPIBAIJ"
597 int MatAssemblyBegin_MPIBAIJ(Mat mat,MatAssemblyType mode)
598 {
599   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
600   MPI_Comm    comm = mat->comm;
601   int         size = baij->size, *owners = baij->rowners,bs=baij->bs;
602   int         rank = baij->rank,tag = mat->tag, *owner,*starts,count,ierr;
603   MPI_Request *send_waits,*recv_waits;
604   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
605   InsertMode  addv;
606   Scalar      *rvalues,*svalues;
607 
608   PetscFunctionBegin;
609   /* make sure all processors are either in INSERTMODE or ADDMODE */
610   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);CHKERRQ(ierr);
611   if (addv == (ADD_VALUES|INSERT_VALUES)) {
612     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Some processors inserted others added");
613   }
614   mat->insertmode = addv; /* in case this processor had no cache */
615 
616   /*  first count number of contributors to each processor */
617   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
618   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
619   owner = (int *) PetscMalloc( (baij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
620   for ( i=0; i<baij->stash.n; i++ ) {
621     idx = baij->stash.idx[i];
622     for ( j=0; j<size; j++ ) {
623       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
624         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
625       }
626     }
627   }
628   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
629 
630   /* inform other processors of number of messages and max length*/
631   work      = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
632   ierr      = MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
633   nreceives = work[rank];
634   ierr      = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
635   nmax      = work[rank];
636   PetscFree(work);
637 
638   /* post receives:
639        1) each message will consist of ordered pairs
640      (global index,value) we store the global index as a double
641      to simplify the message passing.
642        2) since we don't know how long each individual message is we
643      allocate the largest needed buffer for each receive. Potentially
644      this is a lot of wasted space.
645 
646 
647        This could be done better.
648   */
649   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));CHKPTRQ(rvalues);
650   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
651   for ( i=0; i<nreceives; i++ ) {
652     ierr = MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
653   }
654 
655   /* do sends:
656       1) starts[i] gives the starting index in svalues for stuff going to
657          the ith processor
658   */
659   svalues = (Scalar *) PetscMalloc(3*(baij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
660   send_waits = (MPI_Request *) PetscMalloc((nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
661   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
662   starts[0] = 0;
663   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
664   for ( i=0; i<baij->stash.n; i++ ) {
665     svalues[3*starts[owner[i]]]       = (Scalar)  baij->stash.idx[i];
666     svalues[3*starts[owner[i]]+1]     = (Scalar)  baij->stash.idy[i];
667     svalues[3*(starts[owner[i]]++)+2] =  baij->stash.array[i];
668   }
669   PetscFree(owner);
670   starts[0] = 0;
671   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
672   count = 0;
673   for ( i=0; i<size; i++ ) {
674     if (procs[i]) {
675       ierr = MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
676     }
677   }
678   PetscFree(starts); PetscFree(nprocs);
679 
680   /* Free cache space */
681   PLogInfo(0,"[%d]MatAssemblyBegin_MPIBAIJ:Number of off-processor values %d\n",rank,baij->stash.n);
682   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
683 
684   baij->svalues    = svalues;    baij->rvalues    = rvalues;
685   baij->nsends     = nsends;     baij->nrecvs     = nreceives;
686   baij->send_waits = send_waits; baij->recv_waits = recv_waits;
687   baij->rmax       = nmax;
688 
689   PetscFunctionReturn(0);
690 }
691 
692 int CreateHashTable(Mat mat,double factor)
693 {
694   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
695   Mat         A = baij->A, B=baij->B;
696   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data, *b=(Mat_SeqBAIJ *)B->data;
697   int         i,j,k,nz=a->nz+b->nz,h1,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
698   int         size,ct=0,max1=0,max2=0,bs2=baij->bs2,rstart=baij->rstart;
699   int         cstart=baij->cstart,*garray=baij->garray,row,col;
700   double      *HT,key;
701   extern int  PetscGlobalRank;
702   Scalar      **HD;
703   /*
704      Scalar      *aa=a->a,*ba=b->a;
705      static double *HT;
706      static      int flag=1;
707      */
708   PetscFunctionBegin;
709   baij->ht_size=(int)(factor*nz);
710   size = baij->ht_size;
711   /* Allocate Memory for Hash Table */
712   if (baij->ht) {
713     PetscFunctionReturn(0);
714   }
715 
716   baij->ht = (double*)PetscMalloc(size*(sizeof(double)+sizeof(Scalar*))+1); CHKPTRQ(baij->ht);
717   HT = baij->ht;
718   HD = (Scalar**)(HT + size);
719   PetscMemzero(HT,size*sizeof(double)+sizeof(Scalar*));
720 
721 
722   /* Loop Over A */
723   for ( i=0; i<a->mbs; i++ ) {
724     for ( j=ai[i]; j<ai[i+1]; j++ ) {
725       row = i+rstart;
726       col = aj[j]+cstart;
727 
728       key = row*baij->Nbs + col + 1;
729       /* printf("[%d]row,col,key = %2d,%2d,%5.2f\n",PetscGlobalRank,row,col,key); */
730       h1  = HASH1(size,key);
731 
732       for ( k=0; k<size; k++ ){
733         if (HT[(h1+k)%size] == 0.0) {
734           HT[(h1+k)%size] = key;
735           HD[(h1+k)%size] = a->a + j*bs2;
736           break;
737         } else ct++;
738       }
739       if (k> max1) max1 = k;
740     }
741   }
742   /* Loop Over B */
743   for ( i=0; i<b->mbs; i++ ) {
744     for ( j=bi[i]; j<bi[i+1]; j++ ) {
745       row = i+rstart;
746       col = garray[bj[j]];
747       key = row*baij->Nbs + col + 1;
748       /* printf("[%d]row,col,key = %2d,%2d,%5.2f\n",PetscGlobalRank,row,col,key); */
749       h1  = HASH1(size,key);
750       for ( k=0; k<size; k++ ){
751         if (HT[(h1+k)%size] == 0.0) {
752           HT[(h1+k)%size] = key;
753           HD[(h1+k)%size] = b->a + j*bs2;
754           break;
755         } else ct++;
756       }
757       if (k> max2) max2 = k;
758     }
759   }
760 
761   /* Print Summary */
762   for ( i=0,key=0.0,j=0; i<size; i++)
763     if (HT[i]) {j++;}
764 
765   /*printf("[%d] fact = %3.2f max1 = %5d max2 = %5d Size %5d - Searches %5d Avg %5.2f Keys %5d\n",
766          PetscGlobalRank,factor,max1,max2,size,ct+j,((float)ct+j),j); */
767   fflush(stdout);
768   PetscFunctionReturn(0);
769 }
770 
771 #undef __FUNC__
772 #define __FUNC__ "MatAssemblyEnd_MPIBAIJ"
773 int MatAssemblyEnd_MPIBAIJ(Mat mat,MatAssemblyType mode)
774 {
775   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
776   MPI_Status  *send_status,recv_status;
777   int         imdex,nrecvs = baij->nrecvs, count = nrecvs, i, n, ierr;
778   int         bs=baij->bs,row,col,other_disassembled,flg;
779   Scalar      *values,val;
780   InsertMode  addv = mat->insertmode;
781 
782   PetscFunctionBegin;
783   /*  wait on receives */
784   while (count) {
785     ierr = MPI_Waitany(nrecvs,baij->recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
786     /* unpack receives into our local space */
787     values = baij->rvalues + 3*imdex*baij->rmax;
788     ierr = MPI_Get_count(&recv_status,MPIU_SCALAR,&n);CHKERRQ(ierr);
789     n = n/3;
790     for ( i=0; i<n; i++ ) {
791       row = (int) PetscReal(values[3*i]) - baij->rstart*bs;
792       col = (int) PetscReal(values[3*i+1]);
793       val = values[3*i+2];
794       if (col >= baij->cstart*bs && col < baij->cend*bs) {
795         col -= baij->cstart*bs;
796         ierr = MatSetValues(baij->A,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
797       } else {
798         if (mat->was_assembled) {
799           if (!baij->colmap) {
800             ierr = CreateColmap_MPIBAIJ_Private(mat); CHKERRQ(ierr);
801           }
802           col = (baij->colmap[col/bs]) - 1 + col%bs;
803           if (col < 0  && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
804             ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
805             col = (int) PetscReal(values[3*i+1]);
806           }
807         }
808         ierr = MatSetValues(baij->B,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
809       }
810     }
811     count--;
812   }
813   PetscFree(baij->recv_waits); PetscFree(baij->rvalues);
814 
815   /* wait on sends */
816   if (baij->nsends) {
817     send_status = (MPI_Status *) PetscMalloc(baij->nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
818     ierr        = MPI_Waitall(baij->nsends,baij->send_waits,send_status);CHKERRQ(ierr);
819     PetscFree(send_status);
820   }
821   PetscFree(baij->send_waits); PetscFree(baij->svalues);
822 
823   ierr = MatAssemblyBegin(baij->A,mode); CHKERRQ(ierr);
824   ierr = MatAssemblyEnd(baij->A,mode); CHKERRQ(ierr);
825 
826   /* determine if any processor has disassembled, if so we must
827      also disassemble ourselfs, in order that we may reassemble. */
828   ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
829   if (mat->was_assembled && !other_disassembled) {
830     ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
831   }
832 
833   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
834     ierr = MatSetUpMultiply_MPIBAIJ(mat); CHKERRQ(ierr);
835   }
836   ierr = MatAssemblyBegin(baij->B,mode); CHKERRQ(ierr);
837   ierr = MatAssemblyEnd(baij->B,mode); CHKERRQ(ierr);
838 
839   ierr = OptionsHasName(PETSC_NULL,"-use_hash",&flg); CHKERRQ(ierr);
840   if (flg && !baij->ht) {
841     double fact = 1.39;
842     CreateHashTable(mat,fact);
843     mat->ops.setvalues        = MatSetValues_MPIBAIJ_HT;
844     mat->ops.setvaluesblocked = MatSetValuesBlocked_MPIBAIJ_HT;
845   }
846   if (baij->rowvalues) {PetscFree(baij->rowvalues); baij->rowvalues = 0;}
847   PetscFunctionReturn(0);
848 }
849 
850 #undef __FUNC__
851 #define __FUNC__ "MatView_MPIBAIJ_Binary"
852 static int MatView_MPIBAIJ_Binary(Mat mat,Viewer viewer)
853 {
854   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
855   int          ierr;
856 
857   PetscFunctionBegin;
858   if (baij->size == 1) {
859     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
860   } else SETERRQ(PETSC_ERR_SUP,0,"Only uniprocessor output supported");
861   PetscFunctionReturn(0);
862 }
863 
864 #undef __FUNC__
865 #define __FUNC__ "MatView_MPIBAIJ_ASCIIorDraworMatlab"
866 static int MatView_MPIBAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
867 {
868   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
869   int          ierr, format,rank,bs = baij->bs;
870   FILE         *fd;
871   ViewerType   vtype;
872 
873   PetscFunctionBegin;
874   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
875   if (vtype  == ASCII_FILES_VIEWER || vtype == ASCII_FILE_VIEWER) {
876     ierr = ViewerGetFormat(viewer,&format);
877     if (format == VIEWER_FORMAT_ASCII_INFO_LONG) {
878       MatInfo info;
879       MPI_Comm_rank(mat->comm,&rank);
880       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
881       ierr = MatGetInfo(mat,MAT_LOCAL,&info);
882       PetscSequentialPhaseBegin(mat->comm,1);
883       fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
884               rank,baij->m,(int)info.nz_used*bs,(int)info.nz_allocated*bs,
885               baij->bs,(int)info.memory);
886       ierr = MatGetInfo(baij->A,MAT_LOCAL,&info);
887       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
888       ierr = MatGetInfo(baij->B,MAT_LOCAL,&info);
889       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
890       fflush(fd);
891       PetscSequentialPhaseEnd(mat->comm,1);
892       ierr = VecScatterView(baij->Mvctx,viewer); CHKERRQ(ierr);
893       PetscFunctionReturn(0);
894     } else if (format == VIEWER_FORMAT_ASCII_INFO) {
895       PetscPrintf(mat->comm,"  block size is %d\n",bs);
896       PetscFunctionReturn(0);
897     }
898   }
899 
900   if (vtype == DRAW_VIEWER) {
901     Draw       draw;
902     PetscTruth isnull;
903     ierr = ViewerDrawGetDraw(viewer,&draw); CHKERRQ(ierr);
904     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
905   }
906 
907   if (vtype == ASCII_FILE_VIEWER) {
908     ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
909     PetscSequentialPhaseBegin(mat->comm,1);
910     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
911            baij->rank,baij->m,baij->rstart*bs,baij->rend*bs,baij->n,
912             baij->cstart*bs,baij->cend*bs);
913     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
914     ierr = MatView(baij->B,viewer); CHKERRQ(ierr);
915     fflush(fd);
916     PetscSequentialPhaseEnd(mat->comm,1);
917   } else {
918     int size = baij->size;
919     rank = baij->rank;
920     if (size == 1) {
921       ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
922     } else {
923       /* assemble the entire matrix onto first processor. */
924       Mat         A;
925       Mat_SeqBAIJ *Aloc;
926       int         M = baij->M, N = baij->N,*ai,*aj,row,col,i,j,k,*rvals;
927       int         mbs=baij->mbs;
928       Scalar      *a;
929 
930       if (!rank) {
931         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
932       } else {
933         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
934       }
935       PLogObjectParent(mat,A);
936 
937       /* copy over the A part */
938       Aloc = (Mat_SeqBAIJ*) baij->A->data;
939       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
940       row = baij->rstart;
941       rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
942 
943       for ( i=0; i<mbs; i++ ) {
944         rvals[0] = bs*(baij->rstart + i);
945         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
946         for ( j=ai[i]; j<ai[i+1]; j++ ) {
947           col = (baij->cstart+aj[j])*bs;
948           for (k=0; k<bs; k++ ) {
949             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
950             col++; a += bs;
951           }
952         }
953       }
954       /* copy over the B part */
955       Aloc = (Mat_SeqBAIJ*) baij->B->data;
956       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
957       row = baij->rstart*bs;
958       for ( i=0; i<mbs; i++ ) {
959         rvals[0] = bs*(baij->rstart + i);
960         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
961         for ( j=ai[i]; j<ai[i+1]; j++ ) {
962           col = baij->garray[aj[j]]*bs;
963           for (k=0; k<bs; k++ ) {
964             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
965             col++; a += bs;
966           }
967         }
968       }
969       PetscFree(rvals);
970       ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
971       ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
972       /*
973          Everyone has to call to draw the matrix since the graphics waits are
974          synchronized across all processors that share the Draw object
975       */
976       if (!rank || vtype == DRAW_VIEWER) {
977         ierr = MatView(((Mat_MPIBAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
978       }
979       ierr = MatDestroy(A); CHKERRQ(ierr);
980     }
981   }
982   PetscFunctionReturn(0);
983 }
984 
985 
986 
987 #undef __FUNC__
988 #define __FUNC__ "MatView_MPIBAIJ"
989 int MatView_MPIBAIJ(PetscObject obj,Viewer viewer)
990 {
991   Mat         mat = (Mat) obj;
992   int         ierr;
993   ViewerType  vtype;
994 
995   PetscFunctionBegin;
996   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
997   if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER ||
998       vtype == DRAW_VIEWER       || vtype == MATLAB_VIEWER) {
999     ierr = MatView_MPIBAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
1000   } else if (vtype == BINARY_FILE_VIEWER) {
1001     ierr = MatView_MPIBAIJ_Binary(mat,viewer);CHKERRQ(ierr);
1002   }
1003   PetscFunctionReturn(0);
1004 }
1005 
1006 #undef __FUNC__
1007 #define __FUNC__ "MatDestroy_MPIBAIJ"
1008 int MatDestroy_MPIBAIJ(PetscObject obj)
1009 {
1010   Mat         mat = (Mat) obj;
1011   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1012   int         ierr;
1013 
1014   PetscFunctionBegin;
1015 #if defined(USE_PETSC_LOG)
1016   PLogObjectState(obj,"Rows=%d, Cols=%d",baij->M,baij->N);
1017 #endif
1018 
1019   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
1020   PetscFree(baij->rowners);
1021   ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1022   ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1023   if (baij->colmap) PetscFree(baij->colmap);
1024   if (baij->garray) PetscFree(baij->garray);
1025   if (baij->lvec)   VecDestroy(baij->lvec);
1026   if (baij->Mvctx)  VecScatterDestroy(baij->Mvctx);
1027   if (baij->rowvalues) PetscFree(baij->rowvalues);
1028   if (baij->barray) PetscFree(baij->barray);
1029   if (baij->ht) PetscFree(baij->ht);
1030   PetscFree(baij);
1031   PLogObjectDestroy(mat);
1032   PetscHeaderDestroy(mat);
1033   PetscFunctionReturn(0);
1034 }
1035 
1036 #undef __FUNC__
1037 #define __FUNC__ "MatMult_MPIBAIJ"
1038 int MatMult_MPIBAIJ(Mat A,Vec xx,Vec yy)
1039 {
1040   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1041   int         ierr, nt;
1042 
1043   PetscFunctionBegin;
1044   VecGetLocalSize_Fast(xx,nt);
1045   if (nt != a->n) {
1046     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible partition of A and xx");
1047   }
1048   VecGetLocalSize_Fast(yy,nt);
1049   if (nt != a->m) {
1050     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible parition of A and yy");
1051   }
1052   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1053   ierr = (*a->A->ops.mult)(a->A,xx,yy); CHKERRQ(ierr);
1054   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1055   ierr = (*a->B->ops.multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
1056   ierr = VecScatterPostRecvs(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1057   PetscFunctionReturn(0);
1058 }
1059 
1060 #undef __FUNC__
1061 #define __FUNC__ "MatMultAdd_MPIBAIJ"
1062 int MatMultAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1063 {
1064   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1065   int        ierr;
1066 
1067   PetscFunctionBegin;
1068   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1069   ierr = (*a->A->ops.multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1070   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1071   ierr = (*a->B->ops.multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
1072   PetscFunctionReturn(0);
1073 }
1074 
1075 #undef __FUNC__
1076 #define __FUNC__ "MatMultTrans_MPIBAIJ"
1077 int MatMultTrans_MPIBAIJ(Mat A,Vec xx,Vec yy)
1078 {
1079   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1080   int         ierr;
1081 
1082   PetscFunctionBegin;
1083   /* do nondiagonal part */
1084   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1085   /* send it on its way */
1086   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1087   /* do local part */
1088   ierr = (*a->A->ops.multtrans)(a->A,xx,yy); CHKERRQ(ierr);
1089   /* receive remote parts: note this assumes the values are not actually */
1090   /* inserted in yy until the next line, which is true for my implementation*/
1091   /* but is not perhaps always true. */
1092   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1093   PetscFunctionReturn(0);
1094 }
1095 
1096 #undef __FUNC__
1097 #define __FUNC__ "MatMultTransAdd_MPIBAIJ"
1098 int MatMultTransAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1099 {
1100   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1101   int         ierr;
1102 
1103   PetscFunctionBegin;
1104   /* do nondiagonal part */
1105   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1106   /* send it on its way */
1107   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1108   /* do local part */
1109   ierr = (*a->A->ops.multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1110   /* receive remote parts: note this assumes the values are not actually */
1111   /* inserted in yy until the next line, which is true for my implementation*/
1112   /* but is not perhaps always true. */
1113   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1114   PetscFunctionReturn(0);
1115 }
1116 
1117 /*
1118   This only works correctly for square matrices where the subblock A->A is the
1119    diagonal block
1120 */
1121 #undef __FUNC__
1122 #define __FUNC__ "MatGetDiagonal_MPIBAIJ"
1123 int MatGetDiagonal_MPIBAIJ(Mat A,Vec v)
1124 {
1125   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1126   int         ierr;
1127 
1128   PetscFunctionBegin;
1129   if (a->M != a->N) SETERRQ(PETSC_ERR_SUP,0,"Supports only square matrix where A->A is diag block");
1130   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
1131   PetscFunctionReturn(0);
1132 }
1133 
1134 #undef __FUNC__
1135 #define __FUNC__ "MatScale_MPIBAIJ"
1136 int MatScale_MPIBAIJ(Scalar *aa,Mat A)
1137 {
1138   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1139   int         ierr;
1140 
1141   PetscFunctionBegin;
1142   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
1143   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
1144   PetscFunctionReturn(0);
1145 }
1146 
1147 #undef __FUNC__
1148 #define __FUNC__ "MatGetSize_MPIBAIJ"
1149 int MatGetSize_MPIBAIJ(Mat matin,int *m,int *n)
1150 {
1151   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1152 
1153   PetscFunctionBegin;
1154   if (m) *m = mat->M;
1155   if (n) *n = mat->N;
1156   PetscFunctionReturn(0);
1157 }
1158 
1159 #undef __FUNC__
1160 #define __FUNC__ "MatGetLocalSize_MPIBAIJ"
1161 int MatGetLocalSize_MPIBAIJ(Mat matin,int *m,int *n)
1162 {
1163   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1164 
1165   PetscFunctionBegin;
1166   *m = mat->m; *n = mat->N;
1167   PetscFunctionReturn(0);
1168 }
1169 
1170 #undef __FUNC__
1171 #define __FUNC__ "MatGetOwnershipRange_MPIBAIJ"
1172 int MatGetOwnershipRange_MPIBAIJ(Mat matin,int *m,int *n)
1173 {
1174   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1175 
1176   PetscFunctionBegin;
1177   *m = mat->rstart*mat->bs; *n = mat->rend*mat->bs;
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 extern int MatGetRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1182 extern int MatRestoreRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1183 
1184 #undef __FUNC__
1185 #define __FUNC__ "MatGetRow_MPIBAIJ"
1186 int MatGetRow_MPIBAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1187 {
1188   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1189   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1190   int        bs = mat->bs, bs2 = mat->bs2, i, ierr, *cworkA, *cworkB, **pcA, **pcB;
1191   int        nztot, nzA, nzB, lrow, brstart = mat->rstart*bs, brend = mat->rend*bs;
1192   int        *cmap, *idx_p,cstart = mat->cstart;
1193 
1194   PetscFunctionBegin;
1195   if (mat->getrowactive == PETSC_TRUE) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Already active");
1196   mat->getrowactive = PETSC_TRUE;
1197 
1198   if (!mat->rowvalues && (idx || v)) {
1199     /*
1200         allocate enough space to hold information from the longest row.
1201     */
1202     Mat_SeqBAIJ *Aa = (Mat_SeqBAIJ *) mat->A->data,*Ba = (Mat_SeqBAIJ *) mat->B->data;
1203     int     max = 1,mbs = mat->mbs,tmp;
1204     for ( i=0; i<mbs; i++ ) {
1205       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1206       if (max < tmp) { max = tmp; }
1207     }
1208     mat->rowvalues = (Scalar *) PetscMalloc( max*bs2*(sizeof(int)+sizeof(Scalar)));
1209     CHKPTRQ(mat->rowvalues);
1210     mat->rowindices = (int *) (mat->rowvalues + max*bs2);
1211   }
1212 
1213   if (row < brstart || row >= brend) SETERRQ(PETSC_ERR_SUP,0,"Only local rows")
1214   lrow = row - brstart;
1215 
1216   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1217   if (!v)   {pvA = 0; pvB = 0;}
1218   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1219   ierr = (*mat->A->ops.getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1220   ierr = (*mat->B->ops.getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1221   nztot = nzA + nzB;
1222 
1223   cmap  = mat->garray;
1224   if (v  || idx) {
1225     if (nztot) {
1226       /* Sort by increasing column numbers, assuming A and B already sorted */
1227       int imark = -1;
1228       if (v) {
1229         *v = v_p = mat->rowvalues;
1230         for ( i=0; i<nzB; i++ ) {
1231           if (cmap[cworkB[i]/bs] < cstart)   v_p[i] = vworkB[i];
1232           else break;
1233         }
1234         imark = i;
1235         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1236         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1237       }
1238       if (idx) {
1239         *idx = idx_p = mat->rowindices;
1240         if (imark > -1) {
1241           for ( i=0; i<imark; i++ ) {
1242             idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs;
1243           }
1244         } else {
1245           for ( i=0; i<nzB; i++ ) {
1246             if (cmap[cworkB[i]/bs] < cstart)
1247               idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1248             else break;
1249           }
1250           imark = i;
1251         }
1252         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart*bs + cworkA[i];
1253         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1254       }
1255     } else {
1256       if (idx) *idx = 0;
1257       if (v)   *v   = 0;
1258     }
1259   }
1260   *nz = nztot;
1261   ierr = (*mat->A->ops.restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1262   ierr = (*mat->B->ops.restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1263   PetscFunctionReturn(0);
1264 }
1265 
1266 #undef __FUNC__
1267 #define __FUNC__ "MatRestoreRow_MPIBAIJ"
1268 int MatRestoreRow_MPIBAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1269 {
1270   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1271 
1272   PetscFunctionBegin;
1273   if (baij->getrowactive == PETSC_FALSE) {
1274     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"MatGetRow not called");
1275   }
1276   baij->getrowactive = PETSC_FALSE;
1277   PetscFunctionReturn(0);
1278 }
1279 
1280 #undef __FUNC__
1281 #define __FUNC__ "MatGetBlockSize_MPIBAIJ"
1282 int MatGetBlockSize_MPIBAIJ(Mat mat,int *bs)
1283 {
1284   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1285 
1286   PetscFunctionBegin;
1287   *bs = baij->bs;
1288   PetscFunctionReturn(0);
1289 }
1290 
1291 #undef __FUNC__
1292 #define __FUNC__ "MatZeroEntries_MPIBAIJ"
1293 int MatZeroEntries_MPIBAIJ(Mat A)
1294 {
1295   Mat_MPIBAIJ *l = (Mat_MPIBAIJ *) A->data;
1296   int         ierr;
1297 
1298   PetscFunctionBegin;
1299   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
1300   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
1301   PetscFunctionReturn(0);
1302 }
1303 
1304 #undef __FUNC__
1305 #define __FUNC__ "MatGetInfo_MPIBAIJ"
1306 int MatGetInfo_MPIBAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1307 {
1308   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) matin->data;
1309   Mat         A = a->A, B = a->B;
1310   int         ierr;
1311   double      isend[5], irecv[5];
1312 
1313   PetscFunctionBegin;
1314   info->block_size     = (double)a->bs;
1315   ierr = MatGetInfo(A,MAT_LOCAL,info); CHKERRQ(ierr);
1316   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->memory;
1317   ierr = MatGetInfo(B,MAT_LOCAL,info); CHKERRQ(ierr);
1318   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->memory;
1319   if (flag == MAT_LOCAL) {
1320     info->nz_used      = isend[0];
1321     info->nz_allocated = isend[1];
1322     info->nz_unneeded  = isend[2];
1323     info->memory       = isend[3];
1324     info->mallocs      = isend[4];
1325   } else if (flag == MAT_GLOBAL_MAX) {
1326     ierr = MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_MAX,matin->comm);CHKERRQ(ierr);
1327     info->nz_used      = irecv[0];
1328     info->nz_allocated = irecv[1];
1329     info->nz_unneeded  = irecv[2];
1330     info->memory       = irecv[3];
1331     info->mallocs      = irecv[4];
1332   } else if (flag == MAT_GLOBAL_SUM) {
1333     ierr = MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_SUM,matin->comm);CHKERRQ(ierr);
1334     info->nz_used      = irecv[0];
1335     info->nz_allocated = irecv[1];
1336     info->nz_unneeded  = irecv[2];
1337     info->memory       = irecv[3];
1338     info->mallocs      = irecv[4];
1339   }
1340   info->rows_global       = (double)a->M;
1341   info->columns_global    = (double)a->N;
1342   info->rows_local        = (double)a->m;
1343   info->columns_local     = (double)a->N;
1344   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1345   info->fill_ratio_needed = 0;
1346   info->factor_mallocs    = 0;
1347   PetscFunctionReturn(0);
1348 }
1349 
1350 #undef __FUNC__
1351 #define __FUNC__ "MatSetOption_MPIBAIJ"
1352 int MatSetOption_MPIBAIJ(Mat A,MatOption op)
1353 {
1354   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1355 
1356   PetscFunctionBegin;
1357   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1358       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1359       op == MAT_COLUMNS_UNSORTED ||
1360       op == MAT_COLUMNS_SORTED ||
1361       op == MAT_NEW_NONZERO_ALLOCATION_ERROR ||
1362       op == MAT_NEW_NONZERO_LOCATION_ERROR) {
1363         MatSetOption(a->A,op);
1364         MatSetOption(a->B,op);
1365   } else if (op == MAT_ROW_ORIENTED) {
1366         a->roworiented = 1;
1367         MatSetOption(a->A,op);
1368         MatSetOption(a->B,op);
1369   } else if (op == MAT_ROWS_SORTED ||
1370              op == MAT_ROWS_UNSORTED ||
1371              op == MAT_SYMMETRIC ||
1372              op == MAT_STRUCTURALLY_SYMMETRIC ||
1373              op == MAT_YES_NEW_DIAGONALS)
1374     PLogInfo(A,"Info:MatSetOption_MPIBAIJ:Option ignored\n");
1375   else if (op == MAT_COLUMN_ORIENTED) {
1376     a->roworiented = 0;
1377     MatSetOption(a->A,op);
1378     MatSetOption(a->B,op);
1379   } else if (op == MAT_IGNORE_OFF_PROC_ENTRIES) {
1380     a->donotstash = 1;
1381   } else if (op == MAT_NO_NEW_DIAGONALS) {
1382     SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");
1383   } else {
1384     SETERRQ(PETSC_ERR_SUP,0,"unknown option");
1385   }
1386   PetscFunctionReturn(0);
1387 }
1388 
1389 #undef __FUNC__
1390 #define __FUNC__ "MatTranspose_MPIBAIJ("
1391 int MatTranspose_MPIBAIJ(Mat A,Mat *matout)
1392 {
1393   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) A->data;
1394   Mat_SeqBAIJ *Aloc;
1395   Mat        B;
1396   int        ierr,M=baij->M,N=baij->N,*ai,*aj,row,i,*rvals,j,k,col;
1397   int        bs=baij->bs,mbs=baij->mbs;
1398   Scalar     *a;
1399 
1400   PetscFunctionBegin;
1401   if (matout == PETSC_NULL && M != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Square matrix only for in-place");
1402   ierr = MatCreateMPIBAIJ(A->comm,baij->bs,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);
1403   CHKERRQ(ierr);
1404 
1405   /* copy over the A part */
1406   Aloc = (Mat_SeqBAIJ*) baij->A->data;
1407   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1408   row = baij->rstart;
1409   rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1410 
1411   for ( i=0; i<mbs; i++ ) {
1412     rvals[0] = bs*(baij->rstart + i);
1413     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1414     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1415       col = (baij->cstart+aj[j])*bs;
1416       for (k=0; k<bs; k++ ) {
1417         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1418         col++; a += bs;
1419       }
1420     }
1421   }
1422   /* copy over the B part */
1423   Aloc = (Mat_SeqBAIJ*) baij->B->data;
1424   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1425   row = baij->rstart*bs;
1426   for ( i=0; i<mbs; i++ ) {
1427     rvals[0] = bs*(baij->rstart + i);
1428     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1429     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1430       col = baij->garray[aj[j]]*bs;
1431       for (k=0; k<bs; k++ ) {
1432         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1433         col++; a += bs;
1434       }
1435     }
1436   }
1437   PetscFree(rvals);
1438   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1439   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1440 
1441   if (matout != PETSC_NULL) {
1442     *matout = B;
1443   } else {
1444     /* This isn't really an in-place transpose .... but free data structures from baij */
1445     PetscFree(baij->rowners);
1446     ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1447     ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1448     if (baij->colmap) PetscFree(baij->colmap);
1449     if (baij->garray) PetscFree(baij->garray);
1450     if (baij->lvec) VecDestroy(baij->lvec);
1451     if (baij->Mvctx) VecScatterDestroy(baij->Mvctx);
1452     PetscFree(baij);
1453     PetscMemcpy(A,B,sizeof(struct _p_Mat));
1454     PetscHeaderDestroy(B);
1455   }
1456   PetscFunctionReturn(0);
1457 }
1458 
1459 #undef __FUNC__
1460 #define __FUNC__ "MatDiagonalScale_MPIBAIJ"
1461 int MatDiagonalScale_MPIBAIJ(Mat A,Vec ll,Vec rr)
1462 {
1463   Mat a = ((Mat_MPIBAIJ *) A->data)->A;
1464   Mat b = ((Mat_MPIBAIJ *) A->data)->B;
1465   int ierr,s1,s2,s3;
1466 
1467   PetscFunctionBegin;
1468   if (ll)  {
1469     ierr = VecGetLocalSize(ll,&s1); CHKERRQ(ierr);
1470     ierr = MatGetLocalSize(A,&s2,&s3); CHKERRQ(ierr);
1471     if (s1!=s2) SETERRQ(PETSC_ERR_ARG_SIZ,0,"non-conforming local sizes");
1472     ierr = MatDiagonalScale(a,ll,0); CHKERRQ(ierr);
1473     ierr = MatDiagonalScale(b,ll,0); CHKERRQ(ierr);
1474   }
1475   if (rr) SETERRQ(PETSC_ERR_SUP,0,"not supported for right vector");
1476   PetscFunctionReturn(0);
1477 }
1478 
1479 #undef __FUNC__
1480 #define __FUNC__ "MatZeroRows_MPIBAIJ"
1481 int MatZeroRows_MPIBAIJ(Mat A,IS is,Scalar *diag)
1482 {
1483   Mat_MPIBAIJ    *l = (Mat_MPIBAIJ *) A->data;
1484   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
1485   int            *procs,*nprocs,j,found,idx,nsends,*work,row;
1486   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
1487   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
1488   int            *lens,imdex,*lrows,*values,bs=l->bs,rstart_bs=l->rstart_bs;
1489   MPI_Comm       comm = A->comm;
1490   MPI_Request    *send_waits,*recv_waits;
1491   MPI_Status     recv_status,*send_status;
1492   IS             istmp;
1493 
1494   PetscFunctionBegin;
1495   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
1496   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
1497 
1498   /*  first count number of contributors to each processor */
1499   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
1500   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
1501   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
1502   for ( i=0; i<N; i++ ) {
1503     idx = rows[i];
1504     found = 0;
1505     for ( j=0; j<size; j++ ) {
1506       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
1507         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
1508       }
1509     }
1510     if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Index out of range");
1511   }
1512   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
1513 
1514   /* inform other processors of number of messages and max length*/
1515   work   = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
1516   ierr   = MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
1517   nrecvs = work[rank];
1518   ierr   = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
1519   nmax   = work[rank];
1520   PetscFree(work);
1521 
1522   /* post receives:   */
1523   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); CHKPTRQ(rvalues);
1524   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
1525   for ( i=0; i<nrecvs; i++ ) {
1526     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
1527   }
1528 
1529   /* do sends:
1530      1) starts[i] gives the starting index in svalues for stuff going to
1531      the ith processor
1532   */
1533   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
1534   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
1535   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
1536   starts[0] = 0;
1537   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1538   for ( i=0; i<N; i++ ) {
1539     svalues[starts[owner[i]]++] = rows[i];
1540   }
1541   ISRestoreIndices(is,&rows);
1542 
1543   starts[0] = 0;
1544   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1545   count = 0;
1546   for ( i=0; i<size; i++ ) {
1547     if (procs[i]) {
1548       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
1549     }
1550   }
1551   PetscFree(starts);
1552 
1553   base = owners[rank]*bs;
1554 
1555   /*  wait on receives */
1556   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
1557   source = lens + nrecvs;
1558   count  = nrecvs; slen = 0;
1559   while (count) {
1560     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
1561     /* unpack receives into our local space */
1562     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
1563     source[imdex]  = recv_status.MPI_SOURCE;
1564     lens[imdex]  = n;
1565     slen += n;
1566     count--;
1567   }
1568   PetscFree(recv_waits);
1569 
1570   /* move the data into the send scatter */
1571   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
1572   count = 0;
1573   for ( i=0; i<nrecvs; i++ ) {
1574     values = rvalues + i*nmax;
1575     for ( j=0; j<lens[i]; j++ ) {
1576       lrows[count++] = values[j] - base;
1577     }
1578   }
1579   PetscFree(rvalues); PetscFree(lens);
1580   PetscFree(owner); PetscFree(nprocs);
1581 
1582   /* actually zap the local rows */
1583   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
1584   PLogObjectParent(A,istmp);
1585 
1586   ierr = MatZeroRows(l->A,istmp,0); CHKERRQ(ierr);
1587   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
1588   ierr = ISDestroy(istmp); CHKERRQ(ierr);
1589 
1590   if (diag) {
1591     for ( i = 0; i < slen; i++ ) {
1592       row = lrows[i] + rstart_bs;
1593       ierr = MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES); CHKERRQ(ierr);
1594     }
1595     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1596     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1597   }
1598   PetscFree(lrows);
1599 
1600   /* wait on sends */
1601   if (nsends) {
1602     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
1603     ierr        = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
1604     PetscFree(send_status);
1605   }
1606   PetscFree(send_waits); PetscFree(svalues);
1607 
1608   PetscFunctionReturn(0);
1609 }
1610 extern int MatPrintHelp_SeqBAIJ(Mat);
1611 #undef __FUNC__
1612 #define __FUNC__ "MatPrintHelp_MPIBAIJ"
1613 int MatPrintHelp_MPIBAIJ(Mat A)
1614 {
1615   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1616   int         ierr;
1617 
1618   PetscFunctionBegin;
1619   if (!a->rank) {
1620     ierr = MatPrintHelp_SeqBAIJ(a->A);CHKERRQ(ierr);
1621   }
1622   PetscFunctionReturn(0);
1623 }
1624 
1625 #undef __FUNC__
1626 #define __FUNC__ "MatSetUnfactored_MPIBAIJ"
1627 int MatSetUnfactored_MPIBAIJ(Mat A)
1628 {
1629   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1630   int         ierr;
1631 
1632   PetscFunctionBegin;
1633   ierr = MatSetUnfactored(a->A); CHKERRQ(ierr);
1634   PetscFunctionReturn(0);
1635 }
1636 
1637 static int MatConvertSameType_MPIBAIJ(Mat,Mat *,int);
1638 
1639 /* -------------------------------------------------------------------*/
1640 static struct _MatOps MatOps = {
1641   MatSetValues_MPIBAIJ,MatGetRow_MPIBAIJ,MatRestoreRow_MPIBAIJ,MatMult_MPIBAIJ,
1642   MatMultAdd_MPIBAIJ,MatMultTrans_MPIBAIJ,MatMultTransAdd_MPIBAIJ,0,
1643   0,0,0,0,
1644   0,0,MatTranspose_MPIBAIJ,MatGetInfo_MPIBAIJ,
1645   0,MatGetDiagonal_MPIBAIJ,MatDiagonalScale_MPIBAIJ,MatNorm_MPIBAIJ,
1646   MatAssemblyBegin_MPIBAIJ,MatAssemblyEnd_MPIBAIJ,0,MatSetOption_MPIBAIJ,
1647   MatZeroEntries_MPIBAIJ,MatZeroRows_MPIBAIJ,0,
1648   0,0,0,MatGetSize_MPIBAIJ,
1649   MatGetLocalSize_MPIBAIJ,MatGetOwnershipRange_MPIBAIJ,0,0,
1650   0,0,MatConvertSameType_MPIBAIJ,0,0,
1651   0,0,0,MatGetSubMatrices_MPIBAIJ,
1652   MatIncreaseOverlap_MPIBAIJ,MatGetValues_MPIBAIJ,0,MatPrintHelp_MPIBAIJ,
1653   MatScale_MPIBAIJ,0,0,0,MatGetBlockSize_MPIBAIJ,
1654   0,0,0,0,0,0,MatSetUnfactored_MPIBAIJ,0,MatSetValuesBlocked_MPIBAIJ};
1655 
1656 
1657 #undef __FUNC__
1658 #define __FUNC__ "MatCreateMPIBAIJ"
1659 /*@C
1660    MatCreateMPIBAIJ - Creates a sparse parallel matrix in block AIJ format
1661    (block compressed row).  For good matrix assembly performance
1662    the user should preallocate the matrix storage by setting the parameters
1663    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1664    performance can be increased by more than a factor of 50.
1665 
1666    Input Parameters:
1667 .  comm - MPI communicator
1668 .  bs   - size of blockk
1669 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1670            This value should be the same as the local size used in creating the
1671            y vector for the matrix-vector product y = Ax.
1672 .  n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1673            This value should be the same as the local size used in creating the
1674            x vector for the matrix-vector product y = Ax.
1675 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1676 .  N - number of global columns (or PETSC_DECIDE to have calculated if n is given)
1677 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1678            submatrix  (same for all local rows)
1679 .  d_nzz - array containing the number of block nonzeros in the various block rows
1680            of the in diagonal portion of the local (possibly different for each block
1681            row) or PETSC_NULL.  You must leave room for the diagonal entry even if
1682            it is zero.
1683 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1684            submatrix (same for all local rows).
1685 .  o_nzz - array containing the number of nonzeros in the various block rows of the
1686            off-diagonal portion of the local submatrix (possibly different for
1687            each block row) or PETSC_NULL.
1688 
1689    Output Parameter:
1690 .  A - the matrix
1691 
1692    Notes:
1693    The user MUST specify either the local or global matrix dimensions
1694    (possibly both).
1695 
1696    Storage Information:
1697    For a square global matrix we define each processor's diagonal portion
1698    to be its local rows and the corresponding columns (a square submatrix);
1699    each processor's off-diagonal portion encompasses the remainder of the
1700    local matrix (a rectangular submatrix).
1701 
1702    The user can specify preallocated storage for the diagonal part of
1703    the local submatrix with either d_nz or d_nnz (not both).  Set
1704    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1705    memory allocation.  Likewise, specify preallocated storage for the
1706    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1707 
1708    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1709    the figure below we depict these three local rows and all columns (0-11).
1710 
1711 $          0 1 2 3 4 5 6 7 8 9 10 11
1712 $         -------------------
1713 $  row 3  |  o o o d d d o o o o o o
1714 $  row 4  |  o o o d d d o o o o o o
1715 $  row 5  |  o o o d d d o o o o o o
1716 $         -------------------
1717 $
1718 
1719    Thus, any entries in the d locations are stored in the d (diagonal)
1720    submatrix, and any entries in the o locations are stored in the
1721    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1722    stored simply in the MATSEQBAIJ format for compressed row storage.
1723 
1724    Now d_nz should indicate the number of block nonzeros per row in the d matrix,
1725    and o_nz should indicate the number of block nonzeros per row in the o matrix.
1726    In general, for PDE problems in which most nonzeros are near the diagonal,
1727    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1728    or you will get TERRIBLE performance; see the users' manual chapter on
1729    matrices.
1730 
1731 .keywords: matrix, block, aij, compressed row, sparse, parallel
1732 
1733 .seealso: MatCreate(), MatCreateSeqBAIJ(), MatSetValues()
1734 @*/
1735 int MatCreateMPIBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,
1736                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1737 {
1738   Mat          B;
1739   Mat_MPIBAIJ  *b;
1740   int          ierr, i,sum[2],work[2],mbs,nbs,Mbs=PETSC_DECIDE,Nbs=PETSC_DECIDE,size;
1741 
1742   PetscFunctionBegin;
1743   if (bs < 1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Invalid block size specified, must be positive");
1744 
1745   MPI_Comm_size(comm,&size);
1746   if (size == 1) {
1747     if (M == PETSC_DECIDE) M = m;
1748     if (N == PETSC_DECIDE) N = n;
1749     ierr = MatCreateSeqBAIJ(comm,bs,M,N,d_nz,d_nnz,A); CHKERRQ(ierr);
1750     PetscFunctionReturn(0);
1751   }
1752 
1753   *A = 0;
1754   PetscHeaderCreate(B,_p_Mat,MAT_COOKIE,MATMPIBAIJ,comm,MatDestroy,MatView);
1755   PLogObjectCreate(B);
1756   B->data       = (void *) (b = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(b);
1757   PetscMemzero(b,sizeof(Mat_MPIBAIJ));
1758   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
1759 
1760   B->destroy    = MatDestroy_MPIBAIJ;
1761   B->view       = MatView_MPIBAIJ;
1762   B->mapping    = 0;
1763   B->factor     = 0;
1764   B->assembled  = PETSC_FALSE;
1765 
1766   B->insertmode = NOT_SET_VALUES;
1767   MPI_Comm_rank(comm,&b->rank);
1768   MPI_Comm_size(comm,&b->size);
1769 
1770   if ( m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL)) {
1771     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
1772   }
1773   if ( M == PETSC_DECIDE && m == PETSC_DECIDE) {
1774     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either M or m should be specified");
1775   }
1776   if ( N == PETSC_DECIDE && n == PETSC_DECIDE) {
1777     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either N or n should be specified");
1778   }
1779   if ( M != PETSC_DECIDE && m != PETSC_DECIDE) M = PETSC_DECIDE;
1780   if ( N != PETSC_DECIDE && n != PETSC_DECIDE) N = PETSC_DECIDE;
1781 
1782   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1783     work[0] = m; work[1] = n;
1784     mbs = m/bs; nbs = n/bs;
1785     ierr = MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );CHKERRQ(ierr);
1786     if (M == PETSC_DECIDE) {M = sum[0]; Mbs = M/bs;}
1787     if (N == PETSC_DECIDE) {N = sum[1]; Nbs = N/bs;}
1788   }
1789   if (m == PETSC_DECIDE) {
1790     Mbs = M/bs;
1791     if (Mbs*bs != M) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global rows must be divisible by blocksize");
1792     mbs = Mbs/b->size + ((Mbs % b->size) > b->rank);
1793     m   = mbs*bs;
1794   }
1795   if (n == PETSC_DECIDE) {
1796     Nbs = N/bs;
1797     if (Nbs*bs != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global cols must be divisible by blocksize");
1798     nbs = Nbs/b->size + ((Nbs % b->size) > b->rank);
1799     n   = nbs*bs;
1800   }
1801   if (mbs*bs != m || nbs*bs != n) {
1802     SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of local rows, cols must be divisible by blocksize");
1803   }
1804 
1805   b->m = m; B->m = m;
1806   b->n = n; B->n = n;
1807   b->N = N; B->N = N;
1808   b->M = M; B->M = M;
1809   b->bs  = bs;
1810   b->bs2 = bs*bs;
1811   b->mbs = mbs;
1812   b->nbs = nbs;
1813   b->Mbs = Mbs;
1814   b->Nbs = Nbs;
1815 
1816   /* build local table of row and column ownerships */
1817   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
1818   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
1819   b->cowners = b->rowners + b->size + 2;
1820   ierr = MPI_Allgather(&mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1821   b->rowners[0] = 0;
1822   for ( i=2; i<=b->size; i++ ) {
1823     b->rowners[i] += b->rowners[i-1];
1824   }
1825   b->rstart    = b->rowners[b->rank];
1826   b->rend      = b->rowners[b->rank+1];
1827   b->rstart_bs = b->rstart * bs;
1828   b->rend_bs   = b->rend * bs;
1829 
1830   ierr = MPI_Allgather(&nbs,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1831   b->cowners[0] = 0;
1832   for ( i=2; i<=b->size; i++ ) {
1833     b->cowners[i] += b->cowners[i-1];
1834   }
1835   b->cstart    = b->cowners[b->rank];
1836   b->cend      = b->cowners[b->rank+1];
1837   b->cstart_bs = b->cstart * bs;
1838   b->cend_bs   = b->cend * bs;
1839 
1840 
1841   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1842   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
1843   PLogObjectParent(B,b->A);
1844   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1845   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
1846   PLogObjectParent(B,b->B);
1847 
1848   /* build cache for off array entries formed */
1849   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
1850   b->donotstash  = 0;
1851   b->colmap      = 0;
1852   b->garray      = 0;
1853   b->roworiented = 1;
1854 
1855   /* stuff used in block assembly */
1856   b->barray       = 0;
1857 
1858   /* stuff used for matrix vector multiply */
1859   b->lvec         = 0;
1860   b->Mvctx        = 0;
1861 
1862   /* stuff for MatGetRow() */
1863   b->rowindices   = 0;
1864   b->rowvalues    = 0;
1865   b->getrowactive = PETSC_FALSE;
1866 
1867   /* hash table stuff */
1868   b->ht          = 0;
1869   b->ht_size     = 0;
1870 
1871   *A = B;
1872   PetscFunctionReturn(0);
1873 }
1874 
1875 #undef __FUNC__
1876 #define __FUNC__ "MatConvertSameType_MPIBAIJ"
1877 static int MatConvertSameType_MPIBAIJ(Mat matin,Mat *newmat,int cpvalues)
1878 {
1879   Mat         mat;
1880   Mat_MPIBAIJ *a,*oldmat = (Mat_MPIBAIJ *) matin->data;
1881   int         ierr, len=0, flg;
1882 
1883   PetscFunctionBegin;
1884   *newmat       = 0;
1885   PetscHeaderCreate(mat,_p_Mat,MAT_COOKIE,MATMPIBAIJ,matin->comm,MatDestroy,MatView);
1886   PLogObjectCreate(mat);
1887   mat->data       = (void *) (a = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(a);
1888   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1889   mat->destroy    = MatDestroy_MPIBAIJ;
1890   mat->view       = MatView_MPIBAIJ;
1891   mat->factor     = matin->factor;
1892   mat->assembled  = PETSC_TRUE;
1893 
1894   a->m = mat->m   = oldmat->m;
1895   a->n = mat->n   = oldmat->n;
1896   a->M = mat->M   = oldmat->M;
1897   a->N = mat->N   = oldmat->N;
1898 
1899   a->bs  = oldmat->bs;
1900   a->bs2 = oldmat->bs2;
1901   a->mbs = oldmat->mbs;
1902   a->nbs = oldmat->nbs;
1903   a->Mbs = oldmat->Mbs;
1904   a->Nbs = oldmat->Nbs;
1905 
1906   a->rstart       = oldmat->rstart;
1907   a->rend         = oldmat->rend;
1908   a->cstart       = oldmat->cstart;
1909   a->cend         = oldmat->cend;
1910   a->size         = oldmat->size;
1911   a->rank         = oldmat->rank;
1912   mat->insertmode = NOT_SET_VALUES;
1913   a->rowvalues    = 0;
1914   a->getrowactive = PETSC_FALSE;
1915   a->barray       = 0;
1916 
1917   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
1918   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
1919   a->cowners = a->rowners + a->size + 2;
1920   PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));
1921   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
1922   if (oldmat->colmap) {
1923     a->colmap = (int *) PetscMalloc((a->Nbs)*sizeof(int));CHKPTRQ(a->colmap);
1924     PLogObjectMemory(mat,(a->Nbs)*sizeof(int));
1925     PetscMemcpy(a->colmap,oldmat->colmap,(a->Nbs)*sizeof(int));
1926   } else a->colmap = 0;
1927   if (oldmat->garray && (len = ((Mat_SeqBAIJ *) (oldmat->B->data))->nbs)) {
1928     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
1929     PLogObjectMemory(mat,len*sizeof(int));
1930     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
1931   } else a->garray = 0;
1932 
1933   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
1934   PLogObjectParent(mat,a->lvec);
1935   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
1936   PLogObjectParent(mat,a->Mvctx);
1937   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
1938   PLogObjectParent(mat,a->A);
1939   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
1940   PLogObjectParent(mat,a->B);
1941   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
1942   if (flg) {
1943     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
1944   }
1945   *newmat = mat;
1946   PetscFunctionReturn(0);
1947 }
1948 
1949 #include "sys.h"
1950 
1951 #undef __FUNC__
1952 #define __FUNC__ "MatLoad_MPIBAIJ"
1953 int MatLoad_MPIBAIJ(Viewer viewer,MatType type,Mat *newmat)
1954 {
1955   Mat          A;
1956   int          i, nz, ierr, j,rstart, rend, fd;
1957   Scalar       *vals,*buf;
1958   MPI_Comm     comm = ((PetscObject)viewer)->comm;
1959   MPI_Status   status;
1960   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,*browners,maxnz,*cols;
1961   int          *locrowlens,*sndcounts = 0,*procsnz = 0, jj,*mycols,*ibuf;
1962   int          flg,tag = ((PetscObject)viewer)->tag,bs=1,bs2,Mbs,mbs,extra_rows;
1963   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
1964   int          dcount,kmax,k,nzcount,tmp;
1965 
1966   PetscFunctionBegin;
1967   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
1968   bs2  = bs*bs;
1969 
1970   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
1971   if (!rank) {
1972     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
1973     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT); CHKERRQ(ierr);
1974     if (header[0] != MAT_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"not matrix object");
1975     if (header[3] < 0) {
1976       SETERRQ(PETSC_ERR_FILE_UNEXPECTED,1,"Matrix stored in special format, cannot load as MPIBAIJ");
1977     }
1978   }
1979 
1980   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
1981   M = header[1]; N = header[2];
1982 
1983   if (M != N) SETERRQ(PETSC_ERR_SUP,0,"Can only do square matrices");
1984 
1985   /*
1986      This code adds extra rows to make sure the number of rows is
1987      divisible by the blocksize
1988   */
1989   Mbs        = M/bs;
1990   extra_rows = bs - M + bs*(Mbs);
1991   if (extra_rows == bs) extra_rows = 0;
1992   else                  Mbs++;
1993   if (extra_rows &&!rank) {
1994     PLogInfo(0,"MatLoad_MPIBAIJ:Padding loaded matrix to match blocksize\n");
1995   }
1996 
1997   /* determine ownership of all rows */
1998   mbs = Mbs/size + ((Mbs % size) > rank);
1999   m   = mbs * bs;
2000   rowners = (int *) PetscMalloc(2*(size+2)*sizeof(int)); CHKPTRQ(rowners);
2001   browners = rowners + size + 1;
2002   ierr = MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2003   rowners[0] = 0;
2004   for ( i=2; i<=size; i++ ) rowners[i] += rowners[i-1];
2005   for ( i=0; i<=size;  i++ ) browners[i] = rowners[i]*bs;
2006   rstart = rowners[rank];
2007   rend   = rowners[rank+1];
2008 
2009   /* distribute row lengths to all processors */
2010   locrowlens = (int*) PetscMalloc( (rend-rstart)*bs*sizeof(int) ); CHKPTRQ(locrowlens);
2011   if (!rank) {
2012     rowlengths = (int*) PetscMalloc( (M+extra_rows)*sizeof(int) ); CHKPTRQ(rowlengths);
2013     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT); CHKERRQ(ierr);
2014     for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
2015     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
2016     for ( i=0; i<size; i++ ) sndcounts[i] = browners[i+1] - browners[i];
2017     ierr = MPI_Scatterv(rowlengths,sndcounts,browners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);CHKERRQ(ierr);
2018     PetscFree(sndcounts);
2019   } else {
2020     ierr = MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT, 0,comm);CHKERRQ(ierr);
2021   }
2022 
2023   if (!rank) {
2024     /* calculate the number of nonzeros on each processor */
2025     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
2026     PetscMemzero(procsnz,size*sizeof(int));
2027     for ( i=0; i<size; i++ ) {
2028       for ( j=rowners[i]*bs; j< rowners[i+1]*bs; j++ ) {
2029         procsnz[i] += rowlengths[j];
2030       }
2031     }
2032     PetscFree(rowlengths);
2033 
2034     /* determine max buffer needed and allocate it */
2035     maxnz = 0;
2036     for ( i=0; i<size; i++ ) {
2037       maxnz = PetscMax(maxnz,procsnz[i]);
2038     }
2039     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
2040 
2041     /* read in my part of the matrix column indices  */
2042     nz = procsnz[0];
2043     ibuf = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2044     mycols = ibuf;
2045     if (size == 1)  nz -= extra_rows;
2046     ierr = PetscBinaryRead(fd,mycols,nz,PETSC_INT); CHKERRQ(ierr);
2047     if (size == 1)  for (i=0; i< extra_rows; i++) { mycols[nz+i] = M+i; }
2048 
2049     /* read in every ones (except the last) and ship off */
2050     for ( i=1; i<size-1; i++ ) {
2051       nz   = procsnz[i];
2052       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2053       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
2054     }
2055     /* read in the stuff for the last proc */
2056     if ( size != 1 ) {
2057       nz   = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
2058       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2059       for ( i=0; i<extra_rows; i++ ) cols[nz+i] = M+i;
2060       ierr = MPI_Send(cols,nz+extra_rows,MPI_INT,size-1,tag,comm);CHKERRQ(ierr);
2061     }
2062     PetscFree(cols);
2063   } else {
2064     /* determine buffer space needed for message */
2065     nz = 0;
2066     for ( i=0; i<m; i++ ) {
2067       nz += locrowlens[i];
2068     }
2069     ibuf   = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2070     mycols = ibuf;
2071     /* receive message of column indices*/
2072     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
2073     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
2074     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2075   }
2076 
2077   /* loop over local rows, determining number of off diagonal entries */
2078   dlens  = (int *) PetscMalloc( 2*(rend-rstart+1)*sizeof(int) ); CHKPTRQ(dlens);
2079   odlens = dlens + (rend-rstart);
2080   mask   = (int *) PetscMalloc( 3*Mbs*sizeof(int) ); CHKPTRQ(mask);
2081   PetscMemzero(mask,3*Mbs*sizeof(int));
2082   masked1 = mask    + Mbs;
2083   masked2 = masked1 + Mbs;
2084   rowcount = 0; nzcount = 0;
2085   for ( i=0; i<mbs; i++ ) {
2086     dcount  = 0;
2087     odcount = 0;
2088     for ( j=0; j<bs; j++ ) {
2089       kmax = locrowlens[rowcount];
2090       for ( k=0; k<kmax; k++ ) {
2091         tmp = mycols[nzcount++]/bs;
2092         if (!mask[tmp]) {
2093           mask[tmp] = 1;
2094           if (tmp < rstart || tmp >= rend ) masked2[odcount++] = tmp;
2095           else masked1[dcount++] = tmp;
2096         }
2097       }
2098       rowcount++;
2099     }
2100 
2101     dlens[i]  = dcount;
2102     odlens[i] = odcount;
2103 
2104     /* zero out the mask elements we set */
2105     for ( j=0; j<dcount; j++ ) mask[masked1[j]] = 0;
2106     for ( j=0; j<odcount; j++ ) mask[masked2[j]] = 0;
2107   }
2108 
2109   /* create our matrix */
2110   ierr = MatCreateMPIBAIJ(comm,bs,m,PETSC_DECIDE,M+extra_rows,N+extra_rows,0,dlens,0,odlens,newmat);
2111          CHKERRQ(ierr);
2112   A = *newmat;
2113   MatSetOption(A,MAT_COLUMNS_SORTED);
2114 
2115   if (!rank) {
2116     buf = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(buf);
2117     /* read in my part of the matrix numerical values  */
2118     nz = procsnz[0];
2119     vals = buf;
2120     mycols = ibuf;
2121     if (size == 1)  nz -= extra_rows;
2122     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2123     if (size == 1)  for (i=0; i< extra_rows; i++) { vals[nz+i] = 1.0; }
2124 
2125     /* insert into matrix */
2126     jj      = rstart*bs;
2127     for ( i=0; i<m; i++ ) {
2128       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2129       mycols += locrowlens[i];
2130       vals   += locrowlens[i];
2131       jj++;
2132     }
2133     /* read in other processors (except the last one) and ship out */
2134     for ( i=1; i<size-1; i++ ) {
2135       nz   = procsnz[i];
2136       vals = buf;
2137       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2138       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
2139     }
2140     /* the last proc */
2141     if ( size != 1 ){
2142       nz   = procsnz[i] - extra_rows;
2143       vals = buf;
2144       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2145       for ( i=0; i<extra_rows; i++ ) vals[nz+i] = 1.0;
2146       ierr = MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,size-1,A->tag,comm);CHKERRQ(ierr);
2147     }
2148     PetscFree(procsnz);
2149   } else {
2150     /* receive numeric values */
2151     buf = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(buf);
2152 
2153     /* receive message of values*/
2154     vals   = buf;
2155     mycols = ibuf;
2156     ierr   = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
2157     ierr   = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
2158     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2159 
2160     /* insert into matrix */
2161     jj      = rstart*bs;
2162     for ( i=0; i<m; i++ ) {
2163       ierr    = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2164       mycols += locrowlens[i];
2165       vals   += locrowlens[i];
2166       jj++;
2167     }
2168   }
2169   PetscFree(locrowlens);
2170   PetscFree(buf);
2171   PetscFree(ibuf);
2172   PetscFree(rowners);
2173   PetscFree(dlens);
2174   PetscFree(mask);
2175   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2176   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2177   PetscFunctionReturn(0);
2178 }
2179 
2180 
2181