xref: /petsc/src/mat/impls/baij/mpi/mpibaij.c (revision fef457264ad9a8370921e6ba2ad17020ed5d67ad)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: mpibaij.c,v 1.96 1998/01/12 18:35:34 balay Exp balay $";
3 #endif
4 
5 #include "pinclude/pviewer.h"
6 #include "src/mat/impls/baij/mpi/mpibaij.h"
7 #include "src/vec/vecimpl.h"
8 
9 
10 extern int MatSetUpMultiply_MPIBAIJ(Mat);
11 extern int DisAssemble_MPIBAIJ(Mat);
12 extern int MatIncreaseOverlap_MPIBAIJ(Mat,int,IS *,int);
13 extern int MatGetSubMatrices_MPIBAIJ(Mat,int,IS *,IS *,MatGetSubMatrixCall,Mat **);
14 
15 /*
16      Local utility routine that creates a mapping from the global column
17    number to the local number in the off-diagonal part of the local
18    storage of the matrix.  This is done in a non scable way since the
19    length of colmap equals the global matrix length.
20 */
21 #undef __FUNC__
22 #define __FUNC__ "CreateColmap_MPIBAIJ_Private"
23 static int CreateColmap_MPIBAIJ_Private(Mat mat)
24 {
25   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
26   Mat_SeqBAIJ *B = (Mat_SeqBAIJ*) baij->B->data;
27   int         nbs = B->nbs,i,bs=B->bs;;
28 
29   PetscFunctionBegin;
30   baij->colmap = (int *) PetscMalloc((baij->Nbs+1)*sizeof(int));CHKPTRQ(baij->colmap);
31   PLogObjectMemory(mat,baij->Nbs*sizeof(int));
32   PetscMemzero(baij->colmap,baij->Nbs*sizeof(int));
33   for ( i=0; i<nbs; i++ ) baij->colmap[baij->garray[i]] = i*bs+1;
34   PetscFunctionReturn(0);
35 }
36 
37 #define CHUNKSIZE  10
38 
39 #define  MatSetValues_SeqBAIJ_A_Private(row,col,value,addv) \
40 { \
41  \
42     brow = row/bs;  \
43     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
44     rmax = aimax[brow]; nrow = ailen[brow]; \
45       bcol = col/bs; \
46       ridx = row % bs; cidx = col % bs; \
47       low = 0; high = nrow; \
48       while (high-low > 3) { \
49         t = (low+high)/2; \
50         if (rp[t] > bcol) high = t; \
51         else              low  = t; \
52       } \
53       for ( _i=low; _i<high; _i++ ) { \
54         if (rp[_i] > bcol) break; \
55         if (rp[_i] == bcol) { \
56           bap  = ap +  bs2*_i + bs*cidx + ridx; \
57           if (addv == ADD_VALUES) *bap += value;  \
58           else                    *bap  = value;  \
59           goto a_noinsert; \
60         } \
61       } \
62       if (a->nonew == 1) goto a_noinsert; \
63       else if (a->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
64       if (nrow >= rmax) { \
65         /* there is no extra room in row, therefore enlarge */ \
66         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
67         Scalar *new_a; \
68  \
69         if (a->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
70  \
71         /* malloc new storage space */ \
72         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int); \
73         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
74         new_j   = (int *) (new_a + bs2*new_nz); \
75         new_i   = new_j + new_nz; \
76  \
77         /* copy over old data into new slots */ \
78         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];} \
79         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
80         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int)); \
81         len = (new_nz - CHUNKSIZE - ai[brow] - nrow); \
82         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow, \
83                                                            len*sizeof(int)); \
84         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar)); \
85         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
86         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE), \
87                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));  \
88         /* free up old matrix storage */ \
89         PetscFree(a->a);  \
90         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} \
91         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
92         a->singlemalloc = 1; \
93  \
94         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
95         rmax = aimax[brow] = aimax[brow] + CHUNKSIZE; \
96         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
97         a->maxnz += bs2*CHUNKSIZE; \
98         a->reallocs++; \
99         a->nz++; \
100       } \
101       N = nrow++ - 1;  \
102       /* shift up all the later entries in this row */ \
103       for ( ii=N; ii>=_i; ii-- ) { \
104         rp[ii+1] = rp[ii]; \
105         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
106       } \
107       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
108       rp[_i]                      = bcol;  \
109       ap[bs2*_i + bs*cidx + ridx] = value;  \
110       a_noinsert:; \
111     ailen[brow] = nrow; \
112 }
113 
114 #define  MatSetValues_SeqBAIJ_B_Private(row,col,value,addv) \
115 { \
116  \
117     brow = row/bs;  \
118     rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
119     rmax = bimax[brow]; nrow = bilen[brow]; \
120       bcol = col/bs; \
121       ridx = row % bs; cidx = col % bs; \
122       low = 0; high = nrow; \
123       while (high-low > 3) { \
124         t = (low+high)/2; \
125         if (rp[t] > bcol) high = t; \
126         else              low  = t; \
127       } \
128       for ( _i=low; _i<high; _i++ ) { \
129         if (rp[_i] > bcol) break; \
130         if (rp[_i] == bcol) { \
131           bap  = ap +  bs2*_i + bs*cidx + ridx; \
132           if (addv == ADD_VALUES) *bap += value;  \
133           else                    *bap  = value;  \
134           goto b_noinsert; \
135         } \
136       } \
137       if (b->nonew == 1) goto b_noinsert; \
138       else if (b->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
139       if (nrow >= rmax) { \
140         /* there is no extra room in row, therefore enlarge */ \
141         int    new_nz = bi[b->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
142         Scalar *new_a; \
143  \
144         if (b->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
145  \
146         /* malloc new storage space */ \
147         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(b->mbs+1)*sizeof(int); \
148         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
149         new_j   = (int *) (new_a + bs2*new_nz); \
150         new_i   = new_j + new_nz; \
151  \
152         /* copy over old data into new slots */ \
153         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = bi[ii];} \
154         for ( ii=brow+1; ii<b->mbs+1; ii++ ) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
155         PetscMemcpy(new_j,bj,(bi[brow]+nrow)*sizeof(int)); \
156         len = (new_nz - CHUNKSIZE - bi[brow] - nrow); \
157         PetscMemcpy(new_j+bi[brow]+nrow+CHUNKSIZE,bj+bi[brow]+nrow, \
158                                                            len*sizeof(int)); \
159         PetscMemcpy(new_a,ba,(bi[brow]+nrow)*bs2*sizeof(Scalar)); \
160         PetscMemzero(new_a+bs2*(bi[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
161         PetscMemcpy(new_a+bs2*(bi[brow]+nrow+CHUNKSIZE), \
162                     ba+bs2*(bi[brow]+nrow),bs2*len*sizeof(Scalar));  \
163         /* free up old matrix storage */ \
164         PetscFree(b->a);  \
165         if (!b->singlemalloc) {PetscFree(b->i);PetscFree(b->j);} \
166         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
167         b->singlemalloc = 1; \
168  \
169         rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
170         rmax = bimax[brow] = bimax[brow] + CHUNKSIZE; \
171         PLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
172         b->maxnz += bs2*CHUNKSIZE; \
173         b->reallocs++; \
174         b->nz++; \
175       } \
176       N = nrow++ - 1;  \
177       /* shift up all the later entries in this row */ \
178       for ( ii=N; ii>=_i; ii-- ) { \
179         rp[ii+1] = rp[ii]; \
180         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
181       } \
182       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
183       rp[_i]                      = bcol;  \
184       ap[bs2*_i + bs*cidx + ridx] = value;  \
185       b_noinsert:; \
186     bilen[brow] = nrow; \
187 }
188 
189 #undef __FUNC__
190 #define __FUNC__ "MatSetValues_MPIBAIJ"
191 int MatSetValues_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
192 {
193   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
194   Scalar      value;
195   int         ierr,i,j,row,col;
196   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
197   int         rend_orig=baij->rend_bs,cstart_orig=baij->cstart_bs;
198   int         cend_orig=baij->cend_bs,bs=baij->bs;
199 
200   /* Some Variables required in the macro */
201   Mat         A = baij->A;
202   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) (A)->data;
203   int         *aimax=a->imax,*ai=a->i,*ailen=a->ilen,*aj=a->j;
204   Scalar      *aa=a->a;
205 
206   Mat         B = baij->B;
207   Mat_SeqBAIJ *b = (Mat_SeqBAIJ *) (B)->data;
208   int         *bimax=b->imax,*bi=b->i,*bilen=b->ilen,*bj=b->j;
209   Scalar      *ba=b->a;
210 
211   int         *rp,ii,nrow,_i,rmax,N,brow,bcol;
212   int         low,high,t,ridx,cidx,bs2=a->bs2;
213   Scalar      *ap,*bap;
214 
215   PetscFunctionBegin;
216   for ( i=0; i<m; i++ ) {
217 #if defined(USE_PETSC_BOPT_g)
218     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
219     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
220 #endif
221     if (im[i] >= rstart_orig && im[i] < rend_orig) {
222       row = im[i] - rstart_orig;
223       for ( j=0; j<n; j++ ) {
224         if (in[j] >= cstart_orig && in[j] < cend_orig){
225           col = in[j] - cstart_orig;
226           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
227           MatSetValues_SeqBAIJ_A_Private(row,col,value,addv);
228           /* ierr = MatSetValues_SeqBAIJ(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
229         }
230 #if defined(USE_PETSC_BOPT_g)
231         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
232         else if (in[j] >= baij->N) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Col too large");}
233 #endif
234         else {
235           if (mat->was_assembled) {
236             if (!baij->colmap) {
237               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
238             }
239             col = baij->colmap[in[j]/bs] - 1 + in[j]%bs;
240             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
241               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
242               col =  in[j];
243               /* Reinitialize the variables required by MatSetValues_SeqBAIJ_B_Private() */
244               B = baij->B;
245               b = (Mat_SeqBAIJ *) (B)->data;
246               bimax=b->imax;bi=b->i;bilen=b->ilen;bj=b->j;
247               ba=b->a;
248             }
249           } else col = in[j];
250           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
251           MatSetValues_SeqBAIJ_B_Private(row,col,value,addv);
252           /* ierr = MatSetValues_SeqBAIJ(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
253         }
254       }
255     } else {
256       if (roworiented && !baij->donotstash) {
257         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
258       } else {
259         if (!baij->donotstash) {
260           row = im[i];
261 	  for ( j=0; j<n; j++ ) {
262 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
263           }
264         }
265       }
266     }
267   }
268   PetscFunctionReturn(0);
269 }
270 
271 extern int MatSetValuesBlocked_SeqBAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
272 #undef __FUNC__
273 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ"
274 int MatSetValuesBlocked_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
275 {
276   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
277   Scalar      *value,*barray=baij->barray;
278   int         ierr,i,j,ii,jj,row,col,k,l;
279   int         roworiented = baij->roworiented,rstart=baij->rstart ;
280   int         rend=baij->rend,cstart=baij->cstart,stepval;
281   int         cend=baij->cend,bs=baij->bs,bs2=baij->bs2;
282 
283   if(!barray) {
284     baij->barray = barray = (Scalar*) PetscMalloc(bs2*sizeof(Scalar)); CHKPTRQ(barray);
285   }
286 
287   if (roworiented) {
288     stepval = (n-1)*bs;
289   } else {
290     stepval = (m-1)*bs;
291   }
292   for ( i=0; i<m; i++ ) {
293 #if defined(USE_PETSC_BOPT_g)
294     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
295     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
296 #endif
297     if (im[i] >= rstart && im[i] < rend) {
298       row = im[i] - rstart;
299       for ( j=0; j<n; j++ ) {
300         /* If NumCol = 1 then a copy is not required */
301         if ((roworiented) && (n == 1)) {
302           barray = v + i*bs2;
303         } else if((!roworiented) && (m == 1)) {
304           barray = v + j*bs2;
305         } else { /* Here a copy is required */
306           if (roworiented) {
307             value = v + i*(stepval+bs)*bs + j*bs;
308           } else {
309             value = v + j*(stepval+bs)*bs + i*bs;
310           }
311           for ( ii=0; ii<bs; ii++,value+=stepval ) {
312             for (jj=0; jj<bs; jj++ ) {
313               *barray++  = *value++;
314             }
315           }
316           barray -=bs2;
317         }
318 
319         if (in[j] >= cstart && in[j] < cend){
320           col  = in[j] - cstart;
321           ierr = MatSetValuesBlocked_SeqBAIJ(baij->A,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
322         }
323 #if defined(USE_PETSC_BOPT_g)
324         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
325         else if (in[j] >= baij->Nbs) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");}
326 #endif
327         else {
328           if (mat->was_assembled) {
329             if (!baij->colmap) {
330               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
331             }
332 
333 #if defined(USE_PETSC_BOPT_g)
334             if ((baij->colmap[in[j]] - 1) % bs) {SETERRQ(PETSC_ERR_PLIB,0,"Incorrect colmap");}
335 #endif
336             col = (baij->colmap[in[j]] - 1)/bs;
337             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
338               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
339               col =  in[j];
340             }
341           }
342           else col = in[j];
343           ierr = MatSetValuesBlocked_SeqBAIJ(baij->B,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
344         }
345       }
346     } else {
347       if (!baij->donotstash) {
348         if (roworiented ) {
349           row   = im[i]*bs;
350           value = v + i*(stepval+bs)*bs;
351           for ( j=0; j<bs; j++,row++ ) {
352             for ( k=0; k<n; k++ ) {
353               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
354                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
355               }
356             }
357           }
358         } else {
359           for ( j=0; j<n; j++ ) {
360             value = v + j*(stepval+bs)*bs + i*bs;
361             col   = in[j]*bs;
362             for ( k=0; k<bs; k++,col++,value+=stepval) {
363               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
364                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
365               }
366             }
367           }
368         }
369       }
370     }
371   }
372   PetscFunctionReturn(0);
373 }
374 #include <math.h>
375 #define HASH_KEY 0.6180339887
376 #define HASH1(size,key) ((int)((size)*fmod(((key)*HASH_KEY),1)))
377 
378 #undef __FUNC__
379 #define __FUNC__ "MatSetValues_MPIBAIJ_HT"
380 int MatSetValues_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
381 {
382   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
383   int         ierr,i,j,row,col;
384   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
385   int         rend_orig=baij->rend_bs;
386 
387   int         h1,key,size=baij->ht_size,k,bs=baij->bs;
388   int         * HT  = baij->ht;
389   Scalar      ** HD = (Scalar **)(HT + size),value;
390 
391 
392   PetscFunctionBegin;
393 
394   if(((Mat_SeqBAIJ*)baij->A->data)->nonew !=1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix");
395 
396   for ( i=0; i<m; i++ ) {
397 #if defined(USE_PETSC_BOPT_g)
398     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
399     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
400 #endif
401     if (im[i] >= rstart_orig && im[i] < rend_orig) {
402       row = im[i];
403       for ( j=0; j<n; j++ ) {
404         col = in[j];
405         if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
406         /* Look up into the Hash Table */
407         key = (row/bs)*baij->Nbs+(col/bs)+1;
408         h1  = HASH1(size,key);
409 
410         for ( k=0; k<size; k++ ){
411           if (HT[(h1+k)%size] == key) {
412             if (addv == ADD_VALUES) *(HD[(h1+k)%size]+ (col % bs)*bs + row % bs) += value;
413             else                    *(HD[(h1+k)%size]+ (col % bs)*bs + row % bs)  = value;
414             break;
415           }
416         }
417         if ( k==size) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Hash table Entry not found");
418       }
419     } else {
420       if (roworiented && !baij->donotstash) {
421         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
422       } else {
423         if (!baij->donotstash) {
424           row = im[i];
425 	  for ( j=0; j<n; j++ ) {
426 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
427           }
428         }
429       }
430     }
431   }
432   PetscFunctionReturn(0);
433 }
434 
435 #undef __FUNC__
436 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ_HT"
437 int MatSetValuesBlocked_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
438 {
439   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
440   int         ierr,i,j,ii,jj,row,col,k,l;
441   int         roworiented = baij->roworiented,rstart=baij->rstart ;
442   int         rend=baij->rend,stepval,bs=baij->bs,bs2=baij->bs2;
443 
444   int         h1,key,size=baij->ht_size;
445   int         * HT  = baij->ht;
446   Scalar      ** HD = (Scalar **)(HT + size),*value,*baij_a;
447 
448   PetscFunctionBegin;
449 
450   if(((Mat_SeqBAIJ*)baij->A->data)->nonew !=1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix");
451 
452   if (roworiented) {
453     stepval = (n-1)*bs;
454   } else {
455     stepval = (m-1)*bs;
456   }
457   for ( i=0; i<m; i++ ) {
458 #if defined(USE_PETSC_BOPT_g)
459     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
460     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
461 #endif
462     if (im[i] >= rstart && im[i] < rend) {
463       row = im[i];
464       for ( j=0; j<n; j++ ) {
465         col = in[j];
466 
467         /* Look up into the Hash Table */
468         key = row*baij->Nbs+col+1;
469         h1  = HASH1(size,key);
470 
471         for ( k=0; k<size; k++ ){
472           if (HT[(h1+k)%size] == key) {
473             baij_a = HD[(h1+k)%size];
474 
475             if (roworiented) {
476               value = v + i*(stepval+bs)*bs + j*bs;
477               if (addv == ADD_VALUES) {
478                 for ( ii=0; ii<bs; ii++,value+=stepval,baij_a++ ) {
479                   for ( jj=0; jj<bs2; jj+=bs ) {
480                     baij_a[jj]  += *value++;
481                   }
482                 }
483               } else {
484                 for ( ii=0; ii<bs; ii++,value+=stepval,baij_a++ ) {
485                   for ( jj=0; jj<bs2; jj+=bs ) {
486                     baij_a[jj]  = *value++;
487                   }
488                 }
489               }
490             } else {
491               value = v + j*(stepval+bs)*bs + i*bs;
492               if (addv == ADD_VALUES) {
493                 for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
494                   for ( jj=0; jj<bs; jj++ ) {
495                     baij_a[jj]  += *value++;
496                   }
497                 }
498               } else {
499                 for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
500                   for ( jj=0; jj<bs; jj++ ) {
501                     baij_a[jj]  = *value++;
502                   }
503                 }
504               }
505             }
506             break;
507           }
508         }
509         if ( k==size) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
510       }
511     } else {
512       if (!baij->donotstash) {
513         if (roworiented ) {
514           row   = im[i]*bs;
515           value = v + i*(stepval+bs)*bs;
516           for ( j=0; j<bs; j++,row++ ) {
517             for ( k=0; k<n; k++ ) {
518               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
519                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
520               }
521             }
522           }
523         } else {
524           for ( j=0; j<n; j++ ) {
525             value = v + j*(stepval+bs)*bs + i*bs;
526             col   = in[j]*bs;
527             for ( k=0; k<bs; k++,col++,value+=stepval) {
528               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
529                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
530               }
531             }
532           }
533         }
534       }
535     }
536   }
537   PetscFunctionReturn(0);
538 }
539 #undef __FUNC__
540 #define __FUNC__ "MatGetValues_MPIBAIJ"
541 int MatGetValues_MPIBAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
542 {
543   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
544   int        bs=baij->bs,ierr,i,j, bsrstart = baij->rstart*bs, bsrend = baij->rend*bs;
545   int        bscstart = baij->cstart*bs, bscend = baij->cend*bs,row,col;
546 
547   for ( i=0; i<m; i++ ) {
548     if (idxm[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
549     if (idxm[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
550     if (idxm[i] >= bsrstart && idxm[i] < bsrend) {
551       row = idxm[i] - bsrstart;
552       for ( j=0; j<n; j++ ) {
553         if (idxn[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");
554         if (idxn[j] >= baij->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");
555         if (idxn[j] >= bscstart && idxn[j] < bscend){
556           col = idxn[j] - bscstart;
557           ierr = MatGetValues(baij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
558         } else {
559           if (!baij->colmap) {
560             ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
561           }
562           if((baij->colmap[idxn[j]/bs]-1 < 0) ||
563              (baij->garray[(baij->colmap[idxn[j]/bs]-1)/bs] != idxn[j]/bs)) *(v+i*n+j) = 0.0;
564           else {
565             col  = (baij->colmap[idxn[j]/bs]-1) + idxn[j]%bs;
566             ierr = MatGetValues(baij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
567           }
568         }
569       }
570     } else {
571       SETERRQ(PETSC_ERR_SUP,0,"Only local values currently supported");
572     }
573   }
574   PetscFunctionReturn(0);
575 }
576 
577 #undef __FUNC__
578 #define __FUNC__ "MatNorm_MPIBAIJ"
579 int MatNorm_MPIBAIJ(Mat mat,NormType type,double *norm)
580 {
581   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
582   Mat_SeqBAIJ *amat = (Mat_SeqBAIJ*) baij->A->data, *bmat = (Mat_SeqBAIJ*) baij->B->data;
583   int        ierr, i,bs2=baij->bs2;
584   double     sum = 0.0;
585   Scalar     *v;
586 
587   PetscFunctionBegin;
588   if (baij->size == 1) {
589     ierr =  MatNorm(baij->A,type,norm); CHKERRQ(ierr);
590   } else {
591     if (type == NORM_FROBENIUS) {
592       v = amat->a;
593       for (i=0; i<amat->nz*bs2; i++ ) {
594 #if defined(USE_PETSC_COMPLEX)
595         sum += real(conj(*v)*(*v)); v++;
596 #else
597         sum += (*v)*(*v); v++;
598 #endif
599       }
600       v = bmat->a;
601       for (i=0; i<bmat->nz*bs2; i++ ) {
602 #if defined(USE_PETSC_COMPLEX)
603         sum += real(conj(*v)*(*v)); v++;
604 #else
605         sum += (*v)*(*v); v++;
606 #endif
607       }
608       ierr = MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
609       *norm = sqrt(*norm);
610     } else {
611       SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
612     }
613   }
614   PetscFunctionReturn(0);
615 }
616 
617 #undef __FUNC__
618 #define __FUNC__ "MatAssemblyBegin_MPIBAIJ"
619 int MatAssemblyBegin_MPIBAIJ(Mat mat,MatAssemblyType mode)
620 {
621   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
622   MPI_Comm    comm = mat->comm;
623   int         size = baij->size, *owners = baij->rowners,bs=baij->bs;
624   int         rank = baij->rank,tag = mat->tag, *owner,*starts,count,ierr;
625   MPI_Request *send_waits,*recv_waits;
626   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
627   InsertMode  addv;
628   Scalar      *rvalues,*svalues;
629 
630   PetscFunctionBegin;
631   /* make sure all processors are either in INSERTMODE or ADDMODE */
632   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);CHKERRQ(ierr);
633   if (addv == (ADD_VALUES|INSERT_VALUES)) {
634     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Some processors inserted others added");
635   }
636   mat->insertmode = addv; /* in case this processor had no cache */
637 
638   /*  first count number of contributors to each processor */
639   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
640   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
641   owner = (int *) PetscMalloc( (baij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
642   for ( i=0; i<baij->stash.n; i++ ) {
643     idx = baij->stash.idx[i];
644     for ( j=0; j<size; j++ ) {
645       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
646         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
647       }
648     }
649   }
650   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
651 
652   /* inform other processors of number of messages and max length*/
653   work      = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
654   ierr      = MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
655   nreceives = work[rank];
656   ierr      = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
657   nmax      = work[rank];
658   PetscFree(work);
659 
660   /* post receives:
661        1) each message will consist of ordered pairs
662      (global index,value) we store the global index as a double
663      to simplify the message passing.
664        2) since we don't know how long each individual message is we
665      allocate the largest needed buffer for each receive. Potentially
666      this is a lot of wasted space.
667 
668 
669        This could be done better.
670   */
671   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));CHKPTRQ(rvalues);
672   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
673   for ( i=0; i<nreceives; i++ ) {
674     ierr = MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
675   }
676 
677   /* do sends:
678       1) starts[i] gives the starting index in svalues for stuff going to
679          the ith processor
680   */
681   svalues = (Scalar *) PetscMalloc(3*(baij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
682   send_waits = (MPI_Request *) PetscMalloc((nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
683   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
684   starts[0] = 0;
685   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
686   for ( i=0; i<baij->stash.n; i++ ) {
687     svalues[3*starts[owner[i]]]       = (Scalar)  baij->stash.idx[i];
688     svalues[3*starts[owner[i]]+1]     = (Scalar)  baij->stash.idy[i];
689     svalues[3*(starts[owner[i]]++)+2] =  baij->stash.array[i];
690   }
691   PetscFree(owner);
692   starts[0] = 0;
693   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
694   count = 0;
695   for ( i=0; i<size; i++ ) {
696     if (procs[i]) {
697       ierr = MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
698     }
699   }
700   PetscFree(starts); PetscFree(nprocs);
701 
702   /* Free cache space */
703   PLogInfo(0,"[%d]MatAssemblyBegin_MPIBAIJ:Number of off-processor values %d\n",rank,baij->stash.n);
704   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
705 
706   baij->svalues    = svalues;    baij->rvalues    = rvalues;
707   baij->nsends     = nsends;     baij->nrecvs     = nreceives;
708   baij->send_waits = send_waits; baij->recv_waits = recv_waits;
709   baij->rmax       = nmax;
710 
711   PetscFunctionReturn(0);
712 }
713 
714 /*
715   Creates the hash table, and sets the table
716   This table is created only once.
717   If new entried need to be added to the matrix
718   then the hash table has to be destroyed and
719   recreated.
720 */
721 #undef __FUNC__
722 #define __FUNC__ "MatCreateHashTable_MPIBAIJ_Private"
723 int MatCreateHashTable_MPIBAIJ_Private(Mat mat,double factor)
724 {
725   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
726   Mat         A = baij->A, B=baij->B;
727   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data, *b=(Mat_SeqBAIJ *)B->data;
728   int         i,j,k,nz=a->nz+b->nz,h1,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
729   int         size,ct=0,max1=0,max2=0,bs2=baij->bs2,rstart=baij->rstart;
730   int         cstart=baij->cstart,*garray=baij->garray,row,col;
731   int         *HT,key;
732   Scalar      **HD;
733 
734   PetscFunctionBegin;
735   baij->ht_size=(int)(factor*nz);
736   size = baij->ht_size;
737 
738   if (baij->ht) {
739     PetscFunctionReturn(0);
740   }
741 
742   /* Allocate Memory for Hash Table */
743   baij->ht = (int*)PetscMalloc(size*(sizeof(int)+sizeof(Scalar*))+1); CHKPTRQ(baij->ht);
744   HT = baij->ht;
745   HD = (Scalar**)(HT + size);
746   PetscMemzero(HT,size*sizeof(int)+sizeof(Scalar*));
747 
748 
749   /* Loop Over A */
750   for ( i=0; i<a->mbs; i++ ) {
751     for ( j=ai[i]; j<ai[i+1]; j++ ) {
752       row = i+rstart;
753       col = aj[j]+cstart;
754 
755       key = row*baij->Nbs + col + 1;
756       /* printf("[%d]row,col,key = %2d,%2d,%5.2f\n",PetscGlobalRank,row,col,key); */
757       h1  = HASH1(size,key);
758 
759       for ( k=0; k<size; k++ ){
760         if (HT[(h1+k)%size] == 0.0) {
761           HT[(h1+k)%size] = key;
762           HD[(h1+k)%size] = a->a + j*bs2;
763           break;
764         } else ct++;
765       }
766       if (k> max1) max1 = k;
767     }
768   }
769   /* Loop Over B */
770   for ( i=0; i<b->mbs; i++ ) {
771     for ( j=bi[i]; j<bi[i+1]; j++ ) {
772       row = i+rstart;
773       col = garray[bj[j]];
774       key = row*baij->Nbs + col + 1;
775       /* printf("[%d]row,col,key = %2d,%2d,%5.2f\n",PetscGlobalRank,row,col,key); */
776       h1  = HASH1(size,key);
777       for ( k=0; k<size; k++ ){
778         if (HT[(h1+k)%size] == 0.0) {
779           HT[(h1+k)%size] = key;
780           HD[(h1+k)%size] = b->a + j*bs2;
781           break;
782         } else ct++;
783       }
784       if (k> max2) max2 = k;
785     }
786   }
787 
788   /* Print Summary */
789   for ( i=0,key=0.0,j=0; i<size; i++)
790     if (HT[i]) {j++;}
791 
792   /*printf("[%d] fact = %3.2f max1 = %5d max2 = %5d Size %5d - Searches %5d Avg %5.2f Keys %5d\n",
793          PetscGlobalRank,factor,max1,max2,size,ct+j,((float)ct+j),j); */
794   fflush(stdout);
795   PetscFunctionReturn(0);
796 }
797 
798 #undef __FUNC__
799 #define __FUNC__ "MatAssemblyEnd_MPIBAIJ"
800 int MatAssemblyEnd_MPIBAIJ(Mat mat,MatAssemblyType mode)
801 {
802   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
803   MPI_Status  *send_status,recv_status;
804   int         imdex,nrecvs = baij->nrecvs, count = nrecvs, i, n, ierr;
805   int         bs=baij->bs,row,col,other_disassembled,flg;
806   Scalar      *values,val;
807   InsertMode  addv = mat->insertmode;
808 
809   PetscFunctionBegin;
810   /*  wait on receives */
811   while (count) {
812     ierr = MPI_Waitany(nrecvs,baij->recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
813     /* unpack receives into our local space */
814     values = baij->rvalues + 3*imdex*baij->rmax;
815     ierr = MPI_Get_count(&recv_status,MPIU_SCALAR,&n);CHKERRQ(ierr);
816     n = n/3;
817     for ( i=0; i<n; i++ ) {
818       row = (int) PetscReal(values[3*i]) - baij->rstart*bs;
819       col = (int) PetscReal(values[3*i+1]);
820       val = values[3*i+2];
821       if (col >= baij->cstart*bs && col < baij->cend*bs) {
822         col -= baij->cstart*bs;
823         ierr = MatSetValues(baij->A,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
824       } else {
825         if (mat->was_assembled) {
826           if (!baij->colmap) {
827             ierr = CreateColmap_MPIBAIJ_Private(mat); CHKERRQ(ierr);
828           }
829           col = (baij->colmap[col/bs]) - 1 + col%bs;
830           if (col < 0  && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
831             ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
832             col = (int) PetscReal(values[3*i+1]);
833           }
834         }
835         ierr = MatSetValues(baij->B,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
836       }
837     }
838     count--;
839   }
840   PetscFree(baij->recv_waits); PetscFree(baij->rvalues);
841 
842   /* wait on sends */
843   if (baij->nsends) {
844     send_status = (MPI_Status *) PetscMalloc(baij->nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
845     ierr        = MPI_Waitall(baij->nsends,baij->send_waits,send_status);CHKERRQ(ierr);
846     PetscFree(send_status);
847   }
848   PetscFree(baij->send_waits); PetscFree(baij->svalues);
849 
850   ierr = MatAssemblyBegin(baij->A,mode); CHKERRQ(ierr);
851   ierr = MatAssemblyEnd(baij->A,mode); CHKERRQ(ierr);
852 
853   /* determine if any processor has disassembled, if so we must
854      also disassemble ourselfs, in order that we may reassemble. */
855   ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
856   if (mat->was_assembled && !other_disassembled) {
857     ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
858   }
859 
860   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
861     ierr = MatSetUpMultiply_MPIBAIJ(mat); CHKERRQ(ierr);
862   }
863   ierr = MatAssemblyBegin(baij->B,mode); CHKERRQ(ierr);
864   ierr = MatAssemblyEnd(baij->B,mode); CHKERRQ(ierr);
865 
866   ierr = OptionsHasName(PETSC_NULL,"-use_hash",&flg); CHKERRQ(ierr);
867   if (flg && !baij->ht && mode== MAT_FINAL_ASSEMBLY) {
868     double fact = 1.39;
869     ierr = OptionsGetDouble(PETSC_NULL,"-use_hash",&fact,&flg); CHKERRQ(ierr);
870     ierr = MatCreateHashTable_MPIBAIJ_Private(mat,fact); CHKERRQ(ierr);
871     mat->ops.setvalues        = MatSetValues_MPIBAIJ_HT;
872     mat->ops.setvaluesblocked = MatSetValuesBlocked_MPIBAIJ_HT;
873   }
874   if (baij->rowvalues) {PetscFree(baij->rowvalues); baij->rowvalues = 0;}
875   PetscFunctionReturn(0);
876 }
877 
878 #undef __FUNC__
879 #define __FUNC__ "MatView_MPIBAIJ_Binary"
880 static int MatView_MPIBAIJ_Binary(Mat mat,Viewer viewer)
881 {
882   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
883   int          ierr;
884 
885   PetscFunctionBegin;
886   if (baij->size == 1) {
887     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
888   } else SETERRQ(PETSC_ERR_SUP,0,"Only uniprocessor output supported");
889   PetscFunctionReturn(0);
890 }
891 
892 #undef __FUNC__
893 #define __FUNC__ "MatView_MPIBAIJ_ASCIIorDraworMatlab"
894 static int MatView_MPIBAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
895 {
896   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
897   int          ierr, format,rank,bs = baij->bs;
898   FILE         *fd;
899   ViewerType   vtype;
900 
901   PetscFunctionBegin;
902   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
903   if (vtype  == ASCII_FILES_VIEWER || vtype == ASCII_FILE_VIEWER) {
904     ierr = ViewerGetFormat(viewer,&format);
905     if (format == VIEWER_FORMAT_ASCII_INFO_LONG) {
906       MatInfo info;
907       MPI_Comm_rank(mat->comm,&rank);
908       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
909       ierr = MatGetInfo(mat,MAT_LOCAL,&info);
910       PetscSequentialPhaseBegin(mat->comm,1);
911       fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
912               rank,baij->m,(int)info.nz_used*bs,(int)info.nz_allocated*bs,
913               baij->bs,(int)info.memory);
914       ierr = MatGetInfo(baij->A,MAT_LOCAL,&info);
915       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
916       ierr = MatGetInfo(baij->B,MAT_LOCAL,&info);
917       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
918       fflush(fd);
919       PetscSequentialPhaseEnd(mat->comm,1);
920       ierr = VecScatterView(baij->Mvctx,viewer); CHKERRQ(ierr);
921       PetscFunctionReturn(0);
922     } else if (format == VIEWER_FORMAT_ASCII_INFO) {
923       PetscPrintf(mat->comm,"  block size is %d\n",bs);
924       PetscFunctionReturn(0);
925     }
926   }
927 
928   if (vtype == DRAW_VIEWER) {
929     Draw       draw;
930     PetscTruth isnull;
931     ierr = ViewerDrawGetDraw(viewer,&draw); CHKERRQ(ierr);
932     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
933   }
934 
935   if (vtype == ASCII_FILE_VIEWER) {
936     ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
937     PetscSequentialPhaseBegin(mat->comm,1);
938     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
939            baij->rank,baij->m,baij->rstart*bs,baij->rend*bs,baij->n,
940             baij->cstart*bs,baij->cend*bs);
941     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
942     ierr = MatView(baij->B,viewer); CHKERRQ(ierr);
943     fflush(fd);
944     PetscSequentialPhaseEnd(mat->comm,1);
945   } else {
946     int size = baij->size;
947     rank = baij->rank;
948     if (size == 1) {
949       ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
950     } else {
951       /* assemble the entire matrix onto first processor. */
952       Mat         A;
953       Mat_SeqBAIJ *Aloc;
954       int         M = baij->M, N = baij->N,*ai,*aj,row,col,i,j,k,*rvals;
955       int         mbs=baij->mbs;
956       Scalar      *a;
957 
958       if (!rank) {
959         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
960       } else {
961         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
962       }
963       PLogObjectParent(mat,A);
964 
965       /* copy over the A part */
966       Aloc = (Mat_SeqBAIJ*) baij->A->data;
967       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
968       row = baij->rstart;
969       rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
970 
971       for ( i=0; i<mbs; i++ ) {
972         rvals[0] = bs*(baij->rstart + i);
973         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
974         for ( j=ai[i]; j<ai[i+1]; j++ ) {
975           col = (baij->cstart+aj[j])*bs;
976           for (k=0; k<bs; k++ ) {
977             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
978             col++; a += bs;
979           }
980         }
981       }
982       /* copy over the B part */
983       Aloc = (Mat_SeqBAIJ*) baij->B->data;
984       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
985       row = baij->rstart*bs;
986       for ( i=0; i<mbs; i++ ) {
987         rvals[0] = bs*(baij->rstart + i);
988         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
989         for ( j=ai[i]; j<ai[i+1]; j++ ) {
990           col = baij->garray[aj[j]]*bs;
991           for (k=0; k<bs; k++ ) {
992             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
993             col++; a += bs;
994           }
995         }
996       }
997       PetscFree(rvals);
998       ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
999       ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1000       /*
1001          Everyone has to call to draw the matrix since the graphics waits are
1002          synchronized across all processors that share the Draw object
1003       */
1004       if (!rank || vtype == DRAW_VIEWER) {
1005         ierr = MatView(((Mat_MPIBAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
1006       }
1007       ierr = MatDestroy(A); CHKERRQ(ierr);
1008     }
1009   }
1010   PetscFunctionReturn(0);
1011 }
1012 
1013 
1014 
1015 #undef __FUNC__
1016 #define __FUNC__ "MatView_MPIBAIJ"
1017 int MatView_MPIBAIJ(PetscObject obj,Viewer viewer)
1018 {
1019   Mat         mat = (Mat) obj;
1020   int         ierr;
1021   ViewerType  vtype;
1022 
1023   PetscFunctionBegin;
1024   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
1025   if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER ||
1026       vtype == DRAW_VIEWER       || vtype == MATLAB_VIEWER) {
1027     ierr = MatView_MPIBAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
1028   } else if (vtype == BINARY_FILE_VIEWER) {
1029     ierr = MatView_MPIBAIJ_Binary(mat,viewer);CHKERRQ(ierr);
1030   }
1031   PetscFunctionReturn(0);
1032 }
1033 
1034 #undef __FUNC__
1035 #define __FUNC__ "MatDestroy_MPIBAIJ"
1036 int MatDestroy_MPIBAIJ(PetscObject obj)
1037 {
1038   Mat         mat = (Mat) obj;
1039   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1040   int         ierr;
1041 
1042   PetscFunctionBegin;
1043 #if defined(USE_PETSC_LOG)
1044   PLogObjectState(obj,"Rows=%d, Cols=%d",baij->M,baij->N);
1045 #endif
1046 
1047   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
1048   PetscFree(baij->rowners);
1049   ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1050   ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1051   if (baij->colmap) PetscFree(baij->colmap);
1052   if (baij->garray) PetscFree(baij->garray);
1053   if (baij->lvec)   VecDestroy(baij->lvec);
1054   if (baij->Mvctx)  VecScatterDestroy(baij->Mvctx);
1055   if (baij->rowvalues) PetscFree(baij->rowvalues);
1056   if (baij->barray) PetscFree(baij->barray);
1057   if (baij->ht) PetscFree(baij->ht);
1058   PetscFree(baij);
1059   PLogObjectDestroy(mat);
1060   PetscHeaderDestroy(mat);
1061   PetscFunctionReturn(0);
1062 }
1063 
1064 #undef __FUNC__
1065 #define __FUNC__ "MatMult_MPIBAIJ"
1066 int MatMult_MPIBAIJ(Mat A,Vec xx,Vec yy)
1067 {
1068   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1069   int         ierr, nt;
1070 
1071   PetscFunctionBegin;
1072   VecGetLocalSize_Fast(xx,nt);
1073   if (nt != a->n) {
1074     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible partition of A and xx");
1075   }
1076   VecGetLocalSize_Fast(yy,nt);
1077   if (nt != a->m) {
1078     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible parition of A and yy");
1079   }
1080   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1081   ierr = (*a->A->ops.mult)(a->A,xx,yy); CHKERRQ(ierr);
1082   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1083   ierr = (*a->B->ops.multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
1084   ierr = VecScatterPostRecvs(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1085   PetscFunctionReturn(0);
1086 }
1087 
1088 #undef __FUNC__
1089 #define __FUNC__ "MatMultAdd_MPIBAIJ"
1090 int MatMultAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1091 {
1092   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1093   int        ierr;
1094 
1095   PetscFunctionBegin;
1096   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1097   ierr = (*a->A->ops.multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1098   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1099   ierr = (*a->B->ops.multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
1100   PetscFunctionReturn(0);
1101 }
1102 
1103 #undef __FUNC__
1104 #define __FUNC__ "MatMultTrans_MPIBAIJ"
1105 int MatMultTrans_MPIBAIJ(Mat A,Vec xx,Vec yy)
1106 {
1107   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1108   int         ierr;
1109 
1110   PetscFunctionBegin;
1111   /* do nondiagonal part */
1112   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1113   /* send it on its way */
1114   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1115   /* do local part */
1116   ierr = (*a->A->ops.multtrans)(a->A,xx,yy); CHKERRQ(ierr);
1117   /* receive remote parts: note this assumes the values are not actually */
1118   /* inserted in yy until the next line, which is true for my implementation*/
1119   /* but is not perhaps always true. */
1120   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1121   PetscFunctionReturn(0);
1122 }
1123 
1124 #undef __FUNC__
1125 #define __FUNC__ "MatMultTransAdd_MPIBAIJ"
1126 int MatMultTransAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1127 {
1128   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1129   int         ierr;
1130 
1131   PetscFunctionBegin;
1132   /* do nondiagonal part */
1133   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1134   /* send it on its way */
1135   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1136   /* do local part */
1137   ierr = (*a->A->ops.multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1138   /* receive remote parts: note this assumes the values are not actually */
1139   /* inserted in yy until the next line, which is true for my implementation*/
1140   /* but is not perhaps always true. */
1141   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1142   PetscFunctionReturn(0);
1143 }
1144 
1145 /*
1146   This only works correctly for square matrices where the subblock A->A is the
1147    diagonal block
1148 */
1149 #undef __FUNC__
1150 #define __FUNC__ "MatGetDiagonal_MPIBAIJ"
1151 int MatGetDiagonal_MPIBAIJ(Mat A,Vec v)
1152 {
1153   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1154   int         ierr;
1155 
1156   PetscFunctionBegin;
1157   if (a->M != a->N) SETERRQ(PETSC_ERR_SUP,0,"Supports only square matrix where A->A is diag block");
1158   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
1159   PetscFunctionReturn(0);
1160 }
1161 
1162 #undef __FUNC__
1163 #define __FUNC__ "MatScale_MPIBAIJ"
1164 int MatScale_MPIBAIJ(Scalar *aa,Mat A)
1165 {
1166   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1167   int         ierr;
1168 
1169   PetscFunctionBegin;
1170   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
1171   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
1172   PetscFunctionReturn(0);
1173 }
1174 
1175 #undef __FUNC__
1176 #define __FUNC__ "MatGetSize_MPIBAIJ"
1177 int MatGetSize_MPIBAIJ(Mat matin,int *m,int *n)
1178 {
1179   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1180 
1181   PetscFunctionBegin;
1182   if (m) *m = mat->M;
1183   if (n) *n = mat->N;
1184   PetscFunctionReturn(0);
1185 }
1186 
1187 #undef __FUNC__
1188 #define __FUNC__ "MatGetLocalSize_MPIBAIJ"
1189 int MatGetLocalSize_MPIBAIJ(Mat matin,int *m,int *n)
1190 {
1191   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1192 
1193   PetscFunctionBegin;
1194   *m = mat->m; *n = mat->N;
1195   PetscFunctionReturn(0);
1196 }
1197 
1198 #undef __FUNC__
1199 #define __FUNC__ "MatGetOwnershipRange_MPIBAIJ"
1200 int MatGetOwnershipRange_MPIBAIJ(Mat matin,int *m,int *n)
1201 {
1202   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1203 
1204   PetscFunctionBegin;
1205   *m = mat->rstart*mat->bs; *n = mat->rend*mat->bs;
1206   PetscFunctionReturn(0);
1207 }
1208 
1209 extern int MatGetRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1210 extern int MatRestoreRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1211 
1212 #undef __FUNC__
1213 #define __FUNC__ "MatGetRow_MPIBAIJ"
1214 int MatGetRow_MPIBAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1215 {
1216   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1217   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1218   int        bs = mat->bs, bs2 = mat->bs2, i, ierr, *cworkA, *cworkB, **pcA, **pcB;
1219   int        nztot, nzA, nzB, lrow, brstart = mat->rstart*bs, brend = mat->rend*bs;
1220   int        *cmap, *idx_p,cstart = mat->cstart;
1221 
1222   PetscFunctionBegin;
1223   if (mat->getrowactive == PETSC_TRUE) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Already active");
1224   mat->getrowactive = PETSC_TRUE;
1225 
1226   if (!mat->rowvalues && (idx || v)) {
1227     /*
1228         allocate enough space to hold information from the longest row.
1229     */
1230     Mat_SeqBAIJ *Aa = (Mat_SeqBAIJ *) mat->A->data,*Ba = (Mat_SeqBAIJ *) mat->B->data;
1231     int     max = 1,mbs = mat->mbs,tmp;
1232     for ( i=0; i<mbs; i++ ) {
1233       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1234       if (max < tmp) { max = tmp; }
1235     }
1236     mat->rowvalues = (Scalar *) PetscMalloc( max*bs2*(sizeof(int)+sizeof(Scalar)));
1237     CHKPTRQ(mat->rowvalues);
1238     mat->rowindices = (int *) (mat->rowvalues + max*bs2);
1239   }
1240 
1241   if (row < brstart || row >= brend) SETERRQ(PETSC_ERR_SUP,0,"Only local rows")
1242   lrow = row - brstart;
1243 
1244   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1245   if (!v)   {pvA = 0; pvB = 0;}
1246   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1247   ierr = (*mat->A->ops.getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1248   ierr = (*mat->B->ops.getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1249   nztot = nzA + nzB;
1250 
1251   cmap  = mat->garray;
1252   if (v  || idx) {
1253     if (nztot) {
1254       /* Sort by increasing column numbers, assuming A and B already sorted */
1255       int imark = -1;
1256       if (v) {
1257         *v = v_p = mat->rowvalues;
1258         for ( i=0; i<nzB; i++ ) {
1259           if (cmap[cworkB[i]/bs] < cstart)   v_p[i] = vworkB[i];
1260           else break;
1261         }
1262         imark = i;
1263         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1264         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1265       }
1266       if (idx) {
1267         *idx = idx_p = mat->rowindices;
1268         if (imark > -1) {
1269           for ( i=0; i<imark; i++ ) {
1270             idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs;
1271           }
1272         } else {
1273           for ( i=0; i<nzB; i++ ) {
1274             if (cmap[cworkB[i]/bs] < cstart)
1275               idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1276             else break;
1277           }
1278           imark = i;
1279         }
1280         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart*bs + cworkA[i];
1281         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1282       }
1283     } else {
1284       if (idx) *idx = 0;
1285       if (v)   *v   = 0;
1286     }
1287   }
1288   *nz = nztot;
1289   ierr = (*mat->A->ops.restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1290   ierr = (*mat->B->ops.restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1291   PetscFunctionReturn(0);
1292 }
1293 
1294 #undef __FUNC__
1295 #define __FUNC__ "MatRestoreRow_MPIBAIJ"
1296 int MatRestoreRow_MPIBAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1297 {
1298   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1299 
1300   PetscFunctionBegin;
1301   if (baij->getrowactive == PETSC_FALSE) {
1302     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"MatGetRow not called");
1303   }
1304   baij->getrowactive = PETSC_FALSE;
1305   PetscFunctionReturn(0);
1306 }
1307 
1308 #undef __FUNC__
1309 #define __FUNC__ "MatGetBlockSize_MPIBAIJ"
1310 int MatGetBlockSize_MPIBAIJ(Mat mat,int *bs)
1311 {
1312   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1313 
1314   PetscFunctionBegin;
1315   *bs = baij->bs;
1316   PetscFunctionReturn(0);
1317 }
1318 
1319 #undef __FUNC__
1320 #define __FUNC__ "MatZeroEntries_MPIBAIJ"
1321 int MatZeroEntries_MPIBAIJ(Mat A)
1322 {
1323   Mat_MPIBAIJ *l = (Mat_MPIBAIJ *) A->data;
1324   int         ierr;
1325 
1326   PetscFunctionBegin;
1327   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
1328   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
1329   PetscFunctionReturn(0);
1330 }
1331 
1332 #undef __FUNC__
1333 #define __FUNC__ "MatGetInfo_MPIBAIJ"
1334 int MatGetInfo_MPIBAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1335 {
1336   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) matin->data;
1337   Mat         A = a->A, B = a->B;
1338   int         ierr;
1339   double      isend[5], irecv[5];
1340 
1341   PetscFunctionBegin;
1342   info->block_size     = (double)a->bs;
1343   ierr = MatGetInfo(A,MAT_LOCAL,info); CHKERRQ(ierr);
1344   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->memory;
1345   ierr = MatGetInfo(B,MAT_LOCAL,info); CHKERRQ(ierr);
1346   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->memory;
1347   if (flag == MAT_LOCAL) {
1348     info->nz_used      = isend[0];
1349     info->nz_allocated = isend[1];
1350     info->nz_unneeded  = isend[2];
1351     info->memory       = isend[3];
1352     info->mallocs      = isend[4];
1353   } else if (flag == MAT_GLOBAL_MAX) {
1354     ierr = MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_MAX,matin->comm);CHKERRQ(ierr);
1355     info->nz_used      = irecv[0];
1356     info->nz_allocated = irecv[1];
1357     info->nz_unneeded  = irecv[2];
1358     info->memory       = irecv[3];
1359     info->mallocs      = irecv[4];
1360   } else if (flag == MAT_GLOBAL_SUM) {
1361     ierr = MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_SUM,matin->comm);CHKERRQ(ierr);
1362     info->nz_used      = irecv[0];
1363     info->nz_allocated = irecv[1];
1364     info->nz_unneeded  = irecv[2];
1365     info->memory       = irecv[3];
1366     info->mallocs      = irecv[4];
1367   }
1368   info->rows_global       = (double)a->M;
1369   info->columns_global    = (double)a->N;
1370   info->rows_local        = (double)a->m;
1371   info->columns_local     = (double)a->N;
1372   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1373   info->fill_ratio_needed = 0;
1374   info->factor_mallocs    = 0;
1375   PetscFunctionReturn(0);
1376 }
1377 
1378 #undef __FUNC__
1379 #define __FUNC__ "MatSetOption_MPIBAIJ"
1380 int MatSetOption_MPIBAIJ(Mat A,MatOption op)
1381 {
1382   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1383 
1384   PetscFunctionBegin;
1385   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1386       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1387       op == MAT_COLUMNS_UNSORTED ||
1388       op == MAT_COLUMNS_SORTED ||
1389       op == MAT_NEW_NONZERO_ALLOCATION_ERROR ||
1390       op == MAT_NEW_NONZERO_LOCATION_ERROR) {
1391         MatSetOption(a->A,op);
1392         MatSetOption(a->B,op);
1393   } else if (op == MAT_ROW_ORIENTED) {
1394         a->roworiented = 1;
1395         MatSetOption(a->A,op);
1396         MatSetOption(a->B,op);
1397   } else if (op == MAT_ROWS_SORTED ||
1398              op == MAT_ROWS_UNSORTED ||
1399              op == MAT_SYMMETRIC ||
1400              op == MAT_STRUCTURALLY_SYMMETRIC ||
1401              op == MAT_YES_NEW_DIAGONALS)
1402     PLogInfo(A,"Info:MatSetOption_MPIBAIJ:Option ignored\n");
1403   else if (op == MAT_COLUMN_ORIENTED) {
1404     a->roworiented = 0;
1405     MatSetOption(a->A,op);
1406     MatSetOption(a->B,op);
1407   } else if (op == MAT_IGNORE_OFF_PROC_ENTRIES) {
1408     a->donotstash = 1;
1409   } else if (op == MAT_NO_NEW_DIAGONALS) {
1410     SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");
1411   } else {
1412     SETERRQ(PETSC_ERR_SUP,0,"unknown option");
1413   }
1414   PetscFunctionReturn(0);
1415 }
1416 
1417 #undef __FUNC__
1418 #define __FUNC__ "MatTranspose_MPIBAIJ("
1419 int MatTranspose_MPIBAIJ(Mat A,Mat *matout)
1420 {
1421   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) A->data;
1422   Mat_SeqBAIJ *Aloc;
1423   Mat        B;
1424   int        ierr,M=baij->M,N=baij->N,*ai,*aj,row,i,*rvals,j,k,col;
1425   int        bs=baij->bs,mbs=baij->mbs;
1426   Scalar     *a;
1427 
1428   PetscFunctionBegin;
1429   if (matout == PETSC_NULL && M != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Square matrix only for in-place");
1430   ierr = MatCreateMPIBAIJ(A->comm,baij->bs,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);
1431   CHKERRQ(ierr);
1432 
1433   /* copy over the A part */
1434   Aloc = (Mat_SeqBAIJ*) baij->A->data;
1435   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1436   row = baij->rstart;
1437   rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1438 
1439   for ( i=0; i<mbs; i++ ) {
1440     rvals[0] = bs*(baij->rstart + i);
1441     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1442     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1443       col = (baij->cstart+aj[j])*bs;
1444       for (k=0; k<bs; k++ ) {
1445         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1446         col++; a += bs;
1447       }
1448     }
1449   }
1450   /* copy over the B part */
1451   Aloc = (Mat_SeqBAIJ*) baij->B->data;
1452   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1453   row = baij->rstart*bs;
1454   for ( i=0; i<mbs; i++ ) {
1455     rvals[0] = bs*(baij->rstart + i);
1456     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1457     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1458       col = baij->garray[aj[j]]*bs;
1459       for (k=0; k<bs; k++ ) {
1460         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1461         col++; a += bs;
1462       }
1463     }
1464   }
1465   PetscFree(rvals);
1466   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1467   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1468 
1469   if (matout != PETSC_NULL) {
1470     *matout = B;
1471   } else {
1472     /* This isn't really an in-place transpose .... but free data structures from baij */
1473     PetscFree(baij->rowners);
1474     ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1475     ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1476     if (baij->colmap) PetscFree(baij->colmap);
1477     if (baij->garray) PetscFree(baij->garray);
1478     if (baij->lvec) VecDestroy(baij->lvec);
1479     if (baij->Mvctx) VecScatterDestroy(baij->Mvctx);
1480     PetscFree(baij);
1481     PetscMemcpy(A,B,sizeof(struct _p_Mat));
1482     PetscHeaderDestroy(B);
1483   }
1484   PetscFunctionReturn(0);
1485 }
1486 
1487 #undef __FUNC__
1488 #define __FUNC__ "MatDiagonalScale_MPIBAIJ"
1489 int MatDiagonalScale_MPIBAIJ(Mat A,Vec ll,Vec rr)
1490 {
1491   Mat a = ((Mat_MPIBAIJ *) A->data)->A;
1492   Mat b = ((Mat_MPIBAIJ *) A->data)->B;
1493   int ierr,s1,s2,s3;
1494 
1495   PetscFunctionBegin;
1496   if (ll)  {
1497     ierr = VecGetLocalSize(ll,&s1); CHKERRQ(ierr);
1498     ierr = MatGetLocalSize(A,&s2,&s3); CHKERRQ(ierr);
1499     if (s1!=s2) SETERRQ(PETSC_ERR_ARG_SIZ,0,"non-conforming local sizes");
1500     ierr = MatDiagonalScale(a,ll,0); CHKERRQ(ierr);
1501     ierr = MatDiagonalScale(b,ll,0); CHKERRQ(ierr);
1502   }
1503   if (rr) SETERRQ(PETSC_ERR_SUP,0,"not supported for right vector");
1504   PetscFunctionReturn(0);
1505 }
1506 
1507 #undef __FUNC__
1508 #define __FUNC__ "MatZeroRows_MPIBAIJ"
1509 int MatZeroRows_MPIBAIJ(Mat A,IS is,Scalar *diag)
1510 {
1511   Mat_MPIBAIJ    *l = (Mat_MPIBAIJ *) A->data;
1512   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
1513   int            *procs,*nprocs,j,found,idx,nsends,*work,row;
1514   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
1515   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
1516   int            *lens,imdex,*lrows,*values,bs=l->bs,rstart_bs=l->rstart_bs;
1517   MPI_Comm       comm = A->comm;
1518   MPI_Request    *send_waits,*recv_waits;
1519   MPI_Status     recv_status,*send_status;
1520   IS             istmp;
1521 
1522   PetscFunctionBegin;
1523   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
1524   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
1525 
1526   /*  first count number of contributors to each processor */
1527   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
1528   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
1529   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
1530   for ( i=0; i<N; i++ ) {
1531     idx = rows[i];
1532     found = 0;
1533     for ( j=0; j<size; j++ ) {
1534       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
1535         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
1536       }
1537     }
1538     if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Index out of range");
1539   }
1540   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
1541 
1542   /* inform other processors of number of messages and max length*/
1543   work   = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
1544   ierr   = MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
1545   nrecvs = work[rank];
1546   ierr   = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
1547   nmax   = work[rank];
1548   PetscFree(work);
1549 
1550   /* post receives:   */
1551   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); CHKPTRQ(rvalues);
1552   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
1553   for ( i=0; i<nrecvs; i++ ) {
1554     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
1555   }
1556 
1557   /* do sends:
1558      1) starts[i] gives the starting index in svalues for stuff going to
1559      the ith processor
1560   */
1561   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
1562   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
1563   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
1564   starts[0] = 0;
1565   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1566   for ( i=0; i<N; i++ ) {
1567     svalues[starts[owner[i]]++] = rows[i];
1568   }
1569   ISRestoreIndices(is,&rows);
1570 
1571   starts[0] = 0;
1572   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1573   count = 0;
1574   for ( i=0; i<size; i++ ) {
1575     if (procs[i]) {
1576       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
1577     }
1578   }
1579   PetscFree(starts);
1580 
1581   base = owners[rank]*bs;
1582 
1583   /*  wait on receives */
1584   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
1585   source = lens + nrecvs;
1586   count  = nrecvs; slen = 0;
1587   while (count) {
1588     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
1589     /* unpack receives into our local space */
1590     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
1591     source[imdex]  = recv_status.MPI_SOURCE;
1592     lens[imdex]  = n;
1593     slen += n;
1594     count--;
1595   }
1596   PetscFree(recv_waits);
1597 
1598   /* move the data into the send scatter */
1599   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
1600   count = 0;
1601   for ( i=0; i<nrecvs; i++ ) {
1602     values = rvalues + i*nmax;
1603     for ( j=0; j<lens[i]; j++ ) {
1604       lrows[count++] = values[j] - base;
1605     }
1606   }
1607   PetscFree(rvalues); PetscFree(lens);
1608   PetscFree(owner); PetscFree(nprocs);
1609 
1610   /* actually zap the local rows */
1611   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
1612   PLogObjectParent(A,istmp);
1613 
1614   ierr = MatZeroRows(l->A,istmp,0); CHKERRQ(ierr);
1615   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
1616   ierr = ISDestroy(istmp); CHKERRQ(ierr);
1617 
1618   if (diag) {
1619     for ( i = 0; i < slen; i++ ) {
1620       row = lrows[i] + rstart_bs;
1621       ierr = MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES); CHKERRQ(ierr);
1622     }
1623     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1624     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1625   }
1626   PetscFree(lrows);
1627 
1628   /* wait on sends */
1629   if (nsends) {
1630     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
1631     ierr        = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
1632     PetscFree(send_status);
1633   }
1634   PetscFree(send_waits); PetscFree(svalues);
1635 
1636   PetscFunctionReturn(0);
1637 }
1638 extern int MatPrintHelp_SeqBAIJ(Mat);
1639 #undef __FUNC__
1640 #define __FUNC__ "MatPrintHelp_MPIBAIJ"
1641 int MatPrintHelp_MPIBAIJ(Mat A)
1642 {
1643   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1644   int         ierr;
1645 
1646   PetscFunctionBegin;
1647   if (!a->rank) {
1648     ierr = MatPrintHelp_SeqBAIJ(a->A);CHKERRQ(ierr);
1649   }
1650   PetscFunctionReturn(0);
1651 }
1652 
1653 #undef __FUNC__
1654 #define __FUNC__ "MatSetUnfactored_MPIBAIJ"
1655 int MatSetUnfactored_MPIBAIJ(Mat A)
1656 {
1657   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1658   int         ierr;
1659 
1660   PetscFunctionBegin;
1661   ierr = MatSetUnfactored(a->A); CHKERRQ(ierr);
1662   PetscFunctionReturn(0);
1663 }
1664 
1665 static int MatConvertSameType_MPIBAIJ(Mat,Mat *,int);
1666 
1667 /* -------------------------------------------------------------------*/
1668 static struct _MatOps MatOps = {
1669   MatSetValues_MPIBAIJ,MatGetRow_MPIBAIJ,MatRestoreRow_MPIBAIJ,MatMult_MPIBAIJ,
1670   MatMultAdd_MPIBAIJ,MatMultTrans_MPIBAIJ,MatMultTransAdd_MPIBAIJ,0,
1671   0,0,0,0,
1672   0,0,MatTranspose_MPIBAIJ,MatGetInfo_MPIBAIJ,
1673   0,MatGetDiagonal_MPIBAIJ,MatDiagonalScale_MPIBAIJ,MatNorm_MPIBAIJ,
1674   MatAssemblyBegin_MPIBAIJ,MatAssemblyEnd_MPIBAIJ,0,MatSetOption_MPIBAIJ,
1675   MatZeroEntries_MPIBAIJ,MatZeroRows_MPIBAIJ,0,
1676   0,0,0,MatGetSize_MPIBAIJ,
1677   MatGetLocalSize_MPIBAIJ,MatGetOwnershipRange_MPIBAIJ,0,0,
1678   0,0,MatConvertSameType_MPIBAIJ,0,0,
1679   0,0,0,MatGetSubMatrices_MPIBAIJ,
1680   MatIncreaseOverlap_MPIBAIJ,MatGetValues_MPIBAIJ,0,MatPrintHelp_MPIBAIJ,
1681   MatScale_MPIBAIJ,0,0,0,MatGetBlockSize_MPIBAIJ,
1682   0,0,0,0,0,0,MatSetUnfactored_MPIBAIJ,0,MatSetValuesBlocked_MPIBAIJ};
1683 
1684 
1685 #undef __FUNC__
1686 #define __FUNC__ "MatCreateMPIBAIJ"
1687 /*@C
1688    MatCreateMPIBAIJ - Creates a sparse parallel matrix in block AIJ format
1689    (block compressed row).  For good matrix assembly performance
1690    the user should preallocate the matrix storage by setting the parameters
1691    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1692    performance can be increased by more than a factor of 50.
1693 
1694    Input Parameters:
1695 .  comm - MPI communicator
1696 .  bs   - size of blockk
1697 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1698            This value should be the same as the local size used in creating the
1699            y vector for the matrix-vector product y = Ax.
1700 .  n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1701            This value should be the same as the local size used in creating the
1702            x vector for the matrix-vector product y = Ax.
1703 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1704 .  N - number of global columns (or PETSC_DECIDE to have calculated if n is given)
1705 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1706            submatrix  (same for all local rows)
1707 .  d_nzz - array containing the number of block nonzeros in the various block rows
1708            of the in diagonal portion of the local (possibly different for each block
1709            row) or PETSC_NULL.  You must leave room for the diagonal entry even if
1710            it is zero.
1711 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1712            submatrix (same for all local rows).
1713 .  o_nzz - array containing the number of nonzeros in the various block rows of the
1714            off-diagonal portion of the local submatrix (possibly different for
1715            each block row) or PETSC_NULL.
1716 
1717    Output Parameter:
1718 .  A - the matrix
1719 
1720    Notes:
1721    The user MUST specify either the local or global matrix dimensions
1722    (possibly both).
1723 
1724    Storage Information:
1725    For a square global matrix we define each processor's diagonal portion
1726    to be its local rows and the corresponding columns (a square submatrix);
1727    each processor's off-diagonal portion encompasses the remainder of the
1728    local matrix (a rectangular submatrix).
1729 
1730    The user can specify preallocated storage for the diagonal part of
1731    the local submatrix with either d_nz or d_nnz (not both).  Set
1732    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1733    memory allocation.  Likewise, specify preallocated storage for the
1734    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1735 
1736    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1737    the figure below we depict these three local rows and all columns (0-11).
1738 
1739 $          0 1 2 3 4 5 6 7 8 9 10 11
1740 $         -------------------
1741 $  row 3  |  o o o d d d o o o o o o
1742 $  row 4  |  o o o d d d o o o o o o
1743 $  row 5  |  o o o d d d o o o o o o
1744 $         -------------------
1745 $
1746 
1747    Thus, any entries in the d locations are stored in the d (diagonal)
1748    submatrix, and any entries in the o locations are stored in the
1749    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1750    stored simply in the MATSEQBAIJ format for compressed row storage.
1751 
1752    Now d_nz should indicate the number of block nonzeros per row in the d matrix,
1753    and o_nz should indicate the number of block nonzeros per row in the o matrix.
1754    In general, for PDE problems in which most nonzeros are near the diagonal,
1755    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1756    or you will get TERRIBLE performance; see the users' manual chapter on
1757    matrices.
1758 
1759 .keywords: matrix, block, aij, compressed row, sparse, parallel
1760 
1761 .seealso: MatCreate(), MatCreateSeqBAIJ(), MatSetValues()
1762 @*/
1763 int MatCreateMPIBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,
1764                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1765 {
1766   Mat          B;
1767   Mat_MPIBAIJ  *b;
1768   int          ierr, i,sum[2],work[2],mbs,nbs,Mbs=PETSC_DECIDE,Nbs=PETSC_DECIDE,size;
1769 
1770   PetscFunctionBegin;
1771   if (bs < 1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Invalid block size specified, must be positive");
1772 
1773   MPI_Comm_size(comm,&size);
1774   if (size == 1) {
1775     if (M == PETSC_DECIDE) M = m;
1776     if (N == PETSC_DECIDE) N = n;
1777     ierr = MatCreateSeqBAIJ(comm,bs,M,N,d_nz,d_nnz,A); CHKERRQ(ierr);
1778     PetscFunctionReturn(0);
1779   }
1780 
1781   *A = 0;
1782   PetscHeaderCreate(B,_p_Mat,MAT_COOKIE,MATMPIBAIJ,comm,MatDestroy,MatView);
1783   PLogObjectCreate(B);
1784   B->data       = (void *) (b = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(b);
1785   PetscMemzero(b,sizeof(Mat_MPIBAIJ));
1786   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
1787 
1788   B->destroy    = MatDestroy_MPIBAIJ;
1789   B->view       = MatView_MPIBAIJ;
1790   B->mapping    = 0;
1791   B->factor     = 0;
1792   B->assembled  = PETSC_FALSE;
1793 
1794   B->insertmode = NOT_SET_VALUES;
1795   MPI_Comm_rank(comm,&b->rank);
1796   MPI_Comm_size(comm,&b->size);
1797 
1798   if ( m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL)) {
1799     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
1800   }
1801   if ( M == PETSC_DECIDE && m == PETSC_DECIDE) {
1802     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either M or m should be specified");
1803   }
1804   if ( N == PETSC_DECIDE && n == PETSC_DECIDE) {
1805     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either N or n should be specified");
1806   }
1807   if ( M != PETSC_DECIDE && m != PETSC_DECIDE) M = PETSC_DECIDE;
1808   if ( N != PETSC_DECIDE && n != PETSC_DECIDE) N = PETSC_DECIDE;
1809 
1810   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1811     work[0] = m; work[1] = n;
1812     mbs = m/bs; nbs = n/bs;
1813     ierr = MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );CHKERRQ(ierr);
1814     if (M == PETSC_DECIDE) {M = sum[0]; Mbs = M/bs;}
1815     if (N == PETSC_DECIDE) {N = sum[1]; Nbs = N/bs;}
1816   }
1817   if (m == PETSC_DECIDE) {
1818     Mbs = M/bs;
1819     if (Mbs*bs != M) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global rows must be divisible by blocksize");
1820     mbs = Mbs/b->size + ((Mbs % b->size) > b->rank);
1821     m   = mbs*bs;
1822   }
1823   if (n == PETSC_DECIDE) {
1824     Nbs = N/bs;
1825     if (Nbs*bs != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global cols must be divisible by blocksize");
1826     nbs = Nbs/b->size + ((Nbs % b->size) > b->rank);
1827     n   = nbs*bs;
1828   }
1829   if (mbs*bs != m || nbs*bs != n) {
1830     SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of local rows, cols must be divisible by blocksize");
1831   }
1832 
1833   b->m = m; B->m = m;
1834   b->n = n; B->n = n;
1835   b->N = N; B->N = N;
1836   b->M = M; B->M = M;
1837   b->bs  = bs;
1838   b->bs2 = bs*bs;
1839   b->mbs = mbs;
1840   b->nbs = nbs;
1841   b->Mbs = Mbs;
1842   b->Nbs = Nbs;
1843 
1844   /* build local table of row and column ownerships */
1845   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
1846   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
1847   b->cowners = b->rowners + b->size + 2;
1848   ierr = MPI_Allgather(&mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1849   b->rowners[0] = 0;
1850   for ( i=2; i<=b->size; i++ ) {
1851     b->rowners[i] += b->rowners[i-1];
1852   }
1853   b->rstart    = b->rowners[b->rank];
1854   b->rend      = b->rowners[b->rank+1];
1855   b->rstart_bs = b->rstart * bs;
1856   b->rend_bs   = b->rend * bs;
1857 
1858   ierr = MPI_Allgather(&nbs,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1859   b->cowners[0] = 0;
1860   for ( i=2; i<=b->size; i++ ) {
1861     b->cowners[i] += b->cowners[i-1];
1862   }
1863   b->cstart    = b->cowners[b->rank];
1864   b->cend      = b->cowners[b->rank+1];
1865   b->cstart_bs = b->cstart * bs;
1866   b->cend_bs   = b->cend * bs;
1867 
1868 
1869   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1870   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
1871   PLogObjectParent(B,b->A);
1872   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1873   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
1874   PLogObjectParent(B,b->B);
1875 
1876   /* build cache for off array entries formed */
1877   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
1878   b->donotstash  = 0;
1879   b->colmap      = 0;
1880   b->garray      = 0;
1881   b->roworiented = 1;
1882 
1883   /* stuff used in block assembly */
1884   b->barray       = 0;
1885 
1886   /* stuff used for matrix vector multiply */
1887   b->lvec         = 0;
1888   b->Mvctx        = 0;
1889 
1890   /* stuff for MatGetRow() */
1891   b->rowindices   = 0;
1892   b->rowvalues    = 0;
1893   b->getrowactive = PETSC_FALSE;
1894 
1895   /* hash table stuff */
1896   b->ht          = 0;
1897   b->ht_size     = 0;
1898 
1899   *A = B;
1900   PetscFunctionReturn(0);
1901 }
1902 
1903 #undef __FUNC__
1904 #define __FUNC__ "MatConvertSameType_MPIBAIJ"
1905 static int MatConvertSameType_MPIBAIJ(Mat matin,Mat *newmat,int cpvalues)
1906 {
1907   Mat         mat;
1908   Mat_MPIBAIJ *a,*oldmat = (Mat_MPIBAIJ *) matin->data;
1909   int         ierr, len=0, flg;
1910 
1911   PetscFunctionBegin;
1912   *newmat       = 0;
1913   PetscHeaderCreate(mat,_p_Mat,MAT_COOKIE,MATMPIBAIJ,matin->comm,MatDestroy,MatView);
1914   PLogObjectCreate(mat);
1915   mat->data       = (void *) (a = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(a);
1916   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1917   mat->destroy    = MatDestroy_MPIBAIJ;
1918   mat->view       = MatView_MPIBAIJ;
1919   mat->factor     = matin->factor;
1920   mat->assembled  = PETSC_TRUE;
1921 
1922   a->m = mat->m   = oldmat->m;
1923   a->n = mat->n   = oldmat->n;
1924   a->M = mat->M   = oldmat->M;
1925   a->N = mat->N   = oldmat->N;
1926 
1927   a->bs  = oldmat->bs;
1928   a->bs2 = oldmat->bs2;
1929   a->mbs = oldmat->mbs;
1930   a->nbs = oldmat->nbs;
1931   a->Mbs = oldmat->Mbs;
1932   a->Nbs = oldmat->Nbs;
1933 
1934   a->rstart       = oldmat->rstart;
1935   a->rend         = oldmat->rend;
1936   a->cstart       = oldmat->cstart;
1937   a->cend         = oldmat->cend;
1938   a->size         = oldmat->size;
1939   a->rank         = oldmat->rank;
1940   mat->insertmode = NOT_SET_VALUES;
1941   a->rowvalues    = 0;
1942   a->getrowactive = PETSC_FALSE;
1943   a->barray       = 0;
1944 
1945   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
1946   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
1947   a->cowners = a->rowners + a->size + 2;
1948   PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));
1949   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
1950   if (oldmat->colmap) {
1951     a->colmap = (int *) PetscMalloc((a->Nbs)*sizeof(int));CHKPTRQ(a->colmap);
1952     PLogObjectMemory(mat,(a->Nbs)*sizeof(int));
1953     PetscMemcpy(a->colmap,oldmat->colmap,(a->Nbs)*sizeof(int));
1954   } else a->colmap = 0;
1955   if (oldmat->garray && (len = ((Mat_SeqBAIJ *) (oldmat->B->data))->nbs)) {
1956     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
1957     PLogObjectMemory(mat,len*sizeof(int));
1958     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
1959   } else a->garray = 0;
1960 
1961   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
1962   PLogObjectParent(mat,a->lvec);
1963   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
1964   PLogObjectParent(mat,a->Mvctx);
1965   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
1966   PLogObjectParent(mat,a->A);
1967   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
1968   PLogObjectParent(mat,a->B);
1969   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
1970   if (flg) {
1971     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
1972   }
1973   *newmat = mat;
1974   PetscFunctionReturn(0);
1975 }
1976 
1977 #include "sys.h"
1978 
1979 #undef __FUNC__
1980 #define __FUNC__ "MatLoad_MPIBAIJ"
1981 int MatLoad_MPIBAIJ(Viewer viewer,MatType type,Mat *newmat)
1982 {
1983   Mat          A;
1984   int          i, nz, ierr, j,rstart, rend, fd;
1985   Scalar       *vals,*buf;
1986   MPI_Comm     comm = ((PetscObject)viewer)->comm;
1987   MPI_Status   status;
1988   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,*browners,maxnz,*cols;
1989   int          *locrowlens,*sndcounts = 0,*procsnz = 0, jj,*mycols,*ibuf;
1990   int          flg,tag = ((PetscObject)viewer)->tag,bs=1,bs2,Mbs,mbs,extra_rows;
1991   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
1992   int          dcount,kmax,k,nzcount,tmp;
1993 
1994   PetscFunctionBegin;
1995   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
1996   bs2  = bs*bs;
1997 
1998   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
1999   if (!rank) {
2000     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
2001     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT); CHKERRQ(ierr);
2002     if (header[0] != MAT_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"not matrix object");
2003     if (header[3] < 0) {
2004       SETERRQ(PETSC_ERR_FILE_UNEXPECTED,1,"Matrix stored in special format, cannot load as MPIBAIJ");
2005     }
2006   }
2007 
2008   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
2009   M = header[1]; N = header[2];
2010 
2011   if (M != N) SETERRQ(PETSC_ERR_SUP,0,"Can only do square matrices");
2012 
2013   /*
2014      This code adds extra rows to make sure the number of rows is
2015      divisible by the blocksize
2016   */
2017   Mbs        = M/bs;
2018   extra_rows = bs - M + bs*(Mbs);
2019   if (extra_rows == bs) extra_rows = 0;
2020   else                  Mbs++;
2021   if (extra_rows &&!rank) {
2022     PLogInfo(0,"MatLoad_MPIBAIJ:Padding loaded matrix to match blocksize\n");
2023   }
2024 
2025   /* determine ownership of all rows */
2026   mbs = Mbs/size + ((Mbs % size) > rank);
2027   m   = mbs * bs;
2028   rowners = (int *) PetscMalloc(2*(size+2)*sizeof(int)); CHKPTRQ(rowners);
2029   browners = rowners + size + 1;
2030   ierr = MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2031   rowners[0] = 0;
2032   for ( i=2; i<=size; i++ ) rowners[i] += rowners[i-1];
2033   for ( i=0; i<=size;  i++ ) browners[i] = rowners[i]*bs;
2034   rstart = rowners[rank];
2035   rend   = rowners[rank+1];
2036 
2037   /* distribute row lengths to all processors */
2038   locrowlens = (int*) PetscMalloc( (rend-rstart)*bs*sizeof(int) ); CHKPTRQ(locrowlens);
2039   if (!rank) {
2040     rowlengths = (int*) PetscMalloc( (M+extra_rows)*sizeof(int) ); CHKPTRQ(rowlengths);
2041     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT); CHKERRQ(ierr);
2042     for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
2043     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
2044     for ( i=0; i<size; i++ ) sndcounts[i] = browners[i+1] - browners[i];
2045     ierr = MPI_Scatterv(rowlengths,sndcounts,browners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);CHKERRQ(ierr);
2046     PetscFree(sndcounts);
2047   } else {
2048     ierr = MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT, 0,comm);CHKERRQ(ierr);
2049   }
2050 
2051   if (!rank) {
2052     /* calculate the number of nonzeros on each processor */
2053     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
2054     PetscMemzero(procsnz,size*sizeof(int));
2055     for ( i=0; i<size; i++ ) {
2056       for ( j=rowners[i]*bs; j< rowners[i+1]*bs; j++ ) {
2057         procsnz[i] += rowlengths[j];
2058       }
2059     }
2060     PetscFree(rowlengths);
2061 
2062     /* determine max buffer needed and allocate it */
2063     maxnz = 0;
2064     for ( i=0; i<size; i++ ) {
2065       maxnz = PetscMax(maxnz,procsnz[i]);
2066     }
2067     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
2068 
2069     /* read in my part of the matrix column indices  */
2070     nz = procsnz[0];
2071     ibuf = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2072     mycols = ibuf;
2073     if (size == 1)  nz -= extra_rows;
2074     ierr = PetscBinaryRead(fd,mycols,nz,PETSC_INT); CHKERRQ(ierr);
2075     if (size == 1)  for (i=0; i< extra_rows; i++) { mycols[nz+i] = M+i; }
2076 
2077     /* read in every ones (except the last) and ship off */
2078     for ( i=1; i<size-1; i++ ) {
2079       nz   = procsnz[i];
2080       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2081       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
2082     }
2083     /* read in the stuff for the last proc */
2084     if ( size != 1 ) {
2085       nz   = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
2086       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2087       for ( i=0; i<extra_rows; i++ ) cols[nz+i] = M+i;
2088       ierr = MPI_Send(cols,nz+extra_rows,MPI_INT,size-1,tag,comm);CHKERRQ(ierr);
2089     }
2090     PetscFree(cols);
2091   } else {
2092     /* determine buffer space needed for message */
2093     nz = 0;
2094     for ( i=0; i<m; i++ ) {
2095       nz += locrowlens[i];
2096     }
2097     ibuf   = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2098     mycols = ibuf;
2099     /* receive message of column indices*/
2100     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
2101     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
2102     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2103   }
2104 
2105   /* loop over local rows, determining number of off diagonal entries */
2106   dlens  = (int *) PetscMalloc( 2*(rend-rstart+1)*sizeof(int) ); CHKPTRQ(dlens);
2107   odlens = dlens + (rend-rstart);
2108   mask   = (int *) PetscMalloc( 3*Mbs*sizeof(int) ); CHKPTRQ(mask);
2109   PetscMemzero(mask,3*Mbs*sizeof(int));
2110   masked1 = mask    + Mbs;
2111   masked2 = masked1 + Mbs;
2112   rowcount = 0; nzcount = 0;
2113   for ( i=0; i<mbs; i++ ) {
2114     dcount  = 0;
2115     odcount = 0;
2116     for ( j=0; j<bs; j++ ) {
2117       kmax = locrowlens[rowcount];
2118       for ( k=0; k<kmax; k++ ) {
2119         tmp = mycols[nzcount++]/bs;
2120         if (!mask[tmp]) {
2121           mask[tmp] = 1;
2122           if (tmp < rstart || tmp >= rend ) masked2[odcount++] = tmp;
2123           else masked1[dcount++] = tmp;
2124         }
2125       }
2126       rowcount++;
2127     }
2128 
2129     dlens[i]  = dcount;
2130     odlens[i] = odcount;
2131 
2132     /* zero out the mask elements we set */
2133     for ( j=0; j<dcount; j++ ) mask[masked1[j]] = 0;
2134     for ( j=0; j<odcount; j++ ) mask[masked2[j]] = 0;
2135   }
2136 
2137   /* create our matrix */
2138   ierr = MatCreateMPIBAIJ(comm,bs,m,PETSC_DECIDE,M+extra_rows,N+extra_rows,0,dlens,0,odlens,newmat);
2139          CHKERRQ(ierr);
2140   A = *newmat;
2141   MatSetOption(A,MAT_COLUMNS_SORTED);
2142 
2143   if (!rank) {
2144     buf = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(buf);
2145     /* read in my part of the matrix numerical values  */
2146     nz = procsnz[0];
2147     vals = buf;
2148     mycols = ibuf;
2149     if (size == 1)  nz -= extra_rows;
2150     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2151     if (size == 1)  for (i=0; i< extra_rows; i++) { vals[nz+i] = 1.0; }
2152 
2153     /* insert into matrix */
2154     jj      = rstart*bs;
2155     for ( i=0; i<m; i++ ) {
2156       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2157       mycols += locrowlens[i];
2158       vals   += locrowlens[i];
2159       jj++;
2160     }
2161     /* read in other processors (except the last one) and ship out */
2162     for ( i=1; i<size-1; i++ ) {
2163       nz   = procsnz[i];
2164       vals = buf;
2165       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2166       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
2167     }
2168     /* the last proc */
2169     if ( size != 1 ){
2170       nz   = procsnz[i] - extra_rows;
2171       vals = buf;
2172       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2173       for ( i=0; i<extra_rows; i++ ) vals[nz+i] = 1.0;
2174       ierr = MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,size-1,A->tag,comm);CHKERRQ(ierr);
2175     }
2176     PetscFree(procsnz);
2177   } else {
2178     /* receive numeric values */
2179     buf = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(buf);
2180 
2181     /* receive message of values*/
2182     vals   = buf;
2183     mycols = ibuf;
2184     ierr   = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
2185     ierr   = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
2186     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2187 
2188     /* insert into matrix */
2189     jj      = rstart*bs;
2190     for ( i=0; i<m; i++ ) {
2191       ierr    = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2192       mycols += locrowlens[i];
2193       vals   += locrowlens[i];
2194       jj++;
2195     }
2196   }
2197   PetscFree(locrowlens);
2198   PetscFree(buf);
2199   PetscFree(ibuf);
2200   PetscFree(rowners);
2201   PetscFree(dlens);
2202   PetscFree(mask);
2203   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2204   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2205   PetscFunctionReturn(0);
2206 }
2207 
2208 
2209