xref: /petsc/src/mat/impls/sbaij/mpi/mpisbaij.c (revision d643ce63cf9b1fe2cd4ab306b8154e7105d3fb3c)
1 /*$Id: mpisbaij.c,v 1.61 2001/08/10 03:31:37 bsmith Exp $*/
2 
3 #include "src/mat/impls/baij/mpi/mpibaij.h"    /*I "petscmat.h" I*/
4 #include "src/vec/vecimpl.h"
5 #include "mpisbaij.h"
6 #include "src/mat/impls/sbaij/seq/sbaij.h"
7 
8 extern int MatSetUpMultiply_MPISBAIJ(Mat);
9 extern int DisAssemble_MPISBAIJ(Mat);
10 extern int MatIncreaseOverlap_MPISBAIJ(Mat,int,IS *,int);
11 extern int MatGetSubMatrices_MPISBAIJ(Mat,int,IS *,IS *,MatReuse,Mat **);
12 extern int MatGetValues_SeqSBAIJ(Mat,int,int *,int,int *,PetscScalar *);
13 extern int MatSetValues_SeqSBAIJ(Mat,int,int *,int,int *,PetscScalar *,InsertMode);
14 extern int MatSetValuesBlocked_SeqSBAIJ(Mat,int,int*,int,int*,PetscScalar*,InsertMode);
15 extern int MatGetRow_SeqSBAIJ(Mat,int,int*,int**,PetscScalar**);
16 extern int MatRestoreRow_SeqSBAIJ(Mat,int,int*,int**,PetscScalar**);
17 extern int MatPrintHelp_SeqSBAIJ(Mat);
18 extern int MatZeroRows_SeqSBAIJ(Mat,IS,PetscScalar*);
19 extern int MatZeroRows_SeqBAIJ(Mat,IS,PetscScalar *);
20 extern int MatGetRowMax_MPISBAIJ(Mat,Vec);
21 
22 /*  UGLY, ugly, ugly
23    When MatScalar == PetscScalar the function MatSetValuesBlocked_MPIBAIJ_MatScalar() does
24    not exist. Otherwise ..._MatScalar() takes matrix elements in single precision and
25    inserts them into the single precision data structure. The function MatSetValuesBlocked_MPIBAIJ()
26    converts the entries into single precision and then calls ..._MatScalar() to put them
27    into the single precision data structures.
28 */
29 #if defined(PETSC_USE_MAT_SINGLE)
30 extern int MatSetValuesBlocked_SeqSBAIJ_MatScalar(Mat,int,int*,int,int*,MatScalar*,InsertMode);
31 extern int MatSetValues_MPISBAIJ_MatScalar(Mat,int,int*,int,int*,MatScalar*,InsertMode);
32 extern int MatSetValuesBlocked_MPISBAIJ_MatScalar(Mat,int,int*,int,int*,MatScalar*,InsertMode);
33 extern int MatSetValues_MPISBAIJ_HT_MatScalar(Mat,int,int*,int,int*,MatScalar*,InsertMode);
34 extern int MatSetValuesBlocked_MPISBAIJ_HT_MatScalar(Mat,int,int*,int,int*,MatScalar*,InsertMode);
35 #else
36 #define MatSetValuesBlocked_SeqSBAIJ_MatScalar      MatSetValuesBlocked_SeqSBAIJ
37 #define MatSetValues_MPISBAIJ_MatScalar             MatSetValues_MPISBAIJ
38 #define MatSetValuesBlocked_MPISBAIJ_MatScalar      MatSetValuesBlocked_MPISBAIJ
39 #define MatSetValues_MPISBAIJ_HT_MatScalar          MatSetValues_MPISBAIJ_HT
40 #define MatSetValuesBlocked_MPISBAIJ_HT_MatScalar   MatSetValuesBlocked_MPISBAIJ_HT
41 #endif
42 
43 EXTERN_C_BEGIN
44 #undef __FUNCT__
45 #define __FUNCT__ "MatStoreValues_MPISBAIJ"
46 int MatStoreValues_MPISBAIJ(Mat mat)
47 {
48   Mat_MPISBAIJ *aij = (Mat_MPISBAIJ *)mat->data;
49   int          ierr;
50 
51   PetscFunctionBegin;
52   ierr = MatStoreValues(aij->A);CHKERRQ(ierr);
53   ierr = MatStoreValues(aij->B);CHKERRQ(ierr);
54   PetscFunctionReturn(0);
55 }
56 EXTERN_C_END
57 
58 EXTERN_C_BEGIN
59 #undef __FUNCT__
60 #define __FUNCT__ "MatRetrieveValues_MPISBAIJ"
61 int MatRetrieveValues_MPISBAIJ(Mat mat)
62 {
63   Mat_MPISBAIJ *aij = (Mat_MPISBAIJ *)mat->data;
64   int          ierr;
65 
66   PetscFunctionBegin;
67   ierr = MatRetrieveValues(aij->A);CHKERRQ(ierr);
68   ierr = MatRetrieveValues(aij->B);CHKERRQ(ierr);
69   PetscFunctionReturn(0);
70 }
71 EXTERN_C_END
72 
73 /*
74      Local utility routine that creates a mapping from the global column
75    number to the local number in the off-diagonal part of the local
76    storage of the matrix.  This is done in a non scable way since the
77    length of colmap equals the global matrix length.
78 */
79 #undef __FUNCT__
80 #define __FUNCT__ "CreateColmap_MPISBAIJ_Private"
81 static int CreateColmap_MPISBAIJ_Private(Mat mat)
82 {
83   PetscFunctionBegin;
84   SETERRQ(1,"Function not yet written for SBAIJ format");
85   /* PetscFunctionReturn(0); */
86 }
87 
88 #define CHUNKSIZE  10
89 
90 #define  MatSetValues_SeqSBAIJ_A_Private(row,col,value,addv) \
91 { \
92  \
93     brow = row/bs;  \
94     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
95     rmax = aimax[brow]; nrow = ailen[brow]; \
96       bcol = col/bs; \
97       ridx = row % bs; cidx = col % bs; \
98       low = 0; high = nrow; \
99       while (high-low > 3) { \
100         t = (low+high)/2; \
101         if (rp[t] > bcol) high = t; \
102         else              low  = t; \
103       } \
104       for (_i=low; _i<high; _i++) { \
105         if (rp[_i] > bcol) break; \
106         if (rp[_i] == bcol) { \
107           bap  = ap +  bs2*_i + bs*cidx + ridx; \
108           if (addv == ADD_VALUES) *bap += value;  \
109           else                    *bap  = value;  \
110           goto a_noinsert; \
111         } \
112       } \
113       if (a->nonew == 1) goto a_noinsert; \
114       else if (a->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero into matrix"); \
115       if (nrow >= rmax) { \
116         /* there is no extra room in row, therefore enlarge */ \
117         int       new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
118         MatScalar *new_a; \
119  \
120         if (a->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero in the matrix"); \
121  \
122         /* malloc new storage space */ \
123         len   = new_nz*(sizeof(int)+bs2*sizeof(MatScalar))+(a->mbs+1)*sizeof(int); \
124         ierr  = PetscMalloc(len,&new_a);CHKERRQ(ierr); \
125         new_j = (int*)(new_a + bs2*new_nz); \
126         new_i = new_j + new_nz; \
127  \
128         /* copy over old data into new slots */ \
129         for (ii=0; ii<brow+1; ii++) {new_i[ii] = ai[ii];} \
130         for (ii=brow+1; ii<a->mbs+1; ii++) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
131         ierr = PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int));CHKERRQ(ierr); \
132         len = (new_nz - CHUNKSIZE - ai[brow] - nrow); \
133         ierr = PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow,len*sizeof(int));CHKERRQ(ierr); \
134         ierr = PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(MatScalar));CHKERRQ(ierr); \
135         ierr = PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(PetscScalar));CHKERRQ(ierr); \
136         ierr = PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE), \
137                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(MatScalar));CHKERRQ(ierr);  \
138         /* free up old matrix storage */ \
139         ierr = PetscFree(a->a);CHKERRQ(ierr);  \
140         if (!a->singlemalloc) { \
141           ierr = PetscFree(a->i);CHKERRQ(ierr); \
142           ierr = PetscFree(a->j);CHKERRQ(ierr);\
143         } \
144         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
145         a->singlemalloc = PETSC_TRUE; \
146  \
147         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
148         rmax = aimax[brow] = aimax[brow] + CHUNKSIZE; \
149         PetscLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(MatScalar))); \
150         a->s_maxnz += bs2*CHUNKSIZE; \
151         a->reallocs++; \
152         a->s_nz++; \
153       } \
154       N = nrow++ - 1;  \
155       /* shift up all the later entries in this row */ \
156       for (ii=N; ii>=_i; ii--) { \
157         rp[ii+1] = rp[ii]; \
158         ierr = PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(MatScalar));CHKERRQ(ierr); \
159       } \
160       if (N>=_i) { ierr = PetscMemzero(ap+bs2*_i,bs2*sizeof(MatScalar));CHKERRQ(ierr); }  \
161       rp[_i]                      = bcol;  \
162       ap[bs2*_i + bs*cidx + ridx] = value;  \
163       a_noinsert:; \
164     ailen[brow] = nrow; \
165 }
166 #ifndef MatSetValues_SeqBAIJ_B_Private
167 #define  MatSetValues_SeqSBAIJ_B_Private(row,col,value,addv) \
168 { \
169     brow = row/bs;  \
170     rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
171     rmax = bimax[brow]; nrow = bilen[brow]; \
172       bcol = col/bs; \
173       ridx = row % bs; cidx = col % bs; \
174       low = 0; high = nrow; \
175       while (high-low > 3) { \
176         t = (low+high)/2; \
177         if (rp[t] > bcol) high = t; \
178         else              low  = t; \
179       } \
180       for (_i=low; _i<high; _i++) { \
181         if (rp[_i] > bcol) break; \
182         if (rp[_i] == bcol) { \
183           bap  = ap +  bs2*_i + bs*cidx + ridx; \
184           if (addv == ADD_VALUES) *bap += value;  \
185           else                    *bap  = value;  \
186           goto b_noinsert; \
187         } \
188       } \
189       if (b->nonew == 1) goto b_noinsert; \
190       else if (b->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero into matrix"); \
191       if (nrow >= rmax) { \
192         /* there is no extra room in row, therefore enlarge */ \
193         int       new_nz = bi[b->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
194         MatScalar *new_a; \
195  \
196         if (b->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero in the matrix"); \
197  \
198         /* malloc new storage space */ \
199         len   = new_nz*(sizeof(int)+bs2*sizeof(MatScalar))+(b->mbs+1)*sizeof(int); \
200         ierr  = PetscMalloc(len,&new_a);CHKERRQ(ierr); \
201         new_j = (int*)(new_a + bs2*new_nz); \
202         new_i = new_j + new_nz; \
203  \
204         /* copy over old data into new slots */ \
205         for (ii=0; ii<brow+1; ii++) {new_i[ii] = bi[ii];} \
206         for (ii=brow+1; ii<b->mbs+1; ii++) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
207         ierr = PetscMemcpy(new_j,bj,(bi[brow]+nrow)*sizeof(int));CHKERRQ(ierr); \
208         len  = (new_nz - CHUNKSIZE - bi[brow] - nrow); \
209         ierr = PetscMemcpy(new_j+bi[brow]+nrow+CHUNKSIZE,bj+bi[brow]+nrow,len*sizeof(int));CHKERRQ(ierr); \
210         ierr = PetscMemcpy(new_a,ba,(bi[brow]+nrow)*bs2*sizeof(MatScalar));CHKERRQ(ierr); \
211         ierr = PetscMemzero(new_a+bs2*(bi[brow]+nrow),bs2*CHUNKSIZE*sizeof(MatScalar));CHKERRQ(ierr); \
212         ierr = PetscMemcpy(new_a+bs2*(bi[brow]+nrow+CHUNKSIZE), \
213                     ba+bs2*(bi[brow]+nrow),bs2*len*sizeof(MatScalar));CHKERRQ(ierr);  \
214         /* free up old matrix storage */ \
215         ierr = PetscFree(b->a);CHKERRQ(ierr);  \
216         if (!b->singlemalloc) { \
217           ierr = PetscFree(b->i);CHKERRQ(ierr); \
218           ierr = PetscFree(b->j);CHKERRQ(ierr); \
219         } \
220         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
221         b->singlemalloc = PETSC_TRUE; \
222  \
223         rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
224         rmax = bimax[brow] = bimax[brow] + CHUNKSIZE; \
225         PetscLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + bs2*sizeof(MatScalar))); \
226         b->maxnz += bs2*CHUNKSIZE; \
227         b->reallocs++; \
228         b->nz++; \
229       } \
230       N = nrow++ - 1;  \
231       /* shift up all the later entries in this row */ \
232       for (ii=N; ii>=_i; ii--) { \
233         rp[ii+1] = rp[ii]; \
234         ierr = PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(MatScalar));CHKERRQ(ierr); \
235       } \
236       if (N>=_i) { ierr = PetscMemzero(ap+bs2*_i,bs2*sizeof(MatScalar));CHKERRQ(ierr);}  \
237       rp[_i]                      = bcol;  \
238       ap[bs2*_i + bs*cidx + ridx] = value;  \
239       b_noinsert:; \
240     bilen[brow] = nrow; \
241 }
242 #endif
243 
244 #if defined(PETSC_USE_MAT_SINGLE)
245 #undef __FUNCT__
246 #define __FUNCT__ "MatSetValues_MPISBAIJ"
247 int MatSetValues_MPISBAIJ(Mat mat,int m,int *im,int n,int *in,PetscScalar *v,InsertMode addv)
248 {
249   Mat_MPISBAIJ *b = (Mat_MPISBAIJ*)mat->data;
250   int          ierr,i,N = m*n;
251   MatScalar    *vsingle;
252 
253   PetscFunctionBegin;
254   if (N > b->setvalueslen) {
255     if (b->setvaluescopy) {ierr = PetscFree(b->setvaluescopy);CHKERRQ(ierr);}
256     ierr = PetscMalloc(N*sizeof(MatScalar),&b->setvaluescopy);CHKERRQ(ierr);
257     b->setvalueslen  = N;
258   }
259   vsingle = b->setvaluescopy;
260 
261   for (i=0; i<N; i++) {
262     vsingle[i] = v[i];
263   }
264   ierr = MatSetValues_MPISBAIJ_MatScalar(mat,m,im,n,in,vsingle,addv);CHKERRQ(ierr);
265   PetscFunctionReturn(0);
266 }
267 
268 #undef __FUNCT__
269 #define __FUNCT__ "MatSetValuesBlocked_MPISBAIJ"
270 int MatSetValuesBlocked_MPISBAIJ(Mat mat,int m,int *im,int n,int *in,PetscScalar *v,InsertMode addv)
271 {
272   Mat_MPIBAIJ *b = (Mat_MPIBAIJ*)mat->data;
273   int         ierr,i,N = m*n*b->bs2;
274   MatScalar   *vsingle;
275 
276   PetscFunctionBegin;
277   if (N > b->setvalueslen) {
278     if (b->setvaluescopy) {ierr = PetscFree(b->setvaluescopy);CHKERRQ(ierr);}
279     ierr = PetscMalloc(N*sizeof(MatScalar),&b->setvaluescopy);CHKERRQ(ierr);
280     b->setvalueslen  = N;
281   }
282   vsingle = b->setvaluescopy;
283   for (i=0; i<N; i++) {
284     vsingle[i] = v[i];
285   }
286   ierr = MatSetValuesBlocked_MPISBAIJ_MatScalar(mat,m,im,n,in,vsingle,addv);CHKERRQ(ierr);
287   PetscFunctionReturn(0);
288 }
289 
290 #undef __FUNCT__
291 #define __FUNCT__ "MatSetValues_MPISBAIJ_HT"
292 int MatSetValues_MPISBAIJ_HT(Mat mat,int m,int *im,int n,int *in,PetscScalar *v,InsertMode addv)
293 {
294   Mat_MPIBAIJ *b = (Mat_MPIBAIJ*)mat->data;
295   int         ierr,i,N = m*n;
296   MatScalar   *vsingle;
297 
298   PetscFunctionBegin;
299   SETERRQ(1,"Function not yet written for SBAIJ format");
300   /* PetscFunctionReturn(0); */
301 }
302 
303 #undef __FUNCT__
304 #define __FUNCT__ "MatSetValuesBlocked_MPISBAIJ_HT"
305 int MatSetValuesBlocked_MPISBAIJ_HT(Mat mat,int m,int *im,int n,int *in,PetscScalar *v,InsertMode addv)
306 {
307   Mat_MPIBAIJ *b = (Mat_MPIBAIJ*)mat->data;
308   int         ierr,i,N = m*n*b->bs2;
309   MatScalar   *vsingle;
310 
311   PetscFunctionBegin;
312   SETERRQ(1,"Function not yet written for SBAIJ format");
313   /* PetscFunctionReturn(0); */
314 }
315 #endif
316 
317 /* Only add/insert a(i,j) with i<=j (blocks).
318    Any a(i,j) with i>j input by user is ingored.
319 */
320 #undef __FUNCT__
321 #define __FUNCT__ "MatSetValues_MPIBAIJ"
322 int MatSetValues_MPISBAIJ_MatScalar(Mat mat,int m,int *im,int n,int *in,MatScalar *v,InsertMode addv)
323 {
324   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
325   MatScalar    value;
326   PetscTruth   roworiented = baij->roworiented;
327   int          ierr,i,j,row,col;
328   int          rstart_orig=baij->rstart_bs;
329   int          rend_orig=baij->rend_bs,cstart_orig=baij->cstart_bs;
330   int          cend_orig=baij->cend_bs,bs=baij->bs;
331 
332   /* Some Variables required in the macro */
333   Mat          A = baij->A;
334   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)(A)->data;
335   int          *aimax=a->imax,*ai=a->i,*ailen=a->ilen,*aj=a->j;
336   MatScalar    *aa=a->a;
337 
338   Mat          B = baij->B;
339   Mat_SeqBAIJ  *b = (Mat_SeqBAIJ*)(B)->data;
340   int          *bimax=b->imax,*bi=b->i,*bilen=b->ilen,*bj=b->j;
341   MatScalar    *ba=b->a;
342 
343   int          *rp,ii,nrow,_i,rmax,N,brow,bcol;
344   int          low,high,t,ridx,cidx,bs2=a->bs2;
345   MatScalar    *ap,*bap;
346 
347   /* for stash */
348   int          n_loc, *in_loc=0;
349   MatScalar    *v_loc=0;
350 
351   PetscFunctionBegin;
352 
353   if(!baij->donotstash){
354     ierr = PetscMalloc(n*sizeof(int),&in_loc);CHKERRQ(ierr);
355     ierr = PetscMalloc(n*sizeof(MatScalar),&v_loc);CHKERRQ(ierr);
356   }
357 
358   for (i=0; i<m; i++) {
359     if (im[i] < 0) continue;
360 #if defined(PETSC_USE_BOPT_g)
361     if (im[i] >= mat->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Row too large");
362 #endif
363     if (im[i] >= rstart_orig && im[i] < rend_orig) { /* this processor entry */
364       row = im[i] - rstart_orig;              /* local row index */
365       for (j=0; j<n; j++) {
366         if (im[i]/bs > in[j]/bs) continue;    /* ignore lower triangular blocks */
367         if (in[j] >= cstart_orig && in[j] < cend_orig){  /* diag entry (A) */
368           col = in[j] - cstart_orig;          /* local col index */
369           brow = row/bs; bcol = col/bs;
370           if (brow > bcol) continue;  /* ignore lower triangular blocks of A */
371           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
372           MatSetValues_SeqSBAIJ_A_Private(row,col,value,addv);
373           /* ierr = MatSetValues_SeqBAIJ(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
374         } else if (in[j] < 0) continue;
375 #if defined(PETSC_USE_BOPT_g)
376         else if (in[j] >= mat->N) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Col too large");}
377 #endif
378         else {  /* off-diag entry (B) */
379           if (mat->was_assembled) {
380             if (!baij->colmap) {
381               ierr = CreateColmap_MPISBAIJ_Private(mat);CHKERRQ(ierr);
382             }
383 #if defined (PETSC_USE_CTABLE)
384             ierr = PetscTableFind(baij->colmap,in[j]/bs + 1,&col);CHKERRQ(ierr);
385             col  = col - 1 + in[j]%bs;
386 #else
387             col = baij->colmap[in[j]/bs] - 1 + in[j]%bs;
388 #endif
389             if (col < 0 && !((Mat_SeqSBAIJ*)(baij->A->data))->nonew) {
390               ierr = DisAssemble_MPISBAIJ(mat);CHKERRQ(ierr);
391               col =  in[j];
392               /* Reinitialize the variables required by MatSetValues_SeqBAIJ_B_Private() */
393               B = baij->B;
394               b = (Mat_SeqBAIJ*)(B)->data;
395               bimax=b->imax;bi=b->i;bilen=b->ilen;bj=b->j;
396               ba=b->a;
397             }
398           } else col = in[j];
399           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
400           MatSetValues_SeqSBAIJ_B_Private(row,col,value,addv);
401           /* ierr = MatSetValues_SeqBAIJ(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
402         }
403       }
404     } else {  /* off processor entry */
405       if (!baij->donotstash) {
406         n_loc = 0;
407         for (j=0; j<n; j++){
408           if (im[i]/bs > in[j]/bs) continue; /* ignore lower triangular blocks */
409           in_loc[n_loc] = in[j];
410           if (roworiented) {
411             v_loc[n_loc] = v[i*n+j];
412           } else {
413             v_loc[n_loc] = v[j*m+i];
414           }
415           n_loc++;
416         }
417         ierr = MatStashValuesRow_Private(&mat->stash,im[i],n_loc,in_loc,v_loc);CHKERRQ(ierr);
418       }
419     }
420   }
421 
422   if(!baij->donotstash){
423     ierr = PetscFree(in_loc);CHKERRQ(ierr);
424     ierr = PetscFree(v_loc);CHKERRQ(ierr);
425   }
426   PetscFunctionReturn(0);
427 }
428 
429 #undef __FUNCT__
430 #define __FUNCT__ "MatSetValuesBlocked_MPISBAIJ"
431 int MatSetValuesBlocked_MPISBAIJ_MatScalar(Mat mat,int m,int *im,int n,int *in,MatScalar *v,InsertMode addv)
432 {
433   PetscFunctionBegin;
434   SETERRQ(1,"Function not yet written for SBAIJ format");
435   /* PetscFunctionReturn(0); */
436 }
437 
438 #define HASH_KEY 0.6180339887
439 #define HASH(size,key,tmp) (tmp = (key)*HASH_KEY,(int)((size)*(tmp-(int)tmp)))
440 /* #define HASH(size,key) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
441 /* #define HASH(size,key,tmp) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
442 #undef __FUNCT__
443 #define __FUNCT__ "MatSetValues_MPISBAIJ_HT_MatScalar"
444 int MatSetValues_MPISBAIJ_HT_MatScalar(Mat mat,int m,int *im,int n,int *in,MatScalar *v,InsertMode addv)
445 {
446   PetscFunctionBegin;
447   SETERRQ(1,"Function not yet written for SBAIJ format");
448   /* PetscFunctionReturn(0); */
449 }
450 
451 #undef __FUNCT__
452 #define __FUNCT__ "MatSetValuesBlocked_MPISBAIJ_HT_MatScalar"
453 int MatSetValuesBlocked_MPISBAIJ_HT_MatScalar(Mat mat,int m,int *im,int n,int *in,MatScalar *v,InsertMode addv)
454 {
455   PetscFunctionBegin;
456   SETERRQ(1,"Function not yet written for SBAIJ format");
457   /* PetscFunctionReturn(0); */
458 }
459 
460 #undef __FUNCT__
461 #define __FUNCT__ "MatGetValues_MPISBAIJ"
462 int MatGetValues_MPISBAIJ(Mat mat,int m,int *idxm,int n,int *idxn,PetscScalar *v)
463 {
464   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
465   int          bs=baij->bs,ierr,i,j,bsrstart = baij->rstart*bs,bsrend = baij->rend*bs;
466   int          bscstart = baij->cstart*bs,bscend = baij->cend*bs,row,col,data;
467 
468   PetscFunctionBegin;
469   for (i=0; i<m; i++) {
470     if (idxm[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative row");
471     if (idxm[i] >= mat->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Row too large");
472     if (idxm[i] >= bsrstart && idxm[i] < bsrend) {
473       row = idxm[i] - bsrstart;
474       for (j=0; j<n; j++) {
475         if (idxn[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative column");
476         if (idxn[j] >= mat->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Column too large");
477         if (idxn[j] >= bscstart && idxn[j] < bscend){
478           col = idxn[j] - bscstart;
479           ierr = MatGetValues_SeqSBAIJ(baij->A,1,&row,1,&col,v+i*n+j);CHKERRQ(ierr);
480         } else {
481           if (!baij->colmap) {
482             ierr = CreateColmap_MPISBAIJ_Private(mat);CHKERRQ(ierr);
483           }
484 #if defined (PETSC_USE_CTABLE)
485           ierr = PetscTableFind(baij->colmap,idxn[j]/bs+1,&data);CHKERRQ(ierr);
486           data --;
487 #else
488           data = baij->colmap[idxn[j]/bs]-1;
489 #endif
490           if((data < 0) || (baij->garray[data/bs] != idxn[j]/bs)) *(v+i*n+j) = 0.0;
491           else {
492             col  = data + idxn[j]%bs;
493             ierr = MatGetValues_SeqSBAIJ(baij->B,1,&row,1,&col,v+i*n+j);CHKERRQ(ierr);
494           }
495         }
496       }
497     } else {
498       SETERRQ(PETSC_ERR_SUP,"Only local values currently supported");
499     }
500   }
501  PetscFunctionReturn(0);
502 }
503 
504 #undef __FUNCT__
505 #define __FUNCT__ "MatNorm_MPISBAIJ"
506 int MatNorm_MPISBAIJ(Mat mat,NormType type,PetscReal *norm)
507 {
508   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
509   /* Mat_SeqSBAIJ *amat = (Mat_SeqSBAIJ*)baij->A->data; */
510   /* Mat_SeqBAIJ  *bmat = (Mat_SeqBAIJ*)baij->B->data; */
511   int        ierr;
512   PetscReal  sum[2],*lnorm2;
513 
514   PetscFunctionBegin;
515   if (baij->size == 1) {
516     ierr =  MatNorm(baij->A,type,norm);CHKERRQ(ierr);
517   } else {
518     if (type == NORM_FROBENIUS) {
519       ierr = PetscMalloc(2*sizeof(PetscReal),&lnorm2);CHKERRQ(ierr);
520       ierr =  MatNorm(baij->A,type,lnorm2);CHKERRQ(ierr);
521       *lnorm2 = (*lnorm2)*(*lnorm2); lnorm2++;            /* squar power of norm(A) */
522       ierr =  MatNorm(baij->B,type,lnorm2);CHKERRQ(ierr);
523       *lnorm2 = (*lnorm2)*(*lnorm2); lnorm2--;             /* squar power of norm(B) */
524       /*
525       ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);
526       PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d], lnorm2=%g, %g\n",rank,lnorm2[0],lnorm2[1]);
527       */
528       ierr = MPI_Allreduce(lnorm2,&sum,2,MPIU_REAL,MPI_SUM,mat->comm);CHKERRQ(ierr);
529       /*
530       PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d], sum=%g, %g\n",rank,sum[0],sum[1]);
531       PetscSynchronizedFlush(PETSC_COMM_WORLD); */
532 
533       *norm = sqrt(sum[0] + 2*sum[1]);
534       ierr = PetscFree(lnorm2);CHKERRQ(ierr);
535     } else {
536       SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
537     }
538   }
539   PetscFunctionReturn(0);
540 }
541 
542 /*
543   Creates the hash table, and sets the table
544   This table is created only once.
545   If new entried need to be added to the matrix
546   then the hash table has to be destroyed and
547   recreated.
548 */
549 #undef __FUNCT__
550 #define __FUNCT__ "MatCreateHashTable_MPISBAIJ_Private"
551 int MatCreateHashTable_MPISBAIJ_Private(Mat mat,PetscReal factor)
552 {
553   PetscFunctionBegin;
554   SETERRQ(1,"Function not yet written for SBAIJ format");
555   /* PetscFunctionReturn(0); */
556 }
557 
558 #undef __FUNCT__
559 #define __FUNCT__ "MatAssemblyBegin_MPISBAIJ"
560 int MatAssemblyBegin_MPISBAIJ(Mat mat,MatAssemblyType mode)
561 {
562   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
563   int         ierr,nstash,reallocs;
564   InsertMode  addv;
565 
566   PetscFunctionBegin;
567   if (baij->donotstash) {
568     PetscFunctionReturn(0);
569   }
570 
571   /* make sure all processors are either in INSERTMODE or ADDMODE */
572   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,mat->comm);CHKERRQ(ierr);
573   if (addv == (ADD_VALUES|INSERT_VALUES)) {
574     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added");
575   }
576   mat->insertmode = addv; /* in case this processor had no cache */
577 
578   ierr = MatStashScatterBegin_Private(&mat->stash,baij->rowners_bs);CHKERRQ(ierr);
579   ierr = MatStashScatterBegin_Private(&mat->bstash,baij->rowners);CHKERRQ(ierr);
580   ierr = MatStashGetInfo_Private(&mat->stash,&nstash,&reallocs);CHKERRQ(ierr);
581   PetscLogInfo(0,"MatAssemblyBegin_MPISBAIJ:Stash has %d entries,uses %d mallocs.\n",nstash,reallocs);
582   ierr = MatStashGetInfo_Private(&mat->stash,&nstash,&reallocs);CHKERRQ(ierr);
583   PetscLogInfo(0,"MatAssemblyBegin_MPISBAIJ:Block-Stash has %d entries, uses %d mallocs.\n",nstash,reallocs);
584   PetscFunctionReturn(0);
585 }
586 
587 #undef __FUNCT__
588 #define __FUNCT__ "MatAssemblyEnd_MPISBAIJ"
589 int MatAssemblyEnd_MPISBAIJ(Mat mat,MatAssemblyType mode)
590 {
591   Mat_MPISBAIJ *baij=(Mat_MPISBAIJ*)mat->data;
592   Mat_SeqSBAIJ  *a=(Mat_SeqSBAIJ*)baij->A->data;
593   Mat_SeqBAIJ  *b=(Mat_SeqBAIJ*)baij->B->data;
594   int         i,j,rstart,ncols,n,ierr,flg,bs2=baij->bs2;
595   int         *row,*col,other_disassembled;
596   PetscTruth  r1,r2,r3;
597   MatScalar   *val;
598   InsertMode  addv = mat->insertmode;
599   /* int         rank;*/
600 
601   PetscFunctionBegin;
602   /* remove 2 line below later */
603   /*ierr = MPI_Comm_rank(PETSC_COMM_WORLD, &rank);CHKERRQ(ierr); */
604 
605   if (!baij->donotstash) {
606     while (1) {
607       ierr = MatStashScatterGetMesg_Private(&mat->stash,&n,&row,&col,&val,&flg);CHKERRQ(ierr);
608       /*
609       PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d]: in AssemblyEnd, stash, flg=%d\n",rank,flg);
610       PetscSynchronizedFlush(PETSC_COMM_WORLD);
611       */
612       if (!flg) break;
613 
614       for (i=0; i<n;) {
615         /* Now identify the consecutive vals belonging to the same row */
616         for (j=i,rstart=row[j]; j<n; j++) { if (row[j] != rstart) break; }
617         if (j < n) ncols = j-i;
618         else       ncols = n-i;
619         /* Now assemble all these values with a single function call */
620         ierr = MatSetValues_MPISBAIJ_MatScalar(mat,1,row+i,ncols,col+i,val+i,addv);CHKERRQ(ierr);
621         i = j;
622       }
623     }
624     ierr = MatStashScatterEnd_Private(&mat->stash);CHKERRQ(ierr);
625     /* Now process the block-stash. Since the values are stashed column-oriented,
626        set the roworiented flag to column oriented, and after MatSetValues()
627        restore the original flags */
628     r1 = baij->roworiented;
629     r2 = a->roworiented;
630     r3 = b->roworiented;
631     baij->roworiented = PETSC_FALSE;
632     a->roworiented    = PETSC_FALSE;
633     b->roworiented    = PETSC_FALSE;
634     while (1) {
635       ierr = MatStashScatterGetMesg_Private(&mat->bstash,&n,&row,&col,&val,&flg);CHKERRQ(ierr);
636       if (!flg) break;
637 
638       for (i=0; i<n;) {
639         /* Now identify the consecutive vals belonging to the same row */
640         for (j=i,rstart=row[j]; j<n; j++) { if (row[j] != rstart) break; }
641         if (j < n) ncols = j-i;
642         else       ncols = n-i;
643         ierr = MatSetValuesBlocked_MPISBAIJ_MatScalar(mat,1,row+i,ncols,col+i,val+i*bs2,addv);CHKERRQ(ierr);
644         i = j;
645       }
646     }
647     ierr = MatStashScatterEnd_Private(&mat->bstash);CHKERRQ(ierr);
648     baij->roworiented = r1;
649     a->roworiented    = r2;
650     b->roworiented    = r3;
651   }
652 
653   ierr = MatAssemblyBegin(baij->A,mode);CHKERRQ(ierr);
654   ierr = MatAssemblyEnd(baij->A,mode);CHKERRQ(ierr);
655 
656   /* determine if any processor has disassembled, if so we must
657      also disassemble ourselfs, in order that we may reassemble. */
658   /*
659      if nonzero structure of submatrix B cannot change then we know that
660      no processor disassembled thus we can skip this stuff
661   */
662   if (!((Mat_SeqBAIJ*)baij->B->data)->nonew)  {
663     ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
664     if (mat->was_assembled && !other_disassembled) {
665       ierr = DisAssemble_MPISBAIJ(mat);CHKERRQ(ierr);
666     }
667   }
668 
669   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
670     ierr = MatSetUpMultiply_MPISBAIJ(mat);CHKERRQ(ierr);
671   }
672   ierr = MatAssemblyBegin(baij->B,mode);CHKERRQ(ierr);
673   ierr = MatAssemblyEnd(baij->B,mode);CHKERRQ(ierr);
674 
675 #if defined(PETSC_USE_BOPT_g)
676   if (baij->ht && mode== MAT_FINAL_ASSEMBLY) {
677     PetscLogInfo(0,"MatAssemblyEnd_MPISBAIJ:Average Hash Table Search in MatSetValues = %5.2f\n",((PetscReal)baij->ht_total_ct)/baij->ht_insert_ct);
678     baij->ht_total_ct  = 0;
679     baij->ht_insert_ct = 0;
680   }
681 #endif
682   if (baij->ht_flag && !baij->ht && mode == MAT_FINAL_ASSEMBLY) {
683     ierr = MatCreateHashTable_MPISBAIJ_Private(mat,baij->ht_fact);CHKERRQ(ierr);
684     mat->ops->setvalues        = MatSetValues_MPISBAIJ_HT;
685     mat->ops->setvaluesblocked = MatSetValuesBlocked_MPISBAIJ_HT;
686   }
687 
688   if (baij->rowvalues) {
689     ierr = PetscFree(baij->rowvalues);CHKERRQ(ierr);
690     baij->rowvalues = 0;
691   }
692   PetscFunctionReturn(0);
693 }
694 
695 #undef __FUNCT__
696 #define __FUNCT__ "MatView_MPISBAIJ_ASCIIorDraworSocket"
697 static int MatView_MPISBAIJ_ASCIIorDraworSocket(Mat mat,PetscViewer viewer)
698 {
699   Mat_MPISBAIJ      *baij = (Mat_MPISBAIJ*)mat->data;
700   int               ierr,bs = baij->bs,size = baij->size,rank = baij->rank;
701   PetscTruth        isascii,isdraw;
702   PetscViewer       sviewer;
703   PetscViewerFormat format;
704 
705   PetscFunctionBegin;
706   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&isascii);CHKERRQ(ierr);
707   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_DRAW,&isdraw);CHKERRQ(ierr);
708   if (isascii) {
709     ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
710     if (format == PETSC_VIEWER_ASCII_INFO_LONG) {
711       MatInfo info;
712       ierr = MPI_Comm_rank(mat->comm,&rank);CHKERRQ(ierr);
713       ierr = MatGetInfo(mat,MAT_LOCAL,&info);CHKERRQ(ierr);
714       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
715               rank,mat->m,(int)info.nz_used*bs,(int)info.nz_allocated*bs,
716               baij->bs,(int)info.memory);CHKERRQ(ierr);
717       ierr = MatGetInfo(baij->A,MAT_LOCAL,&info);CHKERRQ(ierr);
718       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);CHKERRQ(ierr);
719       ierr = MatGetInfo(baij->B,MAT_LOCAL,&info);CHKERRQ(ierr);
720       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);CHKERRQ(ierr);
721       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
722       ierr = VecScatterView(baij->Mvctx,viewer);CHKERRQ(ierr);
723       PetscFunctionReturn(0);
724     } else if (format == PETSC_VIEWER_ASCII_INFO) {
725       ierr = PetscViewerASCIIPrintf(viewer,"  block size is %d\n",bs);CHKERRQ(ierr);
726       PetscFunctionReturn(0);
727     }
728   }
729 
730   if (isdraw) {
731     PetscDraw       draw;
732     PetscTruth isnull;
733     ierr = PetscViewerDrawGetDraw(viewer,0,&draw);CHKERRQ(ierr);
734     ierr = PetscDrawIsNull(draw,&isnull);CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
735   }
736 
737   if (size == 1) {
738     ierr = PetscObjectSetName((PetscObject)baij->A,mat->name);CHKERRQ(ierr);
739     ierr = MatView(baij->A,viewer);CHKERRQ(ierr);
740   } else {
741     /* assemble the entire matrix onto first processor. */
742     Mat         A;
743     Mat_SeqSBAIJ *Aloc;
744     Mat_SeqBAIJ *Bloc;
745     int         M = mat->M,N = mat->N,*ai,*aj,col,i,j,k,*rvals,mbs = baij->mbs;
746     MatScalar   *a;
747 
748     if (!rank) {
749       ierr = MatCreateMPISBAIJ(mat->comm,baij->bs,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
750     } else {
751       ierr = MatCreateMPISBAIJ(mat->comm,baij->bs,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
752     }
753     PetscLogObjectParent(mat,A);
754 
755     /* copy over the A part */
756     Aloc  = (Mat_SeqSBAIJ*)baij->A->data;
757     ai    = Aloc->i; aj = Aloc->j; a = Aloc->a;
758     ierr  = PetscMalloc(bs*sizeof(int),&rvals);CHKERRQ(ierr);
759 
760     for (i=0; i<mbs; i++) {
761       rvals[0] = bs*(baij->rstart + i);
762       for (j=1; j<bs; j++) { rvals[j] = rvals[j-1] + 1; }
763       for (j=ai[i]; j<ai[i+1]; j++) {
764         col = (baij->cstart+aj[j])*bs;
765         for (k=0; k<bs; k++) {
766           ierr = MatSetValues_MPISBAIJ_MatScalar(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
767           col++; a += bs;
768         }
769       }
770     }
771     /* copy over the B part */
772     Bloc = (Mat_SeqBAIJ*)baij->B->data;
773     ai = Bloc->i; aj = Bloc->j; a = Bloc->a;
774     for (i=0; i<mbs; i++) {
775       rvals[0] = bs*(baij->rstart + i);
776       for (j=1; j<bs; j++) { rvals[j] = rvals[j-1] + 1; }
777       for (j=ai[i]; j<ai[i+1]; j++) {
778         col = baij->garray[aj[j]]*bs;
779         for (k=0; k<bs; k++) {
780           ierr = MatSetValues_MPISBAIJ_MatScalar(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
781           col++; a += bs;
782         }
783       }
784     }
785     ierr = PetscFree(rvals);CHKERRQ(ierr);
786     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
787     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
788     /*
789        Everyone has to call to draw the matrix since the graphics waits are
790        synchronized across all processors that share the PetscDraw object
791     */
792     ierr = PetscViewerGetSingleton(viewer,&sviewer);CHKERRQ(ierr);
793     if (!rank) {
794       ierr = PetscObjectSetName((PetscObject)((Mat_MPISBAIJ*)(A->data))->A,mat->name);CHKERRQ(ierr);
795       ierr = MatView(((Mat_MPISBAIJ*)(A->data))->A,sviewer);CHKERRQ(ierr);
796     }
797     ierr = PetscViewerRestoreSingleton(viewer,&sviewer);CHKERRQ(ierr);
798     ierr = MatDestroy(A);CHKERRQ(ierr);
799   }
800   PetscFunctionReturn(0);
801 }
802 
803 #undef __FUNCT__
804 #define __FUNCT__ "MatView_MPISBAIJ"
805 int MatView_MPISBAIJ(Mat mat,PetscViewer viewer)
806 {
807   int        ierr;
808   PetscTruth isascii,isdraw,issocket,isbinary;
809 
810   PetscFunctionBegin;
811   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&isascii);CHKERRQ(ierr);
812   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_DRAW,&isdraw);CHKERRQ(ierr);
813   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_SOCKET,&issocket);CHKERRQ(ierr);
814   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_BINARY,&isbinary);CHKERRQ(ierr);
815   if (isascii || isdraw || issocket || isbinary) {
816     ierr = MatView_MPISBAIJ_ASCIIorDraworSocket(mat,viewer);CHKERRQ(ierr);
817   } else {
818     SETERRQ1(1,"Viewer type %s not supported by MPISBAIJ matrices",((PetscObject)viewer)->type_name);
819   }
820   PetscFunctionReturn(0);
821 }
822 
823 #undef __FUNCT__
824 #define __FUNCT__ "MatDestroy_MPISBAIJ"
825 int MatDestroy_MPISBAIJ(Mat mat)
826 {
827   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
828   int         ierr;
829 
830   PetscFunctionBegin;
831 #if defined(PETSC_USE_LOG)
832   PetscLogObjectState((PetscObject)mat,"Rows=%d,Cols=%d",mat->M,mat->N);
833 #endif
834   ierr = MatStashDestroy_Private(&mat->stash);CHKERRQ(ierr);
835   ierr = MatStashDestroy_Private(&mat->bstash);CHKERRQ(ierr);
836   ierr = PetscFree(baij->rowners);CHKERRQ(ierr);
837   ierr = MatDestroy(baij->A);CHKERRQ(ierr);
838   ierr = MatDestroy(baij->B);CHKERRQ(ierr);
839 #if defined (PETSC_USE_CTABLE)
840   if (baij->colmap) {ierr = PetscTableDelete(baij->colmap);CHKERRQ(ierr);}
841 #else
842   if (baij->colmap) {ierr = PetscFree(baij->colmap);CHKERRQ(ierr);}
843 #endif
844   if (baij->garray) {ierr = PetscFree(baij->garray);CHKERRQ(ierr);}
845   if (baij->lvec)   {ierr = VecDestroy(baij->lvec);CHKERRQ(ierr);}
846   if (baij->Mvctx)  {ierr = VecScatterDestroy(baij->Mvctx);CHKERRQ(ierr);}
847   if (baij->rowvalues) {ierr = PetscFree(baij->rowvalues);CHKERRQ(ierr);}
848   if (baij->barray) {ierr = PetscFree(baij->barray);CHKERRQ(ierr);}
849   if (baij->hd) {ierr = PetscFree(baij->hd);CHKERRQ(ierr);}
850 #if defined(PETSC_USE_MAT_SINGLE)
851   if (baij->setvaluescopy) {ierr = PetscFree(baij->setvaluescopy);CHKERRQ(ierr);}
852 #endif
853   ierr = PetscFree(baij);CHKERRQ(ierr);
854   PetscFunctionReturn(0);
855 }
856 
857 #undef __FUNCT__
858 #define __FUNCT__ "MatMult_MPISBAIJ"
859 int MatMult_MPISBAIJ(Mat A,Vec xx,Vec yy)
860 {
861   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
862   int         ierr,nt;
863 
864   PetscFunctionBegin;
865   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
866   if (nt != A->n) {
867     SETERRQ(PETSC_ERR_ARG_SIZ,"Incompatible partition of A and xx");
868   }
869   ierr = VecGetLocalSize(yy,&nt);CHKERRQ(ierr);
870   if (nt != A->m) {
871     SETERRQ(PETSC_ERR_ARG_SIZ,"Incompatible parition of A and yy");
872   }
873 
874   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
875   /* do diagonal part */
876   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
877   /* do supperdiagonal part */
878   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
879   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
880   /* do subdiagonal part */
881   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
882   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
883   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
884 
885   PetscFunctionReturn(0);
886 }
887 
888 #undef __FUNCT__
889 #define __FUNCT__ "MatMultAdd_MPISBAIJ"
890 int MatMultAdd_MPISBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
891 {
892   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
893   int        ierr;
894 
895   PetscFunctionBegin;
896   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
897   /* do diagonal part */
898   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
899   /* do supperdiagonal part */
900   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
901   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
902 
903   /* do subdiagonal part */
904   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
905   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
906   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
907 
908   PetscFunctionReturn(0);
909 }
910 
911 #undef __FUNCT__
912 #define __FUNCT__ "MatMultTranspose_MPISBAIJ"
913 int MatMultTranspose_MPISBAIJ(Mat A,Vec xx,Vec yy)
914 {
915   PetscFunctionBegin;
916   SETERRQ(1,"Matrix is symmetric. Call MatMult().");
917   /* PetscFunctionReturn(0); */
918 }
919 
920 #undef __FUNCT__
921 #define __FUNCT__ "MatMultTransposeAdd_MPISBAIJ"
922 int MatMultTransposeAdd_MPISBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
923 {
924   PetscFunctionBegin;
925   SETERRQ(1,"Matrix is symmetric. Call MatMultAdd().");
926   /* PetscFunctionReturn(0); */
927 }
928 
929 /*
930   This only works correctly for square matrices where the subblock A->A is the
931    diagonal block
932 */
933 #undef __FUNCT__
934 #define __FUNCT__ "MatGetDiagonal_MPISBAIJ"
935 int MatGetDiagonal_MPISBAIJ(Mat A,Vec v)
936 {
937   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
938   int         ierr;
939 
940   PetscFunctionBegin;
941   /* if (a->M != a->N) SETERRQ(PETSC_ERR_SUP,"Supports only square matrix where A->A is diag block"); */
942   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
943   PetscFunctionReturn(0);
944 }
945 
946 #undef __FUNCT__
947 #define __FUNCT__ "MatScale_MPISBAIJ"
948 int MatScale_MPISBAIJ(PetscScalar *aa,Mat A)
949 {
950   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
951   int         ierr;
952 
953   PetscFunctionBegin;
954   ierr = MatScale(aa,a->A);CHKERRQ(ierr);
955   ierr = MatScale(aa,a->B);CHKERRQ(ierr);
956   PetscFunctionReturn(0);
957 }
958 
959 #undef __FUNCT__
960 #define __FUNCT__ "MatGetRow_MPISBAIJ"
961 int MatGetRow_MPISBAIJ(Mat matin,int row,int *nz,int **idx,PetscScalar **v)
962 {
963   Mat_MPISBAIJ   *mat = (Mat_MPISBAIJ*)matin->data;
964   PetscScalar    *vworkA,*vworkB,**pvA,**pvB,*v_p;
965   int            bs = mat->bs,bs2 = mat->bs2,i,ierr,*cworkA,*cworkB,**pcA,**pcB;
966   int            nztot,nzA,nzB,lrow,brstart = mat->rstart*bs,brend = mat->rend*bs;
967   int            *cmap,*idx_p,cstart = mat->cstart;
968 
969   PetscFunctionBegin;
970   if (mat->getrowactive == PETSC_TRUE) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Already active");
971   mat->getrowactive = PETSC_TRUE;
972 
973   if (!mat->rowvalues && (idx || v)) {
974     /*
975         allocate enough space to hold information from the longest row.
976     */
977     Mat_SeqSBAIJ *Aa = (Mat_SeqSBAIJ*)mat->A->data;
978     Mat_SeqBAIJ  *Ba = (Mat_SeqBAIJ*)mat->B->data;
979     int     max = 1,mbs = mat->mbs,tmp;
980     for (i=0; i<mbs; i++) {
981       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i]; /* row length */
982       if (max < tmp) { max = tmp; }
983     }
984     ierr = PetscMalloc(max*bs2*(sizeof(int)+sizeof(PetscScalar)),&mat->rowvalues);CHKERRQ(ierr);
985     mat->rowindices = (int*)(mat->rowvalues + max*bs2);
986   }
987 
988   if (row < brstart || row >= brend) SETERRQ(PETSC_ERR_SUP,"Only local rows")
989   lrow = row - brstart;  /* local row index */
990 
991   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
992   if (!v)   {pvA = 0; pvB = 0;}
993   if (!idx) {pcA = 0; if (!v) pcB = 0;}
994   ierr = (*mat->A->ops->getrow)(mat->A,lrow,&nzA,pcA,pvA);CHKERRQ(ierr);
995   ierr = (*mat->B->ops->getrow)(mat->B,lrow,&nzB,pcB,pvB);CHKERRQ(ierr);
996   nztot = nzA + nzB;
997 
998   cmap  = mat->garray;
999   if (v  || idx) {
1000     if (nztot) {
1001       /* Sort by increasing column numbers, assuming A and B already sorted */
1002       int imark = -1;
1003       if (v) {
1004         *v = v_p = mat->rowvalues;
1005         for (i=0; i<nzB; i++) {
1006           if (cmap[cworkB[i]/bs] < cstart)   v_p[i] = vworkB[i];
1007           else break;
1008         }
1009         imark = i;
1010         for (i=0; i<nzA; i++)     v_p[imark+i] = vworkA[i];
1011         for (i=imark; i<nzB; i++) v_p[nzA+i]   = vworkB[i];
1012       }
1013       if (idx) {
1014         *idx = idx_p = mat->rowindices;
1015         if (imark > -1) {
1016           for (i=0; i<imark; i++) {
1017             idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs;
1018           }
1019         } else {
1020           for (i=0; i<nzB; i++) {
1021             if (cmap[cworkB[i]/bs] < cstart)
1022               idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1023             else break;
1024           }
1025           imark = i;
1026         }
1027         for (i=0; i<nzA; i++)     idx_p[imark+i] = cstart*bs + cworkA[i];
1028         for (i=imark; i<nzB; i++) idx_p[nzA+i]   = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1029       }
1030     } else {
1031       if (idx) *idx = 0;
1032       if (v)   *v   = 0;
1033     }
1034   }
1035   *nz = nztot;
1036   ierr = (*mat->A->ops->restorerow)(mat->A,lrow,&nzA,pcA,pvA);CHKERRQ(ierr);
1037   ierr = (*mat->B->ops->restorerow)(mat->B,lrow,&nzB,pcB,pvB);CHKERRQ(ierr);
1038   PetscFunctionReturn(0);
1039 }
1040 
1041 #undef __FUNCT__
1042 #define __FUNCT__ "MatRestoreRow_MPISBAIJ"
1043 int MatRestoreRow_MPISBAIJ(Mat mat,int row,int *nz,int **idx,PetscScalar **v)
1044 {
1045   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
1046 
1047   PetscFunctionBegin;
1048   if (baij->getrowactive == PETSC_FALSE) {
1049     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"MatGetRow not called");
1050   }
1051   baij->getrowactive = PETSC_FALSE;
1052   PetscFunctionReturn(0);
1053 }
1054 
1055 #undef __FUNCT__
1056 #define __FUNCT__ "MatGetBlockSize_MPISBAIJ"
1057 int MatGetBlockSize_MPISBAIJ(Mat mat,int *bs)
1058 {
1059   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
1060 
1061   PetscFunctionBegin;
1062   *bs = baij->bs;
1063   PetscFunctionReturn(0);
1064 }
1065 
1066 #undef __FUNCT__
1067 #define __FUNCT__ "MatZeroEntries_MPISBAIJ"
1068 int MatZeroEntries_MPISBAIJ(Mat A)
1069 {
1070   Mat_MPISBAIJ *l = (Mat_MPISBAIJ*)A->data;
1071   int         ierr;
1072 
1073   PetscFunctionBegin;
1074   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
1075   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
1076   PetscFunctionReturn(0);
1077 }
1078 
1079 #undef __FUNCT__
1080 #define __FUNCT__ "MatGetInfo_MPISBAIJ"
1081 int MatGetInfo_MPISBAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1082 {
1083   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)matin->data;
1084   Mat         A = a->A,B = a->B;
1085   int         ierr;
1086   PetscReal   isend[5],irecv[5];
1087 
1088   PetscFunctionBegin;
1089   info->block_size     = (PetscReal)a->bs;
1090   ierr = MatGetInfo(A,MAT_LOCAL,info);CHKERRQ(ierr);
1091   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->nz_unneeded;
1092   isend[3] = info->memory;  isend[4] = info->mallocs;
1093   ierr = MatGetInfo(B,MAT_LOCAL,info);CHKERRQ(ierr);
1094   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->nz_unneeded;
1095   isend[3] += info->memory;  isend[4] += info->mallocs;
1096   if (flag == MAT_LOCAL) {
1097     info->nz_used      = isend[0];
1098     info->nz_allocated = isend[1];
1099     info->nz_unneeded  = isend[2];
1100     info->memory       = isend[3];
1101     info->mallocs      = isend[4];
1102   } else if (flag == MAT_GLOBAL_MAX) {
1103     ierr = MPI_Allreduce(isend,irecv,5,MPIU_REAL,MPI_MAX,matin->comm);CHKERRQ(ierr);
1104     info->nz_used      = irecv[0];
1105     info->nz_allocated = irecv[1];
1106     info->nz_unneeded  = irecv[2];
1107     info->memory       = irecv[3];
1108     info->mallocs      = irecv[4];
1109   } else if (flag == MAT_GLOBAL_SUM) {
1110     ierr = MPI_Allreduce(isend,irecv,5,MPIU_REAL,MPI_SUM,matin->comm);CHKERRQ(ierr);
1111     info->nz_used      = irecv[0];
1112     info->nz_allocated = irecv[1];
1113     info->nz_unneeded  = irecv[2];
1114     info->memory       = irecv[3];
1115     info->mallocs      = irecv[4];
1116   } else {
1117     SETERRQ1(1,"Unknown MatInfoType argument %d",flag);
1118   }
1119   info->rows_global       = (PetscReal)A->M;
1120   info->columns_global    = (PetscReal)A->N;
1121   info->rows_local        = (PetscReal)A->m;
1122   info->columns_local     = (PetscReal)A->N;
1123   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1124   info->fill_ratio_needed = 0;
1125   info->factor_mallocs    = 0;
1126   PetscFunctionReturn(0);
1127 }
1128 
1129 #undef __FUNCT__
1130 #define __FUNCT__ "MatSetOption_MPISBAIJ"
1131 int MatSetOption_MPISBAIJ(Mat A,MatOption op)
1132 {
1133   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
1134   int         ierr;
1135 
1136   PetscFunctionBegin;
1137   switch (op) {
1138   case MAT_NO_NEW_NONZERO_LOCATIONS:
1139   case MAT_YES_NEW_NONZERO_LOCATIONS:
1140   case MAT_COLUMNS_UNSORTED:
1141   case MAT_COLUMNS_SORTED:
1142   case MAT_NEW_NONZERO_ALLOCATION_ERR:
1143   case MAT_KEEP_ZEROED_ROWS:
1144   case MAT_NEW_NONZERO_LOCATION_ERR:
1145     ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1146     ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1147     break;
1148   case MAT_ROW_ORIENTED:
1149     a->roworiented = PETSC_TRUE;
1150     ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1151     ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1152     break;
1153   case MAT_ROWS_SORTED:
1154   case MAT_ROWS_UNSORTED:
1155   case MAT_YES_NEW_DIAGONALS:
1156   case MAT_USE_SINGLE_PRECISION_SOLVES:
1157     PetscLogInfo(A,"Info:MatSetOption_MPIBAIJ:Option ignored\n");
1158     break;
1159   case MAT_COLUMN_ORIENTED:
1160     a->roworiented = PETSC_FALSE;
1161     ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1162     ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1163     break;
1164   case MAT_IGNORE_OFF_PROC_ENTRIES:
1165     a->donotstash = PETSC_TRUE;
1166     break;
1167   case MAT_NO_NEW_DIAGONALS:
1168     SETERRQ(PETSC_ERR_SUP,"MAT_NO_NEW_DIAGONALS");
1169     break;
1170   case MAT_USE_HASH_TABLE:
1171     a->ht_flag = PETSC_TRUE;
1172     break;
1173   default:
1174     SETERRQ(PETSC_ERR_SUP,"unknown option");
1175     break;
1176   }
1177   PetscFunctionReturn(0);
1178 }
1179 
1180 #undef __FUNCT__
1181 #define __FUNCT__ "MatTranspose_MPISBAIJ("
1182 int MatTranspose_MPISBAIJ(Mat A,Mat *matout)
1183 {
1184   PetscFunctionBegin;
1185   SETERRQ(1,"Matrix is symmetric. MatTranspose() should not be called");
1186   /* PetscFunctionReturn(0); */
1187 }
1188 
1189 #undef __FUNCT__
1190 #define __FUNCT__ "MatDiagonalScale_MPISBAIJ"
1191 int MatDiagonalScale_MPISBAIJ(Mat mat,Vec ll,Vec rr)
1192 {
1193   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
1194   Mat         a = baij->A,b = baij->B;
1195   int         ierr,s1,s2,s3;
1196 
1197   PetscFunctionBegin;
1198   if (ll != rr) {
1199     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1200   }
1201   ierr = MatGetLocalSize(mat,&s2,&s3);CHKERRQ(ierr);
1202   if (rr) {
1203     ierr = VecGetLocalSize(rr,&s1);CHKERRQ(ierr);
1204     if (s1!=s3) SETERRQ(PETSC_ERR_ARG_SIZ,"right vector non-conforming local size");
1205     /* Overlap communication with computation. */
1206     ierr = VecScatterBegin(rr,baij->lvec,INSERT_VALUES,SCATTER_FORWARD,baij->Mvctx);CHKERRQ(ierr);
1207     /*} if (ll) { */
1208     ierr = VecGetLocalSize(ll,&s1);CHKERRQ(ierr);
1209     if (s1!=s2) SETERRQ(PETSC_ERR_ARG_SIZ,"left vector non-conforming local size");
1210     ierr = (*b->ops->diagonalscale)(b,ll,PETSC_NULL);CHKERRQ(ierr);
1211     /* } */
1212   /* scale  the diagonal block */
1213   ierr = (*a->ops->diagonalscale)(a,ll,rr);CHKERRQ(ierr);
1214 
1215   /* if (rr) { */
1216     /* Do a scatter end and then right scale the off-diagonal block */
1217     ierr = VecScatterEnd(rr,baij->lvec,INSERT_VALUES,SCATTER_FORWARD,baij->Mvctx);CHKERRQ(ierr);
1218     ierr = (*b->ops->diagonalscale)(b,PETSC_NULL,baij->lvec);CHKERRQ(ierr);
1219   }
1220 
1221   PetscFunctionReturn(0);
1222 }
1223 
1224 #undef __FUNCT__
1225 #define __FUNCT__ "MatZeroRows_MPISBAIJ"
1226 int MatZeroRows_MPISBAIJ(Mat A,IS is,PetscScalar *diag)
1227 {
1228   Mat_MPISBAIJ   *l = (Mat_MPISBAIJ*)A->data;
1229   int            i,ierr,N,*rows,*owners = l->rowners,size = l->size;
1230   int            *procs,*nprocs,j,idx,nsends,*work,row;
1231   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
1232   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
1233   int            *lens,imdex,*lrows,*values,bs=l->bs,rstart_bs=l->rstart_bs;
1234   MPI_Comm       comm = A->comm;
1235   MPI_Request    *send_waits,*recv_waits;
1236   MPI_Status     recv_status,*send_status;
1237   IS             istmp;
1238   PetscTruth     found;
1239 
1240   PetscFunctionBegin;
1241   ierr = ISGetSize(is,&N);CHKERRQ(ierr);
1242   ierr = ISGetIndices(is,&rows);CHKERRQ(ierr);
1243 
1244   /*  first count number of contributors to each processor */
1245   ierr  = PetscMalloc(2*size*sizeof(int),&nprocs);CHKERRQ(ierr);
1246   ierr  = PetscMemzero(nprocs,2*size*sizeof(int));CHKERRQ(ierr);
1247   procs = nprocs + size;
1248   ierr  = PetscMalloc((N+1)*sizeof(int),&owner);CHKERRQ(ierr); /* see note*/
1249   for (i=0; i<N; i++) {
1250     idx   = rows[i];
1251     found = PETSC_FALSE;
1252     for (j=0; j<size; j++) {
1253       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
1254         nprocs[j]++; procs[j] = 1; owner[i] = j; found = PETSC_TRUE; break;
1255       }
1256     }
1257     if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index out of range");
1258   }
1259   nsends = 0;  for (i=0; i<size; i++) { nsends += procs[i];}
1260 
1261   /* inform other processors of number of messages and max length*/
1262   ierr   = PetscMalloc(2*size*sizeof(int),&work);CHKERRQ(ierr);
1263   ierr   = MPI_Allreduce(nprocs,work,2*size,MPI_INT,PetscMaxSum_Op,comm);CHKERRQ(ierr);
1264   nmax   = work[rank];
1265   nrecvs = work[size+rank];
1266   ierr   = PetscFree(work);CHKERRQ(ierr);
1267 
1268   /* post receives:   */
1269   ierr = PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int),&rvalues);CHKERRQ(ierr);
1270   ierr = PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&recv_waits);CHKERRQ(ierr);
1271   for (i=0; i<nrecvs; i++) {
1272     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
1273   }
1274 
1275   /* do sends:
1276      1) starts[i] gives the starting index in svalues for stuff going to
1277      the ith processor
1278   */
1279   ierr = PetscMalloc((N+1)*sizeof(int),&svalues);CHKERRQ(ierr);
1280   ierr = PetscMalloc((nsends+1)*sizeof(MPI_Request),&send_waits);CHKERRQ(ierr);
1281   ierr = PetscMalloc((size+1)*sizeof(int),&starts);CHKERRQ(ierr);
1282   starts[0]  = 0;
1283   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1284   for (i=0; i<N; i++) {
1285     svalues[starts[owner[i]]++] = rows[i];
1286   }
1287   ierr = ISRestoreIndices(is,&rows);CHKERRQ(ierr);
1288 
1289   starts[0] = 0;
1290   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1291   count = 0;
1292   for (i=0; i<size; i++) {
1293     if (procs[i]) {
1294       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
1295     }
1296   }
1297   ierr = PetscFree(starts);CHKERRQ(ierr);
1298 
1299   base = owners[rank]*bs;
1300 
1301   /*  wait on receives */
1302   ierr   = PetscMalloc(2*(nrecvs+1)*sizeof(int),&lens);CHKERRQ(ierr);
1303   source = lens + nrecvs;
1304   count  = nrecvs; slen = 0;
1305   while (count) {
1306     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
1307     /* unpack receives into our local space */
1308     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
1309     source[imdex]  = recv_status.MPI_SOURCE;
1310     lens[imdex]    = n;
1311     slen          += n;
1312     count--;
1313   }
1314   ierr = PetscFree(recv_waits);CHKERRQ(ierr);
1315 
1316   /* move the data into the send scatter */
1317   ierr = PetscMalloc((slen+1)*sizeof(int),&lrows);CHKERRQ(ierr);
1318   count = 0;
1319   for (i=0; i<nrecvs; i++) {
1320     values = rvalues + i*nmax;
1321     for (j=0; j<lens[i]; j++) {
1322       lrows[count++] = values[j] - base;
1323     }
1324   }
1325   ierr = PetscFree(rvalues);CHKERRQ(ierr);
1326   ierr = PetscFree(lens);CHKERRQ(ierr);
1327   ierr = PetscFree(owner);CHKERRQ(ierr);
1328   ierr = PetscFree(nprocs);CHKERRQ(ierr);
1329 
1330   /* actually zap the local rows */
1331   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
1332   PetscLogObjectParent(A,istmp);
1333 
1334   /*
1335         Zero the required rows. If the "diagonal block" of the matrix
1336      is square and the user wishes to set the diagonal we use seperate
1337      code so that MatSetValues() is not called for each diagonal allocating
1338      new memory, thus calling lots of mallocs and slowing things down.
1339 
1340        Contributed by: Mathew Knepley
1341   */
1342   /* must zero l->B before l->A because the (diag) case below may put values into l->B*/
1343   ierr = MatZeroRows_SeqBAIJ(l->B,istmp,0);CHKERRQ(ierr);
1344   if (diag && (l->A->M == l->A->N)) {
1345     ierr = MatZeroRows_SeqSBAIJ(l->A,istmp,diag);CHKERRQ(ierr);
1346   } else if (diag) {
1347     ierr = MatZeroRows_SeqSBAIJ(l->A,istmp,0);CHKERRQ(ierr);
1348     if (((Mat_SeqSBAIJ*)l->A->data)->nonew) {
1349       SETERRQ(PETSC_ERR_SUP,"MatZeroRows() on rectangular matrices cannot be used with the Mat options \n\
1350 MAT_NO_NEW_NONZERO_LOCATIONS,MAT_NEW_NONZERO_LOCATION_ERR,MAT_NEW_NONZERO_ALLOCATION_ERR");
1351     }
1352     for (i=0; i<slen; i++) {
1353       row  = lrows[i] + rstart_bs;
1354       ierr = MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES);CHKERRQ(ierr);
1355     }
1356     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1357     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1358   } else {
1359     ierr = MatZeroRows_SeqSBAIJ(l->A,istmp,0);CHKERRQ(ierr);
1360   }
1361 
1362   ierr = ISDestroy(istmp);CHKERRQ(ierr);
1363   ierr = PetscFree(lrows);CHKERRQ(ierr);
1364 
1365   /* wait on sends */
1366   if (nsends) {
1367     ierr = PetscMalloc(nsends*sizeof(MPI_Status),&send_status);CHKERRQ(ierr);
1368     ierr        = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
1369     ierr        = PetscFree(send_status);CHKERRQ(ierr);
1370   }
1371   ierr = PetscFree(send_waits);CHKERRQ(ierr);
1372   ierr = PetscFree(svalues);CHKERRQ(ierr);
1373 
1374   PetscFunctionReturn(0);
1375 }
1376 
1377 #undef __FUNCT__
1378 #define __FUNCT__ "MatPrintHelp_MPISBAIJ"
1379 int MatPrintHelp_MPISBAIJ(Mat A)
1380 {
1381   Mat_MPISBAIJ *a   = (Mat_MPISBAIJ*)A->data;
1382   MPI_Comm    comm = A->comm;
1383   static int  called = 0;
1384   int         ierr;
1385 
1386   PetscFunctionBegin;
1387   if (!a->rank) {
1388     ierr = MatPrintHelp_SeqSBAIJ(a->A);CHKERRQ(ierr);
1389   }
1390   if (called) {PetscFunctionReturn(0);} else called = 1;
1391   ierr = (*PetscHelpPrintf)(comm," Options for MATMPISBAIJ matrix format (the defaults):\n");CHKERRQ(ierr);
1392   ierr = (*PetscHelpPrintf)(comm,"  -mat_use_hash_table <factor>: Use hashtable for efficient matrix assembly\n");CHKERRQ(ierr);
1393   PetscFunctionReturn(0);
1394 }
1395 
1396 #undef __FUNCT__
1397 #define __FUNCT__ "MatSetUnfactored_MPISBAIJ"
1398 int MatSetUnfactored_MPISBAIJ(Mat A)
1399 {
1400   Mat_MPISBAIJ *a   = (Mat_MPISBAIJ*)A->data;
1401   int         ierr;
1402 
1403   PetscFunctionBegin;
1404   ierr = MatSetUnfactored(a->A);CHKERRQ(ierr);
1405   PetscFunctionReturn(0);
1406 }
1407 
1408 static int MatDuplicate_MPISBAIJ(Mat,MatDuplicateOption,Mat *);
1409 
1410 #undef __FUNCT__
1411 #define __FUNCT__ "MatEqual_MPISBAIJ"
1412 int MatEqual_MPISBAIJ(Mat A,Mat B,PetscTruth *flag)
1413 {
1414   Mat_MPISBAIJ *matB = (Mat_MPISBAIJ*)B->data,*matA = (Mat_MPISBAIJ*)A->data;
1415   Mat         a,b,c,d;
1416   PetscTruth  flg;
1417   int         ierr;
1418 
1419   PetscFunctionBegin;
1420   ierr = PetscTypeCompare((PetscObject)B,MATMPISBAIJ,&flg);CHKERRQ(ierr);
1421   if (!flg) SETERRQ(PETSC_ERR_ARG_INCOMP,"Matrices must be same type");
1422   a = matA->A; b = matA->B;
1423   c = matB->A; d = matB->B;
1424 
1425   ierr = MatEqual(a,c,&flg);CHKERRQ(ierr);
1426   if (flg == PETSC_TRUE) {
1427     ierr = MatEqual(b,d,&flg);CHKERRQ(ierr);
1428   }
1429   ierr = MPI_Allreduce(&flg,flag,1,MPI_INT,MPI_LAND,A->comm);CHKERRQ(ierr);
1430   PetscFunctionReturn(0);
1431 }
1432 
1433 #undef __FUNCT__
1434 #define __FUNCT__ "MatSetUpPreallocation_MPISBAIJ"
1435 int MatSetUpPreallocation_MPISBAIJ(Mat A)
1436 {
1437   int        ierr;
1438 
1439   PetscFunctionBegin;
1440   ierr = MatMPISBAIJSetPreallocation(A,1,PETSC_DEFAULT,0,PETSC_DEFAULT,0);CHKERRQ(ierr);
1441   PetscFunctionReturn(0);
1442 }
1443 /* -------------------------------------------------------------------*/
1444 static struct _MatOps MatOps_Values = {
1445   MatSetValues_MPISBAIJ,
1446   MatGetRow_MPISBAIJ,
1447   MatRestoreRow_MPISBAIJ,
1448   MatMult_MPISBAIJ,
1449   MatMultAdd_MPISBAIJ,
1450   MatMultTranspose_MPISBAIJ,
1451   MatMultTransposeAdd_MPISBAIJ,
1452   0,
1453   0,
1454   0,
1455   0,
1456   0,
1457   0,
1458   0,
1459   MatTranspose_MPISBAIJ,
1460   MatGetInfo_MPISBAIJ,
1461   MatEqual_MPISBAIJ,
1462   MatGetDiagonal_MPISBAIJ,
1463   MatDiagonalScale_MPISBAIJ,
1464   MatNorm_MPISBAIJ,
1465   MatAssemblyBegin_MPISBAIJ,
1466   MatAssemblyEnd_MPISBAIJ,
1467   0,
1468   MatSetOption_MPISBAIJ,
1469   MatZeroEntries_MPISBAIJ,
1470   MatZeroRows_MPISBAIJ,
1471   0,
1472   0,
1473   0,
1474   0,
1475   MatSetUpPreallocation_MPISBAIJ,
1476   0,
1477   0,
1478   0,
1479   0,
1480   MatDuplicate_MPISBAIJ,
1481   0,
1482   0,
1483   0,
1484   0,
1485   0,
1486   MatGetSubMatrices_MPISBAIJ,
1487   MatIncreaseOverlap_MPISBAIJ,
1488   MatGetValues_MPISBAIJ,
1489   0,
1490   MatPrintHelp_MPISBAIJ,
1491   MatScale_MPISBAIJ,
1492   0,
1493   0,
1494   0,
1495   MatGetBlockSize_MPISBAIJ,
1496   0,
1497   0,
1498   0,
1499   0,
1500   0,
1501   0,
1502   MatSetUnfactored_MPISBAIJ,
1503   0,
1504   MatSetValuesBlocked_MPISBAIJ,
1505   0,
1506   0,
1507   0,
1508   MatGetPetscMaps_Petsc,
1509   0,
1510   0,
1511   0,
1512   0,
1513   0,
1514   0,
1515   MatGetRowMax_MPISBAIJ};
1516 
1517 
1518 EXTERN_C_BEGIN
1519 #undef __FUNCT__
1520 #define __FUNCT__ "MatGetDiagonalBlock_MPISBAIJ"
1521 int MatGetDiagonalBlock_MPISBAIJ(Mat A,PetscTruth *iscopy,MatReuse reuse,Mat *a)
1522 {
1523   PetscFunctionBegin;
1524   *a      = ((Mat_MPISBAIJ *)A->data)->A;
1525   *iscopy = PETSC_FALSE;
1526   PetscFunctionReturn(0);
1527 }
1528 EXTERN_C_END
1529 
1530 EXTERN_C_BEGIN
1531 #undef __FUNCT__
1532 #define __FUNCT__ "MatCreate_MPISBAIJ"
1533 int MatCreate_MPISBAIJ(Mat B)
1534 {
1535   Mat_MPISBAIJ *b;
1536   int          ierr;
1537   PetscTruth   flg;
1538 
1539   PetscFunctionBegin;
1540 
1541   ierr    = PetscNew(Mat_MPISBAIJ,&b);CHKERRQ(ierr);
1542   B->data = (void*)b;
1543   ierr    = PetscMemzero(b,sizeof(Mat_MPISBAIJ));CHKERRQ(ierr);
1544   ierr    = PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr);
1545 
1546   B->ops->destroy    = MatDestroy_MPISBAIJ;
1547   B->ops->view       = MatView_MPISBAIJ;
1548   B->mapping    = 0;
1549   B->factor     = 0;
1550   B->assembled  = PETSC_FALSE;
1551 
1552   B->insertmode = NOT_SET_VALUES;
1553   ierr = MPI_Comm_rank(B->comm,&b->rank);CHKERRQ(ierr);
1554   ierr = MPI_Comm_size(B->comm,&b->size);CHKERRQ(ierr);
1555 
1556   /* build local table of row and column ownerships */
1557   ierr          = PetscMalloc(3*(b->size+2)*sizeof(int),&b->rowners);CHKERRQ(ierr);
1558   b->cowners    = b->rowners + b->size + 2;
1559   b->rowners_bs = b->cowners + b->size + 2;
1560   PetscLogObjectMemory(B,3*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPISBAIJ));
1561 
1562   /* build cache for off array entries formed */
1563   ierr = MatStashCreate_Private(B->comm,1,&B->stash);CHKERRQ(ierr);
1564   b->donotstash  = PETSC_FALSE;
1565   b->colmap      = PETSC_NULL;
1566   b->garray      = PETSC_NULL;
1567   b->roworiented = PETSC_TRUE;
1568 
1569 #if defined(PETSC_USE_MAT_SINGLE)
1570   /* stuff for MatSetValues_XXX in single precision */
1571   b->setvalueslen     = 0;
1572   b->setvaluescopy    = PETSC_NULL;
1573 #endif
1574 
1575   /* stuff used in block assembly */
1576   b->barray       = 0;
1577 
1578   /* stuff used for matrix vector multiply */
1579   b->lvec         = 0;
1580   b->Mvctx        = 0;
1581 
1582   /* stuff for MatGetRow() */
1583   b->rowindices   = 0;
1584   b->rowvalues    = 0;
1585   b->getrowactive = PETSC_FALSE;
1586 
1587   /* hash table stuff */
1588   b->ht           = 0;
1589   b->hd           = 0;
1590   b->ht_size      = 0;
1591   b->ht_flag      = PETSC_FALSE;
1592   b->ht_fact      = 0;
1593   b->ht_total_ct  = 0;
1594   b->ht_insert_ct = 0;
1595 
1596   ierr = PetscOptionsHasName(PETSC_NULL,"-mat_use_hash_table",&flg);CHKERRQ(ierr);
1597   if (flg) {
1598     PetscReal fact = 1.39;
1599     ierr = MatSetOption(B,MAT_USE_HASH_TABLE);CHKERRQ(ierr);
1600     ierr = PetscOptionsGetReal(PETSC_NULL,"-mat_use_hash_table",&fact,PETSC_NULL);CHKERRQ(ierr);
1601     if (fact <= 1.0) fact = 1.39;
1602     ierr = MatMPIBAIJSetHashTableFactor(B,fact);CHKERRQ(ierr);
1603     PetscLogInfo(0,"MatCreateMPISBAIJ:Hash table Factor used %5.2f\n",fact);
1604   }
1605   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatStoreValues_C",
1606                                      "MatStoreValues_MPISBAIJ",
1607                                      MatStoreValues_MPISBAIJ);CHKERRQ(ierr);
1608   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatRetrieveValues_C",
1609                                      "MatRetrieveValues_MPISBAIJ",
1610                                      MatRetrieveValues_MPISBAIJ);CHKERRQ(ierr);
1611   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetDiagonalBlock_C",
1612                                      "MatGetDiagonalBlock_MPISBAIJ",
1613                                      MatGetDiagonalBlock_MPISBAIJ);CHKERRQ(ierr);
1614   PetscFunctionReturn(0);
1615 }
1616 EXTERN_C_END
1617 
1618 #undef __FUNCT__
1619 #define __FUNCT__ "MatMPISBAIJSetPreallocation"
1620 /*@C
1621    MatMPISBAIJSetPreallocation - For good matrix assembly performance
1622    the user should preallocate the matrix storage by setting the parameters
1623    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1624    performance can be increased by more than a factor of 50.
1625 
1626    Collective on Mat
1627 
1628    Input Parameters:
1629 +  A - the matrix
1630 .  bs   - size of blockk
1631 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1632            submatrix  (same for all local rows)
1633 .  d_nnz - array containing the number of block nonzeros in the various block rows
1634            of the in diagonal portion of the local (possibly different for each block
1635            row) or PETSC_NULL.  You must leave room for the diagonal entry even if it is zero.
1636 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1637            submatrix (same for all local rows).
1638 -  o_nnz - array containing the number of nonzeros in the various block rows of the
1639            off-diagonal portion of the local submatrix (possibly different for
1640            each block row) or PETSC_NULL.
1641 
1642 
1643    Options Database Keys:
1644 .   -mat_no_unroll - uses code that does not unroll the loops in the
1645                      block calculations (much slower)
1646 .   -mat_block_size - size of the blocks to use
1647 
1648    Notes:
1649 
1650    If PETSC_DECIDE or  PETSC_DETERMINE is used for a particular argument on one processor
1651    than it must be used on all processors that share the object for that argument.
1652 
1653    Storage Information:
1654    For a square global matrix we define each processor's diagonal portion
1655    to be its local rows and the corresponding columns (a square submatrix);
1656    each processor's off-diagonal portion encompasses the remainder of the
1657    local matrix (a rectangular submatrix).
1658 
1659    The user can specify preallocated storage for the diagonal part of
1660    the local submatrix with either d_nz or d_nnz (not both).  Set
1661    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1662    memory allocation.  Likewise, specify preallocated storage for the
1663    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1664 
1665    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1666    the figure below we depict these three local rows and all columns (0-11).
1667 
1668 .vb
1669            0 1 2 3 4 5 6 7 8 9 10 11
1670           -------------------
1671    row 3  |  o o o d d d o o o o o o
1672    row 4  |  o o o d d d o o o o o o
1673    row 5  |  o o o d d d o o o o o o
1674           -------------------
1675 .ve
1676 
1677    Thus, any entries in the d locations are stored in the d (diagonal)
1678    submatrix, and any entries in the o locations are stored in the
1679    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1680    stored simply in the MATSEQBAIJ format for compressed row storage.
1681 
1682    Now d_nz should indicate the number of block nonzeros per row in the d matrix,
1683    and o_nz should indicate the number of block nonzeros per row in the o matrix.
1684    In general, for PDE problems in which most nonzeros are near the diagonal,
1685    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1686    or you will get TERRIBLE performance; see the users' manual chapter on
1687    matrices.
1688 
1689    Level: intermediate
1690 
1691 .keywords: matrix, block, aij, compressed row, sparse, parallel
1692 
1693 .seealso: MatCreate(), MatCreateSeqSBAIJ(), MatSetValues(), MatCreateMPIBAIJ()
1694 @*/
1695 
1696 int MatMPISBAIJSetPreallocation(Mat B,int bs,int d_nz,int *d_nnz,int o_nz,int *o_nnz)
1697 {
1698   Mat_MPISBAIJ *b;
1699   int          ierr,i,mbs,Mbs;
1700   PetscTruth   flg2;
1701 
1702   PetscFunctionBegin;
1703   ierr = PetscTypeCompare((PetscObject)B,MATMPISBAIJ,&flg2);CHKERRQ(ierr);
1704   if (!flg2) PetscFunctionReturn(0);
1705 
1706   ierr = PetscOptionsGetInt(PETSC_NULL,"-mat_block_size",&bs,PETSC_NULL);CHKERRQ(ierr);
1707 
1708   if (bs < 1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Invalid block size specified, must be positive");
1709   if (d_nz == PETSC_DECIDE || d_nz == PETSC_DEFAULT) d_nz = 3;
1710   if (o_nz == PETSC_DECIDE || o_nz == PETSC_DEFAULT) o_nz = 1;
1711   if (d_nz < 0) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %d",d_nz);
1712   if (o_nz < 0) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %d",o_nz);
1713   if (d_nnz) {
1714     for (i=0; i<B->m/bs; i++) {
1715       if (d_nnz[i] < 0) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than -1: local row %d value %d",i,d_nnz[i]);
1716     }
1717   }
1718   if (o_nnz) {
1719     for (i=0; i<B->m/bs; i++) {
1720       if (o_nnz[i] < 0) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than -1: local row %d value %d",i,o_nnz[i]);
1721     }
1722   }
1723   B->preallocated = PETSC_TRUE;
1724   ierr = PetscSplitOwnershipBlock(B->comm,bs,&B->m,&B->M);CHKERRQ(ierr);
1725   ierr = PetscSplitOwnershipBlock(B->comm,bs,&B->n,&B->N);CHKERRQ(ierr);
1726   ierr = PetscMapCreateMPI(B->comm,B->m,B->M,&B->rmap);CHKERRQ(ierr);
1727   ierr = PetscMapCreateMPI(B->comm,B->m,B->M,&B->cmap);CHKERRQ(ierr);
1728 
1729   b   = (Mat_MPISBAIJ*)B->data;
1730   mbs = B->m/bs;
1731   Mbs = B->M/bs;
1732   if (mbs*bs != B->m) {
1733     SETERRQ2(PETSC_ERR_ARG_SIZ,"No of local rows %d must be divisible by blocksize %d",B->m,bs);
1734   }
1735 
1736   b->bs  = bs;
1737   b->bs2 = bs*bs;
1738   b->mbs = mbs;
1739   b->nbs = mbs;
1740   b->Mbs = Mbs;
1741   b->Nbs = Mbs;
1742 
1743   ierr = MPI_Allgather(&b->mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,B->comm);CHKERRQ(ierr);
1744   b->rowners[0]    = 0;
1745   for (i=2; i<=b->size; i++) {
1746     b->rowners[i] += b->rowners[i-1];
1747   }
1748   b->rstart    = b->rowners[b->rank];
1749   b->rend      = b->rowners[b->rank+1];
1750   b->cstart    = b->rstart;
1751   b->cend      = b->rend;
1752   for (i=0; i<=b->size; i++) {
1753     b->rowners_bs[i] = b->rowners[i]*bs;
1754   }
1755   b->rstart_bs = b-> rstart*bs;
1756   b->rend_bs   = b->rend*bs;
1757 
1758   b->cstart_bs = b->cstart*bs;
1759   b->cend_bs   = b->cend*bs;
1760 
1761 
1762   ierr = MatCreateSeqSBAIJ(PETSC_COMM_SELF,bs,B->m,B->m,d_nz,d_nnz,&b->A);CHKERRQ(ierr);
1763   PetscLogObjectParent(B,b->A);
1764   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,B->m,B->M,o_nz,o_nnz,&b->B);CHKERRQ(ierr);
1765   PetscLogObjectParent(B,b->B);
1766 
1767   /* build cache for off array entries formed */
1768   ierr = MatStashCreate_Private(B->comm,bs,&B->bstash);CHKERRQ(ierr);
1769 
1770   PetscFunctionReturn(0);
1771 }
1772 
1773 #undef __FUNCT__
1774 #define __FUNCT__ "MatCreateMPISBAIJ"
1775 /*@C
1776    MatCreateMPISBAIJ - Creates a sparse parallel matrix in symmetric block AIJ format
1777    (block compressed row).  For good matrix assembly performance
1778    the user should preallocate the matrix storage by setting the parameters
1779    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1780    performance can be increased by more than a factor of 50.
1781 
1782    Collective on MPI_Comm
1783 
1784    Input Parameters:
1785 +  comm - MPI communicator
1786 .  bs   - size of blockk
1787 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1788            This value should be the same as the local size used in creating the
1789            y vector for the matrix-vector product y = Ax.
1790 .  n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1791            This value should be the same as the local size used in creating the
1792            x vector for the matrix-vector product y = Ax.
1793 .  M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1794 .  N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1795 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1796            submatrix  (same for all local rows)
1797 .  d_nnz - array containing the number of block nonzeros in the various block rows
1798            of the in diagonal portion of the local (possibly different for each block
1799            row) or PETSC_NULL.  You must leave room for the diagonal entry even if it is zero.
1800 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1801            submatrix (same for all local rows).
1802 -  o_nnz - array containing the number of nonzeros in the various block rows of the
1803            off-diagonal portion of the local submatrix (possibly different for
1804            each block row) or PETSC_NULL.
1805 
1806    Output Parameter:
1807 .  A - the matrix
1808 
1809    Options Database Keys:
1810 .   -mat_no_unroll - uses code that does not unroll the loops in the
1811                      block calculations (much slower)
1812 .   -mat_block_size - size of the blocks to use
1813 .   -mat_mpi - use the parallel matrix data structures even on one processor
1814                (defaults to using SeqBAIJ format on one processor)
1815 
1816    Notes:
1817    The user MUST specify either the local or global matrix dimensions
1818    (possibly both).
1819 
1820    If PETSC_DECIDE or  PETSC_DETERMINE is used for a particular argument on one processor
1821    than it must be used on all processors that share the object for that argument.
1822 
1823    Storage Information:
1824    For a square global matrix we define each processor's diagonal portion
1825    to be its local rows and the corresponding columns (a square submatrix);
1826    each processor's off-diagonal portion encompasses the remainder of the
1827    local matrix (a rectangular submatrix).
1828 
1829    The user can specify preallocated storage for the diagonal part of
1830    the local submatrix with either d_nz or d_nnz (not both).  Set
1831    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1832    memory allocation.  Likewise, specify preallocated storage for the
1833    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1834 
1835    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1836    the figure below we depict these three local rows and all columns (0-11).
1837 
1838 .vb
1839            0 1 2 3 4 5 6 7 8 9 10 11
1840           -------------------
1841    row 3  |  o o o d d d o o o o o o
1842    row 4  |  o o o d d d o o o o o o
1843    row 5  |  o o o d d d o o o o o o
1844           -------------------
1845 .ve
1846 
1847    Thus, any entries in the d locations are stored in the d (diagonal)
1848    submatrix, and any entries in the o locations are stored in the
1849    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1850    stored simply in the MATSEQBAIJ format for compressed row storage.
1851 
1852    Now d_nz should indicate the number of block nonzeros per row in the d matrix,
1853    and o_nz should indicate the number of block nonzeros per row in the o matrix.
1854    In general, for PDE problems in which most nonzeros are near the diagonal,
1855    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1856    or you will get TERRIBLE performance; see the users' manual chapter on
1857    matrices.
1858 
1859    Level: intermediate
1860 
1861 .keywords: matrix, block, aij, compressed row, sparse, parallel
1862 
1863 .seealso: MatCreate(), MatCreateSeqSBAIJ(), MatSetValues(), MatCreateMPIBAIJ()
1864 @*/
1865 
1866 int MatCreateMPISBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1867 {
1868   int ierr,size;
1869 
1870   PetscFunctionBegin;
1871   ierr = MatCreate(comm,m,n,M,N,A);CHKERRQ(ierr);
1872   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1873   if (size > 1) {
1874     ierr = MatSetType(*A,MATMPISBAIJ);CHKERRQ(ierr);
1875     ierr = MatMPISBAIJSetPreallocation(*A,bs,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
1876   } else {
1877     ierr = MatSetType(*A,MATSEQSBAIJ);CHKERRQ(ierr);
1878     ierr = MatSeqSBAIJSetPreallocation(*A,bs,d_nz,d_nnz);CHKERRQ(ierr);
1879   }
1880   PetscFunctionReturn(0);
1881 }
1882 
1883 
1884 #undef __FUNCT__
1885 #define __FUNCT__ "MatDuplicate_MPISBAIJ"
1886 static int MatDuplicate_MPISBAIJ(Mat matin,MatDuplicateOption cpvalues,Mat *newmat)
1887 {
1888   Mat          mat;
1889   Mat_MPISBAIJ *a,*oldmat = (Mat_MPISBAIJ*)matin->data;
1890   int          ierr,len=0;
1891 
1892   PetscFunctionBegin;
1893   *newmat       = 0;
1894   ierr = MatCreate(matin->comm,matin->m,matin->n,matin->M,matin->N,&mat);CHKERRQ(ierr);
1895   ierr = MatSetType(mat,MATMPISBAIJ);CHKERRQ(ierr);
1896   mat->preallocated = PETSC_TRUE;
1897   a = (Mat_MPISBAIJ*)mat->data;
1898   a->bs  = oldmat->bs;
1899   a->bs2 = oldmat->bs2;
1900   a->mbs = oldmat->mbs;
1901   a->nbs = oldmat->nbs;
1902   a->Mbs = oldmat->Mbs;
1903   a->Nbs = oldmat->Nbs;
1904 
1905   a->rstart       = oldmat->rstart;
1906   a->rend         = oldmat->rend;
1907   a->cstart       = oldmat->cstart;
1908   a->cend         = oldmat->cend;
1909   a->size         = oldmat->size;
1910   a->rank         = oldmat->rank;
1911   a->donotstash   = oldmat->donotstash;
1912   a->roworiented  = oldmat->roworiented;
1913   a->rowindices   = 0;
1914   a->rowvalues    = 0;
1915   a->getrowactive = PETSC_FALSE;
1916   a->barray       = 0;
1917   a->rstart_bs    = oldmat->rstart_bs;
1918   a->rend_bs      = oldmat->rend_bs;
1919   a->cstart_bs    = oldmat->cstart_bs;
1920   a->cend_bs      = oldmat->cend_bs;
1921 
1922   /* hash table stuff */
1923   a->ht           = 0;
1924   a->hd           = 0;
1925   a->ht_size      = 0;
1926   a->ht_flag      = oldmat->ht_flag;
1927   a->ht_fact      = oldmat->ht_fact;
1928   a->ht_total_ct  = 0;
1929   a->ht_insert_ct = 0;
1930 
1931   ierr = PetscMalloc(3*(a->size+2)*sizeof(int),&a->rowners);CHKERRQ(ierr);
1932   PetscLogObjectMemory(mat,3*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPISBAIJ));
1933   a->cowners    = a->rowners + a->size + 2;
1934   a->rowners_bs = a->cowners + a->size + 2;
1935   ierr = PetscMemcpy(a->rowners,oldmat->rowners,3*(a->size+2)*sizeof(int));CHKERRQ(ierr);
1936   ierr = MatStashCreate_Private(matin->comm,1,&mat->stash);CHKERRQ(ierr);
1937   ierr = MatStashCreate_Private(matin->comm,oldmat->bs,&mat->bstash);CHKERRQ(ierr);
1938   if (oldmat->colmap) {
1939 #if defined (PETSC_USE_CTABLE)
1940     ierr = PetscTableCreateCopy(oldmat->colmap,&a->colmap);CHKERRQ(ierr);
1941 #else
1942     ierr = PetscMalloc((a->Nbs)*sizeof(int),&a->colmap);CHKERRQ(ierr);
1943     PetscLogObjectMemory(mat,(a->Nbs)*sizeof(int));
1944     ierr = PetscMemcpy(a->colmap,oldmat->colmap,(a->Nbs)*sizeof(int));CHKERRQ(ierr);
1945 #endif
1946   } else a->colmap = 0;
1947   if (oldmat->garray && (len = ((Mat_SeqBAIJ*)(oldmat->B->data))->nbs)) {
1948     ierr = PetscMalloc(len*sizeof(int),&a->garray);CHKERRQ(ierr);
1949     PetscLogObjectMemory(mat,len*sizeof(int));
1950     ierr = PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));CHKERRQ(ierr);
1951   } else a->garray = 0;
1952 
1953   ierr =  VecDuplicate(oldmat->lvec,&a->lvec);CHKERRQ(ierr);
1954   PetscLogObjectParent(mat,a->lvec);
1955   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx);CHKERRQ(ierr);
1956 
1957   PetscLogObjectParent(mat,a->Mvctx);
1958   ierr =  MatDuplicate(oldmat->A,cpvalues,&a->A);CHKERRQ(ierr);
1959   PetscLogObjectParent(mat,a->A);
1960   ierr =  MatDuplicate(oldmat->B,cpvalues,&a->B);CHKERRQ(ierr);
1961   PetscLogObjectParent(mat,a->B);
1962   ierr = PetscFListDuplicate(mat->qlist,&matin->qlist);CHKERRQ(ierr);
1963   *newmat = mat;
1964   PetscFunctionReturn(0);
1965 }
1966 
1967 #include "petscsys.h"
1968 
1969 EXTERN_C_BEGIN
1970 #undef __FUNCT__
1971 #define __FUNCT__ "MatLoad_MPISBAIJ"
1972 int MatLoad_MPISBAIJ(PetscViewer viewer,MatType type,Mat *newmat)
1973 {
1974   Mat          A;
1975   int          i,nz,ierr,j,rstart,rend,fd;
1976   PetscScalar  *vals,*buf;
1977   MPI_Comm     comm = ((PetscObject)viewer)->comm;
1978   MPI_Status   status;
1979   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,*browners,maxnz,*cols;
1980   int          *locrowlens,*sndcounts = 0,*procsnz = 0,jj,*mycols,*ibuf;
1981   int          tag = ((PetscObject)viewer)->tag,bs=1,Mbs,mbs,extra_rows;
1982   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
1983   int          dcount,kmax,k,nzcount,tmp;
1984 
1985   PetscFunctionBegin;
1986   ierr = PetscOptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,PETSC_NULL);CHKERRQ(ierr);
1987 
1988   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1989   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1990   if (!rank) {
1991     ierr = PetscViewerBinaryGetDescriptor(viewer,&fd);CHKERRQ(ierr);
1992     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT);CHKERRQ(ierr);
1993     if (header[0] != MAT_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"not matrix object");
1994     if (header[3] < 0) {
1995       SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"Matrix stored in special format, cannot load as MPISBAIJ");
1996     }
1997   }
1998 
1999   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
2000   M = header[1]; N = header[2];
2001 
2002   if (M != N) SETERRQ(PETSC_ERR_SUP,"Can only do square matrices");
2003 
2004   /*
2005      This code adds extra rows to make sure the number of rows is
2006      divisible by the blocksize
2007   */
2008   Mbs        = M/bs;
2009   extra_rows = bs - M + bs*(Mbs);
2010   if (extra_rows == bs) extra_rows = 0;
2011   else                  Mbs++;
2012   if (extra_rows &&!rank) {
2013     PetscLogInfo(0,"MatLoad_MPISBAIJ:Padding loaded matrix to match blocksize\n");
2014   }
2015 
2016   /* determine ownership of all rows */
2017   mbs        = Mbs/size + ((Mbs % size) > rank);
2018   m          = mbs*bs;
2019   ierr       = PetscMalloc(2*(size+2)*sizeof(int),&rowners);CHKERRQ(ierr);
2020   browners   = rowners + size + 1;
2021   ierr       = MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2022   rowners[0] = 0;
2023   for (i=2; i<=size; i++) rowners[i] += rowners[i-1];
2024   for (i=0; i<=size;  i++) browners[i] = rowners[i]*bs;
2025   rstart = rowners[rank];
2026   rend   = rowners[rank+1];
2027 
2028   /* distribute row lengths to all processors */
2029   ierr = PetscMalloc((rend-rstart)*bs*sizeof(int),&locrowlens);CHKERRQ(ierr);
2030   if (!rank) {
2031     ierr = PetscMalloc((M+extra_rows)*sizeof(int),&rowlengths);CHKERRQ(ierr);
2032     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT);CHKERRQ(ierr);
2033     for (i=0; i<extra_rows; i++) rowlengths[M+i] = 1;
2034     ierr = PetscMalloc(size*sizeof(int),&sndcounts);CHKERRQ(ierr);
2035     for (i=0; i<size; i++) sndcounts[i] = browners[i+1] - browners[i];
2036     ierr = MPI_Scatterv(rowlengths,sndcounts,browners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);CHKERRQ(ierr);
2037     ierr = PetscFree(sndcounts);CHKERRQ(ierr);
2038   } else {
2039     ierr = MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);CHKERRQ(ierr);
2040   }
2041 
2042   if (!rank) {   /* procs[0] */
2043     /* calculate the number of nonzeros on each processor */
2044     ierr = PetscMalloc(size*sizeof(int),&procsnz);CHKERRQ(ierr);
2045     ierr = PetscMemzero(procsnz,size*sizeof(int));CHKERRQ(ierr);
2046     for (i=0; i<size; i++) {
2047       for (j=rowners[i]*bs; j< rowners[i+1]*bs; j++) {
2048         procsnz[i] += rowlengths[j];
2049       }
2050     }
2051     ierr = PetscFree(rowlengths);CHKERRQ(ierr);
2052 
2053     /* determine max buffer needed and allocate it */
2054     maxnz = 0;
2055     for (i=0; i<size; i++) {
2056       maxnz = PetscMax(maxnz,procsnz[i]);
2057     }
2058     ierr = PetscMalloc(maxnz*sizeof(int),&cols);CHKERRQ(ierr);
2059 
2060     /* read in my part of the matrix column indices  */
2061     nz     = procsnz[0];
2062     ierr   = PetscMalloc(nz*sizeof(int),&ibuf);CHKERRQ(ierr);
2063     mycols = ibuf;
2064     if (size == 1)  nz -= extra_rows;
2065     ierr = PetscBinaryRead(fd,mycols,nz,PETSC_INT);CHKERRQ(ierr);
2066     if (size == 1)  for (i=0; i< extra_rows; i++) { mycols[nz+i] = M+i; }
2067 
2068     /* read in every ones (except the last) and ship off */
2069     for (i=1; i<size-1; i++) {
2070       nz   = procsnz[i];
2071       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT);CHKERRQ(ierr);
2072       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
2073     }
2074     /* read in the stuff for the last proc */
2075     if (size != 1) {
2076       nz   = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
2077       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT);CHKERRQ(ierr);
2078       for (i=0; i<extra_rows; i++) cols[nz+i] = M+i;
2079       ierr = MPI_Send(cols,nz+extra_rows,MPI_INT,size-1,tag,comm);CHKERRQ(ierr);
2080     }
2081     ierr = PetscFree(cols);CHKERRQ(ierr);
2082   } else {  /* procs[i], i>0 */
2083     /* determine buffer space needed for message */
2084     nz = 0;
2085     for (i=0; i<m; i++) {
2086       nz += locrowlens[i];
2087     }
2088     ierr   = PetscMalloc(nz*sizeof(int),&ibuf);CHKERRQ(ierr);
2089     mycols = ibuf;
2090     /* receive message of column indices*/
2091     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
2092     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
2093     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"something is wrong with file");
2094   }
2095 
2096   /* loop over local rows, determining number of off diagonal entries */
2097   ierr     = PetscMalloc(2*(rend-rstart+1)*sizeof(int),&dlens);CHKERRQ(ierr);
2098   odlens   = dlens + (rend-rstart);
2099   ierr     = PetscMalloc(3*Mbs*sizeof(int),&mask);CHKERRQ(ierr);
2100   ierr     = PetscMemzero(mask,3*Mbs*sizeof(int));CHKERRQ(ierr);
2101   masked1  = mask    + Mbs;
2102   masked2  = masked1 + Mbs;
2103   rowcount = 0; nzcount = 0;
2104   for (i=0; i<mbs; i++) {
2105     dcount  = 0;
2106     odcount = 0;
2107     for (j=0; j<bs; j++) {
2108       kmax = locrowlens[rowcount];
2109       for (k=0; k<kmax; k++) {
2110         tmp = mycols[nzcount++]/bs; /* block col. index */
2111         if (!mask[tmp]) {
2112           mask[tmp] = 1;
2113           if (tmp < rstart || tmp >= rend) masked2[odcount++] = tmp; /* entry in off-diag portion */
2114           else masked1[dcount++] = tmp; /* entry in diag portion */
2115         }
2116       }
2117       rowcount++;
2118     }
2119 
2120     dlens[i]  = dcount;  /* d_nzz[i] */
2121     odlens[i] = odcount; /* o_nzz[i] */
2122 
2123     /* zero out the mask elements we set */
2124     for (j=0; j<dcount; j++) mask[masked1[j]] = 0;
2125     for (j=0; j<odcount; j++) mask[masked2[j]] = 0;
2126   }
2127 
2128   /* create our matrix */
2129   ierr = MatCreateMPISBAIJ(comm,bs,m,m,PETSC_DETERMINE,PETSC_DETERMINE,0,dlens,0,odlens,newmat);
2130   CHKERRQ(ierr);
2131   A = *newmat;
2132   ierr = MatSetOption(A,MAT_COLUMNS_SORTED);CHKERRQ(ierr);
2133 
2134   if (!rank) {
2135     ierr = PetscMalloc(maxnz*sizeof(PetscScalar),&buf);CHKERRQ(ierr);
2136     /* read in my part of the matrix numerical values  */
2137     nz = procsnz[0];
2138     vals = buf;
2139     mycols = ibuf;
2140     if (size == 1)  nz -= extra_rows;
2141     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR);CHKERRQ(ierr);
2142     if (size == 1)  for (i=0; i< extra_rows; i++) { vals[nz+i] = 1.0; }
2143 
2144     /* insert into matrix */
2145     jj      = rstart*bs;
2146     for (i=0; i<m; i++) {
2147       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2148       mycols += locrowlens[i];
2149       vals   += locrowlens[i];
2150       jj++;
2151     }
2152 
2153     /* read in other processors (except the last one) and ship out */
2154     for (i=1; i<size-1; i++) {
2155       nz   = procsnz[i];
2156       vals = buf;
2157       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR);CHKERRQ(ierr);
2158       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
2159     }
2160     /* the last proc */
2161     if (size != 1){
2162       nz   = procsnz[i] - extra_rows;
2163       vals = buf;
2164       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR);CHKERRQ(ierr);
2165       for (i=0; i<extra_rows; i++) vals[nz+i] = 1.0;
2166       ierr = MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,size-1,A->tag,comm);CHKERRQ(ierr);
2167     }
2168     ierr = PetscFree(procsnz);CHKERRQ(ierr);
2169 
2170   } else {
2171     /* receive numeric values */
2172     ierr = PetscMalloc(nz*sizeof(PetscScalar),&buf);CHKERRQ(ierr);
2173 
2174     /* receive message of values*/
2175     vals   = buf;
2176     mycols = ibuf;
2177     ierr   = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
2178     ierr   = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
2179     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"something is wrong with file");
2180 
2181     /* insert into matrix */
2182     jj      = rstart*bs;
2183     for (i=0; i<m; i++) {
2184       ierr    = MatSetValues_MPISBAIJ(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2185       mycols += locrowlens[i];
2186       vals   += locrowlens[i];
2187       jj++;
2188     }
2189   }
2190 
2191   ierr = PetscFree(locrowlens);CHKERRQ(ierr);
2192   ierr = PetscFree(buf);CHKERRQ(ierr);
2193   ierr = PetscFree(ibuf);CHKERRQ(ierr);
2194   ierr = PetscFree(rowners);CHKERRQ(ierr);
2195   ierr = PetscFree(dlens);CHKERRQ(ierr);
2196   ierr = PetscFree(mask);CHKERRQ(ierr);
2197   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2198   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2199   PetscFunctionReturn(0);
2200 }
2201 EXTERN_C_END
2202 
2203 #undef __FUNCT__
2204 #define __FUNCT__ "MatMPISBAIJSetHashTableFactor"
2205 /*@
2206    MatMPISBAIJSetHashTableFactor - Sets the factor required to compute the size of the HashTable.
2207 
2208    Input Parameters:
2209 .  mat  - the matrix
2210 .  fact - factor
2211 
2212    Collective on Mat
2213 
2214    Level: advanced
2215 
2216   Notes:
2217    This can also be set by the command line option: -mat_use_hash_table fact
2218 
2219 .keywords: matrix, hashtable, factor, HT
2220 
2221 .seealso: MatSetOption()
2222 @*/
2223 int MatMPISBAIJSetHashTableFactor(Mat mat,PetscReal fact)
2224 {
2225   PetscFunctionBegin;
2226   SETERRQ(1,"Function not yet written for SBAIJ format");
2227   /* PetscFunctionReturn(0); */
2228 }
2229 
2230 #undef __FUNCT__
2231 #define __FUNCT__ "MatGetRowMax_MPISBAIJ"
2232 int MatGetRowMax_MPISBAIJ(Mat A,Vec v)
2233 {
2234   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
2235   Mat_SeqBAIJ  *b = (Mat_SeqBAIJ*)(a->B)->data;
2236   PetscReal    atmp;
2237   PetscReal    *work,*svalues,*rvalues;
2238   int          ierr,i,bs,mbs,*bi,*bj,brow,j,ncols,krow,kcol,col,row,Mbs,bcol;
2239   int          rank,size,*rowners_bs,dest,count,source;
2240   PetscScalar  *va;
2241   MatScalar    *ba;
2242   MPI_Status   stat;
2243 
2244   PetscFunctionBegin;
2245   ierr = MatGetRowMax(a->A,v);CHKERRQ(ierr);
2246   ierr = VecGetArray(v,&va);CHKERRQ(ierr);
2247 
2248   ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
2249   ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);
2250 
2251   bs   = a->bs;
2252   mbs  = a->mbs;
2253   Mbs  = a->Mbs;
2254   ba   = b->a;
2255   bi   = b->i;
2256   bj   = b->j;
2257   /*
2258   PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] M: %d, bs: %d, mbs: %d \n",rank,bs*Mbs,bs,mbs);
2259   PetscSynchronizedFlush(PETSC_COMM_WORLD);
2260   */
2261 
2262   /* find ownerships */
2263   rowners_bs = a->rowners_bs;
2264   /*
2265   if (!rank){
2266     for (i=0; i<size+1; i++) PetscPrintf(PETSC_COMM_SELF," rowners_bs[%d]: %d\n",i,rowners_bs[i]);
2267   }
2268   */
2269 
2270   /* each proc creates an array to be distributed */
2271   ierr = PetscMalloc(bs*Mbs*sizeof(PetscReal),&work);CHKERRQ(ierr);
2272   ierr = PetscMemzero(work,bs*Mbs*sizeof(PetscReal));CHKERRQ(ierr);
2273 
2274   /* row_max for B */
2275   if (rank != size-1){
2276     for (i=0; i<mbs; i++) {
2277       ncols = bi[1] - bi[0]; bi++;
2278       brow  = bs*i;
2279       for (j=0; j<ncols; j++){
2280         bcol = bs*(*bj);
2281         for (kcol=0; kcol<bs; kcol++){
2282           col = bcol + kcol;                 /* local col index */
2283           col += rowners_bs[rank+1];      /* global col index */
2284           /* PetscPrintf(PETSC_COMM_SELF,"[%d], col: %d\n",rank,col); */
2285           for (krow=0; krow<bs; krow++){
2286             atmp = PetscAbsScalar(*ba); ba++;
2287             row = brow + krow;    /* local row index */
2288             /* printf("val[%d,%d]: %g\n",row,col,atmp); */
2289             if (PetscRealPart(va[row]) < atmp) va[row] = atmp;
2290             if (work[col] < atmp) work[col] = atmp;
2291           }
2292         }
2293         bj++;
2294       }
2295     }
2296     /*
2297       PetscPrintf(PETSC_COMM_SELF,"[%d], work: ",rank);
2298       for (i=0; i<bs*Mbs; i++) PetscPrintf(PETSC_COMM_SELF,"%g ",work[i]);
2299       PetscPrintf(PETSC_COMM_SELF,"[%d]: \n");
2300       */
2301 
2302     /* send values to its owners */
2303     for (dest=rank+1; dest<size; dest++){
2304       svalues = work + rowners_bs[dest];
2305       count   = rowners_bs[dest+1]-rowners_bs[dest];
2306       ierr    = MPI_Send(svalues,count,MPIU_REAL,dest,rank,PETSC_COMM_WORLD);CHKERRQ(ierr);
2307       /*
2308       PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] sends %d values to [%d]: %g, %g, %g, %g\n",rank,count,dest,svalues[0],svalues[1],svalues[2],svalues[3]);
2309       PetscSynchronizedFlush(PETSC_COMM_WORLD);
2310       */
2311     }
2312   }
2313 
2314   /* receive values */
2315   if (rank){
2316     rvalues = work;
2317     count   = rowners_bs[rank+1]-rowners_bs[rank];
2318     for (source=0; source<rank; source++){
2319       ierr = MPI_Recv(rvalues,count,MPIU_REAL,MPI_ANY_SOURCE,MPI_ANY_TAG,PETSC_COMM_WORLD,&stat);CHKERRQ(ierr);
2320       /* process values */
2321       for (i=0; i<count; i++){
2322         if (PetscRealPart(va[i]) < rvalues[i]) va[i] = rvalues[i];
2323       }
2324       /*
2325       PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] received %d values from [%d]: %g, %g, %g, %g \n",rank,count,stat.MPI_SOURCE,rvalues[0],rvalues[1],rvalues[2],rvalues[3]);
2326       PetscSynchronizedFlush(PETSC_COMM_WORLD);
2327       */
2328     }
2329   }
2330 
2331   ierr = VecRestoreArray(v,&va);CHKERRQ(ierr);
2332   ierr = PetscFree(work);CHKERRQ(ierr);
2333   PetscFunctionReturn(0);
2334 }
2335