xref: /petsc/src/mat/impls/sbaij/mpi/mpisbaij.c (revision fa3b22f8a6da1ce63e02ce485ee96787cb246aa2)
1 /*$Id: mpisbaij.c,v 1.61 2001/08/10 03:31:37 bsmith Exp $*/
2 
3 #include "src/mat/impls/baij/mpi/mpibaij.h"    /*I "petscmat.h" I*/
4 #include "src/vec/vecimpl.h"
5 #include "mpisbaij.h"
6 #include "src/mat/impls/sbaij/seq/sbaij.h"
7 
8 extern int MatSetUpMultiply_MPISBAIJ(Mat);
9 extern int DisAssemble_MPISBAIJ(Mat);
10 extern int MatIncreaseOverlap_MPISBAIJ(Mat,int,IS *,int);
11 extern int MatGetSubMatrices_MPISBAIJ(Mat,int,IS *,IS *,MatReuse,Mat **);
12 extern int MatGetValues_SeqSBAIJ(Mat,int,int *,int,int *,PetscScalar *);
13 extern int MatSetValues_SeqSBAIJ(Mat,int,int *,int,int *,PetscScalar *,InsertMode);
14 extern int MatSetValuesBlocked_SeqSBAIJ(Mat,int,int*,int,int*,PetscScalar*,InsertMode);
15 extern int MatGetRow_SeqSBAIJ(Mat,int,int*,int**,PetscScalar**);
16 extern int MatRestoreRow_SeqSBAIJ(Mat,int,int*,int**,PetscScalar**);
17 extern int MatPrintHelp_SeqSBAIJ(Mat);
18 extern int MatZeroRows_SeqSBAIJ(Mat,IS,PetscScalar*);
19 extern int MatZeroRows_SeqBAIJ(Mat,IS,PetscScalar *);
20 extern int MatGetRowMax_MPISBAIJ(Mat,Vec);
21 
22 /*  UGLY, ugly, ugly
23    When MatScalar == PetscScalar the function MatSetValuesBlocked_MPIBAIJ_MatScalar() does
24    not exist. Otherwise ..._MatScalar() takes matrix elements in single precision and
25    inserts them into the single precision data structure. The function MatSetValuesBlocked_MPIBAIJ()
26    converts the entries into single precision and then calls ..._MatScalar() to put them
27    into the single precision data structures.
28 */
29 #if defined(PETSC_USE_MAT_SINGLE)
30 extern int MatSetValuesBlocked_SeqSBAIJ_MatScalar(Mat,int,int*,int,int*,MatScalar*,InsertMode);
31 extern int MatSetValues_MPISBAIJ_MatScalar(Mat,int,int*,int,int*,MatScalar*,InsertMode);
32 extern int MatSetValuesBlocked_MPISBAIJ_MatScalar(Mat,int,int*,int,int*,MatScalar*,InsertMode);
33 extern int MatSetValues_MPISBAIJ_HT_MatScalar(Mat,int,int*,int,int*,MatScalar*,InsertMode);
34 extern int MatSetValuesBlocked_MPISBAIJ_HT_MatScalar(Mat,int,int*,int,int*,MatScalar*,InsertMode);
35 #else
36 #define MatSetValuesBlocked_SeqSBAIJ_MatScalar      MatSetValuesBlocked_SeqSBAIJ
37 #define MatSetValues_MPISBAIJ_MatScalar             MatSetValues_MPISBAIJ
38 #define MatSetValuesBlocked_MPISBAIJ_MatScalar      MatSetValuesBlocked_MPISBAIJ
39 #define MatSetValues_MPISBAIJ_HT_MatScalar          MatSetValues_MPISBAIJ_HT
40 #define MatSetValuesBlocked_MPISBAIJ_HT_MatScalar   MatSetValuesBlocked_MPISBAIJ_HT
41 #endif
42 
43 EXTERN_C_BEGIN
44 #undef __FUNCT__
45 #define __FUNCT__ "MatStoreValues_MPISBAIJ"
46 int MatStoreValues_MPISBAIJ(Mat mat)
47 {
48   Mat_MPISBAIJ *aij = (Mat_MPISBAIJ *)mat->data;
49   int          ierr;
50 
51   PetscFunctionBegin;
52   ierr = MatStoreValues(aij->A);CHKERRQ(ierr);
53   ierr = MatStoreValues(aij->B);CHKERRQ(ierr);
54   PetscFunctionReturn(0);
55 }
56 EXTERN_C_END
57 
58 EXTERN_C_BEGIN
59 #undef __FUNCT__
60 #define __FUNCT__ "MatRetrieveValues_MPISBAIJ"
61 int MatRetrieveValues_MPISBAIJ(Mat mat)
62 {
63   Mat_MPISBAIJ *aij = (Mat_MPISBAIJ *)mat->data;
64   int          ierr;
65 
66   PetscFunctionBegin;
67   ierr = MatRetrieveValues(aij->A);CHKERRQ(ierr);
68   ierr = MatRetrieveValues(aij->B);CHKERRQ(ierr);
69   PetscFunctionReturn(0);
70 }
71 EXTERN_C_END
72 
73 /*
74      Local utility routine that creates a mapping from the global column
75    number to the local number in the off-diagonal part of the local
76    storage of the matrix.  This is done in a non scable way since the
77    length of colmap equals the global matrix length.
78 */
79 #undef __FUNCT__
80 #define __FUNCT__ "CreateColmap_MPISBAIJ_Private"
81 static int CreateColmap_MPISBAIJ_Private(Mat mat)
82 {
83   PetscFunctionBegin;
84   SETERRQ(1,"Function not yet written for SBAIJ format");
85   /* PetscFunctionReturn(0); */
86 }
87 
88 #define CHUNKSIZE  10
89 
90 #define  MatSetValues_SeqSBAIJ_A_Private(row,col,value,addv) \
91 { \
92  \
93     brow = row/bs;  \
94     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
95     rmax = aimax[brow]; nrow = ailen[brow]; \
96       bcol = col/bs; \
97       ridx = row % bs; cidx = col % bs; \
98       low = 0; high = nrow; \
99       while (high-low > 3) { \
100         t = (low+high)/2; \
101         if (rp[t] > bcol) high = t; \
102         else              low  = t; \
103       } \
104       for (_i=low; _i<high; _i++) { \
105         if (rp[_i] > bcol) break; \
106         if (rp[_i] == bcol) { \
107           bap  = ap +  bs2*_i + bs*cidx + ridx; \
108           if (addv == ADD_VALUES) *bap += value;  \
109           else                    *bap  = value;  \
110           goto a_noinsert; \
111         } \
112       } \
113       if (a->nonew == 1) goto a_noinsert; \
114       else if (a->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero into matrix"); \
115       if (nrow >= rmax) { \
116         /* there is no extra room in row, therefore enlarge */ \
117         int       new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
118         MatScalar *new_a; \
119  \
120         if (a->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero in the matrix"); \
121  \
122         /* malloc new storage space */ \
123         len   = new_nz*(sizeof(int)+bs2*sizeof(MatScalar))+(a->mbs+1)*sizeof(int); \
124         ierr  = PetscMalloc(len,&new_a);CHKERRQ(ierr); \
125         new_j = (int*)(new_a + bs2*new_nz); \
126         new_i = new_j + new_nz; \
127  \
128         /* copy over old data into new slots */ \
129         for (ii=0; ii<brow+1; ii++) {new_i[ii] = ai[ii];} \
130         for (ii=brow+1; ii<a->mbs+1; ii++) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
131         ierr = PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int));CHKERRQ(ierr); \
132         len = (new_nz - CHUNKSIZE - ai[brow] - nrow); \
133         ierr = PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow,len*sizeof(int));CHKERRQ(ierr); \
134         ierr = PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(MatScalar));CHKERRQ(ierr); \
135         ierr = PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(PetscScalar));CHKERRQ(ierr); \
136         ierr = PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE), \
137                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(MatScalar));CHKERRQ(ierr);  \
138         /* free up old matrix storage */ \
139         ierr = PetscFree(a->a);CHKERRQ(ierr);  \
140         if (!a->singlemalloc) { \
141           ierr = PetscFree(a->i);CHKERRQ(ierr); \
142           ierr = PetscFree(a->j);CHKERRQ(ierr);\
143         } \
144         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
145         a->singlemalloc = PETSC_TRUE; \
146  \
147         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
148         rmax = aimax[brow] = aimax[brow] + CHUNKSIZE; \
149         PetscLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(MatScalar))); \
150         a->s_maxnz += bs2*CHUNKSIZE; \
151         a->reallocs++; \
152         a->s_nz++; \
153       } \
154       N = nrow++ - 1;  \
155       /* shift up all the later entries in this row */ \
156       for (ii=N; ii>=_i; ii--) { \
157         rp[ii+1] = rp[ii]; \
158         ierr = PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(MatScalar));CHKERRQ(ierr); \
159       } \
160       if (N>=_i) { ierr = PetscMemzero(ap+bs2*_i,bs2*sizeof(MatScalar));CHKERRQ(ierr); }  \
161       rp[_i]                      = bcol;  \
162       ap[bs2*_i + bs*cidx + ridx] = value;  \
163       a_noinsert:; \
164     ailen[brow] = nrow; \
165 }
166 #ifndef MatSetValues_SeqBAIJ_B_Private
167 #define  MatSetValues_SeqSBAIJ_B_Private(row,col,value,addv) \
168 { \
169     brow = row/bs;  \
170     rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
171     rmax = bimax[brow]; nrow = bilen[brow]; \
172       bcol = col/bs; \
173       ridx = row % bs; cidx = col % bs; \
174       low = 0; high = nrow; \
175       while (high-low > 3) { \
176         t = (low+high)/2; \
177         if (rp[t] > bcol) high = t; \
178         else              low  = t; \
179       } \
180       for (_i=low; _i<high; _i++) { \
181         if (rp[_i] > bcol) break; \
182         if (rp[_i] == bcol) { \
183           bap  = ap +  bs2*_i + bs*cidx + ridx; \
184           if (addv == ADD_VALUES) *bap += value;  \
185           else                    *bap  = value;  \
186           goto b_noinsert; \
187         } \
188       } \
189       if (b->nonew == 1) goto b_noinsert; \
190       else if (b->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero into matrix"); \
191       if (nrow >= rmax) { \
192         /* there is no extra room in row, therefore enlarge */ \
193         int       new_nz = bi[b->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
194         MatScalar *new_a; \
195  \
196         if (b->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero in the matrix"); \
197  \
198         /* malloc new storage space */ \
199         len   = new_nz*(sizeof(int)+bs2*sizeof(MatScalar))+(b->mbs+1)*sizeof(int); \
200         ierr  = PetscMalloc(len,&new_a);CHKERRQ(ierr); \
201         new_j = (int*)(new_a + bs2*new_nz); \
202         new_i = new_j + new_nz; \
203  \
204         /* copy over old data into new slots */ \
205         for (ii=0; ii<brow+1; ii++) {new_i[ii] = bi[ii];} \
206         for (ii=brow+1; ii<b->mbs+1; ii++) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
207         ierr = PetscMemcpy(new_j,bj,(bi[brow]+nrow)*sizeof(int));CHKERRQ(ierr); \
208         len  = (new_nz - CHUNKSIZE - bi[brow] - nrow); \
209         ierr = PetscMemcpy(new_j+bi[brow]+nrow+CHUNKSIZE,bj+bi[brow]+nrow,len*sizeof(int));CHKERRQ(ierr); \
210         ierr = PetscMemcpy(new_a,ba,(bi[brow]+nrow)*bs2*sizeof(MatScalar));CHKERRQ(ierr); \
211         ierr = PetscMemzero(new_a+bs2*(bi[brow]+nrow),bs2*CHUNKSIZE*sizeof(MatScalar));CHKERRQ(ierr); \
212         ierr = PetscMemcpy(new_a+bs2*(bi[brow]+nrow+CHUNKSIZE), \
213                     ba+bs2*(bi[brow]+nrow),bs2*len*sizeof(MatScalar));CHKERRQ(ierr);  \
214         /* free up old matrix storage */ \
215         ierr = PetscFree(b->a);CHKERRQ(ierr);  \
216         if (!b->singlemalloc) { \
217           ierr = PetscFree(b->i);CHKERRQ(ierr); \
218           ierr = PetscFree(b->j);CHKERRQ(ierr); \
219         } \
220         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
221         b->singlemalloc = PETSC_TRUE; \
222  \
223         rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
224         rmax = bimax[brow] = bimax[brow] + CHUNKSIZE; \
225         PetscLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + bs2*sizeof(MatScalar))); \
226         b->maxnz += bs2*CHUNKSIZE; \
227         b->reallocs++; \
228         b->nz++; \
229       } \
230       N = nrow++ - 1;  \
231       /* shift up all the later entries in this row */ \
232       for (ii=N; ii>=_i; ii--) { \
233         rp[ii+1] = rp[ii]; \
234         ierr = PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(MatScalar));CHKERRQ(ierr); \
235       } \
236       if (N>=_i) { ierr = PetscMemzero(ap+bs2*_i,bs2*sizeof(MatScalar));CHKERRQ(ierr);}  \
237       rp[_i]                      = bcol;  \
238       ap[bs2*_i + bs*cidx + ridx] = value;  \
239       b_noinsert:; \
240     bilen[brow] = nrow; \
241 }
242 #endif
243 
244 #if defined(PETSC_USE_MAT_SINGLE)
245 #undef __FUNCT__
246 #define __FUNCT__ "MatSetValues_MPISBAIJ"
247 int MatSetValues_MPISBAIJ(Mat mat,int m,int *im,int n,int *in,PetscScalar *v,InsertMode addv)
248 {
249   Mat_MPISBAIJ *b = (Mat_MPISBAIJ*)mat->data;
250   int          ierr,i,N = m*n;
251   MatScalar    *vsingle;
252 
253   PetscFunctionBegin;
254   if (N > b->setvalueslen) {
255     if (b->setvaluescopy) {ierr = PetscFree(b->setvaluescopy);CHKERRQ(ierr);}
256     ierr = PetscMalloc(N*sizeof(MatScalar),&b->setvaluescopy);CHKERRQ(ierr);
257     b->setvalueslen  = N;
258   }
259   vsingle = b->setvaluescopy;
260 
261   for (i=0; i<N; i++) {
262     vsingle[i] = v[i];
263   }
264   ierr = MatSetValues_MPISBAIJ_MatScalar(mat,m,im,n,in,vsingle,addv);CHKERRQ(ierr);
265   PetscFunctionReturn(0);
266 }
267 
268 #undef __FUNCT__
269 #define __FUNCT__ "MatSetValuesBlocked_MPISBAIJ"
270 int MatSetValuesBlocked_MPISBAIJ(Mat mat,int m,int *im,int n,int *in,PetscScalar *v,InsertMode addv)
271 {
272   Mat_MPIBAIJ *b = (Mat_MPIBAIJ*)mat->data;
273   int         ierr,i,N = m*n*b->bs2;
274   MatScalar   *vsingle;
275 
276   PetscFunctionBegin;
277   if (N > b->setvalueslen) {
278     if (b->setvaluescopy) {ierr = PetscFree(b->setvaluescopy);CHKERRQ(ierr);}
279     ierr = PetscMalloc(N*sizeof(MatScalar),&b->setvaluescopy);CHKERRQ(ierr);
280     b->setvalueslen  = N;
281   }
282   vsingle = b->setvaluescopy;
283   for (i=0; i<N; i++) {
284     vsingle[i] = v[i];
285   }
286   ierr = MatSetValuesBlocked_MPISBAIJ_MatScalar(mat,m,im,n,in,vsingle,addv);CHKERRQ(ierr);
287   PetscFunctionReturn(0);
288 }
289 
290 #undef __FUNCT__
291 #define __FUNCT__ "MatSetValues_MPISBAIJ_HT"
292 int MatSetValues_MPISBAIJ_HT(Mat mat,int m,int *im,int n,int *in,PetscScalar *v,InsertMode addv)
293 {
294   Mat_MPIBAIJ *b = (Mat_MPIBAIJ*)mat->data;
295   int         ierr,i,N = m*n;
296   MatScalar   *vsingle;
297 
298   PetscFunctionBegin;
299   SETERRQ(1,"Function not yet written for SBAIJ format");
300   /* PetscFunctionReturn(0); */
301 }
302 
303 #undef __FUNCT__
304 #define __FUNCT__ "MatSetValuesBlocked_MPISBAIJ_HT"
305 int MatSetValuesBlocked_MPISBAIJ_HT(Mat mat,int m,int *im,int n,int *in,PetscScalar *v,InsertMode addv)
306 {
307   Mat_MPIBAIJ *b = (Mat_MPIBAIJ*)mat->data;
308   int         ierr,i,N = m*n*b->bs2;
309   MatScalar   *vsingle;
310 
311   PetscFunctionBegin;
312   SETERRQ(1,"Function not yet written for SBAIJ format");
313   /* PetscFunctionReturn(0); */
314 }
315 #endif
316 
317 /* Only add/insert a(i,j) with i<=j (blocks).
318    Any a(i,j) with i>j input by user is ingored.
319 */
320 #undef __FUNCT__
321 #define __FUNCT__ "MatSetValues_MPIBAIJ"
322 int MatSetValues_MPISBAIJ_MatScalar(Mat mat,int m,int *im,int n,int *in,MatScalar *v,InsertMode addv)
323 {
324   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
325   MatScalar    value;
326   PetscTruth   roworiented = baij->roworiented;
327   int          ierr,i,j,row,col;
328   int          rstart_orig=baij->rstart_bs;
329   int          rend_orig=baij->rend_bs,cstart_orig=baij->cstart_bs;
330   int          cend_orig=baij->cend_bs,bs=baij->bs;
331 
332   /* Some Variables required in the macro */
333   Mat          A = baij->A;
334   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)(A)->data;
335   int          *aimax=a->imax,*ai=a->i,*ailen=a->ilen,*aj=a->j;
336   MatScalar    *aa=a->a;
337 
338   Mat          B = baij->B;
339   Mat_SeqBAIJ  *b = (Mat_SeqBAIJ*)(B)->data;
340   int          *bimax=b->imax,*bi=b->i,*bilen=b->ilen,*bj=b->j;
341   MatScalar    *ba=b->a;
342 
343   int          *rp,ii,nrow,_i,rmax,N,brow,bcol;
344   int          low,high,t,ridx,cidx,bs2=a->bs2;
345   MatScalar    *ap,*bap;
346 
347   /* for stash */
348   int          n_loc, *in_loc=0;
349   MatScalar    *v_loc=0;
350 
351   PetscFunctionBegin;
352 
353   if(!baij->donotstash){
354     ierr = PetscMalloc(n*sizeof(int),&in_loc);CHKERRQ(ierr);
355     ierr = PetscMalloc(n*sizeof(MatScalar),&v_loc);CHKERRQ(ierr);
356   }
357 
358   for (i=0; i<m; i++) {
359     if (im[i] < 0) continue;
360 #if defined(PETSC_USE_BOPT_g)
361     if (im[i] >= mat->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Row too large");
362 #endif
363     if (im[i] >= rstart_orig && im[i] < rend_orig) { /* this processor entry */
364       row = im[i] - rstart_orig;              /* local row index */
365       for (j=0; j<n; j++) {
366         if (im[i]/bs > in[j]/bs) continue;    /* ignore lower triangular blocks */
367         if (in[j] >= cstart_orig && in[j] < cend_orig){  /* diag entry (A) */
368           col = in[j] - cstart_orig;          /* local col index */
369           brow = row/bs; bcol = col/bs;
370           if (brow > bcol) continue;  /* ignore lower triangular blocks of A */
371           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
372           MatSetValues_SeqSBAIJ_A_Private(row,col,value,addv);
373           /* ierr = MatSetValues_SeqBAIJ(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
374         } else if (in[j] < 0) continue;
375 #if defined(PETSC_USE_BOPT_g)
376         else if (in[j] >= mat->N) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Col too large");}
377 #endif
378         else {  /* off-diag entry (B) */
379           if (mat->was_assembled) {
380             if (!baij->colmap) {
381               ierr = CreateColmap_MPISBAIJ_Private(mat);CHKERRQ(ierr);
382             }
383 #if defined (PETSC_USE_CTABLE)
384             ierr = PetscTableFind(baij->colmap,in[j]/bs + 1,&col);CHKERRQ(ierr);
385             col  = col - 1 + in[j]%bs;
386 #else
387             col = baij->colmap[in[j]/bs] - 1 + in[j]%bs;
388 #endif
389             if (col < 0 && !((Mat_SeqSBAIJ*)(baij->A->data))->nonew) {
390               ierr = DisAssemble_MPISBAIJ(mat);CHKERRQ(ierr);
391               col =  in[j];
392               /* Reinitialize the variables required by MatSetValues_SeqBAIJ_B_Private() */
393               B = baij->B;
394               b = (Mat_SeqBAIJ*)(B)->data;
395               bimax=b->imax;bi=b->i;bilen=b->ilen;bj=b->j;
396               ba=b->a;
397             }
398           } else col = in[j];
399           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
400           MatSetValues_SeqSBAIJ_B_Private(row,col,value,addv);
401           /* ierr = MatSetValues_SeqBAIJ(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
402         }
403       }
404     } else {  /* off processor entry */
405       if (!baij->donotstash) {
406         n_loc = 0;
407         for (j=0; j<n; j++){
408           if (im[i]/bs > in[j]/bs) continue; /* ignore lower triangular blocks */
409           in_loc[n_loc] = in[j];
410           if (roworiented) {
411             v_loc[n_loc] = v[i*n+j];
412           } else {
413             v_loc[n_loc] = v[j*m+i];
414           }
415           n_loc++;
416         }
417         ierr = MatStashValuesRow_Private(&mat->stash,im[i],n_loc,in_loc,v_loc);CHKERRQ(ierr);
418       }
419     }
420   }
421 
422   if(!baij->donotstash){
423     ierr = PetscFree(in_loc);CHKERRQ(ierr);
424     ierr = PetscFree(v_loc);CHKERRQ(ierr);
425   }
426   PetscFunctionReturn(0);
427 }
428 
429 #undef __FUNCT__
430 #define __FUNCT__ "MatSetValuesBlocked_MPISBAIJ"
431 int MatSetValuesBlocked_MPISBAIJ_MatScalar(Mat mat,int m,int *im,int n,int *in,MatScalar *v,InsertMode addv)
432 {
433   PetscFunctionBegin;
434   SETERRQ(1,"Function not yet written for SBAIJ format");
435   /* PetscFunctionReturn(0); */
436 }
437 
438 #define HASH_KEY 0.6180339887
439 #define HASH(size,key,tmp) (tmp = (key)*HASH_KEY,(int)((size)*(tmp-(int)tmp)))
440 /* #define HASH(size,key) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
441 /* #define HASH(size,key,tmp) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
442 #undef __FUNCT__
443 #define __FUNCT__ "MatSetValues_MPISBAIJ_HT_MatScalar"
444 int MatSetValues_MPISBAIJ_HT_MatScalar(Mat mat,int m,int *im,int n,int *in,MatScalar *v,InsertMode addv)
445 {
446   PetscFunctionBegin;
447   SETERRQ(1,"Function not yet written for SBAIJ format");
448   /* PetscFunctionReturn(0); */
449 }
450 
451 #undef __FUNCT__
452 #define __FUNCT__ "MatSetValuesBlocked_MPISBAIJ_HT_MatScalar"
453 int MatSetValuesBlocked_MPISBAIJ_HT_MatScalar(Mat mat,int m,int *im,int n,int *in,MatScalar *v,InsertMode addv)
454 {
455   PetscFunctionBegin;
456   SETERRQ(1,"Function not yet written for SBAIJ format");
457   /* PetscFunctionReturn(0); */
458 }
459 
460 #undef __FUNCT__
461 #define __FUNCT__ "MatGetValues_MPISBAIJ"
462 int MatGetValues_MPISBAIJ(Mat mat,int m,int *idxm,int n,int *idxn,PetscScalar *v)
463 {
464   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
465   int          bs=baij->bs,ierr,i,j,bsrstart = baij->rstart*bs,bsrend = baij->rend*bs;
466   int          bscstart = baij->cstart*bs,bscend = baij->cend*bs,row,col,data;
467 
468   PetscFunctionBegin;
469   for (i=0; i<m; i++) {
470     if (idxm[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative row");
471     if (idxm[i] >= mat->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Row too large");
472     if (idxm[i] >= bsrstart && idxm[i] < bsrend) {
473       row = idxm[i] - bsrstart;
474       for (j=0; j<n; j++) {
475         if (idxn[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative column");
476         if (idxn[j] >= mat->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Column too large");
477         if (idxn[j] >= bscstart && idxn[j] < bscend){
478           col = idxn[j] - bscstart;
479           ierr = MatGetValues_SeqSBAIJ(baij->A,1,&row,1,&col,v+i*n+j);CHKERRQ(ierr);
480         } else {
481           if (!baij->colmap) {
482             ierr = CreateColmap_MPISBAIJ_Private(mat);CHKERRQ(ierr);
483           }
484 #if defined (PETSC_USE_CTABLE)
485           ierr = PetscTableFind(baij->colmap,idxn[j]/bs+1,&data);CHKERRQ(ierr);
486           data --;
487 #else
488           data = baij->colmap[idxn[j]/bs]-1;
489 #endif
490           if((data < 0) || (baij->garray[data/bs] != idxn[j]/bs)) *(v+i*n+j) = 0.0;
491           else {
492             col  = data + idxn[j]%bs;
493             ierr = MatGetValues_SeqSBAIJ(baij->B,1,&row,1,&col,v+i*n+j);CHKERRQ(ierr);
494           }
495         }
496       }
497     } else {
498       SETERRQ(PETSC_ERR_SUP,"Only local values currently supported");
499     }
500   }
501  PetscFunctionReturn(0);
502 }
503 
504 #undef __FUNCT__
505 #define __FUNCT__ "MatNorm_MPISBAIJ"
506 int MatNorm_MPISBAIJ(Mat mat,NormType type,PetscReal *norm)
507 {
508   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
509   /* Mat_SeqSBAIJ *amat = (Mat_SeqSBAIJ*)baij->A->data; */
510   /* Mat_SeqBAIJ  *bmat = (Mat_SeqBAIJ*)baij->B->data; */
511   int        ierr;
512   PetscReal  sum[2],*lnorm2;
513 
514   PetscFunctionBegin;
515   if (baij->size == 1) {
516     ierr =  MatNorm(baij->A,type,norm);CHKERRQ(ierr);
517   } else {
518     if (type == NORM_FROBENIUS) {
519       ierr = PetscMalloc(2*sizeof(PetscReal),&lnorm2);CHKERRQ(ierr);
520       ierr =  MatNorm(baij->A,type,lnorm2);CHKERRQ(ierr);
521       *lnorm2 = (*lnorm2)*(*lnorm2); lnorm2++;            /* squar power of norm(A) */
522       ierr =  MatNorm(baij->B,type,lnorm2);CHKERRQ(ierr);
523       *lnorm2 = (*lnorm2)*(*lnorm2); lnorm2--;             /* squar power of norm(B) */
524       /*
525       ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);
526       PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d], lnorm2=%g, %g\n",rank,lnorm2[0],lnorm2[1]);
527       */
528       ierr = MPI_Allreduce(lnorm2,&sum,2,MPIU_REAL,MPI_SUM,mat->comm);CHKERRQ(ierr);
529       /*
530       PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d], sum=%g, %g\n",rank,sum[0],sum[1]);
531       PetscSynchronizedFlush(PETSC_COMM_WORLD); */
532 
533       *norm = sqrt(sum[0] + 2*sum[1]);
534       ierr = PetscFree(lnorm2);CHKERRQ(ierr);
535     } else {
536       SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
537     }
538   }
539   PetscFunctionReturn(0);
540 }
541 
542 /*
543   Creates the hash table, and sets the table
544   This table is created only once.
545   If new entried need to be added to the matrix
546   then the hash table has to be destroyed and
547   recreated.
548 */
549 #undef __FUNCT__
550 #define __FUNCT__ "MatCreateHashTable_MPISBAIJ_Private"
551 int MatCreateHashTable_MPISBAIJ_Private(Mat mat,PetscReal factor)
552 {
553   PetscFunctionBegin;
554   SETERRQ(1,"Function not yet written for SBAIJ format");
555   /* PetscFunctionReturn(0); */
556 }
557 
558 #undef __FUNCT__
559 #define __FUNCT__ "MatAssemblyBegin_MPISBAIJ"
560 int MatAssemblyBegin_MPISBAIJ(Mat mat,MatAssemblyType mode)
561 {
562   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
563   int         ierr,nstash,reallocs;
564   InsertMode  addv;
565 
566   PetscFunctionBegin;
567   if (baij->donotstash) {
568     PetscFunctionReturn(0);
569   }
570 
571   /* make sure all processors are either in INSERTMODE or ADDMODE */
572   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,mat->comm);CHKERRQ(ierr);
573   if (addv == (ADD_VALUES|INSERT_VALUES)) {
574     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added");
575   }
576   mat->insertmode = addv; /* in case this processor had no cache */
577 
578   ierr = MatStashScatterBegin_Private(&mat->stash,baij->rowners_bs);CHKERRQ(ierr);
579   ierr = MatStashScatterBegin_Private(&mat->bstash,baij->rowners);CHKERRQ(ierr);
580   ierr = MatStashGetInfo_Private(&mat->stash,&nstash,&reallocs);CHKERRQ(ierr);
581   PetscLogInfo(0,"MatAssemblyBegin_MPISBAIJ:Stash has %d entries,uses %d mallocs.\n",nstash,reallocs);
582   ierr = MatStashGetInfo_Private(&mat->stash,&nstash,&reallocs);CHKERRQ(ierr);
583   PetscLogInfo(0,"MatAssemblyBegin_MPISBAIJ:Block-Stash has %d entries, uses %d mallocs.\n",nstash,reallocs);
584   PetscFunctionReturn(0);
585 }
586 
587 #undef __FUNCT__
588 #define __FUNCT__ "MatAssemblyEnd_MPISBAIJ"
589 int MatAssemblyEnd_MPISBAIJ(Mat mat,MatAssemblyType mode)
590 {
591   Mat_MPISBAIJ *baij=(Mat_MPISBAIJ*)mat->data;
592   Mat_SeqSBAIJ  *a=(Mat_SeqSBAIJ*)baij->A->data;
593   Mat_SeqBAIJ  *b=(Mat_SeqBAIJ*)baij->B->data;
594   int         i,j,rstart,ncols,n,ierr,flg,bs2=baij->bs2;
595   int         *row,*col,other_disassembled;
596   PetscTruth  r1,r2,r3;
597   MatScalar   *val;
598   InsertMode  addv = mat->insertmode;
599   /* int         rank;*/
600 
601   PetscFunctionBegin;
602   /* remove 2 line below later */
603   /*ierr = MPI_Comm_rank(PETSC_COMM_WORLD, &rank);CHKERRQ(ierr); */
604 
605   if (!baij->donotstash) {
606     while (1) {
607       ierr = MatStashScatterGetMesg_Private(&mat->stash,&n,&row,&col,&val,&flg);CHKERRQ(ierr);
608       /*
609       PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d]: in AssemblyEnd, stash, flg=%d\n",rank,flg);
610       PetscSynchronizedFlush(PETSC_COMM_WORLD);
611       */
612       if (!flg) break;
613 
614       for (i=0; i<n;) {
615         /* Now identify the consecutive vals belonging to the same row */
616         for (j=i,rstart=row[j]; j<n; j++) { if (row[j] != rstart) break; }
617         if (j < n) ncols = j-i;
618         else       ncols = n-i;
619         /* Now assemble all these values with a single function call */
620         ierr = MatSetValues_MPISBAIJ_MatScalar(mat,1,row+i,ncols,col+i,val+i,addv);CHKERRQ(ierr);
621         i = j;
622       }
623     }
624     ierr = MatStashScatterEnd_Private(&mat->stash);CHKERRQ(ierr);
625     /* Now process the block-stash. Since the values are stashed column-oriented,
626        set the roworiented flag to column oriented, and after MatSetValues()
627        restore the original flags */
628     r1 = baij->roworiented;
629     r2 = a->roworiented;
630     r3 = b->roworiented;
631     baij->roworiented = PETSC_FALSE;
632     a->roworiented    = PETSC_FALSE;
633     b->roworiented    = PETSC_FALSE;
634     while (1) {
635       ierr = MatStashScatterGetMesg_Private(&mat->bstash,&n,&row,&col,&val,&flg);CHKERRQ(ierr);
636       if (!flg) break;
637 
638       for (i=0; i<n;) {
639         /* Now identify the consecutive vals belonging to the same row */
640         for (j=i,rstart=row[j]; j<n; j++) { if (row[j] != rstart) break; }
641         if (j < n) ncols = j-i;
642         else       ncols = n-i;
643         ierr = MatSetValuesBlocked_MPISBAIJ_MatScalar(mat,1,row+i,ncols,col+i,val+i*bs2,addv);CHKERRQ(ierr);
644         i = j;
645       }
646     }
647     ierr = MatStashScatterEnd_Private(&mat->bstash);CHKERRQ(ierr);
648     baij->roworiented = r1;
649     a->roworiented    = r2;
650     b->roworiented    = r3;
651   }
652 
653   ierr = MatAssemblyBegin(baij->A,mode);CHKERRQ(ierr);
654   ierr = MatAssemblyEnd(baij->A,mode);CHKERRQ(ierr);
655 
656   /* determine if any processor has disassembled, if so we must
657      also disassemble ourselfs, in order that we may reassemble. */
658   /*
659      if nonzero structure of submatrix B cannot change then we know that
660      no processor disassembled thus we can skip this stuff
661   */
662   if (!((Mat_SeqBAIJ*)baij->B->data)->nonew)  {
663     ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
664     if (mat->was_assembled && !other_disassembled) {
665       ierr = DisAssemble_MPISBAIJ(mat);CHKERRQ(ierr);
666     }
667   }
668 
669   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
670     ierr = MatSetUpMultiply_MPISBAIJ(mat);CHKERRQ(ierr);
671   }
672   ierr = MatAssemblyBegin(baij->B,mode);CHKERRQ(ierr);
673   ierr = MatAssemblyEnd(baij->B,mode);CHKERRQ(ierr);
674 
675 #if defined(PETSC_USE_BOPT_g)
676   if (baij->ht && mode== MAT_FINAL_ASSEMBLY) {
677     PetscLogInfo(0,"MatAssemblyEnd_MPISBAIJ:Average Hash Table Search in MatSetValues = %5.2f\n",((PetscReal)baij->ht_total_ct)/baij->ht_insert_ct);
678     baij->ht_total_ct  = 0;
679     baij->ht_insert_ct = 0;
680   }
681 #endif
682   if (baij->ht_flag && !baij->ht && mode == MAT_FINAL_ASSEMBLY) {
683     ierr = MatCreateHashTable_MPISBAIJ_Private(mat,baij->ht_fact);CHKERRQ(ierr);
684     mat->ops->setvalues        = MatSetValues_MPISBAIJ_HT;
685     mat->ops->setvaluesblocked = MatSetValuesBlocked_MPISBAIJ_HT;
686   }
687 
688   if (baij->rowvalues) {
689     ierr = PetscFree(baij->rowvalues);CHKERRQ(ierr);
690     baij->rowvalues = 0;
691   }
692   PetscFunctionReturn(0);
693 }
694 
695 #undef __FUNCT__
696 #define __FUNCT__ "MatView_MPISBAIJ_ASCIIorDraworSocket"
697 static int MatView_MPISBAIJ_ASCIIorDraworSocket(Mat mat,PetscViewer viewer)
698 {
699   Mat_MPISBAIJ      *baij = (Mat_MPISBAIJ*)mat->data;
700   int               ierr,bs = baij->bs,size = baij->size,rank = baij->rank;
701   PetscTruth        isascii,isdraw;
702   PetscViewer       sviewer;
703   PetscViewerFormat format;
704 
705   PetscFunctionBegin;
706   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&isascii);CHKERRQ(ierr);
707   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_DRAW,&isdraw);CHKERRQ(ierr);
708   if (isascii) {
709     ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
710     if (format == PETSC_VIEWER_ASCII_INFO_LONG) {
711       MatInfo info;
712       ierr = MPI_Comm_rank(mat->comm,&rank);CHKERRQ(ierr);
713       ierr = MatGetInfo(mat,MAT_LOCAL,&info);CHKERRQ(ierr);
714       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
715               rank,mat->m,(int)info.nz_used*bs,(int)info.nz_allocated*bs,
716               baij->bs,(int)info.memory);CHKERRQ(ierr);
717       ierr = MatGetInfo(baij->A,MAT_LOCAL,&info);CHKERRQ(ierr);
718       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);CHKERRQ(ierr);
719       ierr = MatGetInfo(baij->B,MAT_LOCAL,&info);CHKERRQ(ierr);
720       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);CHKERRQ(ierr);
721       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
722       ierr = VecScatterView(baij->Mvctx,viewer);CHKERRQ(ierr);
723       PetscFunctionReturn(0);
724     } else if (format == PETSC_VIEWER_ASCII_INFO) {
725       ierr = PetscViewerASCIIPrintf(viewer,"  block size is %d\n",bs);CHKERRQ(ierr);
726       PetscFunctionReturn(0);
727     }
728   }
729 
730   if (isdraw) {
731     PetscDraw       draw;
732     PetscTruth isnull;
733     ierr = PetscViewerDrawGetDraw(viewer,0,&draw);CHKERRQ(ierr);
734     ierr = PetscDrawIsNull(draw,&isnull);CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
735   }
736 
737   if (size == 1) {
738     ierr = PetscObjectSetName((PetscObject)baij->A,mat->name);CHKERRQ(ierr);
739     ierr = MatView(baij->A,viewer);CHKERRQ(ierr);
740   } else {
741     /* assemble the entire matrix onto first processor. */
742     Mat         A;
743     Mat_SeqSBAIJ *Aloc;
744     Mat_SeqBAIJ *Bloc;
745     int         M = mat->M,N = mat->N,*ai,*aj,col,i,j,k,*rvals,mbs = baij->mbs;
746     MatScalar   *a;
747 
748     if (!rank) {
749       ierr = MatCreateMPISBAIJ(mat->comm,baij->bs,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
750     } else {
751       ierr = MatCreateMPISBAIJ(mat->comm,baij->bs,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
752     }
753     PetscLogObjectParent(mat,A);
754 
755     /* copy over the A part */
756     Aloc  = (Mat_SeqSBAIJ*)baij->A->data;
757     ai    = Aloc->i; aj = Aloc->j; a = Aloc->a;
758     ierr  = PetscMalloc(bs*sizeof(int),&rvals);CHKERRQ(ierr);
759 
760     for (i=0; i<mbs; i++) {
761       rvals[0] = bs*(baij->rstart + i);
762       for (j=1; j<bs; j++) { rvals[j] = rvals[j-1] + 1; }
763       for (j=ai[i]; j<ai[i+1]; j++) {
764         col = (baij->cstart+aj[j])*bs;
765         for (k=0; k<bs; k++) {
766           ierr = MatSetValues_MPISBAIJ_MatScalar(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
767           col++; a += bs;
768         }
769       }
770     }
771     /* copy over the B part */
772     Bloc = (Mat_SeqBAIJ*)baij->B->data;
773     ai = Bloc->i; aj = Bloc->j; a = Bloc->a;
774     for (i=0; i<mbs; i++) {
775       rvals[0] = bs*(baij->rstart + i);
776       for (j=1; j<bs; j++) { rvals[j] = rvals[j-1] + 1; }
777       for (j=ai[i]; j<ai[i+1]; j++) {
778         col = baij->garray[aj[j]]*bs;
779         for (k=0; k<bs; k++) {
780           ierr = MatSetValues_MPISBAIJ_MatScalar(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
781           col++; a += bs;
782         }
783       }
784     }
785     ierr = PetscFree(rvals);CHKERRQ(ierr);
786     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
787     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
788     /*
789        Everyone has to call to draw the matrix since the graphics waits are
790        synchronized across all processors that share the PetscDraw object
791     */
792     ierr = PetscViewerGetSingleton(viewer,&sviewer);CHKERRQ(ierr);
793     if (!rank) {
794       ierr = PetscObjectSetName((PetscObject)((Mat_MPISBAIJ*)(A->data))->A,mat->name);CHKERRQ(ierr);
795       ierr = MatView(((Mat_MPISBAIJ*)(A->data))->A,sviewer);CHKERRQ(ierr);
796     }
797     ierr = PetscViewerRestoreSingleton(viewer,&sviewer);CHKERRQ(ierr);
798     ierr = MatDestroy(A);CHKERRQ(ierr);
799   }
800   PetscFunctionReturn(0);
801 }
802 
803 #undef __FUNCT__
804 #define __FUNCT__ "MatView_MPISBAIJ"
805 int MatView_MPISBAIJ(Mat mat,PetscViewer viewer)
806 {
807   int        ierr;
808   PetscTruth isascii,isdraw,issocket,isbinary;
809 
810   PetscFunctionBegin;
811   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&isascii);CHKERRQ(ierr);
812   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_DRAW,&isdraw);CHKERRQ(ierr);
813   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_SOCKET,&issocket);CHKERRQ(ierr);
814   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_BINARY,&isbinary);CHKERRQ(ierr);
815   if (isascii || isdraw || issocket || isbinary) {
816     ierr = MatView_MPISBAIJ_ASCIIorDraworSocket(mat,viewer);CHKERRQ(ierr);
817   } else {
818     SETERRQ1(1,"Viewer type %s not supported by MPISBAIJ matrices",((PetscObject)viewer)->type_name);
819   }
820   PetscFunctionReturn(0);
821 }
822 
823 #undef __FUNCT__
824 #define __FUNCT__ "MatDestroy_MPISBAIJ"
825 int MatDestroy_MPISBAIJ(Mat mat)
826 {
827   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
828   int         ierr;
829 
830   PetscFunctionBegin;
831 #if defined(PETSC_USE_LOG)
832   PetscLogObjectState((PetscObject)mat,"Rows=%d,Cols=%d",mat->M,mat->N);
833 #endif
834   ierr = MatStashDestroy_Private(&mat->stash);CHKERRQ(ierr);
835   ierr = MatStashDestroy_Private(&mat->bstash);CHKERRQ(ierr);
836   ierr = PetscFree(baij->rowners);CHKERRQ(ierr);
837   ierr = MatDestroy(baij->A);CHKERRQ(ierr);
838   ierr = MatDestroy(baij->B);CHKERRQ(ierr);
839 #if defined (PETSC_USE_CTABLE)
840   if (baij->colmap) {ierr = PetscTableDelete(baij->colmap);CHKERRQ(ierr);}
841 #else
842   if (baij->colmap) {ierr = PetscFree(baij->colmap);CHKERRQ(ierr);}
843 #endif
844   if (baij->garray) {ierr = PetscFree(baij->garray);CHKERRQ(ierr);}
845   if (baij->lvec)   {ierr = VecDestroy(baij->lvec);CHKERRQ(ierr);}
846   if (baij->Mvctx)  {ierr = VecScatterDestroy(baij->Mvctx);CHKERRQ(ierr);}
847   if (baij->rowvalues) {ierr = PetscFree(baij->rowvalues);CHKERRQ(ierr);}
848   if (baij->barray) {ierr = PetscFree(baij->barray);CHKERRQ(ierr);}
849   if (baij->hd) {ierr = PetscFree(baij->hd);CHKERRQ(ierr);}
850 #if defined(PETSC_USE_MAT_SINGLE)
851   if (baij->setvaluescopy) {ierr = PetscFree(baij->setvaluescopy);CHKERRQ(ierr);}
852 #endif
853   ierr = PetscFree(baij);CHKERRQ(ierr);
854   PetscFunctionReturn(0);
855 }
856 
857 #undef __FUNCT__
858 #define __FUNCT__ "MatMult_MPISBAIJ"
859 int MatMult_MPISBAIJ(Mat A,Vec xx,Vec yy)
860 {
861   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
862   int         ierr,nt;
863 
864   PetscFunctionBegin;
865   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
866   if (nt != A->n) {
867     SETERRQ(PETSC_ERR_ARG_SIZ,"Incompatible partition of A and xx");
868   }
869   ierr = VecGetLocalSize(yy,&nt);CHKERRQ(ierr);
870   if (nt != A->m) {
871     SETERRQ(PETSC_ERR_ARG_SIZ,"Incompatible parition of A and yy");
872   }
873 
874   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
875   /* do diagonal part */
876   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
877   /* do supperdiagonal part */
878   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
879   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
880   /* do subdiagonal part */
881   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
882   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
883   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
884 
885   PetscFunctionReturn(0);
886 }
887 
888 #undef __FUNCT__
889 #define __FUNCT__ "MatMultAdd_MPISBAIJ"
890 int MatMultAdd_MPISBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
891 {
892   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
893   int        ierr;
894 
895   PetscFunctionBegin;
896   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
897   /* do diagonal part */
898   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
899   /* do supperdiagonal part */
900   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
901   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
902 
903   /* do subdiagonal part */
904   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
905   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
906   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
907 
908   PetscFunctionReturn(0);
909 }
910 
911 #undef __FUNCT__
912 #define __FUNCT__ "MatMultTranspose_MPISBAIJ"
913 int MatMultTranspose_MPISBAIJ(Mat A,Vec xx,Vec yy)
914 {
915   PetscFunctionBegin;
916   SETERRQ(1,"Matrix is symmetric. Call MatMult().");
917   /* PetscFunctionReturn(0); */
918 }
919 
920 #undef __FUNCT__
921 #define __FUNCT__ "MatMultTransposeAdd_MPISBAIJ"
922 int MatMultTransposeAdd_MPISBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
923 {
924   PetscFunctionBegin;
925   SETERRQ(1,"Matrix is symmetric. Call MatMultAdd().");
926   /* PetscFunctionReturn(0); */
927 }
928 
929 /*
930   This only works correctly for square matrices where the subblock A->A is the
931    diagonal block
932 */
933 #undef __FUNCT__
934 #define __FUNCT__ "MatGetDiagonal_MPISBAIJ"
935 int MatGetDiagonal_MPISBAIJ(Mat A,Vec v)
936 {
937   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
938   int         ierr;
939 
940   PetscFunctionBegin;
941   /* if (a->M != a->N) SETERRQ(PETSC_ERR_SUP,"Supports only square matrix where A->A is diag block"); */
942   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
943   PetscFunctionReturn(0);
944 }
945 
946 #undef __FUNCT__
947 #define __FUNCT__ "MatScale_MPISBAIJ"
948 int MatScale_MPISBAIJ(PetscScalar *aa,Mat A)
949 {
950   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
951   int         ierr;
952 
953   PetscFunctionBegin;
954   ierr = MatScale(aa,a->A);CHKERRQ(ierr);
955   ierr = MatScale(aa,a->B);CHKERRQ(ierr);
956   PetscFunctionReturn(0);
957 }
958 
959 #undef __FUNCT__
960 #define __FUNCT__ "MatGetRow_MPISBAIJ"
961 int MatGetRow_MPISBAIJ(Mat matin,int row,int *nz,int **idx,PetscScalar **v)
962 {
963   Mat_MPISBAIJ   *mat = (Mat_MPISBAIJ*)matin->data;
964   PetscScalar    *vworkA,*vworkB,**pvA,**pvB,*v_p;
965   int            bs = mat->bs,bs2 = mat->bs2,i,ierr,*cworkA,*cworkB,**pcA,**pcB;
966   int            nztot,nzA,nzB,lrow,brstart = mat->rstart*bs,brend = mat->rend*bs;
967   int            *cmap,*idx_p,cstart = mat->cstart;
968 
969   PetscFunctionBegin;
970   if (mat->getrowactive == PETSC_TRUE) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Already active");
971   mat->getrowactive = PETSC_TRUE;
972 
973   if (!mat->rowvalues && (idx || v)) {
974     /*
975         allocate enough space to hold information from the longest row.
976     */
977     Mat_SeqSBAIJ *Aa = (Mat_SeqSBAIJ*)mat->A->data;
978     Mat_SeqBAIJ  *Ba = (Mat_SeqBAIJ*)mat->B->data;
979     int     max = 1,mbs = mat->mbs,tmp;
980     for (i=0; i<mbs; i++) {
981       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i]; /* row length */
982       if (max < tmp) { max = tmp; }
983     }
984     ierr = PetscMalloc(max*bs2*(sizeof(int)+sizeof(PetscScalar)),&mat->rowvalues);CHKERRQ(ierr);
985     mat->rowindices = (int*)(mat->rowvalues + max*bs2);
986   }
987 
988   if (row < brstart || row >= brend) SETERRQ(PETSC_ERR_SUP,"Only local rows")
989   lrow = row - brstart;  /* local row index */
990 
991   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
992   if (!v)   {pvA = 0; pvB = 0;}
993   if (!idx) {pcA = 0; if (!v) pcB = 0;}
994   ierr = (*mat->A->ops->getrow)(mat->A,lrow,&nzA,pcA,pvA);CHKERRQ(ierr);
995   ierr = (*mat->B->ops->getrow)(mat->B,lrow,&nzB,pcB,pvB);CHKERRQ(ierr);
996   nztot = nzA + nzB;
997 
998   cmap  = mat->garray;
999   if (v  || idx) {
1000     if (nztot) {
1001       /* Sort by increasing column numbers, assuming A and B already sorted */
1002       int imark = -1;
1003       if (v) {
1004         *v = v_p = mat->rowvalues;
1005         for (i=0; i<nzB; i++) {
1006           if (cmap[cworkB[i]/bs] < cstart)   v_p[i] = vworkB[i];
1007           else break;
1008         }
1009         imark = i;
1010         for (i=0; i<nzA; i++)     v_p[imark+i] = vworkA[i];
1011         for (i=imark; i<nzB; i++) v_p[nzA+i]   = vworkB[i];
1012       }
1013       if (idx) {
1014         *idx = idx_p = mat->rowindices;
1015         if (imark > -1) {
1016           for (i=0; i<imark; i++) {
1017             idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs;
1018           }
1019         } else {
1020           for (i=0; i<nzB; i++) {
1021             if (cmap[cworkB[i]/bs] < cstart)
1022               idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1023             else break;
1024           }
1025           imark = i;
1026         }
1027         for (i=0; i<nzA; i++)     idx_p[imark+i] = cstart*bs + cworkA[i];
1028         for (i=imark; i<nzB; i++) idx_p[nzA+i]   = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1029       }
1030     } else {
1031       if (idx) *idx = 0;
1032       if (v)   *v   = 0;
1033     }
1034   }
1035   *nz = nztot;
1036   ierr = (*mat->A->ops->restorerow)(mat->A,lrow,&nzA,pcA,pvA);CHKERRQ(ierr);
1037   ierr = (*mat->B->ops->restorerow)(mat->B,lrow,&nzB,pcB,pvB);CHKERRQ(ierr);
1038   PetscFunctionReturn(0);
1039 }
1040 
1041 #undef __FUNCT__
1042 #define __FUNCT__ "MatRestoreRow_MPISBAIJ"
1043 int MatRestoreRow_MPISBAIJ(Mat mat,int row,int *nz,int **idx,PetscScalar **v)
1044 {
1045   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
1046 
1047   PetscFunctionBegin;
1048   if (baij->getrowactive == PETSC_FALSE) {
1049     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"MatGetRow not called");
1050   }
1051   baij->getrowactive = PETSC_FALSE;
1052   PetscFunctionReturn(0);
1053 }
1054 
1055 #undef __FUNCT__
1056 #define __FUNCT__ "MatGetBlockSize_MPISBAIJ"
1057 int MatGetBlockSize_MPISBAIJ(Mat mat,int *bs)
1058 {
1059   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
1060 
1061   PetscFunctionBegin;
1062   *bs = baij->bs;
1063   PetscFunctionReturn(0);
1064 }
1065 
1066 #undef __FUNCT__
1067 #define __FUNCT__ "MatZeroEntries_MPISBAIJ"
1068 int MatZeroEntries_MPISBAIJ(Mat A)
1069 {
1070   Mat_MPISBAIJ *l = (Mat_MPISBAIJ*)A->data;
1071   int         ierr;
1072 
1073   PetscFunctionBegin;
1074   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
1075   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
1076   PetscFunctionReturn(0);
1077 }
1078 
1079 #undef __FUNCT__
1080 #define __FUNCT__ "MatGetInfo_MPISBAIJ"
1081 int MatGetInfo_MPISBAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1082 {
1083   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)matin->data;
1084   Mat         A = a->A,B = a->B;
1085   int         ierr;
1086   PetscReal   isend[5],irecv[5];
1087 
1088   PetscFunctionBegin;
1089   info->block_size     = (PetscReal)a->bs;
1090   ierr = MatGetInfo(A,MAT_LOCAL,info);CHKERRQ(ierr);
1091   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->nz_unneeded;
1092   isend[3] = info->memory;  isend[4] = info->mallocs;
1093   ierr = MatGetInfo(B,MAT_LOCAL,info);CHKERRQ(ierr);
1094   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->nz_unneeded;
1095   isend[3] += info->memory;  isend[4] += info->mallocs;
1096   if (flag == MAT_LOCAL) {
1097     info->nz_used      = isend[0];
1098     info->nz_allocated = isend[1];
1099     info->nz_unneeded  = isend[2];
1100     info->memory       = isend[3];
1101     info->mallocs      = isend[4];
1102   } else if (flag == MAT_GLOBAL_MAX) {
1103     ierr = MPI_Allreduce(isend,irecv,5,MPIU_REAL,MPI_MAX,matin->comm);CHKERRQ(ierr);
1104     info->nz_used      = irecv[0];
1105     info->nz_allocated = irecv[1];
1106     info->nz_unneeded  = irecv[2];
1107     info->memory       = irecv[3];
1108     info->mallocs      = irecv[4];
1109   } else if (flag == MAT_GLOBAL_SUM) {
1110     ierr = MPI_Allreduce(isend,irecv,5,MPIU_REAL,MPI_SUM,matin->comm);CHKERRQ(ierr);
1111     info->nz_used      = irecv[0];
1112     info->nz_allocated = irecv[1];
1113     info->nz_unneeded  = irecv[2];
1114     info->memory       = irecv[3];
1115     info->mallocs      = irecv[4];
1116   } else {
1117     SETERRQ1(1,"Unknown MatInfoType argument %d",flag);
1118   }
1119   info->rows_global       = (PetscReal)A->M;
1120   info->columns_global    = (PetscReal)A->N;
1121   info->rows_local        = (PetscReal)A->m;
1122   info->columns_local     = (PetscReal)A->N;
1123   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1124   info->fill_ratio_needed = 0;
1125   info->factor_mallocs    = 0;
1126   PetscFunctionReturn(0);
1127 }
1128 
1129 #undef __FUNCT__
1130 #define __FUNCT__ "MatSetOption_MPISBAIJ"
1131 int MatSetOption_MPISBAIJ(Mat A,MatOption op)
1132 {
1133   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
1134   int         ierr;
1135 
1136   PetscFunctionBegin;
1137   switch (op) {
1138   case MAT_NO_NEW_NONZERO_LOCATIONS:
1139   case MAT_YES_NEW_NONZERO_LOCATIONS:
1140   case MAT_COLUMNS_UNSORTED:
1141   case MAT_COLUMNS_SORTED:
1142   case MAT_NEW_NONZERO_ALLOCATION_ERR:
1143   case MAT_KEEP_ZEROED_ROWS:
1144   case MAT_NEW_NONZERO_LOCATION_ERR:
1145     ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1146     ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1147     break;
1148   case MAT_ROW_ORIENTED:
1149     a->roworiented = PETSC_TRUE;
1150     ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1151     ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1152     break;
1153   case MAT_ROWS_SORTED:
1154   case MAT_ROWS_UNSORTED:
1155   case MAT_YES_NEW_DIAGONALS:
1156   case MAT_USE_SINGLE_PRECISION_SOLVES:
1157     PetscLogInfo(A,"Info:MatSetOption_MPIBAIJ:Option ignored\n");
1158     break;
1159   case MAT_COLUMN_ORIENTED:
1160     a->roworiented = PETSC_FALSE;
1161     ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1162     ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1163     break;
1164   case MAT_IGNORE_OFF_PROC_ENTRIES:
1165     a->donotstash = PETSC_TRUE;
1166     break;
1167   case MAT_NO_NEW_DIAGONALS:
1168     SETERRQ(PETSC_ERR_SUP,"MAT_NO_NEW_DIAGONALS");
1169     break;
1170   case MAT_USE_HASH_TABLE:
1171     a->ht_flag = PETSC_TRUE;
1172     break;
1173   default:
1174     SETERRQ(PETSC_ERR_SUP,"unknown option");
1175     break;
1176   }
1177   PetscFunctionReturn(0);
1178 }
1179 
1180 #undef __FUNCT__
1181 #define __FUNCT__ "MatTranspose_MPISBAIJ("
1182 int MatTranspose_MPISBAIJ(Mat A,Mat *matout)
1183 {
1184   PetscFunctionBegin;
1185   SETERRQ(1,"Matrix is symmetric. MatTranspose() should not be called");
1186   /* PetscFunctionReturn(0); */
1187 }
1188 
1189 #undef __FUNCT__
1190 #define __FUNCT__ "MatDiagonalScale_MPISBAIJ"
1191 int MatDiagonalScale_MPISBAIJ(Mat mat,Vec ll,Vec rr)
1192 {
1193   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
1194   Mat         a = baij->A,b = baij->B;
1195   int         ierr,s1,s2,s3;
1196 
1197   PetscFunctionBegin;
1198   if (ll != rr) {
1199     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1200   }
1201   ierr = MatGetLocalSize(mat,&s2,&s3);CHKERRQ(ierr);
1202   if (rr) {
1203     ierr = VecGetLocalSize(rr,&s1);CHKERRQ(ierr);
1204     if (s1!=s3) SETERRQ(PETSC_ERR_ARG_SIZ,"right vector non-conforming local size");
1205     /* Overlap communication with computation. */
1206     ierr = VecScatterBegin(rr,baij->lvec,INSERT_VALUES,SCATTER_FORWARD,baij->Mvctx);CHKERRQ(ierr);
1207     /*} if (ll) { */
1208     ierr = VecGetLocalSize(ll,&s1);CHKERRQ(ierr);
1209     if (s1!=s2) SETERRQ(PETSC_ERR_ARG_SIZ,"left vector non-conforming local size");
1210     ierr = (*b->ops->diagonalscale)(b,ll,PETSC_NULL);CHKERRQ(ierr);
1211     /* } */
1212   /* scale  the diagonal block */
1213   ierr = (*a->ops->diagonalscale)(a,ll,rr);CHKERRQ(ierr);
1214 
1215   /* if (rr) { */
1216     /* Do a scatter end and then right scale the off-diagonal block */
1217     ierr = VecScatterEnd(rr,baij->lvec,INSERT_VALUES,SCATTER_FORWARD,baij->Mvctx);CHKERRQ(ierr);
1218     ierr = (*b->ops->diagonalscale)(b,PETSC_NULL,baij->lvec);CHKERRQ(ierr);
1219   }
1220 
1221   PetscFunctionReturn(0);
1222 }
1223 
1224 #undef __FUNCT__
1225 #define __FUNCT__ "MatZeroRows_MPISBAIJ"
1226 int MatZeroRows_MPISBAIJ(Mat A,IS is,PetscScalar *diag)
1227 {
1228   Mat_MPISBAIJ   *l = (Mat_MPISBAIJ*)A->data;
1229   int            i,ierr,N,*rows,*owners = l->rowners,size = l->size;
1230   int            *procs,*nprocs,j,idx,nsends,*work,row;
1231   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
1232   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
1233   int            *lens,imdex,*lrows,*values,bs=l->bs,rstart_bs=l->rstart_bs;
1234   MPI_Comm       comm = A->comm;
1235   MPI_Request    *send_waits,*recv_waits;
1236   MPI_Status     recv_status,*send_status;
1237   IS             istmp;
1238   PetscTruth     found;
1239 
1240   PetscFunctionBegin;
1241   ierr = ISGetSize(is,&N);CHKERRQ(ierr);
1242   ierr = ISGetIndices(is,&rows);CHKERRQ(ierr);
1243 
1244   /*  first count number of contributors to each processor */
1245   ierr  = PetscMalloc(2*size*sizeof(int),&nprocs);CHKERRQ(ierr);
1246   ierr  = PetscMemzero(nprocs,2*size*sizeof(int));CHKERRQ(ierr);
1247   procs = nprocs + size;
1248   ierr  = PetscMalloc((N+1)*sizeof(int),&owner);CHKERRQ(ierr); /* see note*/
1249   for (i=0; i<N; i++) {
1250     idx   = rows[i];
1251     found = PETSC_FALSE;
1252     for (j=0; j<size; j++) {
1253       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
1254         nprocs[j]++; procs[j] = 1; owner[i] = j; found = PETSC_TRUE; break;
1255       }
1256     }
1257     if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index out of range");
1258   }
1259   nsends = 0;  for (i=0; i<size; i++) { nsends += procs[i];}
1260 
1261   /* inform other processors of number of messages and max length*/
1262   ierr   = PetscMalloc(2*size*sizeof(int),&work);CHKERRQ(ierr);
1263   ierr   = MPI_Allreduce(nprocs,work,2*size,MPI_INT,PetscMaxSum_Op,comm);CHKERRQ(ierr);
1264   nmax   = work[rank];
1265   nrecvs = work[size+rank];
1266   ierr   = PetscFree(work);CHKERRQ(ierr);
1267 
1268   /* post receives:   */
1269   ierr = PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int),&rvalues);CHKERRQ(ierr);
1270   ierr = PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&recv_waits);CHKERRQ(ierr);
1271   for (i=0; i<nrecvs; i++) {
1272     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
1273   }
1274 
1275   /* do sends:
1276      1) starts[i] gives the starting index in svalues for stuff going to
1277      the ith processor
1278   */
1279   ierr = PetscMalloc((N+1)*sizeof(int),&svalues);CHKERRQ(ierr);
1280   ierr = PetscMalloc((nsends+1)*sizeof(MPI_Request),&send_waits);CHKERRQ(ierr);
1281   ierr = PetscMalloc((size+1)*sizeof(int),&starts);CHKERRQ(ierr);
1282   starts[0]  = 0;
1283   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1284   for (i=0; i<N; i++) {
1285     svalues[starts[owner[i]]++] = rows[i];
1286   }
1287   ierr = ISRestoreIndices(is,&rows);CHKERRQ(ierr);
1288 
1289   starts[0] = 0;
1290   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1291   count = 0;
1292   for (i=0; i<size; i++) {
1293     if (procs[i]) {
1294       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
1295     }
1296   }
1297   ierr = PetscFree(starts);CHKERRQ(ierr);
1298 
1299   base = owners[rank]*bs;
1300 
1301   /*  wait on receives */
1302   ierr   = PetscMalloc(2*(nrecvs+1)*sizeof(int),&lens);CHKERRQ(ierr);
1303   source = lens + nrecvs;
1304   count  = nrecvs; slen = 0;
1305   while (count) {
1306     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
1307     /* unpack receives into our local space */
1308     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
1309     source[imdex]  = recv_status.MPI_SOURCE;
1310     lens[imdex]    = n;
1311     slen          += n;
1312     count--;
1313   }
1314   ierr = PetscFree(recv_waits);CHKERRQ(ierr);
1315 
1316   /* move the data into the send scatter */
1317   ierr = PetscMalloc((slen+1)*sizeof(int),&lrows);CHKERRQ(ierr);
1318   count = 0;
1319   for (i=0; i<nrecvs; i++) {
1320     values = rvalues + i*nmax;
1321     for (j=0; j<lens[i]; j++) {
1322       lrows[count++] = values[j] - base;
1323     }
1324   }
1325   ierr = PetscFree(rvalues);CHKERRQ(ierr);
1326   ierr = PetscFree(lens);CHKERRQ(ierr);
1327   ierr = PetscFree(owner);CHKERRQ(ierr);
1328   ierr = PetscFree(nprocs);CHKERRQ(ierr);
1329 
1330   /* actually zap the local rows */
1331   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
1332   PetscLogObjectParent(A,istmp);
1333 
1334   /*
1335         Zero the required rows. If the "diagonal block" of the matrix
1336      is square and the user wishes to set the diagonal we use seperate
1337      code so that MatSetValues() is not called for each diagonal allocating
1338      new memory, thus calling lots of mallocs and slowing things down.
1339 
1340        Contributed by: Mathew Knepley
1341   */
1342   /* must zero l->B before l->A because the (diag) case below may put values into l->B*/
1343   ierr = MatZeroRows_SeqBAIJ(l->B,istmp,0);CHKERRQ(ierr);
1344   if (diag && (l->A->M == l->A->N)) {
1345     ierr = MatZeroRows_SeqSBAIJ(l->A,istmp,diag);CHKERRQ(ierr);
1346   } else if (diag) {
1347     ierr = MatZeroRows_SeqSBAIJ(l->A,istmp,0);CHKERRQ(ierr);
1348     if (((Mat_SeqSBAIJ*)l->A->data)->nonew) {
1349       SETERRQ(PETSC_ERR_SUP,"MatZeroRows() on rectangular matrices cannot be used with the Mat options \n\
1350 MAT_NO_NEW_NONZERO_LOCATIONS,MAT_NEW_NONZERO_LOCATION_ERR,MAT_NEW_NONZERO_ALLOCATION_ERR");
1351     }
1352     for (i=0; i<slen; i++) {
1353       row  = lrows[i] + rstart_bs;
1354       ierr = MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES);CHKERRQ(ierr);
1355     }
1356     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1357     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1358   } else {
1359     ierr = MatZeroRows_SeqSBAIJ(l->A,istmp,0);CHKERRQ(ierr);
1360   }
1361 
1362   ierr = ISDestroy(istmp);CHKERRQ(ierr);
1363   ierr = PetscFree(lrows);CHKERRQ(ierr);
1364 
1365   /* wait on sends */
1366   if (nsends) {
1367     ierr = PetscMalloc(nsends*sizeof(MPI_Status),&send_status);CHKERRQ(ierr);
1368     ierr        = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
1369     ierr        = PetscFree(send_status);CHKERRQ(ierr);
1370   }
1371   ierr = PetscFree(send_waits);CHKERRQ(ierr);
1372   ierr = PetscFree(svalues);CHKERRQ(ierr);
1373 
1374   PetscFunctionReturn(0);
1375 }
1376 
1377 #undef __FUNCT__
1378 #define __FUNCT__ "MatPrintHelp_MPISBAIJ"
1379 int MatPrintHelp_MPISBAIJ(Mat A)
1380 {
1381   Mat_MPISBAIJ *a   = (Mat_MPISBAIJ*)A->data;
1382   MPI_Comm    comm = A->comm;
1383   static int  called = 0;
1384   int         ierr;
1385 
1386   PetscFunctionBegin;
1387   if (!a->rank) {
1388     ierr = MatPrintHelp_SeqSBAIJ(a->A);CHKERRQ(ierr);
1389   }
1390   if (called) {PetscFunctionReturn(0);} else called = 1;
1391   ierr = (*PetscHelpPrintf)(comm," Options for MATMPISBAIJ matrix format (the defaults):\n");CHKERRQ(ierr);
1392   ierr = (*PetscHelpPrintf)(comm,"  -mat_use_hash_table <factor>: Use hashtable for efficient matrix assembly\n");CHKERRQ(ierr);
1393   PetscFunctionReturn(0);
1394 }
1395 
1396 #undef __FUNCT__
1397 #define __FUNCT__ "MatSetUnfactored_MPISBAIJ"
1398 int MatSetUnfactored_MPISBAIJ(Mat A)
1399 {
1400   Mat_MPISBAIJ *a   = (Mat_MPISBAIJ*)A->data;
1401   int         ierr;
1402 
1403   PetscFunctionBegin;
1404   ierr = MatSetUnfactored(a->A);CHKERRQ(ierr);
1405   PetscFunctionReturn(0);
1406 }
1407 
1408 static int MatDuplicate_MPISBAIJ(Mat,MatDuplicateOption,Mat *);
1409 
1410 #undef __FUNCT__
1411 #define __FUNCT__ "MatEqual_MPISBAIJ"
1412 int MatEqual_MPISBAIJ(Mat A,Mat B,PetscTruth *flag)
1413 {
1414   Mat_MPISBAIJ *matB = (Mat_MPISBAIJ*)B->data,*matA = (Mat_MPISBAIJ*)A->data;
1415   Mat         a,b,c,d;
1416   PetscTruth  flg;
1417   int         ierr;
1418 
1419   PetscFunctionBegin;
1420   ierr = PetscTypeCompare((PetscObject)B,MATMPISBAIJ,&flg);CHKERRQ(ierr);
1421   if (!flg) SETERRQ(PETSC_ERR_ARG_INCOMP,"Matrices must be same type");
1422   a = matA->A; b = matA->B;
1423   c = matB->A; d = matB->B;
1424 
1425   ierr = MatEqual(a,c,&flg);CHKERRQ(ierr);
1426   if (flg == PETSC_TRUE) {
1427     ierr = MatEqual(b,d,&flg);CHKERRQ(ierr);
1428   }
1429   ierr = MPI_Allreduce(&flg,flag,1,MPI_INT,MPI_LAND,A->comm);CHKERRQ(ierr);
1430   PetscFunctionReturn(0);
1431 }
1432 
1433 #undef __FUNCT__
1434 #define __FUNCT__ "MatSetUpPreallocation_MPISBAIJ"
1435 int MatSetUpPreallocation_MPISBAIJ(Mat A)
1436 {
1437   int        ierr;
1438 
1439   PetscFunctionBegin;
1440   ierr = MatMPISBAIJSetPreallocation(A,1,PETSC_DEFAULT,0,PETSC_DEFAULT,0);CHKERRQ(ierr);
1441   PetscFunctionReturn(0);
1442 }
1443 /* -------------------------------------------------------------------*/
1444 static struct _MatOps MatOps_Values = {
1445   MatSetValues_MPISBAIJ,
1446   MatGetRow_MPISBAIJ,
1447   MatRestoreRow_MPISBAIJ,
1448   MatMult_MPISBAIJ,
1449   MatMultAdd_MPISBAIJ,
1450   MatMultTranspose_MPISBAIJ,
1451   MatMultTransposeAdd_MPISBAIJ,
1452   0,
1453   0,
1454   0,
1455   0,
1456   0,
1457   0,
1458   0,
1459   MatTranspose_MPISBAIJ,
1460   MatGetInfo_MPISBAIJ,
1461   MatEqual_MPISBAIJ,
1462   MatGetDiagonal_MPISBAIJ,
1463   MatDiagonalScale_MPISBAIJ,
1464   MatNorm_MPISBAIJ,
1465   MatAssemblyBegin_MPISBAIJ,
1466   MatAssemblyEnd_MPISBAIJ,
1467   0,
1468   MatSetOption_MPISBAIJ,
1469   MatZeroEntries_MPISBAIJ,
1470   MatZeroRows_MPISBAIJ,
1471   0,
1472   0,
1473   0,
1474   0,
1475   MatSetUpPreallocation_MPISBAIJ,
1476   0,
1477   0,
1478   0,
1479   0,
1480   0,
1481   MatDuplicate_MPISBAIJ,
1482   0,
1483   0,
1484   0,
1485   0,
1486   0,
1487   MatGetSubMatrices_MPISBAIJ,
1488   MatIncreaseOverlap_MPISBAIJ,
1489   MatGetValues_MPISBAIJ,
1490   0,
1491   MatPrintHelp_MPISBAIJ,
1492   MatScale_MPISBAIJ,
1493   0,
1494   0,
1495   0,
1496   MatGetBlockSize_MPISBAIJ,
1497   0,
1498   0,
1499   0,
1500   0,
1501   0,
1502   0,
1503   MatSetUnfactored_MPISBAIJ,
1504   0,
1505   MatSetValuesBlocked_MPISBAIJ,
1506   0,
1507   0,
1508   0,
1509   MatGetPetscMaps_Petsc,
1510   0,
1511   0,
1512   0,
1513   0,
1514   0,
1515   0,
1516   MatGetRowMax_MPISBAIJ};
1517 
1518 
1519 EXTERN_C_BEGIN
1520 #undef __FUNCT__
1521 #define __FUNCT__ "MatGetDiagonalBlock_MPISBAIJ"
1522 int MatGetDiagonalBlock_MPISBAIJ(Mat A,PetscTruth *iscopy,MatReuse reuse,Mat *a)
1523 {
1524   PetscFunctionBegin;
1525   *a      = ((Mat_MPISBAIJ *)A->data)->A;
1526   *iscopy = PETSC_FALSE;
1527   PetscFunctionReturn(0);
1528 }
1529 EXTERN_C_END
1530 
1531 EXTERN_C_BEGIN
1532 #undef __FUNCT__
1533 #define __FUNCT__ "MatCreate_MPISBAIJ"
1534 int MatCreate_MPISBAIJ(Mat B)
1535 {
1536   Mat_MPISBAIJ *b;
1537   int          ierr;
1538   PetscTruth   flg;
1539 
1540   PetscFunctionBegin;
1541 
1542   ierr    = PetscNew(Mat_MPISBAIJ,&b);CHKERRQ(ierr);
1543   B->data = (void*)b;
1544   ierr    = PetscMemzero(b,sizeof(Mat_MPISBAIJ));CHKERRQ(ierr);
1545   ierr    = PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr);
1546 
1547   B->ops->destroy    = MatDestroy_MPISBAIJ;
1548   B->ops->view       = MatView_MPISBAIJ;
1549   B->mapping    = 0;
1550   B->factor     = 0;
1551   B->assembled  = PETSC_FALSE;
1552 
1553   B->insertmode = NOT_SET_VALUES;
1554   ierr = MPI_Comm_rank(B->comm,&b->rank);CHKERRQ(ierr);
1555   ierr = MPI_Comm_size(B->comm,&b->size);CHKERRQ(ierr);
1556 
1557   /* build local table of row and column ownerships */
1558   ierr          = PetscMalloc(3*(b->size+2)*sizeof(int),&b->rowners);CHKERRQ(ierr);
1559   b->cowners    = b->rowners + b->size + 2;
1560   b->rowners_bs = b->cowners + b->size + 2;
1561   PetscLogObjectMemory(B,3*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPISBAIJ));
1562 
1563   /* build cache for off array entries formed */
1564   ierr = MatStashCreate_Private(B->comm,1,&B->stash);CHKERRQ(ierr);
1565   b->donotstash  = PETSC_FALSE;
1566   b->colmap      = PETSC_NULL;
1567   b->garray      = PETSC_NULL;
1568   b->roworiented = PETSC_TRUE;
1569 
1570 #if defined(PETSC_USE_MAT_SINGLE)
1571   /* stuff for MatSetValues_XXX in single precision */
1572   b->setvalueslen     = 0;
1573   b->setvaluescopy    = PETSC_NULL;
1574 #endif
1575 
1576   /* stuff used in block assembly */
1577   b->barray       = 0;
1578 
1579   /* stuff used for matrix vector multiply */
1580   b->lvec         = 0;
1581   b->Mvctx        = 0;
1582 
1583   /* stuff for MatGetRow() */
1584   b->rowindices   = 0;
1585   b->rowvalues    = 0;
1586   b->getrowactive = PETSC_FALSE;
1587 
1588   /* hash table stuff */
1589   b->ht           = 0;
1590   b->hd           = 0;
1591   b->ht_size      = 0;
1592   b->ht_flag      = PETSC_FALSE;
1593   b->ht_fact      = 0;
1594   b->ht_total_ct  = 0;
1595   b->ht_insert_ct = 0;
1596 
1597   ierr = PetscOptionsHasName(PETSC_NULL,"-mat_use_hash_table",&flg);CHKERRQ(ierr);
1598   if (flg) {
1599     PetscReal fact = 1.39;
1600     ierr = MatSetOption(B,MAT_USE_HASH_TABLE);CHKERRQ(ierr);
1601     ierr = PetscOptionsGetReal(PETSC_NULL,"-mat_use_hash_table",&fact,PETSC_NULL);CHKERRQ(ierr);
1602     if (fact <= 1.0) fact = 1.39;
1603     ierr = MatMPIBAIJSetHashTableFactor(B,fact);CHKERRQ(ierr);
1604     PetscLogInfo(0,"MatCreateMPISBAIJ:Hash table Factor used %5.2f\n",fact);
1605   }
1606   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatStoreValues_C",
1607                                      "MatStoreValues_MPISBAIJ",
1608                                      MatStoreValues_MPISBAIJ);CHKERRQ(ierr);
1609   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatRetrieveValues_C",
1610                                      "MatRetrieveValues_MPISBAIJ",
1611                                      MatRetrieveValues_MPISBAIJ);CHKERRQ(ierr);
1612   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetDiagonalBlock_C",
1613                                      "MatGetDiagonalBlock_MPISBAIJ",
1614                                      MatGetDiagonalBlock_MPISBAIJ);CHKERRQ(ierr);
1615   PetscFunctionReturn(0);
1616 }
1617 EXTERN_C_END
1618 
1619 #undef __FUNCT__
1620 #define __FUNCT__ "MatMPISBAIJSetPreallocation"
1621 /*@C
1622    MatMPISBAIJSetPreallocation - For good matrix assembly performance
1623    the user should preallocate the matrix storage by setting the parameters
1624    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1625    performance can be increased by more than a factor of 50.
1626 
1627    Collective on Mat
1628 
1629    Input Parameters:
1630 +  A - the matrix
1631 .  bs   - size of blockk
1632 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1633            submatrix  (same for all local rows)
1634 .  d_nnz - array containing the number of block nonzeros in the various block rows
1635            of the in diagonal portion of the local (possibly different for each block
1636            row) or PETSC_NULL.  You must leave room for the diagonal entry even if it is zero.
1637 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1638            submatrix (same for all local rows).
1639 -  o_nnz - array containing the number of nonzeros in the various block rows of the
1640            off-diagonal portion of the local submatrix (possibly different for
1641            each block row) or PETSC_NULL.
1642 
1643 
1644    Options Database Keys:
1645 .   -mat_no_unroll - uses code that does not unroll the loops in the
1646                      block calculations (much slower)
1647 .   -mat_block_size - size of the blocks to use
1648 
1649    Notes:
1650 
1651    If PETSC_DECIDE or  PETSC_DETERMINE is used for a particular argument on one processor
1652    than it must be used on all processors that share the object for that argument.
1653 
1654    Storage Information:
1655    For a square global matrix we define each processor's diagonal portion
1656    to be its local rows and the corresponding columns (a square submatrix);
1657    each processor's off-diagonal portion encompasses the remainder of the
1658    local matrix (a rectangular submatrix).
1659 
1660    The user can specify preallocated storage for the diagonal part of
1661    the local submatrix with either d_nz or d_nnz (not both).  Set
1662    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1663    memory allocation.  Likewise, specify preallocated storage for the
1664    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1665 
1666    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1667    the figure below we depict these three local rows and all columns (0-11).
1668 
1669 .vb
1670            0 1 2 3 4 5 6 7 8 9 10 11
1671           -------------------
1672    row 3  |  o o o d d d o o o o o o
1673    row 4  |  o o o d d d o o o o o o
1674    row 5  |  o o o d d d o o o o o o
1675           -------------------
1676 .ve
1677 
1678    Thus, any entries in the d locations are stored in the d (diagonal)
1679    submatrix, and any entries in the o locations are stored in the
1680    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1681    stored simply in the MATSEQBAIJ format for compressed row storage.
1682 
1683    Now d_nz should indicate the number of block nonzeros per row in the d matrix,
1684    and o_nz should indicate the number of block nonzeros per row in the o matrix.
1685    In general, for PDE problems in which most nonzeros are near the diagonal,
1686    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1687    or you will get TERRIBLE performance; see the users' manual chapter on
1688    matrices.
1689 
1690    Level: intermediate
1691 
1692 .keywords: matrix, block, aij, compressed row, sparse, parallel
1693 
1694 .seealso: MatCreate(), MatCreateSeqSBAIJ(), MatSetValues(), MatCreateMPIBAIJ()
1695 @*/
1696 
1697 int MatMPISBAIJSetPreallocation(Mat B,int bs,int d_nz,int *d_nnz,int o_nz,int *o_nnz)
1698 {
1699   Mat_MPISBAIJ *b;
1700   int          ierr,i,mbs,Mbs;
1701   PetscTruth   flg2;
1702 
1703   PetscFunctionBegin;
1704   ierr = PetscTypeCompare((PetscObject)B,MATMPISBAIJ,&flg2);CHKERRQ(ierr);
1705   if (!flg2) PetscFunctionReturn(0);
1706 
1707   ierr = PetscOptionsGetInt(PETSC_NULL,"-mat_block_size",&bs,PETSC_NULL);CHKERRQ(ierr);
1708 
1709   if (bs < 1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Invalid block size specified, must be positive");
1710   if (d_nz == PETSC_DECIDE || d_nz == PETSC_DEFAULT) d_nz = 3;
1711   if (o_nz == PETSC_DECIDE || o_nz == PETSC_DEFAULT) o_nz = 1;
1712   if (d_nz < 0) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %d",d_nz);
1713   if (o_nz < 0) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %d",o_nz);
1714   if (d_nnz) {
1715     for (i=0; i<B->m/bs; i++) {
1716       if (d_nnz[i] < 0) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than -1: local row %d value %d",i,d_nnz[i]);
1717     }
1718   }
1719   if (o_nnz) {
1720     for (i=0; i<B->m/bs; i++) {
1721       if (o_nnz[i] < 0) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than -1: local row %d value %d",i,o_nnz[i]);
1722     }
1723   }
1724   B->preallocated = PETSC_TRUE;
1725   ierr = PetscSplitOwnershipBlock(B->comm,bs,&B->m,&B->M);CHKERRQ(ierr);
1726   ierr = PetscSplitOwnershipBlock(B->comm,bs,&B->n,&B->N);CHKERRQ(ierr);
1727   ierr = PetscMapCreateMPI(B->comm,B->m,B->M,&B->rmap);CHKERRQ(ierr);
1728   ierr = PetscMapCreateMPI(B->comm,B->m,B->M,&B->cmap);CHKERRQ(ierr);
1729 
1730   b   = (Mat_MPISBAIJ*)B->data;
1731   mbs = B->m/bs;
1732   Mbs = B->M/bs;
1733   if (mbs*bs != B->m) {
1734     SETERRQ2(PETSC_ERR_ARG_SIZ,"No of local rows %d must be divisible by blocksize %d",B->m,bs);
1735   }
1736 
1737   b->bs  = bs;
1738   b->bs2 = bs*bs;
1739   b->mbs = mbs;
1740   b->nbs = mbs;
1741   b->Mbs = Mbs;
1742   b->Nbs = Mbs;
1743 
1744   ierr = MPI_Allgather(&b->mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,B->comm);CHKERRQ(ierr);
1745   b->rowners[0]    = 0;
1746   for (i=2; i<=b->size; i++) {
1747     b->rowners[i] += b->rowners[i-1];
1748   }
1749   b->rstart    = b->rowners[b->rank];
1750   b->rend      = b->rowners[b->rank+1];
1751   b->cstart    = b->rstart;
1752   b->cend      = b->rend;
1753   for (i=0; i<=b->size; i++) {
1754     b->rowners_bs[i] = b->rowners[i]*bs;
1755   }
1756   b->rstart_bs = b-> rstart*bs;
1757   b->rend_bs   = b->rend*bs;
1758 
1759   b->cstart_bs = b->cstart*bs;
1760   b->cend_bs   = b->cend*bs;
1761 
1762 
1763   ierr = MatCreateSeqSBAIJ(PETSC_COMM_SELF,bs,B->m,B->m,d_nz,d_nnz,&b->A);CHKERRQ(ierr);
1764   PetscLogObjectParent(B,b->A);
1765   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,B->m,B->M,o_nz,o_nnz,&b->B);CHKERRQ(ierr);
1766   PetscLogObjectParent(B,b->B);
1767 
1768   /* build cache for off array entries formed */
1769   ierr = MatStashCreate_Private(B->comm,bs,&B->bstash);CHKERRQ(ierr);
1770 
1771   PetscFunctionReturn(0);
1772 }
1773 
1774 #undef __FUNCT__
1775 #define __FUNCT__ "MatCreateMPISBAIJ"
1776 /*@C
1777    MatCreateMPISBAIJ - Creates a sparse parallel matrix in symmetric block AIJ format
1778    (block compressed row).  For good matrix assembly performance
1779    the user should preallocate the matrix storage by setting the parameters
1780    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1781    performance can be increased by more than a factor of 50.
1782 
1783    Collective on MPI_Comm
1784 
1785    Input Parameters:
1786 +  comm - MPI communicator
1787 .  bs   - size of blockk
1788 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1789            This value should be the same as the local size used in creating the
1790            y vector for the matrix-vector product y = Ax.
1791 .  n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1792            This value should be the same as the local size used in creating the
1793            x vector for the matrix-vector product y = Ax.
1794 .  M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1795 .  N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1796 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1797            submatrix  (same for all local rows)
1798 .  d_nnz - array containing the number of block nonzeros in the various block rows
1799            of the in diagonal portion of the local (possibly different for each block
1800            row) or PETSC_NULL.  You must leave room for the diagonal entry even if it is zero.
1801 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1802            submatrix (same for all local rows).
1803 -  o_nnz - array containing the number of nonzeros in the various block rows of the
1804            off-diagonal portion of the local submatrix (possibly different for
1805            each block row) or PETSC_NULL.
1806 
1807    Output Parameter:
1808 .  A - the matrix
1809 
1810    Options Database Keys:
1811 .   -mat_no_unroll - uses code that does not unroll the loops in the
1812                      block calculations (much slower)
1813 .   -mat_block_size - size of the blocks to use
1814 .   -mat_mpi - use the parallel matrix data structures even on one processor
1815                (defaults to using SeqBAIJ format on one processor)
1816 
1817    Notes:
1818    The user MUST specify either the local or global matrix dimensions
1819    (possibly both).
1820 
1821    If PETSC_DECIDE or  PETSC_DETERMINE is used for a particular argument on one processor
1822    than it must be used on all processors that share the object for that argument.
1823 
1824    Storage Information:
1825    For a square global matrix we define each processor's diagonal portion
1826    to be its local rows and the corresponding columns (a square submatrix);
1827    each processor's off-diagonal portion encompasses the remainder of the
1828    local matrix (a rectangular submatrix).
1829 
1830    The user can specify preallocated storage for the diagonal part of
1831    the local submatrix with either d_nz or d_nnz (not both).  Set
1832    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1833    memory allocation.  Likewise, specify preallocated storage for the
1834    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1835 
1836    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1837    the figure below we depict these three local rows and all columns (0-11).
1838 
1839 .vb
1840            0 1 2 3 4 5 6 7 8 9 10 11
1841           -------------------
1842    row 3  |  o o o d d d o o o o o o
1843    row 4  |  o o o d d d o o o o o o
1844    row 5  |  o o o d d d o o o o o o
1845           -------------------
1846 .ve
1847 
1848    Thus, any entries in the d locations are stored in the d (diagonal)
1849    submatrix, and any entries in the o locations are stored in the
1850    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1851    stored simply in the MATSEQBAIJ format for compressed row storage.
1852 
1853    Now d_nz should indicate the number of block nonzeros per row in the d matrix,
1854    and o_nz should indicate the number of block nonzeros per row in the o matrix.
1855    In general, for PDE problems in which most nonzeros are near the diagonal,
1856    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1857    or you will get TERRIBLE performance; see the users' manual chapter on
1858    matrices.
1859 
1860    Level: intermediate
1861 
1862 .keywords: matrix, block, aij, compressed row, sparse, parallel
1863 
1864 .seealso: MatCreate(), MatCreateSeqSBAIJ(), MatSetValues(), MatCreateMPIBAIJ()
1865 @*/
1866 
1867 int MatCreateMPISBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1868 {
1869   int ierr,size;
1870 
1871   PetscFunctionBegin;
1872   ierr = MatCreate(comm,m,n,M,N,A);CHKERRQ(ierr);
1873   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1874   if (size > 1) {
1875     ierr = MatSetType(*A,MATMPISBAIJ);CHKERRQ(ierr);
1876     ierr = MatMPISBAIJSetPreallocation(*A,bs,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
1877   } else {
1878     ierr = MatSetType(*A,MATSEQSBAIJ);CHKERRQ(ierr);
1879     ierr = MatSeqSBAIJSetPreallocation(*A,bs,d_nz,d_nnz);CHKERRQ(ierr);
1880   }
1881   PetscFunctionReturn(0);
1882 }
1883 
1884 
1885 #undef __FUNCT__
1886 #define __FUNCT__ "MatDuplicate_MPISBAIJ"
1887 static int MatDuplicate_MPISBAIJ(Mat matin,MatDuplicateOption cpvalues,Mat *newmat)
1888 {
1889   Mat          mat;
1890   Mat_MPISBAIJ *a,*oldmat = (Mat_MPISBAIJ*)matin->data;
1891   int          ierr,len=0;
1892 
1893   PetscFunctionBegin;
1894   *newmat       = 0;
1895   ierr = MatCreate(matin->comm,matin->m,matin->n,matin->M,matin->N,&mat);CHKERRQ(ierr);
1896   ierr = MatSetType(mat,MATMPISBAIJ);CHKERRQ(ierr);
1897   mat->preallocated = PETSC_TRUE;
1898   a = (Mat_MPISBAIJ*)mat->data;
1899   a->bs  = oldmat->bs;
1900   a->bs2 = oldmat->bs2;
1901   a->mbs = oldmat->mbs;
1902   a->nbs = oldmat->nbs;
1903   a->Mbs = oldmat->Mbs;
1904   a->Nbs = oldmat->Nbs;
1905 
1906   a->rstart       = oldmat->rstart;
1907   a->rend         = oldmat->rend;
1908   a->cstart       = oldmat->cstart;
1909   a->cend         = oldmat->cend;
1910   a->size         = oldmat->size;
1911   a->rank         = oldmat->rank;
1912   a->donotstash   = oldmat->donotstash;
1913   a->roworiented  = oldmat->roworiented;
1914   a->rowindices   = 0;
1915   a->rowvalues    = 0;
1916   a->getrowactive = PETSC_FALSE;
1917   a->barray       = 0;
1918   a->rstart_bs    = oldmat->rstart_bs;
1919   a->rend_bs      = oldmat->rend_bs;
1920   a->cstart_bs    = oldmat->cstart_bs;
1921   a->cend_bs      = oldmat->cend_bs;
1922 
1923   /* hash table stuff */
1924   a->ht           = 0;
1925   a->hd           = 0;
1926   a->ht_size      = 0;
1927   a->ht_flag      = oldmat->ht_flag;
1928   a->ht_fact      = oldmat->ht_fact;
1929   a->ht_total_ct  = 0;
1930   a->ht_insert_ct = 0;
1931 
1932   ierr = PetscMalloc(3*(a->size+2)*sizeof(int),&a->rowners);CHKERRQ(ierr);
1933   PetscLogObjectMemory(mat,3*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPISBAIJ));
1934   a->cowners    = a->rowners + a->size + 2;
1935   a->rowners_bs = a->cowners + a->size + 2;
1936   ierr = PetscMemcpy(a->rowners,oldmat->rowners,3*(a->size+2)*sizeof(int));CHKERRQ(ierr);
1937   ierr = MatStashCreate_Private(matin->comm,1,&mat->stash);CHKERRQ(ierr);
1938   ierr = MatStashCreate_Private(matin->comm,oldmat->bs,&mat->bstash);CHKERRQ(ierr);
1939   if (oldmat->colmap) {
1940 #if defined (PETSC_USE_CTABLE)
1941     ierr = PetscTableCreateCopy(oldmat->colmap,&a->colmap);CHKERRQ(ierr);
1942 #else
1943     ierr = PetscMalloc((a->Nbs)*sizeof(int),&a->colmap);CHKERRQ(ierr);
1944     PetscLogObjectMemory(mat,(a->Nbs)*sizeof(int));
1945     ierr = PetscMemcpy(a->colmap,oldmat->colmap,(a->Nbs)*sizeof(int));CHKERRQ(ierr);
1946 #endif
1947   } else a->colmap = 0;
1948   if (oldmat->garray && (len = ((Mat_SeqBAIJ*)(oldmat->B->data))->nbs)) {
1949     ierr = PetscMalloc(len*sizeof(int),&a->garray);CHKERRQ(ierr);
1950     PetscLogObjectMemory(mat,len*sizeof(int));
1951     ierr = PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));CHKERRQ(ierr);
1952   } else a->garray = 0;
1953 
1954   ierr =  VecDuplicate(oldmat->lvec,&a->lvec);CHKERRQ(ierr);
1955   PetscLogObjectParent(mat,a->lvec);
1956   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx);CHKERRQ(ierr);
1957 
1958   PetscLogObjectParent(mat,a->Mvctx);
1959   ierr =  MatDuplicate(oldmat->A,cpvalues,&a->A);CHKERRQ(ierr);
1960   PetscLogObjectParent(mat,a->A);
1961   ierr =  MatDuplicate(oldmat->B,cpvalues,&a->B);CHKERRQ(ierr);
1962   PetscLogObjectParent(mat,a->B);
1963   ierr = PetscFListDuplicate(mat->qlist,&matin->qlist);CHKERRQ(ierr);
1964   *newmat = mat;
1965   PetscFunctionReturn(0);
1966 }
1967 
1968 #include "petscsys.h"
1969 
1970 EXTERN_C_BEGIN
1971 #undef __FUNCT__
1972 #define __FUNCT__ "MatLoad_MPISBAIJ"
1973 int MatLoad_MPISBAIJ(PetscViewer viewer,MatType type,Mat *newmat)
1974 {
1975   Mat          A;
1976   int          i,nz,ierr,j,rstart,rend,fd;
1977   PetscScalar  *vals,*buf;
1978   MPI_Comm     comm = ((PetscObject)viewer)->comm;
1979   MPI_Status   status;
1980   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,*browners,maxnz,*cols;
1981   int          *locrowlens,*sndcounts = 0,*procsnz = 0,jj,*mycols,*ibuf;
1982   int          tag = ((PetscObject)viewer)->tag,bs=1,Mbs,mbs,extra_rows;
1983   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
1984   int          dcount,kmax,k,nzcount,tmp;
1985 
1986   PetscFunctionBegin;
1987   ierr = PetscOptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,PETSC_NULL);CHKERRQ(ierr);
1988 
1989   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1990   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1991   if (!rank) {
1992     ierr = PetscViewerBinaryGetDescriptor(viewer,&fd);CHKERRQ(ierr);
1993     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT);CHKERRQ(ierr);
1994     if (header[0] != MAT_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"not matrix object");
1995     if (header[3] < 0) {
1996       SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"Matrix stored in special format, cannot load as MPISBAIJ");
1997     }
1998   }
1999 
2000   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
2001   M = header[1]; N = header[2];
2002 
2003   if (M != N) SETERRQ(PETSC_ERR_SUP,"Can only do square matrices");
2004 
2005   /*
2006      This code adds extra rows to make sure the number of rows is
2007      divisible by the blocksize
2008   */
2009   Mbs        = M/bs;
2010   extra_rows = bs - M + bs*(Mbs);
2011   if (extra_rows == bs) extra_rows = 0;
2012   else                  Mbs++;
2013   if (extra_rows &&!rank) {
2014     PetscLogInfo(0,"MatLoad_MPISBAIJ:Padding loaded matrix to match blocksize\n");
2015   }
2016 
2017   /* determine ownership of all rows */
2018   mbs        = Mbs/size + ((Mbs % size) > rank);
2019   m          = mbs*bs;
2020   ierr       = PetscMalloc(2*(size+2)*sizeof(int),&rowners);CHKERRQ(ierr);
2021   browners   = rowners + size + 1;
2022   ierr       = MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2023   rowners[0] = 0;
2024   for (i=2; i<=size; i++) rowners[i] += rowners[i-1];
2025   for (i=0; i<=size;  i++) browners[i] = rowners[i]*bs;
2026   rstart = rowners[rank];
2027   rend   = rowners[rank+1];
2028 
2029   /* distribute row lengths to all processors */
2030   ierr = PetscMalloc((rend-rstart)*bs*sizeof(int),&locrowlens);CHKERRQ(ierr);
2031   if (!rank) {
2032     ierr = PetscMalloc((M+extra_rows)*sizeof(int),&rowlengths);CHKERRQ(ierr);
2033     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT);CHKERRQ(ierr);
2034     for (i=0; i<extra_rows; i++) rowlengths[M+i] = 1;
2035     ierr = PetscMalloc(size*sizeof(int),&sndcounts);CHKERRQ(ierr);
2036     for (i=0; i<size; i++) sndcounts[i] = browners[i+1] - browners[i];
2037     ierr = MPI_Scatterv(rowlengths,sndcounts,browners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);CHKERRQ(ierr);
2038     ierr = PetscFree(sndcounts);CHKERRQ(ierr);
2039   } else {
2040     ierr = MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);CHKERRQ(ierr);
2041   }
2042 
2043   if (!rank) {   /* procs[0] */
2044     /* calculate the number of nonzeros on each processor */
2045     ierr = PetscMalloc(size*sizeof(int),&procsnz);CHKERRQ(ierr);
2046     ierr = PetscMemzero(procsnz,size*sizeof(int));CHKERRQ(ierr);
2047     for (i=0; i<size; i++) {
2048       for (j=rowners[i]*bs; j< rowners[i+1]*bs; j++) {
2049         procsnz[i] += rowlengths[j];
2050       }
2051     }
2052     ierr = PetscFree(rowlengths);CHKERRQ(ierr);
2053 
2054     /* determine max buffer needed and allocate it */
2055     maxnz = 0;
2056     for (i=0; i<size; i++) {
2057       maxnz = PetscMax(maxnz,procsnz[i]);
2058     }
2059     ierr = PetscMalloc(maxnz*sizeof(int),&cols);CHKERRQ(ierr);
2060 
2061     /* read in my part of the matrix column indices  */
2062     nz     = procsnz[0];
2063     ierr   = PetscMalloc(nz*sizeof(int),&ibuf);CHKERRQ(ierr);
2064     mycols = ibuf;
2065     if (size == 1)  nz -= extra_rows;
2066     ierr = PetscBinaryRead(fd,mycols,nz,PETSC_INT);CHKERRQ(ierr);
2067     if (size == 1)  for (i=0; i< extra_rows; i++) { mycols[nz+i] = M+i; }
2068 
2069     /* read in every ones (except the last) and ship off */
2070     for (i=1; i<size-1; i++) {
2071       nz   = procsnz[i];
2072       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT);CHKERRQ(ierr);
2073       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
2074     }
2075     /* read in the stuff for the last proc */
2076     if (size != 1) {
2077       nz   = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
2078       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT);CHKERRQ(ierr);
2079       for (i=0; i<extra_rows; i++) cols[nz+i] = M+i;
2080       ierr = MPI_Send(cols,nz+extra_rows,MPI_INT,size-1,tag,comm);CHKERRQ(ierr);
2081     }
2082     ierr = PetscFree(cols);CHKERRQ(ierr);
2083   } else {  /* procs[i], i>0 */
2084     /* determine buffer space needed for message */
2085     nz = 0;
2086     for (i=0; i<m; i++) {
2087       nz += locrowlens[i];
2088     }
2089     ierr   = PetscMalloc(nz*sizeof(int),&ibuf);CHKERRQ(ierr);
2090     mycols = ibuf;
2091     /* receive message of column indices*/
2092     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
2093     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
2094     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"something is wrong with file");
2095   }
2096 
2097   /* loop over local rows, determining number of off diagonal entries */
2098   ierr     = PetscMalloc(2*(rend-rstart+1)*sizeof(int),&dlens);CHKERRQ(ierr);
2099   odlens   = dlens + (rend-rstart);
2100   ierr     = PetscMalloc(3*Mbs*sizeof(int),&mask);CHKERRQ(ierr);
2101   ierr     = PetscMemzero(mask,3*Mbs*sizeof(int));CHKERRQ(ierr);
2102   masked1  = mask    + Mbs;
2103   masked2  = masked1 + Mbs;
2104   rowcount = 0; nzcount = 0;
2105   for (i=0; i<mbs; i++) {
2106     dcount  = 0;
2107     odcount = 0;
2108     for (j=0; j<bs; j++) {
2109       kmax = locrowlens[rowcount];
2110       for (k=0; k<kmax; k++) {
2111         tmp = mycols[nzcount++]/bs; /* block col. index */
2112         if (!mask[tmp]) {
2113           mask[tmp] = 1;
2114           if (tmp < rstart || tmp >= rend) masked2[odcount++] = tmp; /* entry in off-diag portion */
2115           else masked1[dcount++] = tmp; /* entry in diag portion */
2116         }
2117       }
2118       rowcount++;
2119     }
2120 
2121     dlens[i]  = dcount;  /* d_nzz[i] */
2122     odlens[i] = odcount; /* o_nzz[i] */
2123 
2124     /* zero out the mask elements we set */
2125     for (j=0; j<dcount; j++) mask[masked1[j]] = 0;
2126     for (j=0; j<odcount; j++) mask[masked2[j]] = 0;
2127   }
2128 
2129   /* create our matrix */
2130   ierr = MatCreateMPISBAIJ(comm,bs,m,m,PETSC_DETERMINE,PETSC_DETERMINE,0,dlens,0,odlens,newmat);
2131   CHKERRQ(ierr);
2132   A = *newmat;
2133   ierr = MatSetOption(A,MAT_COLUMNS_SORTED);CHKERRQ(ierr);
2134 
2135   if (!rank) {
2136     ierr = PetscMalloc(maxnz*sizeof(PetscScalar),&buf);CHKERRQ(ierr);
2137     /* read in my part of the matrix numerical values  */
2138     nz = procsnz[0];
2139     vals = buf;
2140     mycols = ibuf;
2141     if (size == 1)  nz -= extra_rows;
2142     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR);CHKERRQ(ierr);
2143     if (size == 1)  for (i=0; i< extra_rows; i++) { vals[nz+i] = 1.0; }
2144 
2145     /* insert into matrix */
2146     jj      = rstart*bs;
2147     for (i=0; i<m; i++) {
2148       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2149       mycols += locrowlens[i];
2150       vals   += locrowlens[i];
2151       jj++;
2152     }
2153 
2154     /* read in other processors (except the last one) and ship out */
2155     for (i=1; i<size-1; i++) {
2156       nz   = procsnz[i];
2157       vals = buf;
2158       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR);CHKERRQ(ierr);
2159       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
2160     }
2161     /* the last proc */
2162     if (size != 1){
2163       nz   = procsnz[i] - extra_rows;
2164       vals = buf;
2165       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR);CHKERRQ(ierr);
2166       for (i=0; i<extra_rows; i++) vals[nz+i] = 1.0;
2167       ierr = MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,size-1,A->tag,comm);CHKERRQ(ierr);
2168     }
2169     ierr = PetscFree(procsnz);CHKERRQ(ierr);
2170 
2171   } else {
2172     /* receive numeric values */
2173     ierr = PetscMalloc(nz*sizeof(PetscScalar),&buf);CHKERRQ(ierr);
2174 
2175     /* receive message of values*/
2176     vals   = buf;
2177     mycols = ibuf;
2178     ierr   = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
2179     ierr   = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
2180     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"something is wrong with file");
2181 
2182     /* insert into matrix */
2183     jj      = rstart*bs;
2184     for (i=0; i<m; i++) {
2185       ierr    = MatSetValues_MPISBAIJ(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2186       mycols += locrowlens[i];
2187       vals   += locrowlens[i];
2188       jj++;
2189     }
2190   }
2191 
2192   ierr = PetscFree(locrowlens);CHKERRQ(ierr);
2193   ierr = PetscFree(buf);CHKERRQ(ierr);
2194   ierr = PetscFree(ibuf);CHKERRQ(ierr);
2195   ierr = PetscFree(rowners);CHKERRQ(ierr);
2196   ierr = PetscFree(dlens);CHKERRQ(ierr);
2197   ierr = PetscFree(mask);CHKERRQ(ierr);
2198   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2199   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2200   PetscFunctionReturn(0);
2201 }
2202 EXTERN_C_END
2203 
2204 #undef __FUNCT__
2205 #define __FUNCT__ "MatMPISBAIJSetHashTableFactor"
2206 /*@
2207    MatMPISBAIJSetHashTableFactor - Sets the factor required to compute the size of the HashTable.
2208 
2209    Input Parameters:
2210 .  mat  - the matrix
2211 .  fact - factor
2212 
2213    Collective on Mat
2214 
2215    Level: advanced
2216 
2217   Notes:
2218    This can also be set by the command line option: -mat_use_hash_table fact
2219 
2220 .keywords: matrix, hashtable, factor, HT
2221 
2222 .seealso: MatSetOption()
2223 @*/
2224 int MatMPISBAIJSetHashTableFactor(Mat mat,PetscReal fact)
2225 {
2226   PetscFunctionBegin;
2227   SETERRQ(1,"Function not yet written for SBAIJ format");
2228   /* PetscFunctionReturn(0); */
2229 }
2230 
2231 #undef __FUNCT__
2232 #define __FUNCT__ "MatGetRowMax_MPISBAIJ"
2233 int MatGetRowMax_MPISBAIJ(Mat A,Vec v)
2234 {
2235   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
2236   Mat_SeqBAIJ  *b = (Mat_SeqBAIJ*)(a->B)->data;
2237   PetscReal    atmp;
2238   PetscReal    *work,*svalues,*rvalues;
2239   int          ierr,i,bs,mbs,*bi,*bj,brow,j,ncols,krow,kcol,col,row,Mbs,bcol;
2240   int          rank,size,*rowners_bs,dest,count,source;
2241   PetscScalar  *va;
2242   MatScalar    *ba;
2243   MPI_Status   stat;
2244 
2245   PetscFunctionBegin;
2246   ierr = MatGetRowMax(a->A,v);CHKERRQ(ierr);
2247   ierr = VecGetArray(v,&va);CHKERRQ(ierr);
2248 
2249   ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
2250   ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);
2251 
2252   bs   = a->bs;
2253   mbs  = a->mbs;
2254   Mbs  = a->Mbs;
2255   ba   = b->a;
2256   bi   = b->i;
2257   bj   = b->j;
2258   /*
2259   PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] M: %d, bs: %d, mbs: %d \n",rank,bs*Mbs,bs,mbs);
2260   PetscSynchronizedFlush(PETSC_COMM_WORLD);
2261   */
2262 
2263   /* find ownerships */
2264   rowners_bs = a->rowners_bs;
2265   /*
2266   if (!rank){
2267     for (i=0; i<size+1; i++) PetscPrintf(PETSC_COMM_SELF," rowners_bs[%d]: %d\n",i,rowners_bs[i]);
2268   }
2269   */
2270 
2271   /* each proc creates an array to be distributed */
2272   ierr = PetscMalloc(bs*Mbs*sizeof(PetscReal),&work);CHKERRQ(ierr);
2273   ierr = PetscMemzero(work,bs*Mbs*sizeof(PetscReal));CHKERRQ(ierr);
2274 
2275   /* row_max for B */
2276   if (rank != size-1){
2277     for (i=0; i<mbs; i++) {
2278       ncols = bi[1] - bi[0]; bi++;
2279       brow  = bs*i;
2280       for (j=0; j<ncols; j++){
2281         bcol = bs*(*bj);
2282         for (kcol=0; kcol<bs; kcol++){
2283           col = bcol + kcol;                 /* local col index */
2284           col += rowners_bs[rank+1];      /* global col index */
2285           /* PetscPrintf(PETSC_COMM_SELF,"[%d], col: %d\n",rank,col); */
2286           for (krow=0; krow<bs; krow++){
2287             atmp = PetscAbsScalar(*ba); ba++;
2288             row = brow + krow;    /* local row index */
2289             /* printf("val[%d,%d]: %g\n",row,col,atmp); */
2290             if (PetscRealPart(va[row]) < atmp) va[row] = atmp;
2291             if (work[col] < atmp) work[col] = atmp;
2292           }
2293         }
2294         bj++;
2295       }
2296     }
2297     /*
2298       PetscPrintf(PETSC_COMM_SELF,"[%d], work: ",rank);
2299       for (i=0; i<bs*Mbs; i++) PetscPrintf(PETSC_COMM_SELF,"%g ",work[i]);
2300       PetscPrintf(PETSC_COMM_SELF,"[%d]: \n");
2301       */
2302 
2303     /* send values to its owners */
2304     for (dest=rank+1; dest<size; dest++){
2305       svalues = work + rowners_bs[dest];
2306       count   = rowners_bs[dest+1]-rowners_bs[dest];
2307       ierr    = MPI_Send(svalues,count,MPIU_REAL,dest,rank,PETSC_COMM_WORLD);CHKERRQ(ierr);
2308       /*
2309       PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] sends %d values to [%d]: %g, %g, %g, %g\n",rank,count,dest,svalues[0],svalues[1],svalues[2],svalues[3]);
2310       PetscSynchronizedFlush(PETSC_COMM_WORLD);
2311       */
2312     }
2313   }
2314 
2315   /* receive values */
2316   if (rank){
2317     rvalues = work;
2318     count   = rowners_bs[rank+1]-rowners_bs[rank];
2319     for (source=0; source<rank; source++){
2320       ierr = MPI_Recv(rvalues,count,MPIU_REAL,MPI_ANY_SOURCE,MPI_ANY_TAG,PETSC_COMM_WORLD,&stat);CHKERRQ(ierr);
2321       /* process values */
2322       for (i=0; i<count; i++){
2323         if (PetscRealPart(va[i]) < rvalues[i]) va[i] = rvalues[i];
2324       }
2325       /*
2326       PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] received %d values from [%d]: %g, %g, %g, %g \n",rank,count,stat.MPI_SOURCE,rvalues[0],rvalues[1],rvalues[2],rvalues[3]);
2327       PetscSynchronizedFlush(PETSC_COMM_WORLD);
2328       */
2329     }
2330   }
2331 
2332   ierr = VecRestoreArray(v,&va);CHKERRQ(ierr);
2333   ierr = PetscFree(work);CHKERRQ(ierr);
2334   PetscFunctionReturn(0);
2335 }
2336