xref: /petsc/src/mat/impls/sbaij/mpi/mpisbaij.c (revision a2fc510e2ab9fd06efe456f123d3efeee2bfdfd3)
1 /*$Id: mpisbaij.c,v 1.61 2001/08/10 03:31:37 bsmith Exp $*/
2 
3 #include "src/mat/impls/baij/mpi/mpibaij.h"    /*I "petscmat.h" I*/
4 #include "src/vec/vecimpl.h"
5 #include "mpisbaij.h"
6 #include "src/mat/impls/sbaij/seq/sbaij.h"
7 
8 extern int MatSetUpMultiply_MPISBAIJ(Mat);
9 extern int MatSetUpMultiply_MPISBAIJ_2comm(Mat);
10 extern int DisAssemble_MPISBAIJ(Mat);
11 extern int MatIncreaseOverlap_MPISBAIJ(Mat,int,IS *,int);
12 extern int MatGetSubMatrices_MPISBAIJ(Mat,int,IS *,IS *,MatReuse,Mat **);
13 extern int MatGetValues_SeqSBAIJ(Mat,int,int *,int,int *,PetscScalar *);
14 extern int MatSetValues_SeqSBAIJ(Mat,int,int *,int,int *,PetscScalar *,InsertMode);
15 extern int MatSetValuesBlocked_SeqSBAIJ(Mat,int,int*,int,int*,PetscScalar*,InsertMode);
16 extern int MatGetRow_SeqSBAIJ(Mat,int,int*,int**,PetscScalar**);
17 extern int MatRestoreRow_SeqSBAIJ(Mat,int,int*,int**,PetscScalar**);
18 extern int MatPrintHelp_SeqSBAIJ(Mat);
19 extern int MatZeroRows_SeqSBAIJ(Mat,IS,PetscScalar*);
20 extern int MatZeroRows_SeqBAIJ(Mat,IS,PetscScalar *);
21 extern int MatGetRowMax_MPISBAIJ(Mat,Vec);
22 extern int MatRelax_MPISBAIJ(Mat,Vec,PetscReal,MatSORType,PetscReal,int,int,Vec);
23 extern int MatUseSpooles_MPISBAIJ(Mat);
24 
25 /*  UGLY, ugly, ugly
26    When MatScalar == PetscScalar the function MatSetValuesBlocked_MPIBAIJ_MatScalar() does
27    not exist. Otherwise ..._MatScalar() takes matrix elements in single precision and
28    inserts them into the single precision data structure. The function MatSetValuesBlocked_MPIBAIJ()
29    converts the entries into single precision and then calls ..._MatScalar() to put them
30    into the single precision data structures.
31 */
32 #if defined(PETSC_USE_MAT_SINGLE)
33 extern int MatSetValuesBlocked_SeqSBAIJ_MatScalar(Mat,int,int*,int,int*,MatScalar*,InsertMode);
34 extern int MatSetValues_MPISBAIJ_MatScalar(Mat,int,int*,int,int*,MatScalar*,InsertMode);
35 extern int MatSetValuesBlocked_MPISBAIJ_MatScalar(Mat,int,int*,int,int*,MatScalar*,InsertMode);
36 extern int MatSetValues_MPISBAIJ_HT_MatScalar(Mat,int,int*,int,int*,MatScalar*,InsertMode);
37 extern int MatSetValuesBlocked_MPISBAIJ_HT_MatScalar(Mat,int,int*,int,int*,MatScalar*,InsertMode);
38 #else
39 #define MatSetValuesBlocked_SeqSBAIJ_MatScalar      MatSetValuesBlocked_SeqSBAIJ
40 #define MatSetValues_MPISBAIJ_MatScalar             MatSetValues_MPISBAIJ
41 #define MatSetValuesBlocked_MPISBAIJ_MatScalar      MatSetValuesBlocked_MPISBAIJ
42 #define MatSetValues_MPISBAIJ_HT_MatScalar          MatSetValues_MPISBAIJ_HT
43 #define MatSetValuesBlocked_MPISBAIJ_HT_MatScalar   MatSetValuesBlocked_MPISBAIJ_HT
44 #endif
45 
46 EXTERN_C_BEGIN
47 #undef __FUNCT__
48 #define __FUNCT__ "MatStoreValues_MPISBAIJ"
49 int MatStoreValues_MPISBAIJ(Mat mat)
50 {
51   Mat_MPISBAIJ *aij = (Mat_MPISBAIJ *)mat->data;
52   int          ierr;
53 
54   PetscFunctionBegin;
55   ierr = MatStoreValues(aij->A);CHKERRQ(ierr);
56   ierr = MatStoreValues(aij->B);CHKERRQ(ierr);
57   PetscFunctionReturn(0);
58 }
59 EXTERN_C_END
60 
61 EXTERN_C_BEGIN
62 #undef __FUNCT__
63 #define __FUNCT__ "MatRetrieveValues_MPISBAIJ"
64 int MatRetrieveValues_MPISBAIJ(Mat mat)
65 {
66   Mat_MPISBAIJ *aij = (Mat_MPISBAIJ *)mat->data;
67   int          ierr;
68 
69   PetscFunctionBegin;
70   ierr = MatRetrieveValues(aij->A);CHKERRQ(ierr);
71   ierr = MatRetrieveValues(aij->B);CHKERRQ(ierr);
72   PetscFunctionReturn(0);
73 }
74 EXTERN_C_END
75 
76 /*
77      Local utility routine that creates a mapping from the global column
78    number to the local number in the off-diagonal part of the local
79    storage of the matrix.  This is done in a non scable way since the
80    length of colmap equals the global matrix length.
81 */
82 #undef __FUNCT__
83 #define __FUNCT__ "CreateColmap_MPISBAIJ_Private"
84 static int CreateColmap_MPISBAIJ_Private(Mat mat)
85 {
86   PetscFunctionBegin;
87   SETERRQ(1,"Function not yet written for SBAIJ format");
88   /* PetscFunctionReturn(0); */
89 }
90 
91 #define CHUNKSIZE  10
92 
93 #define  MatSetValues_SeqSBAIJ_A_Private(row,col,value,addv) \
94 { \
95  \
96     brow = row/bs;  \
97     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
98     rmax = aimax[brow]; nrow = ailen[brow]; \
99       bcol = col/bs; \
100       ridx = row % bs; cidx = col % bs; \
101       low = 0; high = nrow; \
102       while (high-low > 3) { \
103         t = (low+high)/2; \
104         if (rp[t] > bcol) high = t; \
105         else              low  = t; \
106       } \
107       for (_i=low; _i<high; _i++) { \
108         if (rp[_i] > bcol) break; \
109         if (rp[_i] == bcol) { \
110           bap  = ap +  bs2*_i + bs*cidx + ridx; \
111           if (addv == ADD_VALUES) *bap += value;  \
112           else                    *bap  = value;  \
113           goto a_noinsert; \
114         } \
115       } \
116       if (a->nonew == 1) goto a_noinsert; \
117       else if (a->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero into matrix"); \
118       if (nrow >= rmax) { \
119         /* there is no extra room in row, therefore enlarge */ \
120         int       new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
121         MatScalar *new_a; \
122  \
123         if (a->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero in the matrix"); \
124  \
125         /* malloc new storage space */ \
126         len   = new_nz*(sizeof(int)+bs2*sizeof(MatScalar))+(a->mbs+1)*sizeof(int); \
127         ierr  = PetscMalloc(len,&new_a);CHKERRQ(ierr); \
128         new_j = (int*)(new_a + bs2*new_nz); \
129         new_i = new_j + new_nz; \
130  \
131         /* copy over old data into new slots */ \
132         for (ii=0; ii<brow+1; ii++) {new_i[ii] = ai[ii];} \
133         for (ii=brow+1; ii<a->mbs+1; ii++) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
134         ierr = PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int));CHKERRQ(ierr); \
135         len = (new_nz - CHUNKSIZE - ai[brow] - nrow); \
136         ierr = PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow,len*sizeof(int));CHKERRQ(ierr); \
137         ierr = PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(MatScalar));CHKERRQ(ierr); \
138         ierr = PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(PetscScalar));CHKERRQ(ierr); \
139         ierr = PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE), \
140                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(MatScalar));CHKERRQ(ierr);  \
141         /* free up old matrix storage */ \
142         ierr = PetscFree(a->a);CHKERRQ(ierr);  \
143         if (!a->singlemalloc) { \
144           ierr = PetscFree(a->i);CHKERRQ(ierr); \
145           ierr = PetscFree(a->j);CHKERRQ(ierr);\
146         } \
147         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
148         a->singlemalloc = PETSC_TRUE; \
149  \
150         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
151         rmax = aimax[brow] = aimax[brow] + CHUNKSIZE; \
152         PetscLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(MatScalar))); \
153         a->s_maxnz += bs2*CHUNKSIZE; \
154         a->reallocs++; \
155         a->s_nz++; \
156       } \
157       N = nrow++ - 1;  \
158       /* shift up all the later entries in this row */ \
159       for (ii=N; ii>=_i; ii--) { \
160         rp[ii+1] = rp[ii]; \
161         ierr = PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(MatScalar));CHKERRQ(ierr); \
162       } \
163       if (N>=_i) { ierr = PetscMemzero(ap+bs2*_i,bs2*sizeof(MatScalar));CHKERRQ(ierr); }  \
164       rp[_i]                      = bcol;  \
165       ap[bs2*_i + bs*cidx + ridx] = value;  \
166       a_noinsert:; \
167     ailen[brow] = nrow; \
168 }
169 #ifndef MatSetValues_SeqBAIJ_B_Private
170 #define  MatSetValues_SeqSBAIJ_B_Private(row,col,value,addv) \
171 { \
172     brow = row/bs;  \
173     rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
174     rmax = bimax[brow]; nrow = bilen[brow]; \
175       bcol = col/bs; \
176       ridx = row % bs; cidx = col % bs; \
177       low = 0; high = nrow; \
178       while (high-low > 3) { \
179         t = (low+high)/2; \
180         if (rp[t] > bcol) high = t; \
181         else              low  = t; \
182       } \
183       for (_i=low; _i<high; _i++) { \
184         if (rp[_i] > bcol) break; \
185         if (rp[_i] == bcol) { \
186           bap  = ap +  bs2*_i + bs*cidx + ridx; \
187           if (addv == ADD_VALUES) *bap += value;  \
188           else                    *bap  = value;  \
189           goto b_noinsert; \
190         } \
191       } \
192       if (b->nonew == 1) goto b_noinsert; \
193       else if (b->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero into matrix"); \
194       if (nrow >= rmax) { \
195         /* there is no extra room in row, therefore enlarge */ \
196         int       new_nz = bi[b->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
197         MatScalar *new_a; \
198  \
199         if (b->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero in the matrix"); \
200  \
201         /* malloc new storage space */ \
202         len   = new_nz*(sizeof(int)+bs2*sizeof(MatScalar))+(b->mbs+1)*sizeof(int); \
203         ierr  = PetscMalloc(len,&new_a);CHKERRQ(ierr); \
204         new_j = (int*)(new_a + bs2*new_nz); \
205         new_i = new_j + new_nz; \
206  \
207         /* copy over old data into new slots */ \
208         for (ii=0; ii<brow+1; ii++) {new_i[ii] = bi[ii];} \
209         for (ii=brow+1; ii<b->mbs+1; ii++) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
210         ierr = PetscMemcpy(new_j,bj,(bi[brow]+nrow)*sizeof(int));CHKERRQ(ierr); \
211         len  = (new_nz - CHUNKSIZE - bi[brow] - nrow); \
212         ierr = PetscMemcpy(new_j+bi[brow]+nrow+CHUNKSIZE,bj+bi[brow]+nrow,len*sizeof(int));CHKERRQ(ierr); \
213         ierr = PetscMemcpy(new_a,ba,(bi[brow]+nrow)*bs2*sizeof(MatScalar));CHKERRQ(ierr); \
214         ierr = PetscMemzero(new_a+bs2*(bi[brow]+nrow),bs2*CHUNKSIZE*sizeof(MatScalar));CHKERRQ(ierr); \
215         ierr = PetscMemcpy(new_a+bs2*(bi[brow]+nrow+CHUNKSIZE), \
216                     ba+bs2*(bi[brow]+nrow),bs2*len*sizeof(MatScalar));CHKERRQ(ierr);  \
217         /* free up old matrix storage */ \
218         ierr = PetscFree(b->a);CHKERRQ(ierr);  \
219         if (!b->singlemalloc) { \
220           ierr = PetscFree(b->i);CHKERRQ(ierr); \
221           ierr = PetscFree(b->j);CHKERRQ(ierr); \
222         } \
223         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
224         b->singlemalloc = PETSC_TRUE; \
225  \
226         rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
227         rmax = bimax[brow] = bimax[brow] + CHUNKSIZE; \
228         PetscLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + bs2*sizeof(MatScalar))); \
229         b->maxnz += bs2*CHUNKSIZE; \
230         b->reallocs++; \
231         b->nz++; \
232       } \
233       N = nrow++ - 1;  \
234       /* shift up all the later entries in this row */ \
235       for (ii=N; ii>=_i; ii--) { \
236         rp[ii+1] = rp[ii]; \
237         ierr = PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(MatScalar));CHKERRQ(ierr); \
238       } \
239       if (N>=_i) { ierr = PetscMemzero(ap+bs2*_i,bs2*sizeof(MatScalar));CHKERRQ(ierr);}  \
240       rp[_i]                      = bcol;  \
241       ap[bs2*_i + bs*cidx + ridx] = value;  \
242       b_noinsert:; \
243     bilen[brow] = nrow; \
244 }
245 #endif
246 
247 #if defined(PETSC_USE_MAT_SINGLE)
248 #undef __FUNCT__
249 #define __FUNCT__ "MatSetValues_MPISBAIJ"
250 int MatSetValues_MPISBAIJ(Mat mat,int m,int *im,int n,int *in,PetscScalar *v,InsertMode addv)
251 {
252   Mat_MPISBAIJ *b = (Mat_MPISBAIJ*)mat->data;
253   int          ierr,i,N = m*n;
254   MatScalar    *vsingle;
255 
256   PetscFunctionBegin;
257   if (N > b->setvalueslen) {
258     if (b->setvaluescopy) {ierr = PetscFree(b->setvaluescopy);CHKERRQ(ierr);}
259     ierr = PetscMalloc(N*sizeof(MatScalar),&b->setvaluescopy);CHKERRQ(ierr);
260     b->setvalueslen  = N;
261   }
262   vsingle = b->setvaluescopy;
263 
264   for (i=0; i<N; i++) {
265     vsingle[i] = v[i];
266   }
267   ierr = MatSetValues_MPISBAIJ_MatScalar(mat,m,im,n,in,vsingle,addv);CHKERRQ(ierr);
268   PetscFunctionReturn(0);
269 }
270 
271 #undef __FUNCT__
272 #define __FUNCT__ "MatSetValuesBlocked_MPISBAIJ"
273 int MatSetValuesBlocked_MPISBAIJ(Mat mat,int m,int *im,int n,int *in,PetscScalar *v,InsertMode addv)
274 {
275   Mat_MPIBAIJ *b = (Mat_MPIBAIJ*)mat->data;
276   int         ierr,i,N = m*n*b->bs2;
277   MatScalar   *vsingle;
278 
279   PetscFunctionBegin;
280   if (N > b->setvalueslen) {
281     if (b->setvaluescopy) {ierr = PetscFree(b->setvaluescopy);CHKERRQ(ierr);}
282     ierr = PetscMalloc(N*sizeof(MatScalar),&b->setvaluescopy);CHKERRQ(ierr);
283     b->setvalueslen  = N;
284   }
285   vsingle = b->setvaluescopy;
286   for (i=0; i<N; i++) {
287     vsingle[i] = v[i];
288   }
289   ierr = MatSetValuesBlocked_MPISBAIJ_MatScalar(mat,m,im,n,in,vsingle,addv);CHKERRQ(ierr);
290   PetscFunctionReturn(0);
291 }
292 
293 #undef __FUNCT__
294 #define __FUNCT__ "MatSetValues_MPISBAIJ_HT"
295 int MatSetValues_MPISBAIJ_HT(Mat mat,int m,int *im,int n,int *in,PetscScalar *v,InsertMode addv)
296 {
297   Mat_MPIBAIJ *b = (Mat_MPIBAIJ*)mat->data;
298   int         ierr,i,N = m*n;
299   MatScalar   *vsingle;
300 
301   PetscFunctionBegin;
302   SETERRQ(1,"Function not yet written for SBAIJ format");
303   /* PetscFunctionReturn(0); */
304 }
305 
306 #undef __FUNCT__
307 #define __FUNCT__ "MatSetValuesBlocked_MPISBAIJ_HT"
308 int MatSetValuesBlocked_MPISBAIJ_HT(Mat mat,int m,int *im,int n,int *in,PetscScalar *v,InsertMode addv)
309 {
310   Mat_MPIBAIJ *b = (Mat_MPIBAIJ*)mat->data;
311   int         ierr,i,N = m*n*b->bs2;
312   MatScalar   *vsingle;
313 
314   PetscFunctionBegin;
315   SETERRQ(1,"Function not yet written for SBAIJ format");
316   /* PetscFunctionReturn(0); */
317 }
318 #endif
319 
320 /* Only add/insert a(i,j) with i<=j (blocks).
321    Any a(i,j) with i>j input by user is ingored.
322 */
323 #undef __FUNCT__
324 #define __FUNCT__ "MatSetValues_MPIBAIJ"
325 int MatSetValues_MPISBAIJ_MatScalar(Mat mat,int m,int *im,int n,int *in,MatScalar *v,InsertMode addv)
326 {
327   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
328   MatScalar    value;
329   PetscTruth   roworiented = baij->roworiented;
330   int          ierr,i,j,row,col;
331   int          rstart_orig=baij->rstart_bs;
332   int          rend_orig=baij->rend_bs,cstart_orig=baij->cstart_bs;
333   int          cend_orig=baij->cend_bs,bs=baij->bs;
334 
335   /* Some Variables required in the macro */
336   Mat          A = baij->A;
337   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)(A)->data;
338   int          *aimax=a->imax,*ai=a->i,*ailen=a->ilen,*aj=a->j;
339   MatScalar    *aa=a->a;
340 
341   Mat          B = baij->B;
342   Mat_SeqBAIJ  *b = (Mat_SeqBAIJ*)(B)->data;
343   int          *bimax=b->imax,*bi=b->i,*bilen=b->ilen,*bj=b->j;
344   MatScalar    *ba=b->a;
345 
346   int          *rp,ii,nrow,_i,rmax,N,brow,bcol;
347   int          low,high,t,ridx,cidx,bs2=a->bs2;
348   MatScalar    *ap,*bap;
349 
350   /* for stash */
351   int          n_loc, *in_loc=0;
352   MatScalar    *v_loc=0;
353 
354   PetscFunctionBegin;
355 
356   if(!baij->donotstash){
357     ierr = PetscMalloc(n*sizeof(int),&in_loc);CHKERRQ(ierr);
358     ierr = PetscMalloc(n*sizeof(MatScalar),&v_loc);CHKERRQ(ierr);
359   }
360 
361   for (i=0; i<m; i++) {
362     if (im[i] < 0) continue;
363 #if defined(PETSC_USE_BOPT_g)
364     if (im[i] >= mat->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Row too large");
365 #endif
366     if (im[i] >= rstart_orig && im[i] < rend_orig) { /* this processor entry */
367       row = im[i] - rstart_orig;              /* local row index */
368       for (j=0; j<n; j++) {
369         if (im[i]/bs > in[j]/bs) continue;    /* ignore lower triangular blocks */
370         if (in[j] >= cstart_orig && in[j] < cend_orig){  /* diag entry (A) */
371           col = in[j] - cstart_orig;          /* local col index */
372           brow = row/bs; bcol = col/bs;
373           if (brow > bcol) continue;  /* ignore lower triangular blocks of A */
374           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
375           MatSetValues_SeqSBAIJ_A_Private(row,col,value,addv);
376           /* ierr = MatSetValues_SeqBAIJ(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
377         } else if (in[j] < 0) continue;
378 #if defined(PETSC_USE_BOPT_g)
379         else if (in[j] >= mat->N) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Col too large");}
380 #endif
381         else {  /* off-diag entry (B) */
382           if (mat->was_assembled) {
383             if (!baij->colmap) {
384               ierr = CreateColmap_MPISBAIJ_Private(mat);CHKERRQ(ierr);
385             }
386 #if defined (PETSC_USE_CTABLE)
387             ierr = PetscTableFind(baij->colmap,in[j]/bs + 1,&col);CHKERRQ(ierr);
388             col  = col - 1;
389 #else
390             col = baij->colmap[in[j]/bs] - 1;
391 #endif
392             if (col < 0 && !((Mat_SeqSBAIJ*)(baij->A->data))->nonew) {
393               ierr = DisAssemble_MPISBAIJ(mat);CHKERRQ(ierr);
394               col =  in[j];
395               /* Reinitialize the variables required by MatSetValues_SeqBAIJ_B_Private() */
396               B = baij->B;
397               b = (Mat_SeqBAIJ*)(B)->data;
398               bimax=b->imax;bi=b->i;bilen=b->ilen;bj=b->j;
399               ba=b->a;
400             } else col += in[j]%bs;
401           } else col = in[j];
402           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
403           MatSetValues_SeqSBAIJ_B_Private(row,col,value,addv);
404           /* ierr = MatSetValues_SeqBAIJ(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
405         }
406       }
407     } else {  /* off processor entry */
408       if (!baij->donotstash) {
409         n_loc = 0;
410         for (j=0; j<n; j++){
411           if (im[i]/bs > in[j]/bs) continue; /* ignore lower triangular blocks */
412           in_loc[n_loc] = in[j];
413           if (roworiented) {
414             v_loc[n_loc] = v[i*n+j];
415           } else {
416             v_loc[n_loc] = v[j*m+i];
417           }
418           n_loc++;
419         }
420         ierr = MatStashValuesRow_Private(&mat->stash,im[i],n_loc,in_loc,v_loc);CHKERRQ(ierr);
421       }
422     }
423   }
424 
425   if(!baij->donotstash){
426     ierr = PetscFree(in_loc);CHKERRQ(ierr);
427     ierr = PetscFree(v_loc);CHKERRQ(ierr);
428   }
429   PetscFunctionReturn(0);
430 }
431 
432 #undef __FUNCT__
433 #define __FUNCT__ "MatSetValuesBlocked_MPISBAIJ"
434 int MatSetValuesBlocked_MPISBAIJ_MatScalar(Mat mat,int m,int *im,int n,int *in,MatScalar *v,InsertMode addv)
435 {
436   PetscFunctionBegin;
437   SETERRQ(1,"Function not yet written for SBAIJ format");
438   /* PetscFunctionReturn(0); */
439 }
440 
441 #define HASH_KEY 0.6180339887
442 #define HASH(size,key,tmp) (tmp = (key)*HASH_KEY,(int)((size)*(tmp-(int)tmp)))
443 /* #define HASH(size,key) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
444 /* #define HASH(size,key,tmp) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
445 #undef __FUNCT__
446 #define __FUNCT__ "MatSetValues_MPISBAIJ_HT_MatScalar"
447 int MatSetValues_MPISBAIJ_HT_MatScalar(Mat mat,int m,int *im,int n,int *in,MatScalar *v,InsertMode addv)
448 {
449   PetscFunctionBegin;
450   SETERRQ(1,"Function not yet written for SBAIJ format");
451   /* PetscFunctionReturn(0); */
452 }
453 
454 #undef __FUNCT__
455 #define __FUNCT__ "MatSetValuesBlocked_MPISBAIJ_HT_MatScalar"
456 int MatSetValuesBlocked_MPISBAIJ_HT_MatScalar(Mat mat,int m,int *im,int n,int *in,MatScalar *v,InsertMode addv)
457 {
458   PetscFunctionBegin;
459   SETERRQ(1,"Function not yet written for SBAIJ format");
460   /* PetscFunctionReturn(0); */
461 }
462 
463 #undef __FUNCT__
464 #define __FUNCT__ "MatGetValues_MPISBAIJ"
465 int MatGetValues_MPISBAIJ(Mat mat,int m,int *idxm,int n,int *idxn,PetscScalar *v)
466 {
467   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
468   int          bs=baij->bs,ierr,i,j,bsrstart = baij->rstart*bs,bsrend = baij->rend*bs;
469   int          bscstart = baij->cstart*bs,bscend = baij->cend*bs,row,col,data;
470 
471   PetscFunctionBegin;
472   for (i=0; i<m; i++) {
473     if (idxm[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative row");
474     if (idxm[i] >= mat->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Row too large");
475     if (idxm[i] >= bsrstart && idxm[i] < bsrend) {
476       row = idxm[i] - bsrstart;
477       for (j=0; j<n; j++) {
478         if (idxn[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative column");
479         if (idxn[j] >= mat->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Column too large");
480         if (idxn[j] >= bscstart && idxn[j] < bscend){
481           col = idxn[j] - bscstart;
482           ierr = MatGetValues_SeqSBAIJ(baij->A,1,&row,1,&col,v+i*n+j);CHKERRQ(ierr);
483         } else {
484           if (!baij->colmap) {
485             ierr = CreateColmap_MPISBAIJ_Private(mat);CHKERRQ(ierr);
486           }
487 #if defined (PETSC_USE_CTABLE)
488           ierr = PetscTableFind(baij->colmap,idxn[j]/bs+1,&data);CHKERRQ(ierr);
489           data --;
490 #else
491           data = baij->colmap[idxn[j]/bs]-1;
492 #endif
493           if((data < 0) || (baij->garray[data/bs] != idxn[j]/bs)) *(v+i*n+j) = 0.0;
494           else {
495             col  = data + idxn[j]%bs;
496             ierr = MatGetValues_SeqSBAIJ(baij->B,1,&row,1,&col,v+i*n+j);CHKERRQ(ierr);
497           }
498         }
499       }
500     } else {
501       SETERRQ(PETSC_ERR_SUP,"Only local values currently supported");
502     }
503   }
504  PetscFunctionReturn(0);
505 }
506 
507 #undef __FUNCT__
508 #define __FUNCT__ "MatNorm_MPISBAIJ"
509 int MatNorm_MPISBAIJ(Mat mat,NormType type,PetscReal *norm)
510 {
511   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
512   /* Mat_SeqSBAIJ *amat = (Mat_SeqSBAIJ*)baij->A->data; */
513   /* Mat_SeqBAIJ  *bmat = (Mat_SeqBAIJ*)baij->B->data; */
514   int        ierr;
515   PetscReal  sum[2],*lnorm2;
516 
517   PetscFunctionBegin;
518   if (baij->size == 1) {
519     ierr =  MatNorm(baij->A,type,norm);CHKERRQ(ierr);
520   } else {
521     if (type == NORM_FROBENIUS) {
522       ierr = PetscMalloc(2*sizeof(PetscReal),&lnorm2);CHKERRQ(ierr);
523       ierr =  MatNorm(baij->A,type,lnorm2);CHKERRQ(ierr);
524       *lnorm2 = (*lnorm2)*(*lnorm2); lnorm2++;            /* squar power of norm(A) */
525       ierr =  MatNorm(baij->B,type,lnorm2);CHKERRQ(ierr);
526       *lnorm2 = (*lnorm2)*(*lnorm2); lnorm2--;             /* squar power of norm(B) */
527       /*
528       ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);
529       PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d], lnorm2=%g, %g\n",rank,lnorm2[0],lnorm2[1]);
530       */
531       ierr = MPI_Allreduce(lnorm2,&sum,2,MPIU_REAL,MPI_SUM,mat->comm);CHKERRQ(ierr);
532       /*
533       PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d], sum=%g, %g\n",rank,sum[0],sum[1]);
534       PetscSynchronizedFlush(PETSC_COMM_WORLD); */
535 
536       *norm = sqrt(sum[0] + 2*sum[1]);
537       ierr = PetscFree(lnorm2);CHKERRQ(ierr);
538     } else {
539       SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
540     }
541   }
542   PetscFunctionReturn(0);
543 }
544 
545 /*
546   Creates the hash table, and sets the table
547   This table is created only once.
548   If new entried need to be added to the matrix
549   then the hash table has to be destroyed and
550   recreated.
551 */
552 #undef __FUNCT__
553 #define __FUNCT__ "MatCreateHashTable_MPISBAIJ_Private"
554 int MatCreateHashTable_MPISBAIJ_Private(Mat mat,PetscReal factor)
555 {
556   PetscFunctionBegin;
557   SETERRQ(1,"Function not yet written for SBAIJ format");
558   /* PetscFunctionReturn(0); */
559 }
560 
561 #undef __FUNCT__
562 #define __FUNCT__ "MatAssemblyBegin_MPISBAIJ"
563 int MatAssemblyBegin_MPISBAIJ(Mat mat,MatAssemblyType mode)
564 {
565   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
566   int         ierr,nstash,reallocs;
567   InsertMode  addv;
568 
569   PetscFunctionBegin;
570   if (baij->donotstash) {
571     PetscFunctionReturn(0);
572   }
573 
574   /* make sure all processors are either in INSERTMODE or ADDMODE */
575   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,mat->comm);CHKERRQ(ierr);
576   if (addv == (ADD_VALUES|INSERT_VALUES)) {
577     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added");
578   }
579   mat->insertmode = addv; /* in case this processor had no cache */
580 
581   ierr = MatStashScatterBegin_Private(&mat->stash,baij->rowners_bs);CHKERRQ(ierr);
582   ierr = MatStashScatterBegin_Private(&mat->bstash,baij->rowners);CHKERRQ(ierr);
583   ierr = MatStashGetInfo_Private(&mat->stash,&nstash,&reallocs);CHKERRQ(ierr);
584   PetscLogInfo(0,"MatAssemblyBegin_MPISBAIJ:Stash has %d entries,uses %d mallocs.\n",nstash,reallocs);
585   ierr = MatStashGetInfo_Private(&mat->stash,&nstash,&reallocs);CHKERRQ(ierr);
586   PetscLogInfo(0,"MatAssemblyBegin_MPISBAIJ:Block-Stash has %d entries, uses %d mallocs.\n",nstash,reallocs);
587   PetscFunctionReturn(0);
588 }
589 
590 #undef __FUNCT__
591 #define __FUNCT__ "MatAssemblyEnd_MPISBAIJ"
592 int MatAssemblyEnd_MPISBAIJ(Mat mat,MatAssemblyType mode)
593 {
594   Mat_MPISBAIJ *baij=(Mat_MPISBAIJ*)mat->data;
595   Mat_SeqSBAIJ  *a=(Mat_SeqSBAIJ*)baij->A->data;
596   Mat_SeqBAIJ  *b=(Mat_SeqBAIJ*)baij->B->data;
597   int         i,j,rstart,ncols,n,ierr,flg,bs2=baij->bs2;
598   int         *row,*col,other_disassembled;
599   PetscTruth  r1,r2,r3;
600   MatScalar   *val;
601   InsertMode  addv = mat->insertmode;
602 #if defined(PETSC_HAVE_SPOOLES)
603   PetscTruth  flag;
604 #endif
605 
606   PetscFunctionBegin;
607 
608   if (!baij->donotstash) {
609     while (1) {
610       ierr = MatStashScatterGetMesg_Private(&mat->stash,&n,&row,&col,&val,&flg);CHKERRQ(ierr);
611       /*
612       PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d]: in AssemblyEnd, stash, flg=%d\n",rank,flg);
613       PetscSynchronizedFlush(PETSC_COMM_WORLD);
614       */
615       if (!flg) break;
616 
617       for (i=0; i<n;) {
618         /* Now identify the consecutive vals belonging to the same row */
619         for (j=i,rstart=row[j]; j<n; j++) { if (row[j] != rstart) break; }
620         if (j < n) ncols = j-i;
621         else       ncols = n-i;
622         /* Now assemble all these values with a single function call */
623         ierr = MatSetValues_MPISBAIJ_MatScalar(mat,1,row+i,ncols,col+i,val+i,addv);CHKERRQ(ierr);
624         i = j;
625       }
626     }
627     ierr = MatStashScatterEnd_Private(&mat->stash);CHKERRQ(ierr);
628     /* Now process the block-stash. Since the values are stashed column-oriented,
629        set the roworiented flag to column oriented, and after MatSetValues()
630        restore the original flags */
631     r1 = baij->roworiented;
632     r2 = a->roworiented;
633     r3 = b->roworiented;
634     baij->roworiented = PETSC_FALSE;
635     a->roworiented    = PETSC_FALSE;
636     b->roworiented    = PETSC_FALSE;
637     while (1) {
638       ierr = MatStashScatterGetMesg_Private(&mat->bstash,&n,&row,&col,&val,&flg);CHKERRQ(ierr);
639       if (!flg) break;
640 
641       for (i=0; i<n;) {
642         /* Now identify the consecutive vals belonging to the same row */
643         for (j=i,rstart=row[j]; j<n; j++) { if (row[j] != rstart) break; }
644         if (j < n) ncols = j-i;
645         else       ncols = n-i;
646         ierr = MatSetValuesBlocked_MPISBAIJ_MatScalar(mat,1,row+i,ncols,col+i,val+i*bs2,addv);CHKERRQ(ierr);
647         i = j;
648       }
649     }
650     ierr = MatStashScatterEnd_Private(&mat->bstash);CHKERRQ(ierr);
651     baij->roworiented = r1;
652     a->roworiented    = r2;
653     b->roworiented    = r3;
654   }
655 
656   ierr = MatAssemblyBegin(baij->A,mode);CHKERRQ(ierr);
657   ierr = MatAssemblyEnd(baij->A,mode);CHKERRQ(ierr);
658 
659   /* determine if any processor has disassembled, if so we must
660      also disassemble ourselfs, in order that we may reassemble. */
661   /*
662      if nonzero structure of submatrix B cannot change then we know that
663      no processor disassembled thus we can skip this stuff
664   */
665   if (!((Mat_SeqBAIJ*)baij->B->data)->nonew)  {
666     ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
667     if (mat->was_assembled && !other_disassembled) {
668       ierr = DisAssemble_MPISBAIJ(mat);CHKERRQ(ierr);
669     }
670   }
671 
672   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
673     ierr = MatSetUpMultiply_MPISBAIJ(mat);CHKERRQ(ierr); /* setup Mvctx and sMvctx */
674   }
675   ierr = MatAssemblyBegin(baij->B,mode);CHKERRQ(ierr);
676   ierr = MatAssemblyEnd(baij->B,mode);CHKERRQ(ierr);
677 
678 #if defined(PETSC_USE_BOPT_g)
679   if (baij->ht && mode== MAT_FINAL_ASSEMBLY) {
680     PetscLogInfo(0,"MatAssemblyEnd_MPISBAIJ:Average Hash Table Search in MatSetValues = %5.2f\n",((PetscReal)baij->ht_total_ct)/baij->ht_insert_ct);
681     baij->ht_total_ct  = 0;
682     baij->ht_insert_ct = 0;
683   }
684 #endif
685   if (baij->ht_flag && !baij->ht && mode == MAT_FINAL_ASSEMBLY) {
686     ierr = MatCreateHashTable_MPISBAIJ_Private(mat,baij->ht_fact);CHKERRQ(ierr);
687     mat->ops->setvalues        = MatSetValues_MPISBAIJ_HT;
688     mat->ops->setvaluesblocked = MatSetValuesBlocked_MPISBAIJ_HT;
689   }
690 
691   if (baij->rowvalues) {
692     ierr = PetscFree(baij->rowvalues);CHKERRQ(ierr);
693     baij->rowvalues = 0;
694   }
695 
696 #if defined(PETSC_HAVE_SPOOLES)
697   ierr = PetscOptionsHasName(PETSC_NULL,"-mat_sbaij_spooles",&flag);CHKERRQ(ierr);
698   if (flag) { ierr = MatUseSpooles_MPISBAIJ(mat);CHKERRQ(ierr); }
699 #endif
700   PetscFunctionReturn(0);
701 }
702 
703 #undef __FUNCT__
704 #define __FUNCT__ "MatView_MPISBAIJ_ASCIIorDraworSocket"
705 static int MatView_MPISBAIJ_ASCIIorDraworSocket(Mat mat,PetscViewer viewer)
706 {
707   Mat_MPISBAIJ      *baij = (Mat_MPISBAIJ*)mat->data;
708   int               ierr,bs = baij->bs,size = baij->size,rank = baij->rank;
709   PetscTruth        isascii,isdraw;
710   PetscViewer       sviewer;
711   PetscViewerFormat format;
712 
713   PetscFunctionBegin;
714   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&isascii);CHKERRQ(ierr);
715   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_DRAW,&isdraw);CHKERRQ(ierr);
716   if (isascii) {
717     ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
718     if (format == PETSC_VIEWER_ASCII_INFO_LONG) {
719       MatInfo info;
720       ierr = MPI_Comm_rank(mat->comm,&rank);CHKERRQ(ierr);
721       ierr = MatGetInfo(mat,MAT_LOCAL,&info);CHKERRQ(ierr);
722       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
723               rank,mat->m,(int)info.nz_used*bs,(int)info.nz_allocated*bs,
724               baij->bs,(int)info.memory);CHKERRQ(ierr);
725       ierr = MatGetInfo(baij->A,MAT_LOCAL,&info);CHKERRQ(ierr);
726       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);CHKERRQ(ierr);
727       ierr = MatGetInfo(baij->B,MAT_LOCAL,&info);CHKERRQ(ierr);
728       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);CHKERRQ(ierr);
729       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
730       ierr = VecScatterView(baij->Mvctx,viewer);CHKERRQ(ierr);
731       PetscFunctionReturn(0);
732     } else if (format == PETSC_VIEWER_ASCII_INFO) {
733       ierr = PetscViewerASCIIPrintf(viewer,"  block size is %d\n",bs);CHKERRQ(ierr);
734       PetscFunctionReturn(0);
735     }
736   }
737 
738   if (isdraw) {
739     PetscDraw       draw;
740     PetscTruth isnull;
741     ierr = PetscViewerDrawGetDraw(viewer,0,&draw);CHKERRQ(ierr);
742     ierr = PetscDrawIsNull(draw,&isnull);CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
743   }
744 
745   if (size == 1) {
746     ierr = PetscObjectSetName((PetscObject)baij->A,mat->name);CHKERRQ(ierr);
747     ierr = MatView(baij->A,viewer);CHKERRQ(ierr);
748   } else {
749     /* assemble the entire matrix onto first processor. */
750     Mat         A;
751     Mat_SeqSBAIJ *Aloc;
752     Mat_SeqBAIJ *Bloc;
753     int         M = mat->M,N = mat->N,*ai,*aj,col,i,j,k,*rvals,mbs = baij->mbs;
754     MatScalar   *a;
755 
756     if (!rank) {
757       ierr = MatCreateMPISBAIJ(mat->comm,baij->bs,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
758     } else {
759       ierr = MatCreateMPISBAIJ(mat->comm,baij->bs,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
760     }
761     PetscLogObjectParent(mat,A);
762 
763     /* copy over the A part */
764     Aloc  = (Mat_SeqSBAIJ*)baij->A->data;
765     ai    = Aloc->i; aj = Aloc->j; a = Aloc->a;
766     ierr  = PetscMalloc(bs*sizeof(int),&rvals);CHKERRQ(ierr);
767 
768     for (i=0; i<mbs; i++) {
769       rvals[0] = bs*(baij->rstart + i);
770       for (j=1; j<bs; j++) { rvals[j] = rvals[j-1] + 1; }
771       for (j=ai[i]; j<ai[i+1]; j++) {
772         col = (baij->cstart+aj[j])*bs;
773         for (k=0; k<bs; k++) {
774           ierr = MatSetValues_MPISBAIJ_MatScalar(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
775           col++; a += bs;
776         }
777       }
778     }
779     /* copy over the B part */
780     Bloc = (Mat_SeqBAIJ*)baij->B->data;
781     ai = Bloc->i; aj = Bloc->j; a = Bloc->a;
782     for (i=0; i<mbs; i++) {
783       rvals[0] = bs*(baij->rstart + i);
784       for (j=1; j<bs; j++) { rvals[j] = rvals[j-1] + 1; }
785       for (j=ai[i]; j<ai[i+1]; j++) {
786         col = baij->garray[aj[j]]*bs;
787         for (k=0; k<bs; k++) {
788           ierr = MatSetValues_MPISBAIJ_MatScalar(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
789           col++; a += bs;
790         }
791       }
792     }
793     ierr = PetscFree(rvals);CHKERRQ(ierr);
794     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
795     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
796     /*
797        Everyone has to call to draw the matrix since the graphics waits are
798        synchronized across all processors that share the PetscDraw object
799     */
800     ierr = PetscViewerGetSingleton(viewer,&sviewer);CHKERRQ(ierr);
801     if (!rank) {
802       ierr = PetscObjectSetName((PetscObject)((Mat_MPISBAIJ*)(A->data))->A,mat->name);CHKERRQ(ierr);
803       ierr = MatView(((Mat_MPISBAIJ*)(A->data))->A,sviewer);CHKERRQ(ierr);
804     }
805     ierr = PetscViewerRestoreSingleton(viewer,&sviewer);CHKERRQ(ierr);
806     ierr = MatDestroy(A);CHKERRQ(ierr);
807   }
808   PetscFunctionReturn(0);
809 }
810 
811 #undef __FUNCT__
812 #define __FUNCT__ "MatView_MPISBAIJ"
813 int MatView_MPISBAIJ(Mat mat,PetscViewer viewer)
814 {
815   int        ierr;
816   PetscTruth isascii,isdraw,issocket,isbinary;
817 
818   PetscFunctionBegin;
819   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&isascii);CHKERRQ(ierr);
820   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_DRAW,&isdraw);CHKERRQ(ierr);
821   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_SOCKET,&issocket);CHKERRQ(ierr);
822   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_BINARY,&isbinary);CHKERRQ(ierr);
823   if (isascii || isdraw || issocket || isbinary) {
824     ierr = MatView_MPISBAIJ_ASCIIorDraworSocket(mat,viewer);CHKERRQ(ierr);
825   } else {
826     SETERRQ1(1,"Viewer type %s not supported by MPISBAIJ matrices",((PetscObject)viewer)->type_name);
827   }
828   PetscFunctionReturn(0);
829 }
830 
831 #undef __FUNCT__
832 #define __FUNCT__ "MatDestroy_MPISBAIJ"
833 int MatDestroy_MPISBAIJ(Mat mat)
834 {
835   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
836   int         ierr;
837 
838   PetscFunctionBegin;
839 #if defined(PETSC_USE_LOG)
840   PetscLogObjectState((PetscObject)mat,"Rows=%d,Cols=%d",mat->M,mat->N);
841 #endif
842   ierr = MatStashDestroy_Private(&mat->stash);CHKERRQ(ierr);
843   ierr = MatStashDestroy_Private(&mat->bstash);CHKERRQ(ierr);
844   ierr = PetscFree(baij->rowners);CHKERRQ(ierr);
845   ierr = MatDestroy(baij->A);CHKERRQ(ierr);
846   ierr = MatDestroy(baij->B);CHKERRQ(ierr);
847 #if defined (PETSC_USE_CTABLE)
848   if (baij->colmap) {ierr = PetscTableDelete(baij->colmap);CHKERRQ(ierr);}
849 #else
850   if (baij->colmap) {ierr = PetscFree(baij->colmap);CHKERRQ(ierr);}
851 #endif
852   if (baij->garray) {ierr = PetscFree(baij->garray);CHKERRQ(ierr);}
853   if (baij->lvec)   {ierr = VecDestroy(baij->lvec);CHKERRQ(ierr);}
854   if (baij->Mvctx)  {ierr = VecScatterDestroy(baij->Mvctx);CHKERRQ(ierr);}
855   if (baij->slvec0) {
856     ierr = VecDestroy(baij->slvec0);CHKERRQ(ierr);
857     ierr = VecDestroy(baij->slvec0b);CHKERRQ(ierr);
858   }
859   if (baij->slvec1) {
860     ierr = VecDestroy(baij->slvec1);CHKERRQ(ierr);
861     ierr = VecDestroy(baij->slvec1a);CHKERRQ(ierr);
862     ierr = VecDestroy(baij->slvec1b);CHKERRQ(ierr);
863   }
864   if (baij->sMvctx)  {ierr = VecScatterDestroy(baij->sMvctx);CHKERRQ(ierr);}
865   if (baij->rowvalues) {ierr = PetscFree(baij->rowvalues);CHKERRQ(ierr);}
866   if (baij->barray) {ierr = PetscFree(baij->barray);CHKERRQ(ierr);}
867   if (baij->hd) {ierr = PetscFree(baij->hd);CHKERRQ(ierr);}
868 #if defined(PETSC_USE_MAT_SINGLE)
869   if (baij->setvaluescopy) {ierr = PetscFree(baij->setvaluescopy);CHKERRQ(ierr);}
870 #endif
871   ierr = PetscFree(baij);CHKERRQ(ierr);
872   PetscFunctionReturn(0);
873 }
874 
875 #undef __FUNCT__
876 #define __FUNCT__ "MatMult_MPISBAIJ"
877 int MatMult_MPISBAIJ(Mat A,Vec xx,Vec yy)
878 {
879   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
880   int         ierr,nt,mbs=a->mbs,bs=a->bs;
881   PetscScalar *x,*from,zero=0.0;
882 
883   PetscFunctionBegin;
884   /*
885   PetscSynchronizedPrintf(PETSC_COMM_WORLD," _1comm is called ...\n");
886   PetscSynchronizedFlush(PETSC_COMM_WORLD);
887   */
888   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
889   if (nt != A->n) {
890     SETERRQ(PETSC_ERR_ARG_SIZ,"Incompatible partition of A and xx");
891   }
892   ierr = VecGetLocalSize(yy,&nt);CHKERRQ(ierr);
893   if (nt != A->m) {
894     SETERRQ(PETSC_ERR_ARG_SIZ,"Incompatible parition of A and yy");
895   }
896 
897   /* diagonal part */
898   ierr = (*a->A->ops->mult)(a->A,xx,a->slvec1a);CHKERRQ(ierr);
899   ierr = VecSet(&zero,a->slvec1b);CHKERRQ(ierr);
900 
901   /* subdiagonal part */
902   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->slvec0b);CHKERRQ(ierr);
903 
904   /* copy x into the vec slvec0 */
905   ierr = VecGetArray(a->slvec0,&from);CHKERRQ(ierr);
906   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
907   ierr = PetscMemcpy(from,x,bs*mbs*sizeof(MatScalar));CHKERRQ(ierr);
908   ierr = VecRestoreArray(a->slvec0,&from);CHKERRQ(ierr);
909 
910   ierr = VecScatterBegin(a->slvec0,a->slvec1,ADD_VALUES,SCATTER_FORWARD,a->sMvctx);CHKERRQ(ierr);
911   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
912   ierr = VecScatterEnd(a->slvec0,a->slvec1,ADD_VALUES,SCATTER_FORWARD,a->sMvctx);CHKERRQ(ierr);
913 
914   /* supperdiagonal part */
915   ierr = (*a->B->ops->multadd)(a->B,a->slvec1b,a->slvec1a,yy);CHKERRQ(ierr);
916 
917   PetscFunctionReturn(0);
918 }
919 
920 #undef __FUNCT__
921 #define __FUNCT__ "MatMult_MPISBAIJ_2comm"
922 int MatMult_MPISBAIJ_2comm(Mat A,Vec xx,Vec yy)
923 {
924   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
925   int         ierr,nt;
926 
927   PetscFunctionBegin;
928   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
929   if (nt != A->n) {
930     SETERRQ(PETSC_ERR_ARG_SIZ,"Incompatible partition of A and xx");
931   }
932   ierr = VecGetLocalSize(yy,&nt);CHKERRQ(ierr);
933   if (nt != A->m) {
934     SETERRQ(PETSC_ERR_ARG_SIZ,"Incompatible parition of A and yy");
935   }
936 
937   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
938   /* do diagonal part */
939   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
940   /* do supperdiagonal part */
941   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
942   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
943   /* do subdiagonal part */
944   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
945   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
946   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
947 
948   PetscFunctionReturn(0);
949 }
950 
951 #undef __FUNCT__
952 #define __FUNCT__ "MatMultAdd_MPISBAIJ"
953 int MatMultAdd_MPISBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
954 {
955   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
956   int          ierr,mbs=a->mbs,bs=a->bs;
957   PetscScalar  *x,*from,zero=0.0;
958 
959   PetscFunctionBegin;
960   /*
961   PetscSynchronizedPrintf(PETSC_COMM_WORLD," MatMultAdd is called ...\n");
962   PetscSynchronizedFlush(PETSC_COMM_WORLD);
963   */
964   /* diagonal part */
965   ierr = (*a->A->ops->multadd)(a->A,xx,yy,a->slvec1a);CHKERRQ(ierr);
966   ierr = VecSet(&zero,a->slvec1b);CHKERRQ(ierr);
967 
968   /* subdiagonal part */
969   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->slvec0b);CHKERRQ(ierr);
970 
971   /* copy x into the vec slvec0 */
972   ierr = VecGetArray(a->slvec0,&from);CHKERRQ(ierr);
973   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
974   ierr = PetscMemcpy(from,x,bs*mbs*sizeof(MatScalar));CHKERRQ(ierr);
975   ierr = VecRestoreArray(a->slvec0,&from);CHKERRQ(ierr);
976 
977   ierr = VecScatterBegin(a->slvec0,a->slvec1,ADD_VALUES,SCATTER_FORWARD,a->sMvctx);CHKERRQ(ierr);
978   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
979   ierr = VecScatterEnd(a->slvec0,a->slvec1,ADD_VALUES,SCATTER_FORWARD,a->sMvctx);CHKERRQ(ierr);
980 
981   /* supperdiagonal part */
982   ierr = (*a->B->ops->multadd)(a->B,a->slvec1b,a->slvec1a,zz);CHKERRQ(ierr);
983 
984   PetscFunctionReturn(0);
985 }
986 
987 #undef __FUNCT__
988 #define __FUNCT__ "MatMultAdd_MPISBAIJ_2comm"
989 int MatMultAdd_MPISBAIJ_2comm(Mat A,Vec xx,Vec yy,Vec zz)
990 {
991   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
992   int        ierr;
993 
994   PetscFunctionBegin;
995   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
996   /* do diagonal part */
997   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
998   /* do supperdiagonal part */
999   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1000   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
1001 
1002   /* do subdiagonal part */
1003   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
1004   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1005   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1006 
1007   PetscFunctionReturn(0);
1008 }
1009 
1010 #undef __FUNCT__
1011 #define __FUNCT__ "MatMultTranspose_MPISBAIJ"
1012 int MatMultTranspose_MPISBAIJ(Mat A,Vec xx,Vec yy)
1013 {
1014   PetscFunctionBegin;
1015   SETERRQ(1,"Matrix is symmetric. Call MatMult().");
1016   /* PetscFunctionReturn(0); */
1017 }
1018 
1019 #undef __FUNCT__
1020 #define __FUNCT__ "MatMultTransposeAdd_MPISBAIJ"
1021 int MatMultTransposeAdd_MPISBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1022 {
1023   PetscFunctionBegin;
1024   SETERRQ(1,"Matrix is symmetric. Call MatMultAdd().");
1025   /* PetscFunctionReturn(0); */
1026 }
1027 
1028 /*
1029   This only works correctly for square matrices where the subblock A->A is the
1030    diagonal block
1031 */
1032 #undef __FUNCT__
1033 #define __FUNCT__ "MatGetDiagonal_MPISBAIJ"
1034 int MatGetDiagonal_MPISBAIJ(Mat A,Vec v)
1035 {
1036   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
1037   int         ierr;
1038 
1039   PetscFunctionBegin;
1040   /* if (a->M != a->N) SETERRQ(PETSC_ERR_SUP,"Supports only square matrix where A->A is diag block"); */
1041   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
1042   PetscFunctionReturn(0);
1043 }
1044 
1045 #undef __FUNCT__
1046 #define __FUNCT__ "MatScale_MPISBAIJ"
1047 int MatScale_MPISBAIJ(PetscScalar *aa,Mat A)
1048 {
1049   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
1050   int         ierr;
1051 
1052   PetscFunctionBegin;
1053   ierr = MatScale(aa,a->A);CHKERRQ(ierr);
1054   ierr = MatScale(aa,a->B);CHKERRQ(ierr);
1055   PetscFunctionReturn(0);
1056 }
1057 
1058 #undef __FUNCT__
1059 #define __FUNCT__ "MatGetRow_MPISBAIJ"
1060 int MatGetRow_MPISBAIJ(Mat matin,int row,int *nz,int **idx,PetscScalar **v)
1061 {
1062   Mat_MPISBAIJ   *mat = (Mat_MPISBAIJ*)matin->data;
1063   PetscScalar    *vworkA,*vworkB,**pvA,**pvB,*v_p;
1064   int            bs = mat->bs,bs2 = mat->bs2,i,ierr,*cworkA,*cworkB,**pcA,**pcB;
1065   int            nztot,nzA,nzB,lrow,brstart = mat->rstart*bs,brend = mat->rend*bs;
1066   int            *cmap,*idx_p,cstart = mat->cstart;
1067 
1068   PetscFunctionBegin;
1069   if (mat->getrowactive == PETSC_TRUE) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Already active");
1070   mat->getrowactive = PETSC_TRUE;
1071 
1072   if (!mat->rowvalues && (idx || v)) {
1073     /*
1074         allocate enough space to hold information from the longest row.
1075     */
1076     Mat_SeqSBAIJ *Aa = (Mat_SeqSBAIJ*)mat->A->data;
1077     Mat_SeqBAIJ  *Ba = (Mat_SeqBAIJ*)mat->B->data;
1078     int     max = 1,mbs = mat->mbs,tmp;
1079     for (i=0; i<mbs; i++) {
1080       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i]; /* row length */
1081       if (max < tmp) { max = tmp; }
1082     }
1083     ierr = PetscMalloc(max*bs2*(sizeof(int)+sizeof(PetscScalar)),&mat->rowvalues);CHKERRQ(ierr);
1084     mat->rowindices = (int*)(mat->rowvalues + max*bs2);
1085   }
1086 
1087   if (row < brstart || row >= brend) SETERRQ(PETSC_ERR_SUP,"Only local rows")
1088   lrow = row - brstart;  /* local row index */
1089 
1090   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1091   if (!v)   {pvA = 0; pvB = 0;}
1092   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1093   ierr = (*mat->A->ops->getrow)(mat->A,lrow,&nzA,pcA,pvA);CHKERRQ(ierr);
1094   ierr = (*mat->B->ops->getrow)(mat->B,lrow,&nzB,pcB,pvB);CHKERRQ(ierr);
1095   nztot = nzA + nzB;
1096 
1097   cmap  = mat->garray;
1098   if (v  || idx) {
1099     if (nztot) {
1100       /* Sort by increasing column numbers, assuming A and B already sorted */
1101       int imark = -1;
1102       if (v) {
1103         *v = v_p = mat->rowvalues;
1104         for (i=0; i<nzB; i++) {
1105           if (cmap[cworkB[i]/bs] < cstart)   v_p[i] = vworkB[i];
1106           else break;
1107         }
1108         imark = i;
1109         for (i=0; i<nzA; i++)     v_p[imark+i] = vworkA[i];
1110         for (i=imark; i<nzB; i++) v_p[nzA+i]   = vworkB[i];
1111       }
1112       if (idx) {
1113         *idx = idx_p = mat->rowindices;
1114         if (imark > -1) {
1115           for (i=0; i<imark; i++) {
1116             idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs;
1117           }
1118         } else {
1119           for (i=0; i<nzB; i++) {
1120             if (cmap[cworkB[i]/bs] < cstart)
1121               idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1122             else break;
1123           }
1124           imark = i;
1125         }
1126         for (i=0; i<nzA; i++)     idx_p[imark+i] = cstart*bs + cworkA[i];
1127         for (i=imark; i<nzB; i++) idx_p[nzA+i]   = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1128       }
1129     } else {
1130       if (idx) *idx = 0;
1131       if (v)   *v   = 0;
1132     }
1133   }
1134   *nz = nztot;
1135   ierr = (*mat->A->ops->restorerow)(mat->A,lrow,&nzA,pcA,pvA);CHKERRQ(ierr);
1136   ierr = (*mat->B->ops->restorerow)(mat->B,lrow,&nzB,pcB,pvB);CHKERRQ(ierr);
1137   PetscFunctionReturn(0);
1138 }
1139 
1140 #undef __FUNCT__
1141 #define __FUNCT__ "MatRestoreRow_MPISBAIJ"
1142 int MatRestoreRow_MPISBAIJ(Mat mat,int row,int *nz,int **idx,PetscScalar **v)
1143 {
1144   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
1145 
1146   PetscFunctionBegin;
1147   if (baij->getrowactive == PETSC_FALSE) {
1148     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"MatGetRow not called");
1149   }
1150   baij->getrowactive = PETSC_FALSE;
1151   PetscFunctionReturn(0);
1152 }
1153 
1154 #undef __FUNCT__
1155 #define __FUNCT__ "MatGetBlockSize_MPISBAIJ"
1156 int MatGetBlockSize_MPISBAIJ(Mat mat,int *bs)
1157 {
1158   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
1159 
1160   PetscFunctionBegin;
1161   *bs = baij->bs;
1162   PetscFunctionReturn(0);
1163 }
1164 
1165 #undef __FUNCT__
1166 #define __FUNCT__ "MatZeroEntries_MPISBAIJ"
1167 int MatZeroEntries_MPISBAIJ(Mat A)
1168 {
1169   Mat_MPISBAIJ *l = (Mat_MPISBAIJ*)A->data;
1170   int         ierr;
1171 
1172   PetscFunctionBegin;
1173   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
1174   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
1175   PetscFunctionReturn(0);
1176 }
1177 
1178 #undef __FUNCT__
1179 #define __FUNCT__ "MatGetInfo_MPISBAIJ"
1180 int MatGetInfo_MPISBAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1181 {
1182   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)matin->data;
1183   Mat         A = a->A,B = a->B;
1184   int         ierr;
1185   PetscReal   isend[5],irecv[5];
1186 
1187   PetscFunctionBegin;
1188   info->block_size     = (PetscReal)a->bs;
1189   ierr = MatGetInfo(A,MAT_LOCAL,info);CHKERRQ(ierr);
1190   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->nz_unneeded;
1191   isend[3] = info->memory;  isend[4] = info->mallocs;
1192   ierr = MatGetInfo(B,MAT_LOCAL,info);CHKERRQ(ierr);
1193   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->nz_unneeded;
1194   isend[3] += info->memory;  isend[4] += info->mallocs;
1195   if (flag == MAT_LOCAL) {
1196     info->nz_used      = isend[0];
1197     info->nz_allocated = isend[1];
1198     info->nz_unneeded  = isend[2];
1199     info->memory       = isend[3];
1200     info->mallocs      = isend[4];
1201   } else if (flag == MAT_GLOBAL_MAX) {
1202     ierr = MPI_Allreduce(isend,irecv,5,MPIU_REAL,MPI_MAX,matin->comm);CHKERRQ(ierr);
1203     info->nz_used      = irecv[0];
1204     info->nz_allocated = irecv[1];
1205     info->nz_unneeded  = irecv[2];
1206     info->memory       = irecv[3];
1207     info->mallocs      = irecv[4];
1208   } else if (flag == MAT_GLOBAL_SUM) {
1209     ierr = MPI_Allreduce(isend,irecv,5,MPIU_REAL,MPI_SUM,matin->comm);CHKERRQ(ierr);
1210     info->nz_used      = irecv[0];
1211     info->nz_allocated = irecv[1];
1212     info->nz_unneeded  = irecv[2];
1213     info->memory       = irecv[3];
1214     info->mallocs      = irecv[4];
1215   } else {
1216     SETERRQ1(1,"Unknown MatInfoType argument %d",flag);
1217   }
1218   info->rows_global       = (PetscReal)A->M;
1219   info->columns_global    = (PetscReal)A->N;
1220   info->rows_local        = (PetscReal)A->m;
1221   info->columns_local     = (PetscReal)A->N;
1222   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1223   info->fill_ratio_needed = 0;
1224   info->factor_mallocs    = 0;
1225   PetscFunctionReturn(0);
1226 }
1227 
1228 #undef __FUNCT__
1229 #define __FUNCT__ "MatSetOption_MPISBAIJ"
1230 int MatSetOption_MPISBAIJ(Mat A,MatOption op)
1231 {
1232   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
1233   int         ierr;
1234 
1235   PetscFunctionBegin;
1236   switch (op) {
1237   case MAT_NO_NEW_NONZERO_LOCATIONS:
1238   case MAT_YES_NEW_NONZERO_LOCATIONS:
1239   case MAT_COLUMNS_UNSORTED:
1240   case MAT_COLUMNS_SORTED:
1241   case MAT_NEW_NONZERO_ALLOCATION_ERR:
1242   case MAT_KEEP_ZEROED_ROWS:
1243   case MAT_NEW_NONZERO_LOCATION_ERR:
1244     ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1245     ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1246     break;
1247   case MAT_ROW_ORIENTED:
1248     a->roworiented = PETSC_TRUE;
1249     ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1250     ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1251     break;
1252   case MAT_ROWS_SORTED:
1253   case MAT_ROWS_UNSORTED:
1254   case MAT_YES_NEW_DIAGONALS:
1255   case MAT_USE_SINGLE_PRECISION_SOLVES:
1256     PetscLogInfo(A,"Info:MatSetOption_MPIBAIJ:Option ignored\n");
1257     break;
1258   case MAT_COLUMN_ORIENTED:
1259     a->roworiented = PETSC_FALSE;
1260     ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1261     ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1262     break;
1263   case MAT_IGNORE_OFF_PROC_ENTRIES:
1264     a->donotstash = PETSC_TRUE;
1265     break;
1266   case MAT_NO_NEW_DIAGONALS:
1267     SETERRQ(PETSC_ERR_SUP,"MAT_NO_NEW_DIAGONALS");
1268   case MAT_USE_HASH_TABLE:
1269     a->ht_flag = PETSC_TRUE;
1270     break;
1271   default:
1272     SETERRQ(PETSC_ERR_SUP,"unknown option");
1273   }
1274   PetscFunctionReturn(0);
1275 }
1276 
1277 #undef __FUNCT__
1278 #define __FUNCT__ "MatTranspose_MPISBAIJ("
1279 int MatTranspose_MPISBAIJ(Mat A,Mat *matout)
1280 {
1281   PetscFunctionBegin;
1282   SETERRQ(1,"Matrix is symmetric. MatTranspose() should not be called");
1283   /* PetscFunctionReturn(0); */
1284 }
1285 
1286 #undef __FUNCT__
1287 #define __FUNCT__ "MatDiagonalScale_MPISBAIJ"
1288 int MatDiagonalScale_MPISBAIJ(Mat mat,Vec ll,Vec rr)
1289 {
1290   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
1291   Mat         a = baij->A,b = baij->B;
1292   int         ierr,s1,s2,s3;
1293 
1294   PetscFunctionBegin;
1295   if (ll != rr) {
1296     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1297   }
1298   ierr = MatGetLocalSize(mat,&s2,&s3);CHKERRQ(ierr);
1299   if (rr) {
1300     ierr = VecGetLocalSize(rr,&s1);CHKERRQ(ierr);
1301     if (s1!=s3) SETERRQ(PETSC_ERR_ARG_SIZ,"right vector non-conforming local size");
1302     /* Overlap communication with computation. */
1303     ierr = VecScatterBegin(rr,baij->lvec,INSERT_VALUES,SCATTER_FORWARD,baij->Mvctx);CHKERRQ(ierr);
1304     /*} if (ll) { */
1305     ierr = VecGetLocalSize(ll,&s1);CHKERRQ(ierr);
1306     if (s1!=s2) SETERRQ(PETSC_ERR_ARG_SIZ,"left vector non-conforming local size");
1307     ierr = (*b->ops->diagonalscale)(b,ll,PETSC_NULL);CHKERRQ(ierr);
1308     /* } */
1309   /* scale  the diagonal block */
1310   ierr = (*a->ops->diagonalscale)(a,ll,rr);CHKERRQ(ierr);
1311 
1312   /* if (rr) { */
1313     /* Do a scatter end and then right scale the off-diagonal block */
1314     ierr = VecScatterEnd(rr,baij->lvec,INSERT_VALUES,SCATTER_FORWARD,baij->Mvctx);CHKERRQ(ierr);
1315     ierr = (*b->ops->diagonalscale)(b,PETSC_NULL,baij->lvec);CHKERRQ(ierr);
1316   }
1317 
1318   PetscFunctionReturn(0);
1319 }
1320 
1321 #undef __FUNCT__
1322 #define __FUNCT__ "MatZeroRows_MPISBAIJ"
1323 int MatZeroRows_MPISBAIJ(Mat A,IS is,PetscScalar *diag)
1324 {
1325   Mat_MPISBAIJ   *l = (Mat_MPISBAIJ*)A->data;
1326   int            i,ierr,N,*rows,*owners = l->rowners,size = l->size;
1327   int            *procs,*nprocs,j,idx,nsends,*work,row;
1328   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
1329   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
1330   int            *lens,imdex,*lrows,*values,bs=l->bs,rstart_bs=l->rstart_bs;
1331   MPI_Comm       comm = A->comm;
1332   MPI_Request    *send_waits,*recv_waits;
1333   MPI_Status     recv_status,*send_status;
1334   IS             istmp;
1335   PetscTruth     found;
1336 
1337   PetscFunctionBegin;
1338   ierr = ISGetSize(is,&N);CHKERRQ(ierr);
1339   ierr = ISGetIndices(is,&rows);CHKERRQ(ierr);
1340 
1341   /*  first count number of contributors to each processor */
1342   ierr  = PetscMalloc(2*size*sizeof(int),&nprocs);CHKERRQ(ierr);
1343   ierr  = PetscMemzero(nprocs,2*size*sizeof(int));CHKERRQ(ierr);
1344   procs = nprocs + size;
1345   ierr  = PetscMalloc((N+1)*sizeof(int),&owner);CHKERRQ(ierr); /* see note*/
1346   for (i=0; i<N; i++) {
1347     idx   = rows[i];
1348     found = PETSC_FALSE;
1349     for (j=0; j<size; j++) {
1350       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
1351         nprocs[j]++; procs[j] = 1; owner[i] = j; found = PETSC_TRUE; break;
1352       }
1353     }
1354     if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index out of range");
1355   }
1356   nsends = 0;  for (i=0; i<size; i++) { nsends += procs[i];}
1357 
1358   /* inform other processors of number of messages and max length*/
1359   ierr   = PetscMalloc(2*size*sizeof(int),&work);CHKERRQ(ierr);
1360   ierr   = MPI_Allreduce(nprocs,work,2*size,MPI_INT,PetscMaxSum_Op,comm);CHKERRQ(ierr);
1361   nmax   = work[rank];
1362   nrecvs = work[size+rank];
1363   ierr   = PetscFree(work);CHKERRQ(ierr);
1364 
1365   /* post receives:   */
1366   ierr = PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int),&rvalues);CHKERRQ(ierr);
1367   ierr = PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&recv_waits);CHKERRQ(ierr);
1368   for (i=0; i<nrecvs; i++) {
1369     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
1370   }
1371 
1372   /* do sends:
1373      1) starts[i] gives the starting index in svalues for stuff going to
1374      the ith processor
1375   */
1376   ierr = PetscMalloc((N+1)*sizeof(int),&svalues);CHKERRQ(ierr);
1377   ierr = PetscMalloc((nsends+1)*sizeof(MPI_Request),&send_waits);CHKERRQ(ierr);
1378   ierr = PetscMalloc((size+1)*sizeof(int),&starts);CHKERRQ(ierr);
1379   starts[0]  = 0;
1380   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1381   for (i=0; i<N; i++) {
1382     svalues[starts[owner[i]]++] = rows[i];
1383   }
1384   ierr = ISRestoreIndices(is,&rows);CHKERRQ(ierr);
1385 
1386   starts[0] = 0;
1387   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1388   count = 0;
1389   for (i=0; i<size; i++) {
1390     if (procs[i]) {
1391       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
1392     }
1393   }
1394   ierr = PetscFree(starts);CHKERRQ(ierr);
1395 
1396   base = owners[rank]*bs;
1397 
1398   /*  wait on receives */
1399   ierr   = PetscMalloc(2*(nrecvs+1)*sizeof(int),&lens);CHKERRQ(ierr);
1400   source = lens + nrecvs;
1401   count  = nrecvs; slen = 0;
1402   while (count) {
1403     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
1404     /* unpack receives into our local space */
1405     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
1406     source[imdex]  = recv_status.MPI_SOURCE;
1407     lens[imdex]    = n;
1408     slen          += n;
1409     count--;
1410   }
1411   ierr = PetscFree(recv_waits);CHKERRQ(ierr);
1412 
1413   /* move the data into the send scatter */
1414   ierr = PetscMalloc((slen+1)*sizeof(int),&lrows);CHKERRQ(ierr);
1415   count = 0;
1416   for (i=0; i<nrecvs; i++) {
1417     values = rvalues + i*nmax;
1418     for (j=0; j<lens[i]; j++) {
1419       lrows[count++] = values[j] - base;
1420     }
1421   }
1422   ierr = PetscFree(rvalues);CHKERRQ(ierr);
1423   ierr = PetscFree(lens);CHKERRQ(ierr);
1424   ierr = PetscFree(owner);CHKERRQ(ierr);
1425   ierr = PetscFree(nprocs);CHKERRQ(ierr);
1426 
1427   /* actually zap the local rows */
1428   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
1429   PetscLogObjectParent(A,istmp);
1430 
1431   /*
1432         Zero the required rows. If the "diagonal block" of the matrix
1433      is square and the user wishes to set the diagonal we use seperate
1434      code so that MatSetValues() is not called for each diagonal allocating
1435      new memory, thus calling lots of mallocs and slowing things down.
1436 
1437        Contributed by: Mathew Knepley
1438   */
1439   /* must zero l->B before l->A because the (diag) case below may put values into l->B*/
1440   ierr = MatZeroRows_SeqBAIJ(l->B,istmp,0);CHKERRQ(ierr);
1441   if (diag && (l->A->M == l->A->N)) {
1442     ierr = MatZeroRows_SeqSBAIJ(l->A,istmp,diag);CHKERRQ(ierr);
1443   } else if (diag) {
1444     ierr = MatZeroRows_SeqSBAIJ(l->A,istmp,0);CHKERRQ(ierr);
1445     if (((Mat_SeqSBAIJ*)l->A->data)->nonew) {
1446       SETERRQ(PETSC_ERR_SUP,"MatZeroRows() on rectangular matrices cannot be used with the Mat options \n\
1447 MAT_NO_NEW_NONZERO_LOCATIONS,MAT_NEW_NONZERO_LOCATION_ERR,MAT_NEW_NONZERO_ALLOCATION_ERR");
1448     }
1449     for (i=0; i<slen; i++) {
1450       row  = lrows[i] + rstart_bs;
1451       ierr = MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES);CHKERRQ(ierr);
1452     }
1453     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1454     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1455   } else {
1456     ierr = MatZeroRows_SeqSBAIJ(l->A,istmp,0);CHKERRQ(ierr);
1457   }
1458 
1459   ierr = ISDestroy(istmp);CHKERRQ(ierr);
1460   ierr = PetscFree(lrows);CHKERRQ(ierr);
1461 
1462   /* wait on sends */
1463   if (nsends) {
1464     ierr = PetscMalloc(nsends*sizeof(MPI_Status),&send_status);CHKERRQ(ierr);
1465     ierr        = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
1466     ierr        = PetscFree(send_status);CHKERRQ(ierr);
1467   }
1468   ierr = PetscFree(send_waits);CHKERRQ(ierr);
1469   ierr = PetscFree(svalues);CHKERRQ(ierr);
1470 
1471   PetscFunctionReturn(0);
1472 }
1473 
1474 #undef __FUNCT__
1475 #define __FUNCT__ "MatPrintHelp_MPISBAIJ"
1476 int MatPrintHelp_MPISBAIJ(Mat A)
1477 {
1478   Mat_MPISBAIJ *a   = (Mat_MPISBAIJ*)A->data;
1479   MPI_Comm    comm = A->comm;
1480   static int  called = 0;
1481   int         ierr;
1482 
1483   PetscFunctionBegin;
1484   if (!a->rank) {
1485     ierr = MatPrintHelp_SeqSBAIJ(a->A);CHKERRQ(ierr);
1486   }
1487   if (called) {PetscFunctionReturn(0);} else called = 1;
1488   ierr = (*PetscHelpPrintf)(comm," Options for MATMPISBAIJ matrix format (the defaults):\n");CHKERRQ(ierr);
1489   ierr = (*PetscHelpPrintf)(comm,"  -mat_use_hash_table <factor>: Use hashtable for efficient matrix assembly\n");CHKERRQ(ierr);
1490   PetscFunctionReturn(0);
1491 }
1492 
1493 #undef __FUNCT__
1494 #define __FUNCT__ "MatSetUnfactored_MPISBAIJ"
1495 int MatSetUnfactored_MPISBAIJ(Mat A)
1496 {
1497   Mat_MPISBAIJ *a   = (Mat_MPISBAIJ*)A->data;
1498   int         ierr;
1499 
1500   PetscFunctionBegin;
1501   ierr = MatSetUnfactored(a->A);CHKERRQ(ierr);
1502   PetscFunctionReturn(0);
1503 }
1504 
1505 static int MatDuplicate_MPISBAIJ(Mat,MatDuplicateOption,Mat *);
1506 
1507 #undef __FUNCT__
1508 #define __FUNCT__ "MatEqual_MPISBAIJ"
1509 int MatEqual_MPISBAIJ(Mat A,Mat B,PetscTruth *flag)
1510 {
1511   Mat_MPISBAIJ *matB = (Mat_MPISBAIJ*)B->data,*matA = (Mat_MPISBAIJ*)A->data;
1512   Mat         a,b,c,d;
1513   PetscTruth  flg;
1514   int         ierr;
1515 
1516   PetscFunctionBegin;
1517   ierr = PetscTypeCompare((PetscObject)B,MATMPISBAIJ,&flg);CHKERRQ(ierr);
1518   if (!flg) SETERRQ(PETSC_ERR_ARG_INCOMP,"Matrices must be same type");
1519   a = matA->A; b = matA->B;
1520   c = matB->A; d = matB->B;
1521 
1522   ierr = MatEqual(a,c,&flg);CHKERRQ(ierr);
1523   if (flg == PETSC_TRUE) {
1524     ierr = MatEqual(b,d,&flg);CHKERRQ(ierr);
1525   }
1526   ierr = MPI_Allreduce(&flg,flag,1,MPI_INT,MPI_LAND,A->comm);CHKERRQ(ierr);
1527   PetscFunctionReturn(0);
1528 }
1529 
1530 #undef __FUNCT__
1531 #define __FUNCT__ "MatSetUpPreallocation_MPISBAIJ"
1532 int MatSetUpPreallocation_MPISBAIJ(Mat A)
1533 {
1534   int        ierr;
1535 
1536   PetscFunctionBegin;
1537   ierr = MatMPISBAIJSetPreallocation(A,1,PETSC_DEFAULT,0,PETSC_DEFAULT,0);CHKERRQ(ierr);
1538   PetscFunctionReturn(0);
1539 }
1540 /* -------------------------------------------------------------------*/
1541 static struct _MatOps MatOps_Values = {
1542   MatSetValues_MPISBAIJ,
1543   MatGetRow_MPISBAIJ,
1544   MatRestoreRow_MPISBAIJ,
1545   MatMult_MPISBAIJ,
1546   MatMultAdd_MPISBAIJ,
1547   MatMultTranspose_MPISBAIJ,
1548   MatMultTransposeAdd_MPISBAIJ,
1549   0,
1550   0,
1551   0,
1552   0,
1553   0,
1554   0,
1555   MatRelax_MPISBAIJ,
1556   MatTranspose_MPISBAIJ,
1557   MatGetInfo_MPISBAIJ,
1558   MatEqual_MPISBAIJ,
1559   MatGetDiagonal_MPISBAIJ,
1560   MatDiagonalScale_MPISBAIJ,
1561   MatNorm_MPISBAIJ,
1562   MatAssemblyBegin_MPISBAIJ,
1563   MatAssemblyEnd_MPISBAIJ,
1564   0,
1565   MatSetOption_MPISBAIJ,
1566   MatZeroEntries_MPISBAIJ,
1567   MatZeroRows_MPISBAIJ,
1568   0,
1569   0,
1570   0,
1571   0,
1572   MatSetUpPreallocation_MPISBAIJ,
1573   0,
1574   0,
1575   0,
1576   0,
1577   MatDuplicate_MPISBAIJ,
1578   0,
1579   0,
1580   0,
1581   0,
1582   0,
1583   MatGetSubMatrices_MPISBAIJ,
1584   MatIncreaseOverlap_MPISBAIJ,
1585   MatGetValues_MPISBAIJ,
1586   0,
1587   MatPrintHelp_MPISBAIJ,
1588   MatScale_MPISBAIJ,
1589   0,
1590   0,
1591   0,
1592   MatGetBlockSize_MPISBAIJ,
1593   0,
1594   0,
1595   0,
1596   0,
1597   0,
1598   0,
1599   MatSetUnfactored_MPISBAIJ,
1600   0,
1601   MatSetValuesBlocked_MPISBAIJ,
1602   0,
1603   0,
1604   0,
1605   MatGetPetscMaps_Petsc,
1606   0,
1607   0,
1608   0,
1609   0,
1610   0,
1611   0,
1612   MatGetRowMax_MPISBAIJ};
1613 
1614 
1615 EXTERN_C_BEGIN
1616 #undef __FUNCT__
1617 #define __FUNCT__ "MatGetDiagonalBlock_MPISBAIJ"
1618 int MatGetDiagonalBlock_MPISBAIJ(Mat A,PetscTruth *iscopy,MatReuse reuse,Mat *a)
1619 {
1620   PetscFunctionBegin;
1621   *a      = ((Mat_MPISBAIJ *)A->data)->A;
1622   *iscopy = PETSC_FALSE;
1623   PetscFunctionReturn(0);
1624 }
1625 EXTERN_C_END
1626 
1627 EXTERN_C_BEGIN
1628 #undef __FUNCT__
1629 #define __FUNCT__ "MatCreate_MPISBAIJ"
1630 int MatCreate_MPISBAIJ(Mat B)
1631 {
1632   Mat_MPISBAIJ *b;
1633   int          ierr;
1634   PetscTruth   flg;
1635 
1636   PetscFunctionBegin;
1637 
1638   ierr    = PetscNew(Mat_MPISBAIJ,&b);CHKERRQ(ierr);
1639   B->data = (void*)b;
1640   ierr    = PetscMemzero(b,sizeof(Mat_MPISBAIJ));CHKERRQ(ierr);
1641   ierr    = PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr);
1642 
1643   B->ops->destroy    = MatDestroy_MPISBAIJ;
1644   B->ops->view       = MatView_MPISBAIJ;
1645   B->mapping    = 0;
1646   B->factor     = 0;
1647   B->assembled  = PETSC_FALSE;
1648 
1649   B->insertmode = NOT_SET_VALUES;
1650   ierr = MPI_Comm_rank(B->comm,&b->rank);CHKERRQ(ierr);
1651   ierr = MPI_Comm_size(B->comm,&b->size);CHKERRQ(ierr);
1652 
1653   /* build local table of row and column ownerships */
1654   ierr          = PetscMalloc(3*(b->size+2)*sizeof(int),&b->rowners);CHKERRQ(ierr);
1655   b->cowners    = b->rowners + b->size + 2;
1656   b->rowners_bs = b->cowners + b->size + 2;
1657   PetscLogObjectMemory(B,3*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPISBAIJ));
1658 
1659   /* build cache for off array entries formed */
1660   ierr = MatStashCreate_Private(B->comm,1,&B->stash);CHKERRQ(ierr);
1661   b->donotstash  = PETSC_FALSE;
1662   b->colmap      = PETSC_NULL;
1663   b->garray      = PETSC_NULL;
1664   b->roworiented = PETSC_TRUE;
1665 
1666 #if defined(PETSC_USE_MAT_SINGLE)
1667   /* stuff for MatSetValues_XXX in single precision */
1668   b->setvalueslen     = 0;
1669   b->setvaluescopy    = PETSC_NULL;
1670 #endif
1671 
1672   /* stuff used in block assembly */
1673   b->barray       = 0;
1674 
1675   /* stuff used for matrix vector multiply */
1676   b->lvec         = 0;
1677   b->Mvctx        = 0;
1678   b->slvec0       = 0;
1679   b->slvec0b      = 0;
1680   b->slvec1       = 0;
1681   b->slvec1a      = 0;
1682   b->slvec1b      = 0;
1683   b->sMvctx       = 0;
1684 
1685   /* stuff for MatGetRow() */
1686   b->rowindices   = 0;
1687   b->rowvalues    = 0;
1688   b->getrowactive = PETSC_FALSE;
1689 
1690   /* hash table stuff */
1691   b->ht           = 0;
1692   b->hd           = 0;
1693   b->ht_size      = 0;
1694   b->ht_flag      = PETSC_FALSE;
1695   b->ht_fact      = 0;
1696   b->ht_total_ct  = 0;
1697   b->ht_insert_ct = 0;
1698 
1699   ierr = PetscOptionsHasName(PETSC_NULL,"-mat_use_hash_table",&flg);CHKERRQ(ierr);
1700   if (flg) {
1701     PetscReal fact = 1.39;
1702     ierr = MatSetOption(B,MAT_USE_HASH_TABLE);CHKERRQ(ierr);
1703     ierr = PetscOptionsGetReal(PETSC_NULL,"-mat_use_hash_table",&fact,PETSC_NULL);CHKERRQ(ierr);
1704     if (fact <= 1.0) fact = 1.39;
1705     ierr = MatMPIBAIJSetHashTableFactor(B,fact);CHKERRQ(ierr);
1706     PetscLogInfo(0,"MatCreateMPISBAIJ:Hash table Factor used %5.2f\n",fact);
1707   }
1708   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatStoreValues_C",
1709                                      "MatStoreValues_MPISBAIJ",
1710                                      MatStoreValues_MPISBAIJ);CHKERRQ(ierr);
1711   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatRetrieveValues_C",
1712                                      "MatRetrieveValues_MPISBAIJ",
1713                                      MatRetrieveValues_MPISBAIJ);CHKERRQ(ierr);
1714   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetDiagonalBlock_C",
1715                                      "MatGetDiagonalBlock_MPISBAIJ",
1716                                      MatGetDiagonalBlock_MPISBAIJ);CHKERRQ(ierr);
1717   PetscFunctionReturn(0);
1718 }
1719 EXTERN_C_END
1720 
1721 #undef __FUNCT__
1722 #define __FUNCT__ "MatMPISBAIJSetPreallocation"
1723 /*@C
1724    MatMPISBAIJSetPreallocation - For good matrix assembly performance
1725    the user should preallocate the matrix storage by setting the parameters
1726    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1727    performance can be increased by more than a factor of 50.
1728 
1729    Collective on Mat
1730 
1731    Input Parameters:
1732 +  A - the matrix
1733 .  bs   - size of blockk
1734 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1735            submatrix  (same for all local rows)
1736 .  d_nnz - array containing the number of block nonzeros in the various block rows
1737            of the in diagonal portion of the local (possibly different for each block
1738            row) or PETSC_NULL.  You must leave room for the diagonal entry even if it is zero.
1739 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1740            submatrix (same for all local rows).
1741 -  o_nnz - array containing the number of nonzeros in the various block rows of the
1742            off-diagonal portion of the local submatrix (possibly different for
1743            each block row) or PETSC_NULL.
1744 
1745 
1746    Options Database Keys:
1747 .   -mat_no_unroll - uses code that does not unroll the loops in the
1748                      block calculations (much slower)
1749 .   -mat_block_size - size of the blocks to use
1750 
1751    Notes:
1752 
1753    If PETSC_DECIDE or  PETSC_DETERMINE is used for a particular argument on one processor
1754    than it must be used on all processors that share the object for that argument.
1755 
1756    Storage Information:
1757    For a square global matrix we define each processor's diagonal portion
1758    to be its local rows and the corresponding columns (a square submatrix);
1759    each processor's off-diagonal portion encompasses the remainder of the
1760    local matrix (a rectangular submatrix).
1761 
1762    The user can specify preallocated storage for the diagonal part of
1763    the local submatrix with either d_nz or d_nnz (not both).  Set
1764    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1765    memory allocation.  Likewise, specify preallocated storage for the
1766    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1767 
1768    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1769    the figure below we depict these three local rows and all columns (0-11).
1770 
1771 .vb
1772            0 1 2 3 4 5 6 7 8 9 10 11
1773           -------------------
1774    row 3  |  o o o d d d o o o o o o
1775    row 4  |  o o o d d d o o o o o o
1776    row 5  |  o o o d d d o o o o o o
1777           -------------------
1778 .ve
1779 
1780    Thus, any entries in the d locations are stored in the d (diagonal)
1781    submatrix, and any entries in the o locations are stored in the
1782    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1783    stored simply in the MATSEQBAIJ format for compressed row storage.
1784 
1785    Now d_nz should indicate the number of block nonzeros per row in the d matrix,
1786    and o_nz should indicate the number of block nonzeros per row in the o matrix.
1787    In general, for PDE problems in which most nonzeros are near the diagonal,
1788    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1789    or you will get TERRIBLE performance; see the users' manual chapter on
1790    matrices.
1791 
1792    Level: intermediate
1793 
1794 .keywords: matrix, block, aij, compressed row, sparse, parallel
1795 
1796 .seealso: MatCreate(), MatCreateSeqSBAIJ(), MatSetValues(), MatCreateMPIBAIJ()
1797 @*/
1798 
1799 int MatMPISBAIJSetPreallocation(Mat B,int bs,int d_nz,int *d_nnz,int o_nz,int *o_nnz)
1800 {
1801   Mat_MPISBAIJ *b;
1802   int          ierr,i,mbs,Mbs;
1803   PetscTruth   flg2;
1804 
1805   PetscFunctionBegin;
1806   ierr = PetscTypeCompare((PetscObject)B,MATMPISBAIJ,&flg2);CHKERRQ(ierr);
1807   if (!flg2) PetscFunctionReturn(0);
1808 
1809   ierr = PetscOptionsGetInt(PETSC_NULL,"-mat_block_size",&bs,PETSC_NULL);CHKERRQ(ierr);
1810 
1811   if (bs < 1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Invalid block size specified, must be positive");
1812   if (d_nz == PETSC_DECIDE || d_nz == PETSC_DEFAULT) d_nz = 3;
1813   if (o_nz == PETSC_DECIDE || o_nz == PETSC_DEFAULT) o_nz = 1;
1814   if (d_nz < 0) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %d",d_nz);
1815   if (o_nz < 0) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %d",o_nz);
1816   if (d_nnz) {
1817     for (i=0; i<B->m/bs; i++) {
1818       if (d_nnz[i] < 0) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than -1: local row %d value %d",i,d_nnz[i]);
1819     }
1820   }
1821   if (o_nnz) {
1822     for (i=0; i<B->m/bs; i++) {
1823       if (o_nnz[i] < 0) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than -1: local row %d value %d",i,o_nnz[i]);
1824     }
1825   }
1826   B->preallocated = PETSC_TRUE;
1827   ierr = PetscSplitOwnershipBlock(B->comm,bs,&B->m,&B->M);CHKERRQ(ierr);
1828   ierr = PetscSplitOwnershipBlock(B->comm,bs,&B->n,&B->N);CHKERRQ(ierr);
1829   ierr = PetscMapCreateMPI(B->comm,B->m,B->M,&B->rmap);CHKERRQ(ierr);
1830   ierr = PetscMapCreateMPI(B->comm,B->m,B->M,&B->cmap);CHKERRQ(ierr);
1831 
1832   b   = (Mat_MPISBAIJ*)B->data;
1833   mbs = B->m/bs;
1834   Mbs = B->M/bs;
1835   if (mbs*bs != B->m) {
1836     SETERRQ2(PETSC_ERR_ARG_SIZ,"No of local rows %d must be divisible by blocksize %d",B->m,bs);
1837   }
1838 
1839   b->bs  = bs;
1840   b->bs2 = bs*bs;
1841   b->mbs = mbs;
1842   b->nbs = mbs;
1843   b->Mbs = Mbs;
1844   b->Nbs = Mbs;
1845 
1846   ierr = MPI_Allgather(&b->mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,B->comm);CHKERRQ(ierr);
1847   b->rowners[0]    = 0;
1848   for (i=2; i<=b->size; i++) {
1849     b->rowners[i] += b->rowners[i-1];
1850   }
1851   b->rstart    = b->rowners[b->rank];
1852   b->rend      = b->rowners[b->rank+1];
1853   b->cstart    = b->rstart;
1854   b->cend      = b->rend;
1855   for (i=0; i<=b->size; i++) {
1856     b->rowners_bs[i] = b->rowners[i]*bs;
1857   }
1858   b->rstart_bs = b-> rstart*bs;
1859   b->rend_bs   = b->rend*bs;
1860 
1861   b->cstart_bs = b->cstart*bs;
1862   b->cend_bs   = b->cend*bs;
1863 
1864 
1865   ierr = MatCreateSeqSBAIJ(PETSC_COMM_SELF,bs,B->m,B->m,d_nz,d_nnz,&b->A);CHKERRQ(ierr);
1866   PetscLogObjectParent(B,b->A);
1867   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,B->m,B->M,o_nz,o_nnz,&b->B);CHKERRQ(ierr);
1868   PetscLogObjectParent(B,b->B);
1869 
1870   /* build cache for off array entries formed */
1871   ierr = MatStashCreate_Private(B->comm,bs,&B->bstash);CHKERRQ(ierr);
1872 
1873   PetscFunctionReturn(0);
1874 }
1875 
1876 #undef __FUNCT__
1877 #define __FUNCT__ "MatCreateMPISBAIJ"
1878 /*@C
1879    MatCreateMPISBAIJ - Creates a sparse parallel matrix in symmetric block AIJ format
1880    (block compressed row).  For good matrix assembly performance
1881    the user should preallocate the matrix storage by setting the parameters
1882    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1883    performance can be increased by more than a factor of 50.
1884 
1885    Collective on MPI_Comm
1886 
1887    Input Parameters:
1888 +  comm - MPI communicator
1889 .  bs   - size of blockk
1890 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1891            This value should be the same as the local size used in creating the
1892            y vector for the matrix-vector product y = Ax.
1893 .  n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1894            This value should be the same as the local size used in creating the
1895            x vector for the matrix-vector product y = Ax.
1896 .  M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1897 .  N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1898 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1899            submatrix  (same for all local rows)
1900 .  d_nnz - array containing the number of block nonzeros in the various block rows
1901            of the in diagonal portion of the local (possibly different for each block
1902            row) or PETSC_NULL.  You must leave room for the diagonal entry even if it is zero.
1903 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1904            submatrix (same for all local rows).
1905 -  o_nnz - array containing the number of nonzeros in the various block rows of the
1906            off-diagonal portion of the local submatrix (possibly different for
1907            each block row) or PETSC_NULL.
1908 
1909    Output Parameter:
1910 .  A - the matrix
1911 
1912    Options Database Keys:
1913 .   -mat_no_unroll - uses code that does not unroll the loops in the
1914                      block calculations (much slower)
1915 .   -mat_block_size - size of the blocks to use
1916 .   -mat_mpi - use the parallel matrix data structures even on one processor
1917                (defaults to using SeqBAIJ format on one processor)
1918 
1919    Notes:
1920    The user MUST specify either the local or global matrix dimensions
1921    (possibly both).
1922 
1923    If PETSC_DECIDE or  PETSC_DETERMINE is used for a particular argument on one processor
1924    than it must be used on all processors that share the object for that argument.
1925 
1926    Storage Information:
1927    For a square global matrix we define each processor's diagonal portion
1928    to be its local rows and the corresponding columns (a square submatrix);
1929    each processor's off-diagonal portion encompasses the remainder of the
1930    local matrix (a rectangular submatrix).
1931 
1932    The user can specify preallocated storage for the diagonal part of
1933    the local submatrix with either d_nz or d_nnz (not both).  Set
1934    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1935    memory allocation.  Likewise, specify preallocated storage for the
1936    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1937 
1938    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1939    the figure below we depict these three local rows and all columns (0-11).
1940 
1941 .vb
1942            0 1 2 3 4 5 6 7 8 9 10 11
1943           -------------------
1944    row 3  |  o o o d d d o o o o o o
1945    row 4  |  o o o d d d o o o o o o
1946    row 5  |  o o o d d d o o o o o o
1947           -------------------
1948 .ve
1949 
1950    Thus, any entries in the d locations are stored in the d (diagonal)
1951    submatrix, and any entries in the o locations are stored in the
1952    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1953    stored simply in the MATSEQBAIJ format for compressed row storage.
1954 
1955    Now d_nz should indicate the number of block nonzeros per row in the d matrix,
1956    and o_nz should indicate the number of block nonzeros per row in the o matrix.
1957    In general, for PDE problems in which most nonzeros are near the diagonal,
1958    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1959    or you will get TERRIBLE performance; see the users' manual chapter on
1960    matrices.
1961 
1962    Level: intermediate
1963 
1964 .keywords: matrix, block, aij, compressed row, sparse, parallel
1965 
1966 .seealso: MatCreate(), MatCreateSeqSBAIJ(), MatSetValues(), MatCreateMPIBAIJ()
1967 @*/
1968 
1969 int MatCreateMPISBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1970 {
1971   int ierr,size;
1972 
1973   PetscFunctionBegin;
1974   ierr = MatCreate(comm,m,n,M,N,A);CHKERRQ(ierr);
1975   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1976   if (size > 1) {
1977     ierr = MatSetType(*A,MATMPISBAIJ);CHKERRQ(ierr);
1978     ierr = MatMPISBAIJSetPreallocation(*A,bs,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
1979   } else {
1980     ierr = MatSetType(*A,MATSEQSBAIJ);CHKERRQ(ierr);
1981     ierr = MatSeqSBAIJSetPreallocation(*A,bs,d_nz,d_nnz);CHKERRQ(ierr);
1982   }
1983   PetscFunctionReturn(0);
1984 }
1985 
1986 
1987 #undef __FUNCT__
1988 #define __FUNCT__ "MatDuplicate_MPISBAIJ"
1989 static int MatDuplicate_MPISBAIJ(Mat matin,MatDuplicateOption cpvalues,Mat *newmat)
1990 {
1991   Mat          mat;
1992   Mat_MPISBAIJ *a,*oldmat = (Mat_MPISBAIJ*)matin->data;
1993   int          ierr,len=0;
1994 
1995   PetscFunctionBegin;
1996   *newmat       = 0;
1997   ierr = MatCreate(matin->comm,matin->m,matin->n,matin->M,matin->N,&mat);CHKERRQ(ierr);
1998   ierr = MatSetType(mat,MATMPISBAIJ);CHKERRQ(ierr);
1999   mat->preallocated = PETSC_TRUE;
2000   a = (Mat_MPISBAIJ*)mat->data;
2001   a->bs  = oldmat->bs;
2002   a->bs2 = oldmat->bs2;
2003   a->mbs = oldmat->mbs;
2004   a->nbs = oldmat->nbs;
2005   a->Mbs = oldmat->Mbs;
2006   a->Nbs = oldmat->Nbs;
2007 
2008   a->rstart       = oldmat->rstart;
2009   a->rend         = oldmat->rend;
2010   a->cstart       = oldmat->cstart;
2011   a->cend         = oldmat->cend;
2012   a->size         = oldmat->size;
2013   a->rank         = oldmat->rank;
2014   a->donotstash   = oldmat->donotstash;
2015   a->roworiented  = oldmat->roworiented;
2016   a->rowindices   = 0;
2017   a->rowvalues    = 0;
2018   a->getrowactive = PETSC_FALSE;
2019   a->barray       = 0;
2020   a->rstart_bs    = oldmat->rstart_bs;
2021   a->rend_bs      = oldmat->rend_bs;
2022   a->cstart_bs    = oldmat->cstart_bs;
2023   a->cend_bs      = oldmat->cend_bs;
2024 
2025   /* hash table stuff */
2026   a->ht           = 0;
2027   a->hd           = 0;
2028   a->ht_size      = 0;
2029   a->ht_flag      = oldmat->ht_flag;
2030   a->ht_fact      = oldmat->ht_fact;
2031   a->ht_total_ct  = 0;
2032   a->ht_insert_ct = 0;
2033 
2034   ierr = PetscMalloc(3*(a->size+2)*sizeof(int),&a->rowners);CHKERRQ(ierr);
2035   PetscLogObjectMemory(mat,3*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPISBAIJ));
2036   a->cowners    = a->rowners + a->size + 2;
2037   a->rowners_bs = a->cowners + a->size + 2;
2038   ierr = PetscMemcpy(a->rowners,oldmat->rowners,3*(a->size+2)*sizeof(int));CHKERRQ(ierr);
2039   ierr = MatStashCreate_Private(matin->comm,1,&mat->stash);CHKERRQ(ierr);
2040   ierr = MatStashCreate_Private(matin->comm,oldmat->bs,&mat->bstash);CHKERRQ(ierr);
2041   if (oldmat->colmap) {
2042 #if defined (PETSC_USE_CTABLE)
2043     ierr = PetscTableCreateCopy(oldmat->colmap,&a->colmap);CHKERRQ(ierr);
2044 #else
2045     ierr = PetscMalloc((a->Nbs)*sizeof(int),&a->colmap);CHKERRQ(ierr);
2046     PetscLogObjectMemory(mat,(a->Nbs)*sizeof(int));
2047     ierr = PetscMemcpy(a->colmap,oldmat->colmap,(a->Nbs)*sizeof(int));CHKERRQ(ierr);
2048 #endif
2049   } else a->colmap = 0;
2050   if (oldmat->garray && (len = ((Mat_SeqBAIJ*)(oldmat->B->data))->nbs)) {
2051     ierr = PetscMalloc(len*sizeof(int),&a->garray);CHKERRQ(ierr);
2052     PetscLogObjectMemory(mat,len*sizeof(int));
2053     ierr = PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));CHKERRQ(ierr);
2054   } else a->garray = 0;
2055 
2056   ierr =  VecDuplicate(oldmat->lvec,&a->lvec);CHKERRQ(ierr);
2057   PetscLogObjectParent(mat,a->lvec);
2058   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx);CHKERRQ(ierr);
2059 
2060   PetscLogObjectParent(mat,a->Mvctx);
2061   ierr =  MatDuplicate(oldmat->A,cpvalues,&a->A);CHKERRQ(ierr);
2062   PetscLogObjectParent(mat,a->A);
2063   ierr =  MatDuplicate(oldmat->B,cpvalues,&a->B);CHKERRQ(ierr);
2064   PetscLogObjectParent(mat,a->B);
2065   ierr = PetscFListDuplicate(mat->qlist,&matin->qlist);CHKERRQ(ierr);
2066   *newmat = mat;
2067   PetscFunctionReturn(0);
2068 }
2069 
2070 #include "petscsys.h"
2071 
2072 EXTERN_C_BEGIN
2073 #undef __FUNCT__
2074 #define __FUNCT__ "MatLoad_MPISBAIJ"
2075 int MatLoad_MPISBAIJ(PetscViewer viewer,MatType type,Mat *newmat)
2076 {
2077   Mat          A;
2078   int          i,nz,ierr,j,rstart,rend,fd;
2079   PetscScalar  *vals,*buf;
2080   MPI_Comm     comm = ((PetscObject)viewer)->comm;
2081   MPI_Status   status;
2082   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,*browners,maxnz,*cols;
2083   int          *locrowlens,*sndcounts = 0,*procsnz = 0,jj,*mycols,*ibuf;
2084   int          tag = ((PetscObject)viewer)->tag,bs=1,Mbs,mbs,extra_rows;
2085   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
2086   int          dcount,kmax,k,nzcount,tmp;
2087 
2088   PetscFunctionBegin;
2089   ierr = PetscOptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,PETSC_NULL);CHKERRQ(ierr);
2090 
2091   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
2092   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
2093   if (!rank) {
2094     ierr = PetscViewerBinaryGetDescriptor(viewer,&fd);CHKERRQ(ierr);
2095     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT);CHKERRQ(ierr);
2096     if (header[0] != MAT_FILE_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"not matrix object");
2097     if (header[3] < 0) {
2098       SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"Matrix stored in special format, cannot load as MPISBAIJ");
2099     }
2100   }
2101 
2102   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
2103   M = header[1]; N = header[2];
2104 
2105   if (M != N) SETERRQ(PETSC_ERR_SUP,"Can only do square matrices");
2106 
2107   /*
2108      This code adds extra rows to make sure the number of rows is
2109      divisible by the blocksize
2110   */
2111   Mbs        = M/bs;
2112   extra_rows = bs - M + bs*(Mbs);
2113   if (extra_rows == bs) extra_rows = 0;
2114   else                  Mbs++;
2115   if (extra_rows &&!rank) {
2116     PetscLogInfo(0,"MatLoad_MPISBAIJ:Padding loaded matrix to match blocksize\n");
2117   }
2118 
2119   /* determine ownership of all rows */
2120   mbs        = Mbs/size + ((Mbs % size) > rank);
2121   m          = mbs*bs;
2122   ierr       = PetscMalloc(2*(size+2)*sizeof(int),&rowners);CHKERRQ(ierr);
2123   browners   = rowners + size + 1;
2124   ierr       = MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2125   rowners[0] = 0;
2126   for (i=2; i<=size; i++) rowners[i] += rowners[i-1];
2127   for (i=0; i<=size;  i++) browners[i] = rowners[i]*bs;
2128   rstart = rowners[rank];
2129   rend   = rowners[rank+1];
2130 
2131   /* distribute row lengths to all processors */
2132   ierr = PetscMalloc((rend-rstart)*bs*sizeof(int),&locrowlens);CHKERRQ(ierr);
2133   if (!rank) {
2134     ierr = PetscMalloc((M+extra_rows)*sizeof(int),&rowlengths);CHKERRQ(ierr);
2135     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT);CHKERRQ(ierr);
2136     for (i=0; i<extra_rows; i++) rowlengths[M+i] = 1;
2137     ierr = PetscMalloc(size*sizeof(int),&sndcounts);CHKERRQ(ierr);
2138     for (i=0; i<size; i++) sndcounts[i] = browners[i+1] - browners[i];
2139     ierr = MPI_Scatterv(rowlengths,sndcounts,browners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);CHKERRQ(ierr);
2140     ierr = PetscFree(sndcounts);CHKERRQ(ierr);
2141   } else {
2142     ierr = MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);CHKERRQ(ierr);
2143   }
2144 
2145   if (!rank) {   /* procs[0] */
2146     /* calculate the number of nonzeros on each processor */
2147     ierr = PetscMalloc(size*sizeof(int),&procsnz);CHKERRQ(ierr);
2148     ierr = PetscMemzero(procsnz,size*sizeof(int));CHKERRQ(ierr);
2149     for (i=0; i<size; i++) {
2150       for (j=rowners[i]*bs; j< rowners[i+1]*bs; j++) {
2151         procsnz[i] += rowlengths[j];
2152       }
2153     }
2154     ierr = PetscFree(rowlengths);CHKERRQ(ierr);
2155 
2156     /* determine max buffer needed and allocate it */
2157     maxnz = 0;
2158     for (i=0; i<size; i++) {
2159       maxnz = PetscMax(maxnz,procsnz[i]);
2160     }
2161     ierr = PetscMalloc(maxnz*sizeof(int),&cols);CHKERRQ(ierr);
2162 
2163     /* read in my part of the matrix column indices  */
2164     nz     = procsnz[0];
2165     ierr   = PetscMalloc(nz*sizeof(int),&ibuf);CHKERRQ(ierr);
2166     mycols = ibuf;
2167     if (size == 1)  nz -= extra_rows;
2168     ierr = PetscBinaryRead(fd,mycols,nz,PETSC_INT);CHKERRQ(ierr);
2169     if (size == 1)  for (i=0; i< extra_rows; i++) { mycols[nz+i] = M+i; }
2170 
2171     /* read in every ones (except the last) and ship off */
2172     for (i=1; i<size-1; i++) {
2173       nz   = procsnz[i];
2174       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT);CHKERRQ(ierr);
2175       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
2176     }
2177     /* read in the stuff for the last proc */
2178     if (size != 1) {
2179       nz   = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
2180       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT);CHKERRQ(ierr);
2181       for (i=0; i<extra_rows; i++) cols[nz+i] = M+i;
2182       ierr = MPI_Send(cols,nz+extra_rows,MPI_INT,size-1,tag,comm);CHKERRQ(ierr);
2183     }
2184     ierr = PetscFree(cols);CHKERRQ(ierr);
2185   } else {  /* procs[i], i>0 */
2186     /* determine buffer space needed for message */
2187     nz = 0;
2188     for (i=0; i<m; i++) {
2189       nz += locrowlens[i];
2190     }
2191     ierr   = PetscMalloc(nz*sizeof(int),&ibuf);CHKERRQ(ierr);
2192     mycols = ibuf;
2193     /* receive message of column indices*/
2194     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
2195     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
2196     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"something is wrong with file");
2197   }
2198 
2199   /* loop over local rows, determining number of off diagonal entries */
2200   ierr     = PetscMalloc(2*(rend-rstart+1)*sizeof(int),&dlens);CHKERRQ(ierr);
2201   odlens   = dlens + (rend-rstart);
2202   ierr     = PetscMalloc(3*Mbs*sizeof(int),&mask);CHKERRQ(ierr);
2203   ierr     = PetscMemzero(mask,3*Mbs*sizeof(int));CHKERRQ(ierr);
2204   masked1  = mask    + Mbs;
2205   masked2  = masked1 + Mbs;
2206   rowcount = 0; nzcount = 0;
2207   for (i=0; i<mbs; i++) {
2208     dcount  = 0;
2209     odcount = 0;
2210     for (j=0; j<bs; j++) {
2211       kmax = locrowlens[rowcount];
2212       for (k=0; k<kmax; k++) {
2213         tmp = mycols[nzcount++]/bs; /* block col. index */
2214         if (!mask[tmp]) {
2215           mask[tmp] = 1;
2216           if (tmp < rstart || tmp >= rend) masked2[odcount++] = tmp; /* entry in off-diag portion */
2217           else masked1[dcount++] = tmp; /* entry in diag portion */
2218         }
2219       }
2220       rowcount++;
2221     }
2222 
2223     dlens[i]  = dcount;  /* d_nzz[i] */
2224     odlens[i] = odcount; /* o_nzz[i] */
2225 
2226     /* zero out the mask elements we set */
2227     for (j=0; j<dcount; j++) mask[masked1[j]] = 0;
2228     for (j=0; j<odcount; j++) mask[masked2[j]] = 0;
2229   }
2230 
2231   /* create our matrix */
2232   ierr = MatCreateMPISBAIJ(comm,bs,m,m,PETSC_DETERMINE,PETSC_DETERMINE,0,dlens,0,odlens,newmat);
2233   CHKERRQ(ierr);
2234   A = *newmat;
2235   ierr = MatSetOption(A,MAT_COLUMNS_SORTED);CHKERRQ(ierr);
2236 
2237   if (!rank) {
2238     ierr = PetscMalloc(maxnz*sizeof(PetscScalar),&buf);CHKERRQ(ierr);
2239     /* read in my part of the matrix numerical values  */
2240     nz = procsnz[0];
2241     vals = buf;
2242     mycols = ibuf;
2243     if (size == 1)  nz -= extra_rows;
2244     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR);CHKERRQ(ierr);
2245     if (size == 1)  for (i=0; i< extra_rows; i++) { vals[nz+i] = 1.0; }
2246 
2247     /* insert into matrix */
2248     jj      = rstart*bs;
2249     for (i=0; i<m; i++) {
2250       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2251       mycols += locrowlens[i];
2252       vals   += locrowlens[i];
2253       jj++;
2254     }
2255 
2256     /* read in other processors (except the last one) and ship out */
2257     for (i=1; i<size-1; i++) {
2258       nz   = procsnz[i];
2259       vals = buf;
2260       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR);CHKERRQ(ierr);
2261       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
2262     }
2263     /* the last proc */
2264     if (size != 1){
2265       nz   = procsnz[i] - extra_rows;
2266       vals = buf;
2267       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR);CHKERRQ(ierr);
2268       for (i=0; i<extra_rows; i++) vals[nz+i] = 1.0;
2269       ierr = MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,size-1,A->tag,comm);CHKERRQ(ierr);
2270     }
2271     ierr = PetscFree(procsnz);CHKERRQ(ierr);
2272 
2273   } else {
2274     /* receive numeric values */
2275     ierr = PetscMalloc(nz*sizeof(PetscScalar),&buf);CHKERRQ(ierr);
2276 
2277     /* receive message of values*/
2278     vals   = buf;
2279     mycols = ibuf;
2280     ierr   = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
2281     ierr   = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
2282     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"something is wrong with file");
2283 
2284     /* insert into matrix */
2285     jj      = rstart*bs;
2286     for (i=0; i<m; i++) {
2287       ierr    = MatSetValues_MPISBAIJ(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2288       mycols += locrowlens[i];
2289       vals   += locrowlens[i];
2290       jj++;
2291     }
2292   }
2293 
2294   ierr = PetscFree(locrowlens);CHKERRQ(ierr);
2295   ierr = PetscFree(buf);CHKERRQ(ierr);
2296   ierr = PetscFree(ibuf);CHKERRQ(ierr);
2297   ierr = PetscFree(rowners);CHKERRQ(ierr);
2298   ierr = PetscFree(dlens);CHKERRQ(ierr);
2299   ierr = PetscFree(mask);CHKERRQ(ierr);
2300   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2301   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2302   PetscFunctionReturn(0);
2303 }
2304 EXTERN_C_END
2305 
2306 #undef __FUNCT__
2307 #define __FUNCT__ "MatMPISBAIJSetHashTableFactor"
2308 /*@
2309    MatMPISBAIJSetHashTableFactor - Sets the factor required to compute the size of the HashTable.
2310 
2311    Input Parameters:
2312 .  mat  - the matrix
2313 .  fact - factor
2314 
2315    Collective on Mat
2316 
2317    Level: advanced
2318 
2319   Notes:
2320    This can also be set by the command line option: -mat_use_hash_table fact
2321 
2322 .keywords: matrix, hashtable, factor, HT
2323 
2324 .seealso: MatSetOption()
2325 @*/
2326 int MatMPISBAIJSetHashTableFactor(Mat mat,PetscReal fact)
2327 {
2328   PetscFunctionBegin;
2329   SETERRQ(1,"Function not yet written for SBAIJ format");
2330   /* PetscFunctionReturn(0); */
2331 }
2332 
2333 #undef __FUNCT__
2334 #define __FUNCT__ "MatGetRowMax_MPISBAIJ"
2335 int MatGetRowMax_MPISBAIJ(Mat A,Vec v)
2336 {
2337   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
2338   Mat_SeqBAIJ  *b = (Mat_SeqBAIJ*)(a->B)->data;
2339   PetscReal    atmp;
2340   PetscReal    *work,*svalues,*rvalues;
2341   int          ierr,i,bs,mbs,*bi,*bj,brow,j,ncols,krow,kcol,col,row,Mbs,bcol;
2342   int          rank,size,*rowners_bs,dest,count,source;
2343   PetscScalar  *va;
2344   MatScalar    *ba;
2345   MPI_Status   stat;
2346 
2347   PetscFunctionBegin;
2348   ierr = MatGetRowMax(a->A,v);CHKERRQ(ierr);
2349   ierr = VecGetArray(v,&va);CHKERRQ(ierr);
2350 
2351   ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
2352   ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);
2353 
2354   bs   = a->bs;
2355   mbs  = a->mbs;
2356   Mbs  = a->Mbs;
2357   ba   = b->a;
2358   bi   = b->i;
2359   bj   = b->j;
2360   /*
2361   PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] M: %d, bs: %d, mbs: %d \n",rank,bs*Mbs,bs,mbs);
2362   PetscSynchronizedFlush(PETSC_COMM_WORLD);
2363   */
2364 
2365   /* find ownerships */
2366   rowners_bs = a->rowners_bs;
2367   /*
2368   if (!rank){
2369     for (i=0; i<size+1; i++) PetscPrintf(PETSC_COMM_SELF," rowners_bs[%d]: %d\n",i,rowners_bs[i]);
2370   }
2371   */
2372 
2373   /* each proc creates an array to be distributed */
2374   ierr = PetscMalloc(bs*Mbs*sizeof(PetscReal),&work);CHKERRQ(ierr);
2375   ierr = PetscMemzero(work,bs*Mbs*sizeof(PetscReal));CHKERRQ(ierr);
2376 
2377   /* row_max for B */
2378   if (rank != size-1){
2379     for (i=0; i<mbs; i++) {
2380       ncols = bi[1] - bi[0]; bi++;
2381       brow  = bs*i;
2382       for (j=0; j<ncols; j++){
2383         bcol = bs*(*bj);
2384         for (kcol=0; kcol<bs; kcol++){
2385           col = bcol + kcol;                 /* local col index */
2386           col += rowners_bs[rank+1];      /* global col index */
2387           /* PetscPrintf(PETSC_COMM_SELF,"[%d], col: %d\n",rank,col); */
2388           for (krow=0; krow<bs; krow++){
2389             atmp = PetscAbsScalar(*ba); ba++;
2390             row = brow + krow;    /* local row index */
2391             /* printf("val[%d,%d]: %g\n",row,col,atmp); */
2392             if (PetscRealPart(va[row]) < atmp) va[row] = atmp;
2393             if (work[col] < atmp) work[col] = atmp;
2394           }
2395         }
2396         bj++;
2397       }
2398     }
2399     /*
2400       PetscPrintf(PETSC_COMM_SELF,"[%d], work: ",rank);
2401       for (i=0; i<bs*Mbs; i++) PetscPrintf(PETSC_COMM_SELF,"%g ",work[i]);
2402       PetscPrintf(PETSC_COMM_SELF,"[%d]: \n");
2403       */
2404 
2405     /* send values to its owners */
2406     for (dest=rank+1; dest<size; dest++){
2407       svalues = work + rowners_bs[dest];
2408       count   = rowners_bs[dest+1]-rowners_bs[dest];
2409       ierr    = MPI_Send(svalues,count,MPIU_REAL,dest,rank,PETSC_COMM_WORLD);CHKERRQ(ierr);
2410       /*
2411       PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] sends %d values to [%d]: %g, %g, %g, %g\n",rank,count,dest,svalues[0],svalues[1],svalues[2],svalues[3]);
2412       PetscSynchronizedFlush(PETSC_COMM_WORLD);
2413       */
2414     }
2415   }
2416 
2417   /* receive values */
2418   if (rank){
2419     rvalues = work;
2420     count   = rowners_bs[rank+1]-rowners_bs[rank];
2421     for (source=0; source<rank; source++){
2422       ierr = MPI_Recv(rvalues,count,MPIU_REAL,MPI_ANY_SOURCE,MPI_ANY_TAG,PETSC_COMM_WORLD,&stat);CHKERRQ(ierr);
2423       /* process values */
2424       for (i=0; i<count; i++){
2425         if (PetscRealPart(va[i]) < rvalues[i]) va[i] = rvalues[i];
2426       }
2427       /*
2428       PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] received %d values from [%d]: %g, %g, %g, %g \n",rank,count,stat.MPI_SOURCE,rvalues[0],rvalues[1],rvalues[2],rvalues[3]);
2429       PetscSynchronizedFlush(PETSC_COMM_WORLD);
2430       */
2431     }
2432   }
2433 
2434   ierr = VecRestoreArray(v,&va);CHKERRQ(ierr);
2435   ierr = PetscFree(work);CHKERRQ(ierr);
2436   PetscFunctionReturn(0);
2437 }
2438 
2439 #undef __FUNCT__
2440 #define __FUNCT__ "MatRelax_MPISBAIJ"
2441 int MatRelax_MPISBAIJ(Mat matin,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,int its,int lits,Vec xx)
2442 {
2443   Mat_MPISBAIJ   *mat = (Mat_MPISBAIJ*)matin->data;
2444   int            ierr,mbs=mat->mbs,bs=mat->bs;
2445   PetscScalar    mone=-1.0,*x,*b,*ptr,zero=0.0;
2446   Vec            bb1;
2447 
2448   PetscFunctionBegin;
2449   if (its <= 0 || lits <= 0) SETERRQ2(PETSC_ERR_ARG_WRONG,"Relaxation requires global its %d and local its %d both positive",its,lits);
2450   if (bs > 1)
2451     SETERRQ(PETSC_ERR_SUP,"SSOR for block size > 1 is not yet implemented");
2452 
2453   if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
2454     if ( flag & SOR_ZERO_INITIAL_GUESS ) {
2455       ierr = (*mat->A->ops->relax)(mat->A,bb,omega,flag,fshift,lits,lits,xx);CHKERRQ(ierr);
2456       its--;
2457     }
2458 
2459     ierr = VecDuplicate(bb,&bb1);CHKERRQ(ierr);
2460     while (its--){
2461 
2462       /* lower triangular part: slvec0b = - B^T*xx */
2463       ierr = (*mat->B->ops->multtranspose)(mat->B,xx,mat->slvec0b);CHKERRQ(ierr);
2464 
2465       /* copy xx into slvec0a */
2466       ierr = VecGetArray(mat->slvec0,&ptr);CHKERRQ(ierr);
2467       ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2468       ierr = PetscMemcpy(ptr,x,bs*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2469       ierr = VecRestoreArray(mat->slvec0,&ptr);CHKERRQ(ierr);
2470 
2471       ierr = VecScale(&mone,mat->slvec0);CHKERRQ(ierr);
2472 
2473       /* copy bb into slvec1a */
2474       ierr = VecGetArray(mat->slvec1,&ptr);CHKERRQ(ierr);
2475       ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2476       ierr = PetscMemcpy(ptr,b,bs*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2477       ierr = VecRestoreArray(mat->slvec1,&ptr);CHKERRQ(ierr);
2478 
2479       /* set slvec1b = 0 */
2480       ierr = VecSet(&zero,mat->slvec1b);CHKERRQ(ierr);
2481 
2482       ierr = VecScatterBegin(mat->slvec0,mat->slvec1,ADD_VALUES,SCATTER_FORWARD,mat->sMvctx);CHKERRQ(ierr);
2483       ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2484       ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2485       ierr = VecScatterEnd(mat->slvec0,mat->slvec1,ADD_VALUES,SCATTER_FORWARD,mat->sMvctx);CHKERRQ(ierr);
2486 
2487       /* upper triangular part: bb1 = bb1 - B*x */
2488       ierr = (*mat->B->ops->multadd)(mat->B,mat->slvec1b,mat->slvec1a,bb1);CHKERRQ(ierr);
2489 
2490       /* local diagonal sweep */
2491       ierr = (*mat->A->ops->relax)(mat->A,bb1,omega,SOR_SYMMETRIC_SWEEP,fshift,lits,lits,xx);CHKERRQ(ierr);
2492     }
2493     ierr = VecDestroy(bb1);CHKERRQ(ierr);
2494   } else {
2495     SETERRQ(PETSC_ERR_SUP,"MatSORType is not supported for SBAIJ matrix format");
2496   }
2497   PetscFunctionReturn(0);
2498 }
2499 
2500 #undef __FUNCT__
2501 #define __FUNCT__ "MatRelax_MPISBAIJ_2comm"
2502 int MatRelax_MPISBAIJ_2comm(Mat matin,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,int its,int lits,Vec xx)
2503 {
2504   Mat_MPISBAIJ   *mat = (Mat_MPISBAIJ*)matin->data;
2505   int            ierr;
2506   PetscScalar    mone=-1.0;
2507   Vec            lvec1,bb1;
2508 
2509   PetscFunctionBegin;
2510   if (its <= 0 || lits <= 0) SETERRQ2(PETSC_ERR_ARG_WRONG,"Relaxation requires global its %d and local its %d both positive",its,lits);
2511   if (mat->bs > 1)
2512     SETERRQ(PETSC_ERR_SUP,"SSOR for block size > 1 is not yet implemented");
2513 
2514   if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
2515     if ( flag & SOR_ZERO_INITIAL_GUESS ) {
2516       ierr = (*mat->A->ops->relax)(mat->A,bb,omega,flag,fshift,lits,lits,xx);CHKERRQ(ierr);
2517       its--;
2518     }
2519 
2520     ierr = VecDuplicate(mat->lvec,&lvec1);CHKERRQ(ierr);
2521     ierr = VecDuplicate(bb,&bb1);CHKERRQ(ierr);
2522     while (its--){
2523       ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
2524 
2525       /* lower diagonal part: bb1 = bb - B^T*xx */
2526       ierr = (*mat->B->ops->multtranspose)(mat->B,xx,lvec1);CHKERRQ(ierr);
2527       ierr = VecScale(&mone,lvec1);CHKERRQ(ierr);
2528 
2529       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
2530       ierr = VecCopy(bb,bb1);CHKERRQ(ierr);
2531       ierr = VecScatterBegin(lvec1,bb1,ADD_VALUES,SCATTER_REVERSE,mat->Mvctx);CHKERRQ(ierr);
2532 
2533       /* upper diagonal part: bb1 = bb1 - B*x */
2534       ierr = VecScale(&mone,mat->lvec);CHKERRQ(ierr);
2535       ierr = (*mat->B->ops->multadd)(mat->B,mat->lvec,bb1,bb1);CHKERRQ(ierr);
2536 
2537       ierr = VecScatterEnd(lvec1,bb1,ADD_VALUES,SCATTER_REVERSE,mat->Mvctx);CHKERRQ(ierr);
2538 
2539       /* diagonal sweep */
2540       ierr = (*mat->A->ops->relax)(mat->A,bb1,omega,SOR_SYMMETRIC_SWEEP,fshift,lits,lits,xx);CHKERRQ(ierr);
2541     }
2542     ierr = VecDestroy(lvec1);CHKERRQ(ierr);
2543     ierr = VecDestroy(bb1);CHKERRQ(ierr);
2544   } else {
2545     SETERRQ(PETSC_ERR_SUP,"MatSORType is not supported for SBAIJ matrix format");
2546   }
2547   PetscFunctionReturn(0);
2548 }
2549 
2550