xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision f7cf7585ec4ab5bbe01f2b7df7e992ca325859cc)
1 /*$Id: sbaij2.c,v 1.3 2000/06/25 17:01:23 balay Exp balay $*/
2 
3 #include "petscsys.h"
4 #include "src/mat/impls/baij/seq/baij.h"
5 #include "src/vec/vecimpl.h"
6 #include "src/inline/spops.h"
7 #include "src/inline/ilu.h"
8 #include "petscbt.h"
9 #include "sbaij.h"
10 
11 #undef __FUNC__
12 #define __FUNC__ "MatIncreaseOverlap_SeqSBAIJ"
13 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS *is,int ov)
14 {
15   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
16   int         row,i,j,k,l,m,n,*idx,ierr,*nidx,isz,val,ival;
17   int         start,end,*ai,*aj,bs,*nidx2;
18   PetscBT     table;
19 
20   PetscFunctionBegin;
21   m     = a->mbs;
22   ai    = a->i;
23   aj    = a->j;
24   bs    = a->bs;
25 
26   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative overlap specified");
27 
28   ierr  = PetscBTCreate(m,table);CHKERRQ(ierr);
29   nidx  = (int*)PetscMalloc((m+1)*sizeof(int));CHKPTRQ(nidx);
30   nidx2 = (int *)PetscMalloc((a->m+1)*sizeof(int));CHKPTRQ(nidx2);
31 
32   for (i=0; i<is_max; i++) {
33     /* Initialise the two local arrays */
34     isz  = 0;
35     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
36 
37     /* Extract the indices, assume there can be duplicate entries */
38     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
39     ierr = ISGetSize(is[i],&n);CHKERRQ(ierr);
40 
41     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
42     for (j=0; j<n ; ++j){
43       ival = idx[j]/bs; /* convert the indices into block indices */
44       if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"index greater than mat-dim");
45       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
46     }
47     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
49     k = 0;
50     for (j=0; j<ov; j++){ /* for each overlap*/
51       n = isz;
52       for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
53         row   = nidx[k];
54         start = ai[row];
55         end   = ai[row+1];
56         for (l = start; l<end ; l++){
57           val = aj[l];
58           if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
59         }
60       }
61     }
62     /* expand the Index Set */
63     for (j=0; j<isz; j++) {
64       for (k=0; k<bs; k++)
65         nidx2[j*bs+k] = nidx[j]*bs+k;
66     }
67     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
68   }
69   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
70   ierr = PetscFree(nidx);CHKERRQ(ierr);
71   ierr = PetscFree(nidx2);CHKERRQ(ierr);
72   PetscFunctionReturn(0);
73 }
74 
75 #undef __FUNC__
76 #define __FUNC__ "MatGetSubMatrix_SeqSBAIJ_Private"
77 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
78 {
79   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
80   int          *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
81   int          row,mat_i,*mat_j,tcol,*mat_ilen;
82   int          *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
83   int          *aj = a->j,*ai = a->i;
84   MatScalar    *mat_a;
85   Mat          C;
86   PetscTruth   flag;
87 
88   PetscFunctionBegin;
89 
90   if ( isrow != iscol ) SETERRA(1,0,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
91   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
92   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted");
93 
94   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
95   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
96 
97   smap  = (int*)PetscMalloc((1+oldcols)*sizeof(int));CHKPTRQ(smap);
98   ssmap = smap;
99   lens  = (int*)PetscMalloc((1+nrows)*sizeof(int));CHKPTRQ(lens);
100   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
101   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
102   /* determine lens of each row */
103   for (i=0; i<nrows; i++) {
104     kstart  = ai[irow[i]];
105     kend    = kstart + a->ilen[irow[i]];
106     lens[i] = 0;
107       for (k=kstart; k<kend; k++) {
108         if (ssmap[aj[k]]) {
109           lens[i]++;
110         }
111       }
112     }
113   /* Create and fill new matrix */
114   if (scall == MAT_REUSE_MATRIX) {
115     c = (Mat_SeqSBAIJ *)((*B)->data);
116 
117     if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Submatrix wrong size");
118     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr);
119     if (flag == PETSC_FALSE) {
120       SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong no of nonzeros");
121     }
122     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
123     C = *B;
124   } else {
125     ierr = MatCreateSeqSBAIJ(A->comm,bs,nrows*bs,nrows*bs,0,lens,&C);CHKERRQ(ierr);
126   }
127   c = (Mat_SeqSBAIJ *)(C->data);
128   for (i=0; i<nrows; i++) {
129     row    = irow[i];
130     kstart = ai[row];
131     kend   = kstart + a->ilen[row];
132     mat_i  = c->i[i];
133     mat_j  = c->j + mat_i;
134     mat_a  = c->a + mat_i*bs2;
135     mat_ilen = c->ilen + i;
136     for (k=kstart; k<kend; k++) {
137       if ((tcol=ssmap[a->j[k]])) {
138         *mat_j++ = tcol - 1;
139         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
140         mat_a   += bs2;
141         (*mat_ilen)++;
142       }
143     }
144   }
145 
146   /* Free work space */
147   ierr = PetscFree(smap);CHKERRQ(ierr);
148   ierr = PetscFree(lens);CHKERRQ(ierr);
149   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
150   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
151 
152   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
153   *B = C;
154   PetscFunctionReturn(0);
155 }
156 
157 #undef __FUNC__
158 #define __FUNC__ "MatGetSubMatrix_SeqSBAIJ"
159 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
160 {
161   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
162   IS          is1;
163   int         *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
164 
165   PetscFunctionBegin;
166   if ( isrow != iscol ) SETERRA(1,0,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
167 
168   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
169   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
170 
171   /* Verify if the indices corespond to each element in a block
172    and form the IS with compressed IS */
173   vary = (int*)PetscMalloc(2*(a->mbs+1)*sizeof(int));CHKPTRQ(vary);
174   iary = vary + a->mbs;
175   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
176   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
177 
178   count = 0;
179   for (i=0; i<a->mbs; i++) {
180     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"Index set does not match blocks");
181     if (vary[i]==bs) iary[count++] = i;
182   }
183   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
184 
185   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
186   ierr = PetscFree(vary);CHKERRQ(ierr);
187 
188   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
189   ISDestroy(is1);
190   PetscFunctionReturn(0);
191 }
192 
193 #undef __FUNC__
194 #define __FUNC__ "MatGetSubMatrices_SeqSBAIJ"
195 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,IS *irow,IS *icol,MatReuse scall,Mat **B)
196 {
197   int ierr,i;
198 
199   PetscFunctionBegin;
200   if (scall == MAT_INITIAL_MATRIX) {
201     *B = (Mat*)PetscMalloc((n+1)*sizeof(Mat));CHKPTRQ(*B);
202   }
203 
204   for (i=0; i<n; i++) {
205     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
206   }
207   PetscFunctionReturn(0);
208 }
209 
210 /* -------------------------------------------------------*/
211 /* Should check that shapes of vectors and matrices match */
212 /* -------------------------------------------------------*/
213 #include "pinclude/blaslapack.h"
214 
215 #undef __FUNC__
216 #define __FUNC__ "MatMult_SeqSBAIJ_1"
217 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
218 {
219   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
220   Scalar          *x,*z,*xb,x1,zero=0.0;
221   MatScalar       *v;
222   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
223 
224   PetscFunctionBegin;
225   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
226   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
227   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
228 
229   v  = a->a;
230   xb = x;
231 
232   for (i=0; i<mbs; i++) {
233     n  = ai[1] - ai[0];  /* length of i_th row of A */
234     x1 = xb[0];
235     ib = aj + *ai;
236     z[i] += *v++ * x[*ib++];   /* (diag of A)*x */
237     for (j=1; j<n; j++) {
238       cval    = *ib;
239       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
240       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
241     }
242     xb++; ai++;
243   }
244 
245   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
246   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
247   PLogFlops(2*(a->s_nz*2 - a->m) - a->m);  /* s_nz = (nz+m)/2 */
248   PetscFunctionReturn(0);
249 }
250 
251 #undef __FUNC__
252 #define __FUNC__ "MatMult_SeqSBAIJ_2"
253 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
254 {
255   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
256   Scalar          *x,*z,*xb,x1,x2,zero=0.0;
257   MatScalar       *v;
258   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
259 
260 
261   PetscFunctionBegin;
262   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
263   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
264   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
265 
266   v     = a->a;
267   xb = x;
268 
269   for (i=0; i<mbs; i++) {
270     n  = ai[1] - ai[0]; /* length of i_th block row of A */
271     x1 = xb[0]; x2 = xb[1];
272     ib = aj + *ai;
273     /* (diag of A)*x */
274     z[2*i]   += v[0]*x1 + v[2]*x2;
275     z[2*i+1] += v[2]*x1 + v[3]*x2;
276     v += 4;
277     for (j=1; j<n; j++) {
278       /* (strict lower triangular part of A)*x  */
279       cval       = ib[j]*2;
280       z[cval]     += v[0]*x1 + v[1]*x2;
281       z[cval+1]   += v[2]*x1 + v[3]*x2;
282       /* (strict upper triangular part of A)*x  */
283       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
284       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
285       v  += 4;
286     }
287     xb +=2; ai++;
288   }
289 
290   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
291   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
292   PLogFlops(8*(a->s_nz*2 - a->m) - a->m);
293   PetscFunctionReturn(0);
294 }
295 
296 #undef __FUNC__
297 #define __FUNC__ "MatMult_SeqSBAIJ_3"
298 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
299 {
300   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
301   Scalar          *x,*z,*xb,x1,x2,x3,zero=0.0;
302   MatScalar       *v;
303   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
304 
305 
306   PetscFunctionBegin;
307   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
308   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
309   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
310 
311   v     = a->a;
312   xb = x;
313 
314   for (i=0; i<mbs; i++) {
315     n  = ai[1] - ai[0]; /* length of i_th block row of A */
316     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
317     ib = aj + *ai;
318     /* (diag of A)*x */
319     z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
320     z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
321     z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
322     v += 9;
323     for (j=1; j<n; j++) {
324       /* (strict lower triangular part of A)*x  */
325       cval       = ib[j]*3;
326       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
327       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
328       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
329       /* (strict upper triangular part of A)*x  */
330       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
331       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
332       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
333       v  += 9;
334     }
335     xb +=3; ai++;
336   }
337 
338   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
339   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
340   PLogFlops(18*(a->s_nz*2 - a->m) - a->m);
341   PetscFunctionReturn(0);
342 }
343 
344 #undef __FUNC__
345 #define __FUNC__ "MatMult_SeqSBAIJ_4"
346 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
347 {
348   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
349   Scalar          *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
350   MatScalar       *v;
351   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
352 
353   PetscFunctionBegin;
354   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
355   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
356   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
357 
358   v     = a->a;
359   xb = x;
360 
361   for (i=0; i<mbs; i++) {
362     n  = ai[1] - ai[0]; /* length of i_th block row of A */
363     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
364     ib = aj + *ai;
365     /* (diag of A)*x */
366     z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
367     z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
368     z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
369     z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
370     v += 16;
371     for (j=1; j<n; j++) {
372       /* (strict lower triangular part of A)*x  */
373       cval       = ib[j]*4;
374       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
375       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
376       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
377       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
378       /* (strict upper triangular part of A)*x  */
379       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
380       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
381       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
382       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
383       v  += 16;
384     }
385     xb +=4; ai++;
386   }
387 
388   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
389   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
390   PLogFlops(32*(a->s_nz*2 - a->m) - a->m);
391   PetscFunctionReturn(0);
392 }
393 
394 #undef __FUNC__
395 #define __FUNC__ "MatMult_SeqSBAIJ_5"
396 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
397 {
398   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
399   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
400   MatScalar       *v;
401   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
402 
403   PetscFunctionBegin;
404   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
405   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
406   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
407 
408   v     = a->a;
409   xb = x;
410 
411   for (i=0; i<mbs; i++) {
412     n  = ai[1] - ai[0]; /* length of i_th block row of A */
413     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
414     ib = aj + *ai;
415     /* (diag of A)*x */
416     z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
417     z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
418     z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
419     z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
420     z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
421     v += 25;
422     for (j=1; j<n; j++) {
423       /* (strict lower triangular part of A)*x  */
424       cval       = ib[j]*5;
425       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
426       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
427       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
428       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
429       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
430       /* (strict upper triangular part of A)*x  */
431       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
432       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
433       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
434       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
435       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
436       v  += 25;
437     }
438     xb +=5; ai++;
439   }
440 
441   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
442   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
443   PLogFlops(50*(a->s_nz*2 - a->m) - a->m);
444   PetscFunctionReturn(0);
445 }
446 
447 
448 #undef __FUNC__
449 #define __FUNC__ "MatMult_SeqSBAIJ_6"
450 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
451 {
452   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
453   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
454   MatScalar       *v;
455   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
456 
457   PetscFunctionBegin;
458   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
459   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
460   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
461 
462   v     = a->a;
463   xb = x;
464 
465   for (i=0; i<mbs; i++) {
466     n  = ai[1] - ai[0]; /* length of i_th block row of A */
467     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
468     ib = aj + *ai;
469     /* (diag of A)*x */
470     z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
471     z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
472     z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
473     z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
474     z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
475     z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
476     v += 36;
477     for (j=1; j<n; j++) {
478       /* (strict lower triangular part of A)*x  */
479       cval       = ib[j]*6;
480       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
481       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
482       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
483       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
484       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
485       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
486       /* (strict upper triangular part of A)*x  */
487       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
488       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
489       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
490       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
491       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
492       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
493       v  += 36;
494     }
495     xb +=6; ai++;
496   }
497 
498   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
499   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
500   PLogFlops(72*(a->s_nz*2 - a->m) - a->m);
501   PetscFunctionReturn(0);
502 }
503 #undef __FUNC__
504 #define __FUNC__ "MatMult_SeqSBAIJ_7"
505 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
506 {
507   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
508   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
509   MatScalar       *v;
510   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
511 
512   PetscFunctionBegin;
513   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
514   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
515   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
516 
517   v     = a->a;
518   xb = x;
519 
520   for (i=0; i<mbs; i++) {
521     n  = ai[1] - ai[0]; /* length of i_th block row of A */
522     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
523     ib = aj + *ai;
524     /* (diag of A)*x */
525     z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
526     z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
527     z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
528     z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
529     z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
530     z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
531     z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
532     v += 49;
533     for (j=1; j<n; j++) {
534       /* (strict lower triangular part of A)*x  */
535       cval       = ib[j]*7;
536       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
537       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
538       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
539       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
540       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
541       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
542       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
543       /* (strict upper triangular part of A)*x  */
544       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
545       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
546       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
547       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
548       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
549       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
550       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
551       v  += 49;
552     }
553     xb +=7; ai++;
554   }
555   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
556   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
557   PLogFlops(98*(a->s_nz*2 - a->m) - a->m);
558   PetscFunctionReturn(0);
559 }
560 
561 /*
562     This will not work with MatScalar == float because it calls the BLAS
563 */
564 #undef __FUNC__
565 #define __FUNC__ "MatMult_SeqSBAIJ_N"
566 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
567 {
568   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
569   Scalar          *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
570   MatScalar       *v;
571   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
572   int             ncols,k;
573 
574   PetscFunctionBegin;
575   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
576   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
577   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
578 
579   aj   = a->j;
580   v    = a->a;
581   ii   = a->i;
582 
583   if (!a->mult_work) {
584     k = a->m;
585     a->mult_work = (Scalar*)PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
586   }
587   work = a->mult_work;
588 
589   for (i=0; i<mbs; i++) {
590     n     = ii[1] - ii[0]; ncols = n*bs;
591     workt = work; idx=aj+ii[0];
592 
593     /* upper triangular part */
594     for (j=0; j<n; j++) {
595       /* col = bs*(*idx); */
596       xb = x_ptr + bs*(*idx++);
597       for (k=0; k<bs; k++) workt[k] = xb[k];
598       workt += bs;
599     }
600     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
601     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
602 
603     /* strict lower triangular part */
604     ncols -= bs;
605     if (ncols > 0){
606       workt = work;
607       ierr  = PetscMemzero(workt,ncols*sizeof(Scalar));CHKERRQ(ierr);
608       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v+bs2,workt);
609 
610       idx=aj+ii[0]+1;
611       for (j=1; j<n; j++) {
612         zb = z_ptr + bs*(*idx);
613         idx++;
614         for (k=0; k<bs; k++) zb[k] += workt[k] ;
615         workt += bs;
616       }
617     }
618 
619     x += bs; v += n*bs2; z += bs; ii++;
620   }
621 
622   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
623   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
624   PLogFlops(2*(a->s_nz*2 - a->m)*bs2 - a->m);
625   PetscFunctionReturn(0);
626 }
627 
628 #undef __FUNC__
629 #define __FUNC__ "MatMultAdd_SeqSBAIJ_1"
630 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
631 {
632   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
633   Scalar          *x,*y,*z,*xb,x1;
634   MatScalar       *v;
635   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
636 
637   PetscFunctionBegin;
638   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
639   if (yy != xx) {
640     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
641   } else {
642     y = x;
643   }
644   if (zz != yy) {
645     ierr = VecCopy(yy,zz); CHKERRQ(ierr);
646     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
647   } else {
648     z = y;
649   }
650 
651   v  = a->a;
652   xb = x;
653 
654   for (i=0; i<mbs; i++) {
655     n  = ai[1] - ai[0];  /* length of i_th row of A */
656     x1 = xb[0];
657     ib = aj + *ai;
658     z[i] += *v++ * x[*ib++];   /* (diag of A)*x */
659     for (j=1; j<n; j++) {
660       cval    = *ib;
661       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
662       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
663     }
664     xb++; ai++;
665   }
666 
667   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
668   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
669   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
670 
671   PLogFlops(2*(a->s_nz*2 - a->m));
672   PetscFunctionReturn(0);
673 }
674 
675 #undef __FUNC__
676 #define __FUNC__ "MatMultAdd_SeqSBAIJ_2"
677 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
678 {
679   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
680   Scalar          *x,*y,*z,*xb,x1,x2;
681   MatScalar       *v;
682   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
683 
684   PetscFunctionBegin;
685   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
686   if (yy != xx) {
687     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
688   } else {
689     y = x;
690   }
691   if (zz != yy) {
692     ierr = VecCopy(yy,zz); CHKERRQ(ierr);
693     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
694   } else {
695     z = y;
696   }
697 
698   v     = a->a;
699   xb = x;
700 
701   for (i=0; i<mbs; i++) {
702     n  = ai[1] - ai[0]; /* length of i_th block row of A */
703     x1 = xb[0]; x2 = xb[1];
704     ib = aj + *ai;
705     /* (diag of A)*x */
706     z[2*i]   += v[0]*x1 + v[2]*x2;
707     z[2*i+1] += v[2]*x1 + v[3]*x2;
708     v += 4;
709     for (j=1; j<n; j++) {
710       /* (strict lower triangular part of A)*x  */
711       cval       = ib[j]*2;
712       z[cval]     += v[0]*x1 + v[1]*x2;
713       z[cval+1]   += v[2]*x1 + v[3]*x2;
714       /* (strict upper triangular part of A)*x  */
715       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
716       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
717       v  += 4;
718     }
719     xb +=2; ai++;
720   }
721 
722   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
723   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
724   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
725 
726   PLogFlops(4*(a->s_nz*2 - a->m));
727   PetscFunctionReturn(0);
728 }
729 
730 #undef __FUNC__
731 #define __FUNC__ "MatMultAdd_SeqSBAIJ_3"
732 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
733 {
734   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
735   Scalar          *x,*y,*z,*xb,x1,x2,x3;
736   MatScalar       *v;
737   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
738 
739   PetscFunctionBegin;
740   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
741   if (yy != xx) {
742     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
743   } else {
744     y = x;
745   }
746   if (zz != yy) {
747     ierr = VecCopy(yy,zz); CHKERRQ(ierr);
748     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
749   } else {
750     z = y;
751   }
752 
753   v     = a->a;
754   xb = x;
755 
756   for (i=0; i<mbs; i++) {
757     n  = ai[1] - ai[0]; /* length of i_th block row of A */
758     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
759     ib = aj + *ai;
760     /* (diag of A)*x */
761     z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
762     z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
763     z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
764     v += 9;
765     for (j=1; j<n; j++) {
766       /* (strict lower triangular part of A)*x  */
767       cval       = ib[j]*3;
768       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
769       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
770       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
771       /* (strict upper triangular part of A)*x  */
772       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
773       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
774       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
775       v  += 9;
776     }
777     xb +=3; ai++;
778   }
779 
780   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
781   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
782   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
783 
784   PLogFlops(18*(a->s_nz*2 - a->m));
785   PetscFunctionReturn(0);
786 }
787 
788 #undef __FUNC__
789 #define __FUNC__ "MatMultAdd_SeqSBAIJ_4"
790 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
791 {
792   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
793   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4;
794   MatScalar       *v;
795   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
796 
797   PetscFunctionBegin;
798   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
799   if (yy != xx) {
800     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
801   } else {
802     y = x;
803   }
804   if (zz != yy) {
805     ierr = VecCopy(yy,zz); CHKERRQ(ierr);
806     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
807   } else {
808     z = y;
809   }
810 
811   v     = a->a;
812   xb = x;
813 
814   for (i=0; i<mbs; i++) {
815     n  = ai[1] - ai[0]; /* length of i_th block row of A */
816     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
817     ib = aj + *ai;
818     /* (diag of A)*x */
819     z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
820     z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
821     z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
822     z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
823     v += 16;
824     for (j=1; j<n; j++) {
825       /* (strict lower triangular part of A)*x  */
826       cval       = ib[j]*4;
827       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
828       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
829       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
830       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
831       /* (strict upper triangular part of A)*x  */
832       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
833       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
834       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
835       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
836       v  += 16;
837     }
838     xb +=4; ai++;
839   }
840 
841   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
842   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
843   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
844 
845   PLogFlops(32*(a->s_nz*2 - a->m));
846   PetscFunctionReturn(0);
847 }
848 
849 #undef __FUNC__
850 #define __FUNC__ "MatMultAdd_SeqSBAIJ_5"
851 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
852 {
853   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
854   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4,x5;
855   MatScalar       *v;
856   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
857 
858   PetscFunctionBegin;
859   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
860   if (yy != xx) {
861     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
862   } else {
863     y = x;
864   }
865   if (zz != yy) {
866     ierr = VecCopy(yy,zz); CHKERRQ(ierr);
867     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
868   } else {
869     z = y;
870   }
871 
872   v     = a->a;
873   xb = x;
874 
875   for (i=0; i<mbs; i++) {
876     n  = ai[1] - ai[0]; /* length of i_th block row of A */
877     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
878     ib = aj + *ai;
879     /* (diag of A)*x */
880     z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
881     z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
882     z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
883     z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
884     z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
885     v += 25;
886     for (j=1; j<n; j++) {
887       /* (strict lower triangular part of A)*x  */
888       cval       = ib[j]*5;
889       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
890       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
891       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
892       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
893       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
894       /* (strict upper triangular part of A)*x  */
895       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
896       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
897       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
898       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
899       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
900       v  += 25;
901     }
902     xb +=5; ai++;
903   }
904 
905   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
906   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
907   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
908 
909   PLogFlops(50*(a->s_nz*2 - a->m));
910   PetscFunctionReturn(0);
911 }
912 #undef __FUNC__
913 #define __FUNC__ "MatMultAdd_SeqSBAIJ_6"
914 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
915 {
916   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
917   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
918   MatScalar       *v;
919   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
920 
921   PetscFunctionBegin;
922   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
923   if (yy != xx) {
924     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
925   } else {
926     y = x;
927   }
928   if (zz != yy) {
929     ierr = VecCopy(yy,zz); CHKERRQ(ierr);
930     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
931   } else {
932     z = y;
933   }
934 
935   v     = a->a;
936   xb = x;
937 
938   for (i=0; i<mbs; i++) {
939     n  = ai[1] - ai[0]; /* length of i_th block row of A */
940     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
941     ib = aj + *ai;
942     /* (diag of A)*x */
943     z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
944     z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
945     z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
946     z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
947     z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
948     z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
949     v += 36;
950     for (j=1; j<n; j++) {
951       /* (strict lower triangular part of A)*x  */
952       cval       = ib[j]*6;
953       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
954       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
955       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
956       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
957       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
958       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
959       /* (strict upper triangular part of A)*x  */
960       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
961       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
962       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
963       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
964       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
965       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
966       v  += 36;
967     }
968     xb +=6; ai++;
969   }
970 
971   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
972   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
973   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
974 
975   PLogFlops(72*(a->s_nz*2 - a->m));
976   PetscFunctionReturn(0);
977 }
978 
979 #undef __FUNC__
980 #define __FUNC__ "MatMultAdd_SeqSBAIJ_7"
981 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
982 {
983   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
984   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
985   MatScalar       *v;
986   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
987 
988   PetscFunctionBegin;
989   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
990   if (yy != xx) {
991     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
992   } else {
993     y = x;
994   }
995   if (zz != yy) {
996     ierr = VecCopy(yy,zz); CHKERRQ(ierr);
997     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
998   } else {
999     z = y;
1000   }
1001 
1002   v     = a->a;
1003   xb = x;
1004 
1005   for (i=0; i<mbs; i++) {
1006     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1007     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1008     ib = aj + *ai;
1009     /* (diag of A)*x */
1010     z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1011     z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1012     z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1013     z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1014     z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1015     z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1016     z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1017     v += 49;
1018     for (j=1; j<n; j++) {
1019       /* (strict lower triangular part of A)*x  */
1020       cval       = ib[j]*7;
1021       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1022       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1023       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1024       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1025       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1026       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1027       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1028       /* (strict upper triangular part of A)*x  */
1029       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1030       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1031       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1032       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1033       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1034       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1035       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1036       v  += 49;
1037     }
1038     xb +=7; ai++;
1039   }
1040 
1041   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1042   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1043   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1044 
1045   PLogFlops(98*(a->s_nz*2 - a->m));
1046   PetscFunctionReturn(0);
1047 }
1048 
1049 #undef __FUNC__
1050 #define __FUNC__ "MatMultAdd_SeqSBAIJ_N"
1051 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1052 {
1053   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
1054   Scalar          *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1055   MatScalar       *v;
1056   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
1057   int             ncols,k;
1058 
1059   PetscFunctionBegin;
1060   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1061   if (yy != xx) {
1062     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1063   } else {
1064     y = x;
1065   }
1066   if (zz != yy) {
1067     ierr = VecCopy(yy,zz); CHKERRQ(ierr);
1068     ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1069   } else {
1070     z = y;
1071   }
1072 
1073   aj   = a->j;
1074   v    = a->a;
1075   ii   = a->i;
1076 
1077   if (!a->mult_work) {
1078     k = a->m;
1079     a->mult_work = (Scalar*)PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1080   }
1081   work = a->mult_work;
1082 
1083 
1084   for (i=0; i<mbs; i++) {
1085     n     = ii[1] - ii[0]; ncols = n*bs;
1086     workt = work; idx=aj+ii[0];
1087 
1088     /* upper triangular part */
1089     for (j=0; j<n; j++) {
1090       /* col = bs*(*idx); */
1091       xb = x_ptr + bs*(*idx++);
1092       for (k=0; k<bs; k++) workt[k] = xb[k];
1093       workt += bs;
1094     }
1095     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1096     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1097 
1098     /* strict lower triangular part */
1099     ncols -= bs;
1100     if (ncols > 0){
1101       workt = work;
1102       ierr  = PetscMemzero(workt,ncols*sizeof(Scalar));CHKERRQ(ierr);
1103       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v+bs2,workt);
1104 
1105       idx=aj+ii[0]+1;
1106       for (j=1; j<n; j++) {
1107         zb = z_ptr + bs*(*idx);
1108         idx++;
1109         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1110         workt += bs;
1111       }
1112     }
1113 
1114     x += bs; v += n*bs2; z += bs; ii++;
1115   }
1116 
1117   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1118   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1119   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1120 
1121   PLogFlops(2*(a->s_nz*2 - a->m));
1122   PetscFunctionReturn(0);
1123 }
1124 
1125 #undef __FUNC__
1126 #define __FUNC__ "MatMultTranspose_SeqSBAIJ"
1127 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1128 {
1129   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
1130   Scalar          *xg,*zg,*zb,zero = 0.0;
1131   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
1132   MatScalar       *v;
1133   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval;
1134   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1135 
1136   PetscFunctionBegin;
1137   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
1138   ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg;
1139   ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg;
1140 
1141   idx   = a->j;
1142   v     = a->a;
1143   ii    = a->i;
1144   xb    = x;
1145   switch (bs) {
1146   case 1:
1147     for (i=0; i<mbs; i++) {
1148       n  = ii[1] - ii[0]; ii++;
1149       x1 = xb[0];
1150       ib = idx + ai[i];
1151       for (j=0; j<n; j++) {
1152         rval    = ib[j];
1153         z[rval] += *v * x1;
1154         v++;
1155       }
1156       xb++;
1157     }
1158     break;
1159   case 2:
1160     for (i=0; i<mbs; i++) {
1161       n  = ii[1] - ii[0]; ii++;
1162       x1 = xb[0]; x2 = xb[1];
1163       ib = idx + ai[i];
1164       for (j=0; j<n; j++) {
1165         rval      = ib[j]*2;
1166         z[rval++] += v[0]*x1 + v[1]*x2;
1167         z[rval]   += v[2]*x1 + v[3]*x2;
1168         v  += 4;
1169       }
1170       xb += 2;
1171     }
1172     break;
1173   case 3:
1174     for (i=0; i<mbs; i++) {
1175       n  = ii[1] - ii[0]; ii++;
1176       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1177       ib = idx + ai[i];
1178       for (j=0; j<n; j++) {
1179         rval      = ib[j]*3;
1180         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1181         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1182         z[rval]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
1183         v  += 9;
1184       }
1185       xb += 3;
1186     }
1187     break;
1188   case 4:
1189     for (i=0; i<mbs; i++) {
1190       n  = ii[1] - ii[0]; ii++;
1191       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1192       ib = idx + ai[i];
1193       for (j=0; j<n; j++) {
1194         rval      = ib[j]*4;
1195         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1196         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1197         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1198         z[rval]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1199         v  += 16;
1200       }
1201       xb += 4;
1202     }
1203     break;
1204   case 5:
1205     for (i=0; i<mbs; i++) {
1206       n  = ii[1] - ii[0]; ii++;
1207       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1208       x4 = xb[3]; x5 = xb[4];
1209       ib = idx + ai[i];
1210       for (j=0; j<n; j++) {
1211         rval      = ib[j]*5;
1212         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1213         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1214         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1215         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1216         z[rval]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1217         v  += 25;
1218       }
1219       xb += 5;
1220     }
1221     break;
1222   case 6:
1223     for (i=0; i<mbs; i++) {
1224       n  = ii[1] - ii[0]; ii++;
1225       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1226       x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1227       ib = idx + ai[i];
1228       for (j=0; j<n; j++) {
1229         rval      = ib[j]*6;
1230         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 + v[4]*x5 + v[5]*x6;
1231         z[rval++] +=  v[6]*x1 +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1232         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1233         z[rval++] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1234         z[rval++] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1235         z[rval]   += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1236         v  += 36;
1237       }
1238       xb += 6;
1239     }
1240     break;
1241   case 7:
1242     for (i=0; i<mbs; i++) {
1243       n  = ii[1] - ii[0]; ii++;
1244       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1245       x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1246       ib = idx + ai[i];
1247       for (j=0; j<n; j++) {
1248         rval      = ib[j]*7;
1249         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1250         z[rval++] +=  v[7]*x1 +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1251         z[rval++] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1252         z[rval++] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1253         z[rval++] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1254         z[rval++] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1255         z[rval]   += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1256         v  += 49;
1257       }
1258       xb += 7;
1259     }
1260     break;
1261   default: {       /* block sizes larger then 7 by 7 are handled by BLAS */
1262       int       ncols,k;
1263       Scalar    *work,*workt;
1264 
1265       if (!a->mult_work) {
1266         k = PetscMax(a->m,a->n);
1267         a->mult_work = (Scalar*)PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1268       }
1269       work = a->mult_work;
1270       for (i=0; i<mbs; i++) {
1271         n     = ii[1] - ii[0]; ii++;
1272         ncols = n*bs;
1273         ierr  = PetscMemzero(work,ncols*sizeof(Scalar));CHKERRQ(ierr);
1274         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1275         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1276         v += n*bs2;
1277         x += bs;
1278         workt = work;
1279         for (j=0; j<n; j++) {
1280           zb = z + bs*(*idx++);
1281           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1282           workt += bs;
1283         }
1284       }
1285     }
1286   }
1287   ierr = VecRestoreArray(xx,&xg);CHKERRQ(ierr);
1288   ierr = VecRestoreArray(zz,&zg);CHKERRQ(ierr);
1289   PLogFlops(2*a->nz*a->bs2 - a->n);
1290   PetscFunctionReturn(0);
1291 }
1292 
1293 #undef __FUNC__
1294 #define __FUNC__ "MatMultTransposeAdd_SeqSBAIJ"
1295 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1296 
1297 {
1298   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
1299   Scalar          *xg,*zg,*zb,*x,*z,*xb,x1,x2,x3,x4,x5;
1300   MatScalar       *v;
1301   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1302 
1303   PetscFunctionBegin;
1304   ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg;
1305   ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg;
1306 
1307   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1308 
1309   idx   = a->j;
1310   v     = a->a;
1311   ii    = a->i;
1312   xb    = x;
1313 
1314   switch (bs) {
1315   case 1:
1316     for (i=0; i<mbs; i++) {
1317       n  = ii[1] - ii[0]; ii++;
1318       x1 = xb[0];
1319       ib = idx + ai[i];
1320       for (j=0; j<n; j++) {
1321         rval    = ib[j];
1322         z[rval] += *v * x1;
1323         v++;
1324       }
1325       xb++;
1326     }
1327     break;
1328   case 2:
1329     for (i=0; i<mbs; i++) {
1330       n  = ii[1] - ii[0]; ii++;
1331       x1 = xb[0]; x2 = xb[1];
1332       ib = idx + ai[i];
1333       for (j=0; j<n; j++) {
1334         rval      = ib[j]*2;
1335         z[rval++] += v[0]*x1 + v[1]*x2;
1336         z[rval++] += v[2]*x1 + v[3]*x2;
1337         v  += 4;
1338       }
1339       xb += 2;
1340     }
1341     break;
1342   case 3:
1343     for (i=0; i<mbs; i++) {
1344       n  = ii[1] - ii[0]; ii++;
1345       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1346       ib = idx + ai[i];
1347       for (j=0; j<n; j++) {
1348         rval      = ib[j]*3;
1349         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1350         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1351         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1352         v  += 9;
1353       }
1354       xb += 3;
1355     }
1356     break;
1357   case 4:
1358     for (i=0; i<mbs; i++) {
1359       n  = ii[1] - ii[0]; ii++;
1360       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1361       ib = idx + ai[i];
1362       for (j=0; j<n; j++) {
1363         rval      = ib[j]*4;
1364         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1365         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1366         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1367         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1368         v  += 16;
1369       }
1370       xb += 4;
1371     }
1372     break;
1373   case 5:
1374     for (i=0; i<mbs; i++) {
1375       n  = ii[1] - ii[0]; ii++;
1376       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1377       x4 = xb[3]; x5 = xb[4];
1378       ib = idx + ai[i];
1379       for (j=0; j<n; j++) {
1380         rval      = ib[j]*5;
1381         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1382         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1383         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1384         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1385         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1386         v  += 25;
1387       }
1388       xb += 5;
1389     }
1390     break;
1391   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1392       int       ncols,k;
1393       Scalar    *work,*workt;
1394 
1395       if (!a->mult_work) {
1396         k = PetscMax(a->m,a->n);
1397         a->mult_work = (Scalar*)PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1398       }
1399       work = a->mult_work;
1400       for (i=0; i<mbs; i++) {
1401         n     = ii[1] - ii[0]; ii++;
1402         ncols = n*bs;
1403         ierr  = PetscMemzero(work,ncols*sizeof(Scalar));CHKERRQ(ierr);
1404         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1405         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1406         v += n*bs2;
1407         x += bs;
1408         workt = work;
1409         for (j=0; j<n; j++) {
1410           zb = z + bs*(*idx++);
1411           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1412           workt += bs;
1413         }
1414       }
1415     }
1416   }
1417   ierr = VecRestoreArray(xx,&xg);CHKERRQ(ierr);
1418   ierr = VecRestoreArray(zz,&zg);CHKERRQ(ierr);
1419   PLogFlops(2*a->nz*a->bs2);
1420   PetscFunctionReturn(0);
1421 }
1422 
1423 #undef __FUNC__
1424 #define __FUNC__ "MatScale_SeqSBAIJ"
1425 int MatScale_SeqSBAIJ(Scalar *alpha,Mat inA)
1426 {
1427   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1428   int         one = 1,totalnz = a->bs2*a->s_nz;
1429 
1430   PetscFunctionBegin;
1431   BLscal_(&totalnz,alpha,a->a,&one);
1432   PLogFlops(totalnz);
1433   PetscFunctionReturn(0);
1434 }
1435 
1436 #undef __FUNC__
1437 #define __FUNC__ "MatNorm_SeqSBAIJ"
1438 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1439 {
1440   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1441   MatScalar   *v = a->a;
1442   PetscReal   sum_diag = 0.0, sum_off = 0.0, *sum;
1443   int         i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs;
1444   int         *jl,*il,jmin,jmax,ierr,nexti,ik;
1445 
1446   PetscFunctionBegin;
1447   if (type == NORM_FROBENIUS) {
1448     for (k=0; k<mbs; k++){
1449       jmin = a->i[k]; jmax = a->i[k+1];
1450       /* diagonal block */
1451       for (i=0; i<bs2; i++){
1452 #if defined(PETSC_USE_COMPLEX)
1453         sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1454 #else
1455         sum_diag += (*v)*(*v); v++;
1456 #endif
1457       }
1458       for (j=jmin+1; j<jmax; j++){  /* off-diagonal blocks */
1459         for (i=0; i<bs2; i++){
1460 #if defined(PETSC_USE_COMPLEX)
1461           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1462 #else
1463           sum_off += (*v)*(*v); v++;
1464 #endif
1465         }
1466       }
1467     }
1468     *norm = sqrt(sum_diag + 2*sum_off);
1469 
1470   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1471     il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
1472     jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
1473     sum = (PetscReal*)PetscMalloc(bs*sizeof(PetscReal));CHKPTRQ(sum);
1474     for (i=0; i<mbs; i++) {
1475       jl[i] = mbs; il[0] = 0;
1476     }
1477 
1478     *norm = 0.0;
1479     for (k=0; k<mbs; k++) { /* k_th block row */
1480       for (j=0; j<bs; j++) sum[j]=0.0;
1481 
1482       /*-- col sum --*/
1483       i = jl[k]; /* first |A(i,k)| to be added */
1484       while (i<mbs){
1485         nexti = jl[i];  /* next block row to be added */
1486         ik    = il[i];  /* block index of A(i,k) in the array a */
1487         for (j=0; j<bs; j++){
1488           v = a->a + ik*bs2 + j*bs;
1489           for (k1=0; k1<bs; k1++) {
1490             sum[j] += PetscAbsScalar(*v); v++;
1491           }
1492         }
1493         /* update il, jl */
1494         jmin = ik + 1; jmax = a->i[i+1];
1495         if (jmin < jmax){
1496           il[i] = jmin;
1497           j   = a->j[jmin];
1498           jl[i] = jl[j]; jl[j]=i;
1499         }
1500         i = nexti;
1501       }
1502 
1503       /*-- row sum --*/
1504       jmin = a->i[k]; jmax = a->i[k+1];
1505       for (i=jmin; i<jmax; i++) {
1506         for (j=0; j<bs; j++){
1507           v = a->a + i*bs2 + j;
1508           for (k1=0; k1<bs; k1++){
1509             sum[j] += PetscAbsScalar(*v);
1510             v   += bs;
1511           }
1512         }
1513       }
1514       /* add k_th block row to il, jl */
1515       jmin++;
1516       if (jmin < jmax){
1517         il[k] = jmin;
1518         j   = a->j[jmin];
1519         jl[k] = jl[j]; jl[j] = k;
1520       }
1521       for (j=0; j<bs; j++){
1522         if (sum[j] > *norm) *norm = sum[j];
1523       }
1524     }
1525     ierr = PetscFree(il);CHKERRQ(ierr);
1526     ierr = PetscFree(jl);CHKERRQ(ierr);
1527     ierr = PetscFree(sum);CHKERRQ(ierr);
1528   } else {
1529     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1530   }
1531   PetscFunctionReturn(0);
1532 }
1533 
1534 #ifdef MatNorm_SeqBAIJ
1535 /* This is modified MatNorm_SeqBAIJ.
1536    MatNorm_SeqBAIJ in baij/seq/baij2.c is not correct for bs>1 */
1537 
1538 #undef __FUNC__
1539 #define __FUNC__ "MatNorm_SeqBAIJ"
1540 int MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1541 {
1542   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1543   MatScalar   *v = a->a;
1544   PetscReal   sum = 0.0;
1545   int         i,j,k,bs = a->bs,nz=a->nz,bs2=a->bs2,k1;
1546 
1547   PetscFunctionBegin;
1548   if (type == NORM_FROBENIUS) {
1549     for (i=0; i< bs2*nz; i++) {
1550 #if defined(PETSC_USE_COMPLEX)
1551       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1552 #else
1553       sum += (*v)*(*v); v++;
1554 #endif
1555     }
1556     *norm = sqrt(sum);
1557   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1558     *norm = 0.0;
1559     for (j=0; j<a->mbs; j++) {
1560       for (k=0; k<bs; k++) {
1561         v = a->a + bs2*a->i[j] + k;
1562         sum = 0.0;
1563         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1564           for (k1=0; k1<bs; k1++){   /* this loop was missing in the original code*/
1565             sum += PetscAbsScalar(*v);
1566             v   += bs;
1567           }
1568         }
1569         if (sum > *norm) *norm = sum;
1570       }
1571     }
1572   } else {
1573     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1574   }
1575   PetscFunctionReturn(0);
1576 }
1577 #endif
1578 
1579 #undef __FUNC__
1580 #define __FUNC__ "MatEqual_SeqSBAIJ"
1581 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1582 {
1583   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1584   int         ierr;
1585 
1586   PetscFunctionBegin;
1587   if (B->type !=MATSEQSBAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices must be same type");
1588 
1589   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1590   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)|| (a->s_nz != b->s_nz)) {
1591     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1592   }
1593 
1594   /* if the a->i are the same */
1595   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr);
1596   if (*flg == PETSC_FALSE) {
1597     PetscFunctionReturn(0);
1598   }
1599 
1600   /* if a->j are the same */
1601   ierr = PetscMemcmp(a->j,b->j,(a->s_nz)*sizeof(int),flg);CHKERRQ(ierr);
1602   if (*flg == PETSC_FALSE) {
1603     PetscFunctionReturn(0);
1604   }
1605   /* if a->a are the same */
1606   ierr = PetscMemcmp(a->a,b->a,(a->s_nz)*(a->bs)*(a->bs)*sizeof(Scalar),flg);CHKERRQ(ierr);
1607   PetscFunctionReturn(0);
1608 
1609 }
1610 
1611 #undef __FUNC__
1612 #define __FUNC__ "MatGetDiagonal_SeqSBAIJ"
1613 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1614 {
1615   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1616   int         ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1617   Scalar      *x,zero = 0.0;
1618   MatScalar   *aa,*aa_j;
1619 
1620   PetscFunctionBegin;
1621   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Not for factored matrix");
1622   bs   = a->bs;
1623   aa   = a->a;
1624   ai   = a->i;
1625   aj   = a->j;
1626   ambs = a->mbs;
1627   bs2  = a->bs2;
1628 
1629   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1630   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1631   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1632   /* if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Nonconforming matrix and vector");*/
1633   for (i=0; i<ambs; i++) {
1634     j=ai[i];
1635     if (aj[j] == i) {             /* if this is a diagonal element */
1636       row  = i*bs;
1637       aa_j = aa + j*bs2;
1638       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1639     }
1640   }
1641   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1642   PetscFunctionReturn(0);
1643 }
1644 
1645 #undef __FUNC__
1646 #define __FUNC__ "MatDiagonalScale_SeqSBAIJ"
1647 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1648 {
1649   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1650   Scalar      *l,*r,x,*li,*ri;
1651   MatScalar   *aa,*v;
1652   int         ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1653 
1654   PetscFunctionBegin;
1655   ai  = a->i;
1656   aj  = a->j;
1657   aa  = a->a;
1658   m   = a->m;
1659   bs  = a->bs;
1660   mbs = a->mbs;
1661   bs2 = a->bs2;
1662 
1663   if (ll != rr) {
1664     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"For symmetric format, left and right scaling vectors must be same\n");
1665   }
1666   if (ll) {
1667     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1668     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1669     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Left scaling vector wrong length");
1670     for (i=0; i<mbs; i++) { /* for each block row */
1671       M  = ai[i+1] - ai[i];
1672       li = l + i*bs;
1673       v  = aa + bs2*ai[i];
1674       for (j=0; j<M; j++) { /* for each block */
1675         for (k=0; k<bs2; k++) {
1676           (*v++) *= li[k%bs];
1677         }
1678 #ifdef CONT
1679         /* will be used to replace the above loop */
1680         ri = l + bs*aj[ai[i]+j];
1681         for (k=0; k<bs; k++) { /* column value */
1682           x = ri[k];
1683           for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1684         }
1685 #endif
1686 
1687       }
1688     }
1689     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1690     PLogFlops(2*a->s_nz);
1691   }
1692   /* will be deleted */
1693   if (rr) {
1694     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
1695     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1696     if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Right scaling vector wrong length");
1697     for (i=0; i<mbs; i++) { /* for each block row */
1698       M  = ai[i+1] - ai[i];
1699       v  = aa + bs2*ai[i];
1700       for (j=0; j<M; j++) { /* for each block */
1701         ri = r + bs*aj[ai[i]+j];
1702         for (k=0; k<bs; k++) {
1703           x = ri[k];
1704           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1705         }
1706       }
1707     }
1708     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
1709     PLogFlops(a->s_nz);
1710   }
1711   PetscFunctionReturn(0);
1712 }
1713 
1714 #undef __FUNC__
1715 #define __FUNC__ "MatGetInfo_SeqSBAIJ"
1716 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1717 {
1718   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1719 
1720   PetscFunctionBegin;
1721   info->rows_global    = (double)a->m;
1722   info->columns_global = (double)a->m;
1723   info->rows_local     = (double)a->m;
1724   info->columns_local  = (double)a->m;
1725   info->block_size     = a->bs2;
1726   info->nz_allocated   = a->s_maxnz; /*num. of nonzeros in upper triangular part */
1727   info->nz_used        = a->bs2*a->s_nz; /*num. of nonzeros in upper triangular part */
1728   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1729   info->assemblies   = A->num_ass;
1730   info->mallocs      = a->reallocs;
1731   info->memory       = A->mem;
1732   if (A->factor) {
1733     info->fill_ratio_given  = A->info.fill_ratio_given;
1734     info->fill_ratio_needed = A->info.fill_ratio_needed;
1735     info->factor_mallocs    = A->info.factor_mallocs;
1736   } else {
1737     info->fill_ratio_given  = 0;
1738     info->fill_ratio_needed = 0;
1739     info->factor_mallocs    = 0;
1740   }
1741   PetscFunctionReturn(0);
1742 }
1743 
1744 
1745 #undef __FUNC__
1746 #define __FUNC__ "MatZeroEntries_SeqSBAIJ"
1747 int MatZeroEntries_SeqSBAIJ(Mat A)
1748 {
1749   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1750   int         ierr;
1751 
1752   PetscFunctionBegin;
1753   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1754   PetscFunctionReturn(0);
1755 }
1756