xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 7fbae186bacbe2f615b0bc86cd5234b9970bdcb7)
1 /*$Id: sbaij2.c,v 1.13 2000/09/28 20:51:54 bsmith Exp hzhang $*/
2 
3 #include "petscsys.h"
4 #include "src/mat/impls/baij/seq/baij.h"
5 #include "src/vec/vecimpl.h"
6 #include "src/inline/spops.h"
7 #include "src/inline/ilu.h"
8 #include "petscbt.h"
9 #include "sbaij.h"
10 
11 #undef __FUNC__
12 #define __FUNC__ "MatIncreaseOverlap_SeqSBAIJ"
13 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS *is,int ov)
14 {
15   PetscFunctionBegin;
16   SETERRQ(1,"Function not yet written for SBAIJ format");
17   /* PetscFunctionReturn(0); */
18 }
19 
20 #undef __FUNC__
21 #define __FUNC__ "MatGetSubMatrix_SeqSBAIJ_Private"
22 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
23 {
24   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
25   int          *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
26   int          row,mat_i,*mat_j,tcol,*mat_ilen;
27   int          *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
28   int          *aj = a->j,*ai = a->i;
29   MatScalar    *mat_a;
30   Mat          C;
31   PetscTruth   flag;
32 
33   PetscFunctionBegin;
34 
35   if (isrow != iscol) SETERRA(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
36   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
37   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
38 
39   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
40   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
41 
42   smap  = (int*)PetscMalloc((1+oldcols)*sizeof(int));CHKPTRQ(smap);
43   ssmap = smap;
44   lens  = (int*)PetscMalloc((1+nrows)*sizeof(int));CHKPTRQ(lens);
45   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
46   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
47   /* determine lens of each row */
48   for (i=0; i<nrows; i++) {
49     kstart  = ai[irow[i]];
50     kend    = kstart + a->ilen[irow[i]];
51     lens[i] = 0;
52       for (k=kstart; k<kend; k++) {
53         if (ssmap[aj[k]]) {
54           lens[i]++;
55         }
56       }
57     }
58   /* Create and fill new matrix */
59   if (scall == MAT_REUSE_MATRIX) {
60     c = (Mat_SeqSBAIJ *)((*B)->data);
61 
62     if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
63     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr);
64     if (flag == PETSC_FALSE) {
65       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
66     }
67     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
68     C = *B;
69   } else {
70     ierr = MatCreateSeqSBAIJ(A->comm,bs,nrows*bs,nrows*bs,0,lens,&C);CHKERRQ(ierr);
71   }
72   c = (Mat_SeqSBAIJ *)(C->data);
73   for (i=0; i<nrows; i++) {
74     row    = irow[i];
75     kstart = ai[row];
76     kend   = kstart + a->ilen[row];
77     mat_i  = c->i[i];
78     mat_j  = c->j + mat_i;
79     mat_a  = c->a + mat_i*bs2;
80     mat_ilen = c->ilen + i;
81     for (k=kstart; k<kend; k++) {
82       if ((tcol=ssmap[a->j[k]])) {
83         *mat_j++ = tcol - 1;
84         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
85         mat_a   += bs2;
86         (*mat_ilen)++;
87       }
88     }
89   }
90 
91   /* Free work space */
92   ierr = PetscFree(smap);CHKERRQ(ierr);
93   ierr = PetscFree(lens);CHKERRQ(ierr);
94   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
95   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
96 
97   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
98   *B = C;
99   PetscFunctionReturn(0);
100 }
101 
102 #undef __FUNC__
103 #define __FUNC__ "MatGetSubMatrix_SeqSBAIJ"
104 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
105 {
106   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
107   IS          is1;
108   int         *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
109 
110   PetscFunctionBegin;
111   if (isrow != iscol) SETERRA(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
112 
113   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
114   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
115 
116   /* Verify if the indices corespond to each element in a block
117    and form the IS with compressed IS */
118   vary = (int*)PetscMalloc(2*(a->mbs+1)*sizeof(int));CHKPTRQ(vary);
119   iary = vary + a->mbs;
120   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
121   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
122 
123   count = 0;
124   for (i=0; i<a->mbs; i++) {
125     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,"Index set does not match blocks");
126     if (vary[i]==bs) iary[count++] = i;
127   }
128   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
129 
130   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
131   ierr = PetscFree(vary);CHKERRQ(ierr);
132 
133   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
134   ISDestroy(is1);
135   PetscFunctionReturn(0);
136 }
137 
138 #undef __FUNC__
139 #define __FUNC__ "MatGetSubMatrices_SeqSBAIJ"
140 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,IS *irow,IS *icol,MatReuse scall,Mat **B)
141 {
142   int ierr,i;
143 
144   PetscFunctionBegin;
145   if (scall == MAT_INITIAL_MATRIX) {
146     *B = (Mat*)PetscMalloc((n+1)*sizeof(Mat));CHKPTRQ(*B);
147   }
148 
149   for (i=0; i<n; i++) {
150     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
151   }
152   PetscFunctionReturn(0);
153 }
154 
155 /* -------------------------------------------------------*/
156 /* Should check that shapes of vectors and matrices match */
157 /* -------------------------------------------------------*/
158 #include "petscblaslapack.h"
159 
160 #undef __FUNC__
161 #define __FUNC__ "MatMult_SeqSBAIJ_1"
162 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
163 {
164   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
165   Scalar          *x,*z,*xb,x1,zero=0.0;
166   MatScalar       *v;
167   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
168 
169   PetscFunctionBegin;
170   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
171   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
172   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
173 
174   v  = a->a;
175   xb = x;
176 
177   for (i=0; i<mbs; i++) {
178     n  = ai[1] - ai[0];  /* length of i_th row of A */
179     x1 = xb[0];
180     ib = aj + *ai;
181     if (*ib == i) z[i] += *v++ * x[*ib++];   /* (diag of A)*x */
182     for (j=1; j<n; j++) {
183       cval    = *ib;
184       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
185       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
186     }
187     xb++; ai++;
188   }
189 
190   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
191   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
192   PLogFlops(2*(a->s_nz*2 - a->m) - a->m);  /* s_nz = (nz+m)/2 */
193   PetscFunctionReturn(0);
194 }
195 
196 #undef __FUNC__
197 #define __FUNC__ "MatMult_SeqSBAIJ_2"
198 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
199 {
200   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
201   Scalar          *x,*z,*xb,x1,x2,zero=0.0;
202   MatScalar       *v;
203   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
204 
205 
206   PetscFunctionBegin;
207   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
208   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
209   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
210 
211   v     = a->a;
212   xb = x;
213 
214   for (i=0; i<mbs; i++) {
215     n  = ai[1] - ai[0]; /* length of i_th block row of A */
216     x1 = xb[0]; x2 = xb[1];
217     ib = aj + *ai;
218     if (*ib == i){     /* (diag of A)*x */
219       z[2*i]   += v[0]*x1 + v[2]*x2;
220       z[2*i+1] += v[2]*x1 + v[3]*x2;
221       v += 4;
222     }
223     for (j=1; j<n; j++) {
224       /* (strict lower triangular part of A)*x  */
225       cval       = ib[j]*2;
226       z[cval]     += v[0]*x1 + v[1]*x2;
227       z[cval+1]   += v[2]*x1 + v[3]*x2;
228       /* (strict upper triangular part of A)*x  */
229       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
230       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
231       v  += 4;
232     }
233     xb +=2; ai++;
234   }
235 
236   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
237   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
238   PLogFlops(8*(a->s_nz*2 - a->m) - a->m);
239   PetscFunctionReturn(0);
240 }
241 
242 #undef __FUNC__
243 #define __FUNC__ "MatMult_SeqSBAIJ_3"
244 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
245 {
246   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
247   Scalar          *x,*z,*xb,x1,x2,x3,zero=0.0;
248   MatScalar       *v;
249   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
250 
251 
252   PetscFunctionBegin;
253   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
254   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
255   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
256 
257   v     = a->a;
258   xb = x;
259 
260   for (i=0; i<mbs; i++) {
261     n  = ai[1] - ai[0]; /* length of i_th block row of A */
262     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
263     ib = aj + *ai;
264     if (*ib == i){     /* (diag of A)*x */
265       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
266       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
267       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
268       v += 9;
269     }
270     for (j=1; j<n; j++) {
271       /* (strict lower triangular part of A)*x  */
272       cval       = ib[j]*3;
273       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
274       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
275       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
276       /* (strict upper triangular part of A)*x  */
277       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
278       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
279       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
280       v  += 9;
281     }
282     xb +=3; ai++;
283   }
284 
285   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
286   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
287   PLogFlops(18*(a->s_nz*2 - a->m) - a->m);
288   PetscFunctionReturn(0);
289 }
290 
291 #undef __FUNC__
292 #define __FUNC__ "MatMult_SeqSBAIJ_4"
293 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
294 {
295   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
296   Scalar          *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
297   MatScalar       *v;
298   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
299 
300   PetscFunctionBegin;
301   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
302   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
303   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
304 
305   v     = a->a;
306   xb = x;
307 
308   for (i=0; i<mbs; i++) {
309     n  = ai[1] - ai[0]; /* length of i_th block row of A */
310     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
311     ib = aj + *ai;
312     if (*ib == i){     /* (diag of A)*x */
313       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
314       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
315       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
316       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
317       v += 16;
318     }
319     for (j=1; j<n; j++) {
320       /* (strict lower triangular part of A)*x  */
321       cval       = ib[j]*4;
322       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
323       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
324       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
325       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
326       /* (strict upper triangular part of A)*x  */
327       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
328       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
329       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
330       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
331       v  += 16;
332     }
333     xb +=4; ai++;
334   }
335 
336   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
337   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
338   PLogFlops(32*(a->s_nz*2 - a->m) - a->m);
339   PetscFunctionReturn(0);
340 }
341 
342 #undef __FUNC__
343 #define __FUNC__ "MatMult_SeqSBAIJ_5"
344 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
345 {
346   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
347   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
348   MatScalar       *v;
349   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
350 
351   PetscFunctionBegin;
352   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
353   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
354   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
355 
356   v     = a->a;
357   xb = x;
358 
359   for (i=0; i<mbs; i++) {
360     n  = ai[1] - ai[0]; /* length of i_th block row of A */
361     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
362     ib = aj + *ai;
363     if (*ib == i){      /* (diag of A)*x */
364       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
365       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
366       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
367       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
368       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
369       v += 25;
370     }
371     for (j=1; j<n; j++) {
372       /* (strict lower triangular part of A)*x  */
373       cval       = ib[j]*5;
374       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
375       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
376       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
377       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
378       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
379       /* (strict upper triangular part of A)*x  */
380       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
381       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
382       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
383       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
384       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
385       v  += 25;
386     }
387     xb +=5; ai++;
388   }
389 
390   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
391   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
392   PLogFlops(50*(a->s_nz*2 - a->m) - a->m);
393   PetscFunctionReturn(0);
394 }
395 
396 
397 #undef __FUNC__
398 #define __FUNC__ "MatMult_SeqSBAIJ_6"
399 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
400 {
401   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
402   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
403   MatScalar       *v;
404   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
405 
406   PetscFunctionBegin;
407   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
408   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
409   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
410 
411   v     = a->a;
412   xb = x;
413 
414   for (i=0; i<mbs; i++) {
415     n  = ai[1] - ai[0]; /* length of i_th block row of A */
416     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
417     ib = aj + *ai;
418     if (*ib == i){      /* (diag of A)*x */
419       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
420       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
421       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
422       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
423       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
424       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
425       v += 36;
426     }
427     for (j=1; j<n; j++) {
428       /* (strict lower triangular part of A)*x  */
429       cval       = ib[j]*6;
430       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
431       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
432       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
433       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
434       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
435       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
436       /* (strict upper triangular part of A)*x  */
437       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
438       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
439       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
440       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
441       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
442       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
443       v  += 36;
444     }
445     xb +=6; ai++;
446   }
447 
448   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
449   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
450   PLogFlops(72*(a->s_nz*2 - a->m) - a->m);
451   PetscFunctionReturn(0);
452 }
453 #undef __FUNC__
454 #define __FUNC__ "MatMult_SeqSBAIJ_7"
455 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
456 {
457   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
458   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
459   MatScalar       *v;
460   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
461 
462   PetscFunctionBegin;
463   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
464   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
465   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
466 
467   v     = a->a;
468   xb = x;
469 
470   for (i=0; i<mbs; i++) {
471     n  = ai[1] - ai[0]; /* length of i_th block row of A */
472     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
473     ib = aj + *ai;
474     if (*ib == i){      /* (diag of A)*x */
475       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
476       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
477       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
478       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
479       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
480       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
481       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
482       v += 49;
483     }
484     for (j=1; j<n; j++) {
485       /* (strict lower triangular part of A)*x  */
486       cval       = ib[j]*7;
487       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
488       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
489       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
490       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
491       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
492       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
493       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
494       /* (strict upper triangular part of A)*x  */
495       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
496       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
497       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
498       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
499       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
500       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
501       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
502       v  += 49;
503     }
504     xb +=7; ai++;
505   }
506   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
507   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
508   PLogFlops(98*(a->s_nz*2 - a->m) - a->m);
509   PetscFunctionReturn(0);
510 }
511 
512 /*
513     This will not work with MatScalar == float because it calls the BLAS
514 */
515 #undef __FUNC__
516 #define __FUNC__ "MatMult_SeqSBAIJ_N"
517 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
518 {
519   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
520   Scalar          *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
521   MatScalar       *v;
522   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
523   int             ncols,k;
524 
525   PetscFunctionBegin;
526   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
527   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
528   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
529 
530   aj   = a->j;
531   v    = a->a;
532   ii   = a->i;
533 
534   if (!a->mult_work) {
535     k = a->m;
536     a->mult_work = (Scalar*)PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
537   }
538   work = a->mult_work;
539 
540   for (i=0; i<mbs; i++) {
541     n     = ii[1] - ii[0]; ncols = n*bs;
542     workt = work; idx=aj+ii[0];
543 
544     /* upper triangular part */
545     for (j=0; j<n; j++) {
546       /* col = bs*(*idx); */
547       xb = x_ptr + bs*(*idx++);
548       for (k=0; k<bs; k++) workt[k] = xb[k];
549       workt += bs;
550     }
551     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
552     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
553 
554     /* strict lower triangular part */
555     ncols -= bs;
556     if (ncols > 0){
557       workt = work;
558       ierr  = PetscMemzero(workt,ncols*sizeof(Scalar));CHKERRQ(ierr);
559       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v+bs2,workt);
560 
561       idx=aj+ii[0]+1;
562       for (j=1; j<n; j++) {
563         zb = z_ptr + bs*(*idx);
564         idx++;
565         for (k=0; k<bs; k++) zb[k] += workt[k] ;
566         workt += bs;
567       }
568     }
569 
570     x += bs; v += n*bs2; z += bs; ii++;
571   }
572 
573   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
574   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
575   PLogFlops(2*(a->s_nz*2 - a->m)*bs2 - a->m);
576   PetscFunctionReturn(0);
577 }
578 
579 #undef __FUNC__
580 #define __FUNC__ "MatMultAdd_SeqSBAIJ_1"
581 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
582 {
583   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
584   Scalar          *x,*y,*z,*xb,x1;
585   MatScalar       *v;
586   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
587 
588   PetscFunctionBegin;
589   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
590   if (yy != xx) {
591     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
592   } else {
593     y = x;
594   }
595   if (zz != yy) {
596     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
597     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
598   } else {
599     z = y;
600   }
601 
602   v  = a->a;
603   xb = x;
604 
605   for (i=0; i<mbs; i++) {
606     n  = ai[1] - ai[0];  /* length of i_th row of A */
607     x1 = xb[0];
608     ib = aj + *ai;
609     if (*ib == i) z[i] += *v++ * x[*ib++];   /* (diag of A)*x */
610     for (j=1; j<n; j++) {
611       cval    = *ib;
612       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
613       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
614     }
615     xb++; ai++;
616   }
617 
618   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
619   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
620   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
621 
622   PLogFlops(2*(a->s_nz*2 - a->m));
623   PetscFunctionReturn(0);
624 }
625 
626 #undef __FUNC__
627 #define __FUNC__ "MatMultAdd_SeqSBAIJ_2"
628 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
629 {
630   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
631   Scalar          *x,*y,*z,*xb,x1,x2;
632   MatScalar       *v;
633   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
634 
635   PetscFunctionBegin;
636   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
637   if (yy != xx) {
638     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
639   } else {
640     y = x;
641   }
642   if (zz != yy) {
643     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
644     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
645   } else {
646     z = y;
647   }
648 
649   v     = a->a;
650   xb = x;
651 
652   for (i=0; i<mbs; i++) {
653     n  = ai[1] - ai[0]; /* length of i_th block row of A */
654     x1 = xb[0]; x2 = xb[1];
655     ib = aj + *ai;
656     if (*ib == i){      /* (diag of A)*x */
657       z[2*i]   += v[0]*x1 + v[2]*x2;
658       z[2*i+1] += v[2]*x1 + v[3]*x2;
659       v += 4;
660     }
661     for (j=1; j<n; j++) {
662       /* (strict lower triangular part of A)*x  */
663       cval       = ib[j]*2;
664       z[cval]     += v[0]*x1 + v[1]*x2;
665       z[cval+1]   += v[2]*x1 + v[3]*x2;
666       /* (strict upper triangular part of A)*x  */
667       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
668       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
669       v  += 4;
670     }
671     xb +=2; ai++;
672   }
673 
674   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
675   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
676   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
677 
678   PLogFlops(4*(a->s_nz*2 - a->m));
679   PetscFunctionReturn(0);
680 }
681 
682 #undef __FUNC__
683 #define __FUNC__ "MatMultAdd_SeqSBAIJ_3"
684 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
685 {
686   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
687   Scalar          *x,*y,*z,*xb,x1,x2,x3;
688   MatScalar       *v;
689   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
690 
691   PetscFunctionBegin;
692   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
693   if (yy != xx) {
694     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
695   } else {
696     y = x;
697   }
698   if (zz != yy) {
699     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
700     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
701   } else {
702     z = y;
703   }
704 
705   v     = a->a;
706   xb = x;
707 
708   for (i=0; i<mbs; i++) {
709     n  = ai[1] - ai[0]; /* length of i_th block row of A */
710     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
711     ib = aj + *ai;
712     if (*ib == i){     /* (diag of A)*x */
713      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
714      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
715      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
716      v += 9;
717     }
718     for (j=1; j<n; j++) {
719       /* (strict lower triangular part of A)*x  */
720       cval       = ib[j]*3;
721       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
722       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
723       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
724       /* (strict upper triangular part of A)*x  */
725       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
726       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
727       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
728       v  += 9;
729     }
730     xb +=3; ai++;
731   }
732 
733   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
734   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
735   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
736 
737   PLogFlops(18*(a->s_nz*2 - a->m));
738   PetscFunctionReturn(0);
739 }
740 
741 #undef __FUNC__
742 #define __FUNC__ "MatMultAdd_SeqSBAIJ_4"
743 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
744 {
745   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
746   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4;
747   MatScalar       *v;
748   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
749 
750   PetscFunctionBegin;
751   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
752   if (yy != xx) {
753     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
754   } else {
755     y = x;
756   }
757   if (zz != yy) {
758     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
759     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
760   } else {
761     z = y;
762   }
763 
764   v     = a->a;
765   xb = x;
766 
767   for (i=0; i<mbs; i++) {
768     n  = ai[1] - ai[0]; /* length of i_th block row of A */
769     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
770     ib = aj + *ai;
771     if (*ib == i){      /* (diag of A)*x */
772       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
773       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
774       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
775       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
776       v += 16;
777     }
778     for (j=1; j<n; j++) {
779       /* (strict lower triangular part of A)*x  */
780       cval       = ib[j]*4;
781       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
782       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
783       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
784       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
785       /* (strict upper triangular part of A)*x  */
786       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
787       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
788       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
789       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
790       v  += 16;
791     }
792     xb +=4; ai++;
793   }
794 
795   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
796   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
797   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
798 
799   PLogFlops(32*(a->s_nz*2 - a->m));
800   PetscFunctionReturn(0);
801 }
802 
803 #undef __FUNC__
804 #define __FUNC__ "MatMultAdd_SeqSBAIJ_5"
805 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
806 {
807   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
808   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4,x5;
809   MatScalar       *v;
810   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
811 
812   PetscFunctionBegin;
813   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
814   if (yy != xx) {
815     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
816   } else {
817     y = x;
818   }
819   if (zz != yy) {
820     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
821     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
822   } else {
823     z = y;
824   }
825 
826   v     = a->a;
827   xb = x;
828 
829   for (i=0; i<mbs; i++) {
830     n  = ai[1] - ai[0]; /* length of i_th block row of A */
831     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
832     ib = aj + *ai;
833     if (*ib == i){      /* (diag of A)*x */
834       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
835       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
836       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
837       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
838       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
839       v += 25;
840     }
841     for (j=1; j<n; j++) {
842       /* (strict lower triangular part of A)*x  */
843       cval       = ib[j]*5;
844       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
845       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
846       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
847       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
848       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
849       /* (strict upper triangular part of A)*x  */
850       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
851       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
852       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
853       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
854       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
855       v  += 25;
856     }
857     xb +=5; ai++;
858   }
859 
860   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
861   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
862   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
863 
864   PLogFlops(50*(a->s_nz*2 - a->m));
865   PetscFunctionReturn(0);
866 }
867 #undef __FUNC__
868 #define __FUNC__ "MatMultAdd_SeqSBAIJ_6"
869 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
870 {
871   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
872   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
873   MatScalar       *v;
874   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
875 
876   PetscFunctionBegin;
877   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
878   if (yy != xx) {
879     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
880   } else {
881     y = x;
882   }
883   if (zz != yy) {
884     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
885     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
886   } else {
887     z = y;
888   }
889 
890   v     = a->a;
891   xb = x;
892 
893   for (i=0; i<mbs; i++) {
894     n  = ai[1] - ai[0]; /* length of i_th block row of A */
895     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
896     ib = aj + *ai;
897     if (*ib == i){     /* (diag of A)*x */
898       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
899       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
900       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
901       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
902       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
903       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
904       v += 36;
905     }
906     for (j=1; j<n; j++) {
907       /* (strict lower triangular part of A)*x  */
908       cval       = ib[j]*6;
909       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
910       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
911       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
912       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
913       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
914       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
915       /* (strict upper triangular part of A)*x  */
916       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
917       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
918       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
919       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
920       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
921       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
922       v  += 36;
923     }
924     xb +=6; ai++;
925   }
926 
927   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
928   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
929   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
930 
931   PLogFlops(72*(a->s_nz*2 - a->m));
932   PetscFunctionReturn(0);
933 }
934 
935 #undef __FUNC__
936 #define __FUNC__ "MatMultAdd_SeqSBAIJ_7"
937 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
938 {
939   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
940   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
941   MatScalar       *v;
942   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
943 
944   PetscFunctionBegin;
945   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
946   if (yy != xx) {
947     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
948   } else {
949     y = x;
950   }
951   if (zz != yy) {
952     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
953     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
954   } else {
955     z = y;
956   }
957 
958   v     = a->a;
959   xb = x;
960 
961   for (i=0; i<mbs; i++) {
962     n  = ai[1] - ai[0]; /* length of i_th block row of A */
963     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
964     ib = aj + *ai;
965     if (*ib == i){     /* (diag of A)*x */
966       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
967       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
968       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
969       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
970       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
971       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
972       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
973       v += 49;
974     }
975     for (j=1; j<n; j++) {
976       /* (strict lower triangular part of A)*x  */
977       cval       = ib[j]*7;
978       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
979       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
980       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
981       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
982       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
983       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
984       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
985       /* (strict upper triangular part of A)*x  */
986       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
987       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
988       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
989       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
990       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
991       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
992       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
993       v  += 49;
994     }
995     xb +=7; ai++;
996   }
997 
998   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
999   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1000   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1001 
1002   PLogFlops(98*(a->s_nz*2 - a->m));
1003   PetscFunctionReturn(0);
1004 }
1005 
1006 #undef __FUNC__
1007 #define __FUNC__ "MatMultAdd_SeqSBAIJ_N"
1008 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1009 {
1010   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
1011   Scalar          *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1012   MatScalar       *v;
1013   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
1014   int             ncols,k;
1015 
1016   PetscFunctionBegin;
1017   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1018   if (yy != xx) {
1019     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1020   } else {
1021     y = x;
1022   }
1023   if (zz != yy) {
1024     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1025     ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1026   } else {
1027     z = y;
1028   }
1029 
1030   aj   = a->j;
1031   v    = a->a;
1032   ii   = a->i;
1033 
1034   if (!a->mult_work) {
1035     k = a->m;
1036     a->mult_work = (Scalar*)PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1037   }
1038   work = a->mult_work;
1039 
1040 
1041   for (i=0; i<mbs; i++) {
1042     n     = ii[1] - ii[0]; ncols = n*bs;
1043     workt = work; idx=aj+ii[0];
1044 
1045     /* upper triangular part */
1046     for (j=0; j<n; j++) {
1047       /* col = bs*(*idx); */
1048       xb = x_ptr + bs*(*idx++);
1049       for (k=0; k<bs; k++) workt[k] = xb[k];
1050       workt += bs;
1051     }
1052     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1053     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1054 
1055     /* strict lower triangular part */
1056     ncols -= bs;
1057     if (ncols > 0){
1058       workt = work;
1059       ierr  = PetscMemzero(workt,ncols*sizeof(Scalar));CHKERRQ(ierr);
1060       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v+bs2,workt);
1061 
1062       idx=aj+ii[0]+1;
1063       for (j=1; j<n; j++) {
1064         zb = z_ptr + bs*(*idx);
1065         idx++;
1066         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1067         workt += bs;
1068       }
1069     }
1070 
1071     x += bs; v += n*bs2; z += bs; ii++;
1072   }
1073 
1074   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1075   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1076   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1077 
1078   PLogFlops(2*(a->s_nz*2 - a->m));
1079   PetscFunctionReturn(0);
1080 }
1081 
1082 #undef __FUNC__
1083 #define __FUNC__ "MatMultTranspose_SeqSBAIJ"
1084 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1085 {
1086   PetscFunctionBegin;
1087   SETERRQ(1,"Matrix is symmetric. Call MatMult().");
1088   /* PetscFunctionReturn(0); */
1089 }
1090 
1091 #undef __FUNC__
1092 #define __FUNC__ "MatMultTransposeAdd_SeqSBAIJ"
1093 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1094 
1095 {
1096   PetscFunctionBegin;
1097   SETERRQ(1,"Matrix is symmetric. Call MatMultAdd().");
1098   /* PetscFunctionReturn(0); */
1099 }
1100 
1101 #undef __FUNC__
1102 #define __FUNC__ "MatScale_SeqSBAIJ"
1103 int MatScale_SeqSBAIJ(Scalar *alpha,Mat inA)
1104 {
1105   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1106   int         one = 1,totalnz = a->bs2*a->s_nz;
1107 
1108   PetscFunctionBegin;
1109   BLscal_(&totalnz,alpha,a->a,&one);
1110   PLogFlops(totalnz);
1111   PetscFunctionReturn(0);
1112 }
1113 
1114 #undef __FUNC__
1115 #define __FUNC__ "MatNorm_SeqSBAIJ"
1116 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1117 {
1118   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1119   MatScalar   *v = a->a;
1120   PetscReal   sum_diag = 0.0, sum_off = 0.0, *sum;
1121   int         i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs;
1122   int         *jl,*il,jmin,jmax,ierr,nexti,ik;
1123 
1124   PetscFunctionBegin;
1125   if (type == NORM_FROBENIUS) {
1126     for (k=0; k<mbs; k++){
1127       jmin = a->i[k]; jmax = a->i[k+1];
1128       /* diagonal block */
1129       for (i=0; i<bs2; i++){
1130 #if defined(PETSC_USE_COMPLEX)
1131         sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1132 #else
1133         sum_diag += (*v)*(*v); v++;
1134 #endif
1135       }
1136       for (j=jmin+1; j<jmax; j++){  /* off-diagonal blocks */
1137         for (i=0; i<bs2; i++){
1138 #if defined(PETSC_USE_COMPLEX)
1139           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1140 #else
1141           sum_off += (*v)*(*v); v++;
1142 #endif
1143         }
1144       }
1145     }
1146     *norm = sqrt(sum_diag + 2*sum_off);
1147 
1148   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1149     il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
1150     jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
1151     sum = (PetscReal*)PetscMalloc(bs*sizeof(PetscReal));CHKPTRQ(sum);
1152     for (i=0; i<mbs; i++) {
1153       jl[i] = mbs; il[0] = 0;
1154     }
1155 
1156     *norm = 0.0;
1157     for (k=0; k<mbs; k++) { /* k_th block row */
1158       for (j=0; j<bs; j++) sum[j]=0.0;
1159 
1160       /*-- col sum --*/
1161       i = jl[k]; /* first |A(i,k)| to be added */
1162       while (i<mbs){
1163         nexti = jl[i];  /* next block row to be added */
1164         ik    = il[i];  /* block index of A(i,k) in the array a */
1165         for (j=0; j<bs; j++){
1166           v = a->a + ik*bs2 + j*bs;
1167           for (k1=0; k1<bs; k1++) {
1168             sum[j] += PetscAbsScalar(*v); v++;
1169           }
1170         }
1171         /* update il, jl */
1172         jmin = ik + 1; jmax = a->i[i+1];
1173         if (jmin < jmax){
1174           il[i] = jmin;
1175           j   = a->j[jmin];
1176           jl[i] = jl[j]; jl[j]=i;
1177         }
1178         i = nexti;
1179       }
1180 
1181       /*-- row sum --*/
1182       jmin = a->i[k]; jmax = a->i[k+1];
1183       for (i=jmin; i<jmax; i++) {
1184         for (j=0; j<bs; j++){
1185           v = a->a + i*bs2 + j;
1186           for (k1=0; k1<bs; k1++){
1187             sum[j] += PetscAbsScalar(*v);
1188             v   += bs;
1189           }
1190         }
1191       }
1192       /* add k_th block row to il, jl */
1193       jmin++;
1194       if (jmin < jmax){
1195         il[k] = jmin;
1196         j   = a->j[jmin];
1197         jl[k] = jl[j]; jl[j] = k;
1198       }
1199       for (j=0; j<bs; j++){
1200         if (sum[j] > *norm) *norm = sum[j];
1201       }
1202     }
1203     ierr = PetscFree(il);CHKERRQ(ierr);
1204     ierr = PetscFree(jl);CHKERRQ(ierr);
1205     ierr = PetscFree(sum);CHKERRQ(ierr);
1206   } else {
1207     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1208   }
1209   PetscFunctionReturn(0);
1210 }
1211 
1212 #undef __FUNC__
1213 #define __FUNC__ "MatEqual_SeqSBAIJ"
1214 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1215 {
1216   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1217   int         ierr;
1218 
1219   PetscFunctionBegin;
1220   if (B->type !=MATSEQSBAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,"Matrices must be same type");
1221 
1222   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1223   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)|| (a->s_nz != b->s_nz)) {
1224     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1225   }
1226 
1227   /* if the a->i are the same */
1228   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr);
1229   if (*flg == PETSC_FALSE) {
1230     PetscFunctionReturn(0);
1231   }
1232 
1233   /* if a->j are the same */
1234   ierr = PetscMemcmp(a->j,b->j,(a->s_nz)*sizeof(int),flg);CHKERRQ(ierr);
1235   if (*flg == PETSC_FALSE) {
1236     PetscFunctionReturn(0);
1237   }
1238   /* if a->a are the same */
1239   ierr = PetscMemcmp(a->a,b->a,(a->s_nz)*(a->bs)*(a->bs)*sizeof(Scalar),flg);CHKERRQ(ierr);
1240   PetscFunctionReturn(0);
1241 
1242 }
1243 
1244 #undef __FUNC__
1245 #define __FUNC__ "MatGetDiagonal_SeqSBAIJ"
1246 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1247 {
1248   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1249   int         ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1250   Scalar      *x,zero = 0.0;
1251   MatScalar   *aa,*aa_j;
1252 
1253   PetscFunctionBegin;
1254   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1255   bs   = a->bs;
1256   aa   = a->a;
1257   ai   = a->i;
1258   aj   = a->j;
1259   ambs = a->mbs;
1260   bs2  = a->bs2;
1261 
1262   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1263   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1264   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1265   /* if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");*/
1266   for (i=0; i<ambs; i++) {
1267     j=ai[i];
1268     if (aj[j] == i) {             /* if this is a diagonal element */
1269       row  = i*bs;
1270       aa_j = aa + j*bs2;
1271       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1272     }
1273   }
1274   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1275   PetscFunctionReturn(0);
1276 }
1277 
1278 #undef __FUNC__
1279 #define __FUNC__ "MatDiagonalScale_SeqSBAIJ"
1280 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1281 {
1282   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1283   Scalar      *l,*r,x,*li,*ri;
1284   MatScalar   *aa,*v;
1285   int         ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1286 
1287   PetscFunctionBegin;
1288   ai  = a->i;
1289   aj  = a->j;
1290   aa  = a->a;
1291   m   = a->m;
1292   bs  = a->bs;
1293   mbs = a->mbs;
1294   bs2 = a->bs2;
1295 
1296   if (ll != rr) {
1297     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1298   }
1299   if (ll) {
1300     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1301     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1302     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1303     for (i=0; i<mbs; i++) { /* for each block row */
1304       M  = ai[i+1] - ai[i];
1305       li = l + i*bs;
1306       v  = aa + bs2*ai[i];
1307       for (j=0; j<M; j++) { /* for each block */
1308         for (k=0; k<bs2; k++) {
1309           (*v++) *= li[k%bs];
1310         }
1311 #ifdef CONT
1312         /* will be used to replace the above loop */
1313         ri = l + bs*aj[ai[i]+j];
1314         for (k=0; k<bs; k++) { /* column value */
1315           x = ri[k];
1316           for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1317         }
1318 #endif
1319 
1320       }
1321     }
1322     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1323     PLogFlops(2*a->s_nz);
1324   }
1325   /* will be deleted */
1326   if (rr) {
1327     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
1328     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1329     if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1330     for (i=0; i<mbs; i++) { /* for each block row */
1331       M  = ai[i+1] - ai[i];
1332       v  = aa + bs2*ai[i];
1333       for (j=0; j<M; j++) { /* for each block */
1334         ri = r + bs*aj[ai[i]+j];
1335         for (k=0; k<bs; k++) {
1336           x = ri[k];
1337           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1338         }
1339       }
1340     }
1341     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
1342     PLogFlops(a->s_nz);
1343   }
1344   PetscFunctionReturn(0);
1345 }
1346 
1347 #undef __FUNC__
1348 #define __FUNC__ "MatGetInfo_SeqSBAIJ"
1349 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1350 {
1351   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1352 
1353   PetscFunctionBegin;
1354   info->rows_global    = (double)a->m;
1355   info->columns_global = (double)a->m;
1356   info->rows_local     = (double)a->m;
1357   info->columns_local  = (double)a->m;
1358   info->block_size     = a->bs2;
1359   info->nz_allocated   = a->s_maxnz; /*num. of nonzeros in upper triangular part */
1360   info->nz_used        = a->bs2*a->s_nz; /*num. of nonzeros in upper triangular part */
1361   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1362   info->assemblies   = A->num_ass;
1363   info->mallocs      = a->reallocs;
1364   info->memory       = A->mem;
1365   if (A->factor) {
1366     info->fill_ratio_given  = A->info.fill_ratio_given;
1367     info->fill_ratio_needed = A->info.fill_ratio_needed;
1368     info->factor_mallocs    = A->info.factor_mallocs;
1369   } else {
1370     info->fill_ratio_given  = 0;
1371     info->fill_ratio_needed = 0;
1372     info->factor_mallocs    = 0;
1373   }
1374   PetscFunctionReturn(0);
1375 }
1376 
1377 
1378 #undef __FUNC__
1379 #define __FUNC__ "MatZeroEntries_SeqSBAIJ"
1380 int MatZeroEntries_SeqSBAIJ(Mat A)
1381 {
1382   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1383   int         ierr;
1384 
1385   PetscFunctionBegin;
1386   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1387   PetscFunctionReturn(0);
1388 }
1389