xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision d1bcd226b9f92c795e207870456c25e5b5fd5971)
1 /*$Id: sbaij2.c,v 1.8 2000/09/07 14:06:03 hzhang Exp hzhang $*/
2 
3 #include "petscsys.h"
4 #include "src/mat/impls/baij/seq/baij.h"
5 #include "src/vec/vecimpl.h"
6 #include "src/inline/spops.h"
7 #include "src/inline/ilu.h"
8 #include "petscbt.h"
9 #include "sbaij.h"
10 
11 #undef __FUNC__
12 #define __FUNC__ "MatIncreaseOverlap_SeqSBAIJ"
13 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS *is,int ov)
14 {
15   PetscFunctionBegin;
16   SETERRQ(1,1,"Function not yet written for SBAIJ format");
17   PetscFunctionReturn(0);
18 }
19 
20 #undef __FUNC__
21 #define __FUNC__ "MatGetSubMatrix_SeqSBAIJ_Private"
22 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
23 {
24   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
25   int          *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
26   int          row,mat_i,*mat_j,tcol,*mat_ilen;
27   int          *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
28   int          *aj = a->j,*ai = a->i;
29   MatScalar    *mat_a;
30   Mat          C;
31   PetscTruth   flag;
32 
33   PetscFunctionBegin;
34 
35   if (isrow != iscol) SETERRA(1,0,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
36   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
37   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted");
38 
39   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
40   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
41 
42   smap  = (int*)PetscMalloc((1+oldcols)*sizeof(int));CHKPTRQ(smap);
43   ssmap = smap;
44   lens  = (int*)PetscMalloc((1+nrows)*sizeof(int));CHKPTRQ(lens);
45   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
46   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
47   /* determine lens of each row */
48   for (i=0; i<nrows; i++) {
49     kstart  = ai[irow[i]];
50     kend    = kstart + a->ilen[irow[i]];
51     lens[i] = 0;
52       for (k=kstart; k<kend; k++) {
53         if (ssmap[aj[k]]) {
54           lens[i]++;
55         }
56       }
57     }
58   /* Create and fill new matrix */
59   if (scall == MAT_REUSE_MATRIX) {
60     c = (Mat_SeqSBAIJ *)((*B)->data);
61 
62     if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Submatrix wrong size");
63     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr);
64     if (flag == PETSC_FALSE) {
65       SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong no of nonzeros");
66     }
67     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
68     C = *B;
69   } else {
70     ierr = MatCreateSeqSBAIJ(A->comm,bs,nrows*bs,nrows*bs,0,lens,&C);CHKERRQ(ierr);
71   }
72   c = (Mat_SeqSBAIJ *)(C->data);
73   for (i=0; i<nrows; i++) {
74     row    = irow[i];
75     kstart = ai[row];
76     kend   = kstart + a->ilen[row];
77     mat_i  = c->i[i];
78     mat_j  = c->j + mat_i;
79     mat_a  = c->a + mat_i*bs2;
80     mat_ilen = c->ilen + i;
81     for (k=kstart; k<kend; k++) {
82       if ((tcol=ssmap[a->j[k]])) {
83         *mat_j++ = tcol - 1;
84         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
85         mat_a   += bs2;
86         (*mat_ilen)++;
87       }
88     }
89   }
90 
91   /* Free work space */
92   ierr = PetscFree(smap);CHKERRQ(ierr);
93   ierr = PetscFree(lens);CHKERRQ(ierr);
94   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
95   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
96 
97   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
98   *B = C;
99   PetscFunctionReturn(0);
100 }
101 
102 #undef __FUNC__
103 #define __FUNC__ "MatGetSubMatrix_SeqSBAIJ"
104 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
105 {
106   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
107   IS          is1;
108   int         *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
109 
110   PetscFunctionBegin;
111   if (isrow != iscol) SETERRA(1,0,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
112 
113   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
114   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
115 
116   /* Verify if the indices corespond to each element in a block
117    and form the IS with compressed IS */
118   vary = (int*)PetscMalloc(2*(a->mbs+1)*sizeof(int));CHKPTRQ(vary);
119   iary = vary + a->mbs;
120   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
121   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
122 
123   count = 0;
124   for (i=0; i<a->mbs; i++) {
125     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"Index set does not match blocks");
126     if (vary[i]==bs) iary[count++] = i;
127   }
128   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
129 
130   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
131   ierr = PetscFree(vary);CHKERRQ(ierr);
132 
133   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
134   ISDestroy(is1);
135   PetscFunctionReturn(0);
136 }
137 
138 #undef __FUNC__
139 #define __FUNC__ "MatGetSubMatrices_SeqSBAIJ"
140 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,IS *irow,IS *icol,MatReuse scall,Mat **B)
141 {
142   int ierr,i;
143 
144   PetscFunctionBegin;
145   if (scall == MAT_INITIAL_MATRIX) {
146     *B = (Mat*)PetscMalloc((n+1)*sizeof(Mat));CHKPTRQ(*B);
147   }
148 
149   for (i=0; i<n; i++) {
150     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
151   }
152   PetscFunctionReturn(0);
153 }
154 
155 /* -------------------------------------------------------*/
156 /* Should check that shapes of vectors and matrices match */
157 /* -------------------------------------------------------*/
158 #include "petscblaslapack.h"
159 
160 #undef __FUNC__
161 #define __FUNC__ "MatMult_SeqSBAIJ_1"
162 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
163 {
164   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
165   Scalar          *x,*z,*xb,x1,zero=0.0;
166   MatScalar       *v;
167   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
168 
169   PetscFunctionBegin;
170   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
171   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
172   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
173 
174   v  = a->a;
175   xb = x;
176 
177   for (i=0; i<mbs; i++) {
178     n  = ai[1] - ai[0];  /* length of i_th row of A */
179     x1 = xb[0];
180     ib = aj + *ai;
181     z[i] += *v++ * x[*ib++];   /* (diag of A)*x */
182     for (j=1; j<n; j++) {
183       cval    = *ib;
184       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
185       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
186     }
187     xb++; ai++;
188   }
189 
190   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
191   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
192   PLogFlops(2*(a->s_nz*2 - a->m) - a->m);  /* s_nz = (nz+m)/2 */
193   PetscFunctionReturn(0);
194 }
195 
196 #undef __FUNC__
197 #define __FUNC__ "MatMult_SeqSBAIJ_2"
198 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
199 {
200   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
201   Scalar          *x,*z,*xb,x1,x2,zero=0.0;
202   MatScalar       *v;
203   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
204 
205 
206   PetscFunctionBegin;
207   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
208   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
209   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
210 
211   v     = a->a;
212   xb = x;
213 
214   for (i=0; i<mbs; i++) {
215     n  = ai[1] - ai[0]; /* length of i_th block row of A */
216     x1 = xb[0]; x2 = xb[1];
217     ib = aj + *ai;
218     /* (diag of A)*x */
219     z[2*i]   += v[0]*x1 + v[2]*x2;
220     z[2*i+1] += v[2]*x1 + v[3]*x2;
221     v += 4;
222     for (j=1; j<n; j++) {
223       /* (strict lower triangular part of A)*x  */
224       cval       = ib[j]*2;
225       z[cval]     += v[0]*x1 + v[1]*x2;
226       z[cval+1]   += v[2]*x1 + v[3]*x2;
227       /* (strict upper triangular part of A)*x  */
228       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
229       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
230       v  += 4;
231     }
232     xb +=2; ai++;
233   }
234 
235   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
236   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
237   PLogFlops(8*(a->s_nz*2 - a->m) - a->m);
238   PetscFunctionReturn(0);
239 }
240 
241 #undef __FUNC__
242 #define __FUNC__ "MatMult_SeqSBAIJ_3"
243 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
244 {
245   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
246   Scalar          *x,*z,*xb,x1,x2,x3,zero=0.0;
247   MatScalar       *v;
248   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
249 
250 
251   PetscFunctionBegin;
252   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
253   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
254   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
255 
256   v     = a->a;
257   xb = x;
258 
259   for (i=0; i<mbs; i++) {
260     n  = ai[1] - ai[0]; /* length of i_th block row of A */
261     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
262     ib = aj + *ai;
263     /* (diag of A)*x */
264     z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
265     z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
266     z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
267     v += 9;
268     for (j=1; j<n; j++) {
269       /* (strict lower triangular part of A)*x  */
270       cval       = ib[j]*3;
271       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
272       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
273       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
274       /* (strict upper triangular part of A)*x  */
275       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
276       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
277       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
278       v  += 9;
279     }
280     xb +=3; ai++;
281   }
282 
283   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
284   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
285   PLogFlops(18*(a->s_nz*2 - a->m) - a->m);
286   PetscFunctionReturn(0);
287 }
288 
289 #undef __FUNC__
290 #define __FUNC__ "MatMult_SeqSBAIJ_4"
291 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
292 {
293   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
294   Scalar          *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
295   MatScalar       *v;
296   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
297 
298   PetscFunctionBegin;
299   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
300   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
301   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
302 
303   v     = a->a;
304   xb = x;
305 
306   for (i=0; i<mbs; i++) {
307     n  = ai[1] - ai[0]; /* length of i_th block row of A */
308     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
309     ib = aj + *ai;
310     /* (diag of A)*x */
311     z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
312     z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
313     z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
314     z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
315     v += 16;
316     for (j=1; j<n; j++) {
317       /* (strict lower triangular part of A)*x  */
318       cval       = ib[j]*4;
319       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
320       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
321       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
322       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
323       /* (strict upper triangular part of A)*x  */
324       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
325       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
326       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
327       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
328       v  += 16;
329     }
330     xb +=4; ai++;
331   }
332 
333   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
334   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
335   PLogFlops(32*(a->s_nz*2 - a->m) - a->m);
336   PetscFunctionReturn(0);
337 }
338 
339 #undef __FUNC__
340 #define __FUNC__ "MatMult_SeqSBAIJ_5"
341 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
342 {
343   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
344   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
345   MatScalar       *v;
346   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
347 
348   PetscFunctionBegin;
349   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
350   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
351   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
352 
353   v     = a->a;
354   xb = x;
355 
356   for (i=0; i<mbs; i++) {
357     n  = ai[1] - ai[0]; /* length of i_th block row of A */
358     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
359     ib = aj + *ai;
360     /* (diag of A)*x */
361     z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
362     z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
363     z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
364     z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
365     z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
366     v += 25;
367     for (j=1; j<n; j++) {
368       /* (strict lower triangular part of A)*x  */
369       cval       = ib[j]*5;
370       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
371       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
372       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
373       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
374       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
375       /* (strict upper triangular part of A)*x  */
376       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
377       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
378       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
379       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
380       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
381       v  += 25;
382     }
383     xb +=5; ai++;
384   }
385 
386   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
387   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
388   PLogFlops(50*(a->s_nz*2 - a->m) - a->m);
389   PetscFunctionReturn(0);
390 }
391 
392 
393 #undef __FUNC__
394 #define __FUNC__ "MatMult_SeqSBAIJ_6"
395 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
396 {
397   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
398   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
399   MatScalar       *v;
400   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
401 
402   PetscFunctionBegin;
403   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
404   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
405   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
406 
407   v     = a->a;
408   xb = x;
409 
410   for (i=0; i<mbs; i++) {
411     n  = ai[1] - ai[0]; /* length of i_th block row of A */
412     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
413     ib = aj + *ai;
414     /* (diag of A)*x */
415     z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
416     z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
417     z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
418     z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
419     z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
420     z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
421     v += 36;
422     for (j=1; j<n; j++) {
423       /* (strict lower triangular part of A)*x  */
424       cval       = ib[j]*6;
425       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
426       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
427       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
428       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
429       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
430       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
431       /* (strict upper triangular part of A)*x  */
432       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
433       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
434       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
435       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
436       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
437       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
438       v  += 36;
439     }
440     xb +=6; ai++;
441   }
442 
443   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
444   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
445   PLogFlops(72*(a->s_nz*2 - a->m) - a->m);
446   PetscFunctionReturn(0);
447 }
448 #undef __FUNC__
449 #define __FUNC__ "MatMult_SeqSBAIJ_7"
450 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
451 {
452   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
453   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
454   MatScalar       *v;
455   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
456 
457   PetscFunctionBegin;
458   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
459   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
460   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
461 
462   v     = a->a;
463   xb = x;
464 
465   for (i=0; i<mbs; i++) {
466     n  = ai[1] - ai[0]; /* length of i_th block row of A */
467     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
468     ib = aj + *ai;
469     /* (diag of A)*x */
470     z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
471     z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
472     z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
473     z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
474     z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
475     z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
476     z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
477     v += 49;
478     for (j=1; j<n; j++) {
479       /* (strict lower triangular part of A)*x  */
480       cval       = ib[j]*7;
481       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
482       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
483       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
484       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
485       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
486       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
487       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
488       /* (strict upper triangular part of A)*x  */
489       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
490       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
491       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
492       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
493       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
494       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
495       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
496       v  += 49;
497     }
498     xb +=7; ai++;
499   }
500   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
501   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
502   PLogFlops(98*(a->s_nz*2 - a->m) - a->m);
503   PetscFunctionReturn(0);
504 }
505 
506 /*
507     This will not work with MatScalar == float because it calls the BLAS
508 */
509 #undef __FUNC__
510 #define __FUNC__ "MatMult_SeqSBAIJ_N"
511 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
512 {
513   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
514   Scalar          *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
515   MatScalar       *v;
516   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
517   int             ncols,k;
518 
519   PetscFunctionBegin;
520   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
521   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
522   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
523 
524   aj   = a->j;
525   v    = a->a;
526   ii   = a->i;
527 
528   if (!a->mult_work) {
529     k = a->m;
530     a->mult_work = (Scalar*)PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
531   }
532   work = a->mult_work;
533 
534   for (i=0; i<mbs; i++) {
535     n     = ii[1] - ii[0]; ncols = n*bs;
536     workt = work; idx=aj+ii[0];
537 
538     /* upper triangular part */
539     for (j=0; j<n; j++) {
540       /* col = bs*(*idx); */
541       xb = x_ptr + bs*(*idx++);
542       for (k=0; k<bs; k++) workt[k] = xb[k];
543       workt += bs;
544     }
545     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
546     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
547 
548     /* strict lower triangular part */
549     ncols -= bs;
550     if (ncols > 0){
551       workt = work;
552       ierr  = PetscMemzero(workt,ncols*sizeof(Scalar));CHKERRQ(ierr);
553       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v+bs2,workt);
554 
555       idx=aj+ii[0]+1;
556       for (j=1; j<n; j++) {
557         zb = z_ptr + bs*(*idx);
558         idx++;
559         for (k=0; k<bs; k++) zb[k] += workt[k] ;
560         workt += bs;
561       }
562     }
563 
564     x += bs; v += n*bs2; z += bs; ii++;
565   }
566 
567   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
568   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
569   PLogFlops(2*(a->s_nz*2 - a->m)*bs2 - a->m);
570   PetscFunctionReturn(0);
571 }
572 
573 #undef __FUNC__
574 #define __FUNC__ "MatMultAdd_SeqSBAIJ_1"
575 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
576 {
577   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
578   Scalar          *x,*y,*z,*xb,x1;
579   MatScalar       *v;
580   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
581 
582   PetscFunctionBegin;
583   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
584   if (yy != xx) {
585     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
586   } else {
587     y = x;
588   }
589   if (zz != yy) {
590     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
591     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
592   } else {
593     z = y;
594   }
595 
596   v  = a->a;
597   xb = x;
598 
599   for (i=0; i<mbs; i++) {
600     n  = ai[1] - ai[0];  /* length of i_th row of A */
601     x1 = xb[0];
602     ib = aj + *ai;
603     z[i] += *v++ * x[*ib++];   /* (diag of A)*x */
604     for (j=1; j<n; j++) {
605       cval    = *ib;
606       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
607       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
608     }
609     xb++; ai++;
610   }
611 
612   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
613   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
614   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
615 
616   PLogFlops(2*(a->s_nz*2 - a->m));
617   PetscFunctionReturn(0);
618 }
619 
620 #undef __FUNC__
621 #define __FUNC__ "MatMultAdd_SeqSBAIJ_2"
622 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
623 {
624   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
625   Scalar          *x,*y,*z,*xb,x1,x2;
626   MatScalar       *v;
627   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
628 
629   PetscFunctionBegin;
630   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
631   if (yy != xx) {
632     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
633   } else {
634     y = x;
635   }
636   if (zz != yy) {
637     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
638     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
639   } else {
640     z = y;
641   }
642 
643   v     = a->a;
644   xb = x;
645 
646   for (i=0; i<mbs; i++) {
647     n  = ai[1] - ai[0]; /* length of i_th block row of A */
648     x1 = xb[0]; x2 = xb[1];
649     ib = aj + *ai;
650     /* (diag of A)*x */
651     z[2*i]   += v[0]*x1 + v[2]*x2;
652     z[2*i+1] += v[2]*x1 + v[3]*x2;
653     v += 4;
654     for (j=1; j<n; j++) {
655       /* (strict lower triangular part of A)*x  */
656       cval       = ib[j]*2;
657       z[cval]     += v[0]*x1 + v[1]*x2;
658       z[cval+1]   += v[2]*x1 + v[3]*x2;
659       /* (strict upper triangular part of A)*x  */
660       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
661       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
662       v  += 4;
663     }
664     xb +=2; ai++;
665   }
666 
667   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
668   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
669   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
670 
671   PLogFlops(4*(a->s_nz*2 - a->m));
672   PetscFunctionReturn(0);
673 }
674 
675 #undef __FUNC__
676 #define __FUNC__ "MatMultAdd_SeqSBAIJ_3"
677 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
678 {
679   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
680   Scalar          *x,*y,*z,*xb,x1,x2,x3;
681   MatScalar       *v;
682   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
683 
684   PetscFunctionBegin;
685   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
686   if (yy != xx) {
687     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
688   } else {
689     y = x;
690   }
691   if (zz != yy) {
692     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
693     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
694   } else {
695     z = y;
696   }
697 
698   v     = a->a;
699   xb = x;
700 
701   for (i=0; i<mbs; i++) {
702     n  = ai[1] - ai[0]; /* length of i_th block row of A */
703     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
704     ib = aj + *ai;
705     /* (diag of A)*x */
706     z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
707     z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
708     z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
709     v += 9;
710     for (j=1; j<n; j++) {
711       /* (strict lower triangular part of A)*x  */
712       cval       = ib[j]*3;
713       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
714       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
715       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
716       /* (strict upper triangular part of A)*x  */
717       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
718       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
719       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
720       v  += 9;
721     }
722     xb +=3; ai++;
723   }
724 
725   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
726   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
727   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
728 
729   PLogFlops(18*(a->s_nz*2 - a->m));
730   PetscFunctionReturn(0);
731 }
732 
733 #undef __FUNC__
734 #define __FUNC__ "MatMultAdd_SeqSBAIJ_4"
735 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
736 {
737   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
738   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4;
739   MatScalar       *v;
740   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
741 
742   PetscFunctionBegin;
743   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
744   if (yy != xx) {
745     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
746   } else {
747     y = x;
748   }
749   if (zz != yy) {
750     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
751     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
752   } else {
753     z = y;
754   }
755 
756   v     = a->a;
757   xb = x;
758 
759   for (i=0; i<mbs; i++) {
760     n  = ai[1] - ai[0]; /* length of i_th block row of A */
761     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
762     ib = aj + *ai;
763     /* (diag of A)*x */
764     z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
765     z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
766     z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
767     z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
768     v += 16;
769     for (j=1; j<n; j++) {
770       /* (strict lower triangular part of A)*x  */
771       cval       = ib[j]*4;
772       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
773       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
774       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
775       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
776       /* (strict upper triangular part of A)*x  */
777       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
778       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
779       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
780       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
781       v  += 16;
782     }
783     xb +=4; ai++;
784   }
785 
786   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
787   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
788   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
789 
790   PLogFlops(32*(a->s_nz*2 - a->m));
791   PetscFunctionReturn(0);
792 }
793 
794 #undef __FUNC__
795 #define __FUNC__ "MatMultAdd_SeqSBAIJ_5"
796 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
797 {
798   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
799   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4,x5;
800   MatScalar       *v;
801   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
802 
803   PetscFunctionBegin;
804   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
805   if (yy != xx) {
806     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
807   } else {
808     y = x;
809   }
810   if (zz != yy) {
811     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
812     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
813   } else {
814     z = y;
815   }
816 
817   v     = a->a;
818   xb = x;
819 
820   for (i=0; i<mbs; i++) {
821     n  = ai[1] - ai[0]; /* length of i_th block row of A */
822     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
823     ib = aj + *ai;
824     /* (diag of A)*x */
825     z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
826     z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
827     z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
828     z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
829     z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
830     v += 25;
831     for (j=1; j<n; j++) {
832       /* (strict lower triangular part of A)*x  */
833       cval       = ib[j]*5;
834       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
835       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
836       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
837       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
838       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
839       /* (strict upper triangular part of A)*x  */
840       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
841       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
842       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
843       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
844       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
845       v  += 25;
846     }
847     xb +=5; ai++;
848   }
849 
850   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
851   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
852   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
853 
854   PLogFlops(50*(a->s_nz*2 - a->m));
855   PetscFunctionReturn(0);
856 }
857 #undef __FUNC__
858 #define __FUNC__ "MatMultAdd_SeqSBAIJ_6"
859 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
860 {
861   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
862   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
863   MatScalar       *v;
864   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
865 
866   PetscFunctionBegin;
867   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
868   if (yy != xx) {
869     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
870   } else {
871     y = x;
872   }
873   if (zz != yy) {
874     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
875     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
876   } else {
877     z = y;
878   }
879 
880   v     = a->a;
881   xb = x;
882 
883   for (i=0; i<mbs; i++) {
884     n  = ai[1] - ai[0]; /* length of i_th block row of A */
885     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
886     ib = aj + *ai;
887     /* (diag of A)*x */
888     z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
889     z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
890     z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
891     z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
892     z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
893     z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
894     v += 36;
895     for (j=1; j<n; j++) {
896       /* (strict lower triangular part of A)*x  */
897       cval       = ib[j]*6;
898       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
899       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
900       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
901       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
902       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
903       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
904       /* (strict upper triangular part of A)*x  */
905       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
906       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
907       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
908       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
909       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
910       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
911       v  += 36;
912     }
913     xb +=6; ai++;
914   }
915 
916   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
917   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
918   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
919 
920   PLogFlops(72*(a->s_nz*2 - a->m));
921   PetscFunctionReturn(0);
922 }
923 
924 #undef __FUNC__
925 #define __FUNC__ "MatMultAdd_SeqSBAIJ_7"
926 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
927 {
928   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
929   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
930   MatScalar       *v;
931   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
932 
933   PetscFunctionBegin;
934   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
935   if (yy != xx) {
936     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
937   } else {
938     y = x;
939   }
940   if (zz != yy) {
941     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
942     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
943   } else {
944     z = y;
945   }
946 
947   v     = a->a;
948   xb = x;
949 
950   for (i=0; i<mbs; i++) {
951     n  = ai[1] - ai[0]; /* length of i_th block row of A */
952     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
953     ib = aj + *ai;
954     /* (diag of A)*x */
955     z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
956     z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
957     z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
958     z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
959     z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
960     z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
961     z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
962     v += 49;
963     for (j=1; j<n; j++) {
964       /* (strict lower triangular part of A)*x  */
965       cval       = ib[j]*7;
966       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
967       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
968       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
969       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
970       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
971       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
972       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
973       /* (strict upper triangular part of A)*x  */
974       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
975       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
976       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
977       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
978       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
979       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
980       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
981       v  += 49;
982     }
983     xb +=7; ai++;
984   }
985 
986   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
987   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
988   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
989 
990   PLogFlops(98*(a->s_nz*2 - a->m));
991   PetscFunctionReturn(0);
992 }
993 
994 #undef __FUNC__
995 #define __FUNC__ "MatMultAdd_SeqSBAIJ_N"
996 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
997 {
998   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
999   Scalar          *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1000   MatScalar       *v;
1001   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
1002   int             ncols,k;
1003 
1004   PetscFunctionBegin;
1005   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1006   if (yy != xx) {
1007     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1008   } else {
1009     y = x;
1010   }
1011   if (zz != yy) {
1012     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1013     ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1014   } else {
1015     z = y;
1016   }
1017 
1018   aj   = a->j;
1019   v    = a->a;
1020   ii   = a->i;
1021 
1022   if (!a->mult_work) {
1023     k = a->m;
1024     a->mult_work = (Scalar*)PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1025   }
1026   work = a->mult_work;
1027 
1028 
1029   for (i=0; i<mbs; i++) {
1030     n     = ii[1] - ii[0]; ncols = n*bs;
1031     workt = work; idx=aj+ii[0];
1032 
1033     /* upper triangular part */
1034     for (j=0; j<n; j++) {
1035       /* col = bs*(*idx); */
1036       xb = x_ptr + bs*(*idx++);
1037       for (k=0; k<bs; k++) workt[k] = xb[k];
1038       workt += bs;
1039     }
1040     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1041     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1042 
1043     /* strict lower triangular part */
1044     ncols -= bs;
1045     if (ncols > 0){
1046       workt = work;
1047       ierr  = PetscMemzero(workt,ncols*sizeof(Scalar));CHKERRQ(ierr);
1048       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v+bs2,workt);
1049 
1050       idx=aj+ii[0]+1;
1051       for (j=1; j<n; j++) {
1052         zb = z_ptr + bs*(*idx);
1053         idx++;
1054         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1055         workt += bs;
1056       }
1057     }
1058 
1059     x += bs; v += n*bs2; z += bs; ii++;
1060   }
1061 
1062   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1063   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1064   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1065 
1066   PLogFlops(2*(a->s_nz*2 - a->m));
1067   PetscFunctionReturn(0);
1068 }
1069 
1070 #undef __FUNC__
1071 #define __FUNC__ "MatMultTranspose_SeqSBAIJ"
1072 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1073 {
1074   PetscFunctionBegin;
1075   SETERRQ(1,1,"Matrix is symmetric. Call MatMult().");
1076   PetscFunctionReturn(0);
1077 }
1078 
1079 #undef __FUNC__
1080 #define __FUNC__ "MatMultTransposeAdd_SeqSBAIJ"
1081 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1082 
1083 {
1084   PetscFunctionBegin;
1085   SETERRQ(1,1,"Matrix is symmetric. Call MatMultAdd().");
1086   PetscFunctionReturn(0);
1087 }
1088 
1089 #undef __FUNC__
1090 #define __FUNC__ "MatScale_SeqSBAIJ"
1091 int MatScale_SeqSBAIJ(Scalar *alpha,Mat inA)
1092 {
1093   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1094   int         one = 1,totalnz = a->bs2*a->s_nz;
1095 
1096   PetscFunctionBegin;
1097   BLscal_(&totalnz,alpha,a->a,&one);
1098   PLogFlops(totalnz);
1099   PetscFunctionReturn(0);
1100 }
1101 
1102 #undef __FUNC__
1103 #define __FUNC__ "MatNorm_SeqSBAIJ"
1104 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1105 {
1106   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1107   MatScalar   *v = a->a;
1108   PetscReal   sum_diag = 0.0, sum_off = 0.0, *sum;
1109   int         i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs;
1110   int         *jl,*il,jmin,jmax,ierr,nexti,ik;
1111 
1112   PetscFunctionBegin;
1113   printf("call MatNorm_SeqSBAIJ\n");
1114   if (type == NORM_FROBENIUS) {
1115     for (k=0; k<mbs; k++){
1116       jmin = a->i[k]; jmax = a->i[k+1];
1117       /* diagonal block */
1118       for (i=0; i<bs2; i++){
1119 #if defined(PETSC_USE_COMPLEX)
1120         sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1121 #else
1122         sum_diag += (*v)*(*v); v++;
1123 #endif
1124       }
1125       for (j=jmin+1; j<jmax; j++){  /* off-diagonal blocks */
1126         for (i=0; i<bs2; i++){
1127 #if defined(PETSC_USE_COMPLEX)
1128           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1129 #else
1130           sum_off += (*v)*(*v); v++;
1131 #endif
1132         }
1133       }
1134     }
1135     *norm = sqrt(sum_diag + 2*sum_off);
1136 
1137   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1138     il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
1139     jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
1140     sum = (PetscReal*)PetscMalloc(bs*sizeof(PetscReal));CHKPTRQ(sum);
1141     for (i=0; i<mbs; i++) {
1142       jl[i] = mbs; il[0] = 0;
1143     }
1144 
1145     *norm = 0.0;
1146     for (k=0; k<mbs; k++) { /* k_th block row */
1147       for (j=0; j<bs; j++) sum[j]=0.0;
1148 
1149       /*-- col sum --*/
1150       i = jl[k]; /* first |A(i,k)| to be added */
1151       while (i<mbs){
1152         nexti = jl[i];  /* next block row to be added */
1153         ik    = il[i];  /* block index of A(i,k) in the array a */
1154         for (j=0; j<bs; j++){
1155           v = a->a + ik*bs2 + j*bs;
1156           for (k1=0; k1<bs; k1++) {
1157             sum[j] += PetscAbsScalar(*v); v++;
1158           }
1159         }
1160         /* update il, jl */
1161         jmin = ik + 1; jmax = a->i[i+1];
1162         if (jmin < jmax){
1163           il[i] = jmin;
1164           j   = a->j[jmin];
1165           jl[i] = jl[j]; jl[j]=i;
1166         }
1167         i = nexti;
1168       }
1169 
1170       /*-- row sum --*/
1171       jmin = a->i[k]; jmax = a->i[k+1];
1172       for (i=jmin; i<jmax; i++) {
1173         for (j=0; j<bs; j++){
1174           v = a->a + i*bs2 + j;
1175           for (k1=0; k1<bs; k1++){
1176             sum[j] += PetscAbsScalar(*v);
1177             v   += bs;
1178           }
1179         }
1180       }
1181       /* add k_th block row to il, jl */
1182       jmin++;
1183       if (jmin < jmax){
1184         il[k] = jmin;
1185         j   = a->j[jmin];
1186         jl[k] = jl[j]; jl[j] = k;
1187       }
1188       for (j=0; j<bs; j++){
1189         if (sum[j] > *norm) *norm = sum[j];
1190       }
1191     }
1192     ierr = PetscFree(il);CHKERRQ(ierr);
1193     ierr = PetscFree(jl);CHKERRQ(ierr);
1194     ierr = PetscFree(sum);CHKERRQ(ierr);
1195   } else {
1196     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1197   }
1198   PetscFunctionReturn(0);
1199 }
1200 
1201 #undef __FUNC__
1202 #define __FUNC__ "MatEqual_SeqSBAIJ"
1203 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1204 {
1205   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1206   int         ierr;
1207 
1208   PetscFunctionBegin;
1209   if (B->type !=MATSEQSBAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices must be same type");
1210 
1211   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1212   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)|| (a->s_nz != b->s_nz)) {
1213     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1214   }
1215 
1216   /* if the a->i are the same */
1217   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr);
1218   if (*flg == PETSC_FALSE) {
1219     PetscFunctionReturn(0);
1220   }
1221 
1222   /* if a->j are the same */
1223   ierr = PetscMemcmp(a->j,b->j,(a->s_nz)*sizeof(int),flg);CHKERRQ(ierr);
1224   if (*flg == PETSC_FALSE) {
1225     PetscFunctionReturn(0);
1226   }
1227   /* if a->a are the same */
1228   ierr = PetscMemcmp(a->a,b->a,(a->s_nz)*(a->bs)*(a->bs)*sizeof(Scalar),flg);CHKERRQ(ierr);
1229   PetscFunctionReturn(0);
1230 
1231 }
1232 
1233 #undef __FUNC__
1234 #define __FUNC__ "MatGetDiagonal_SeqSBAIJ"
1235 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1236 {
1237   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1238   int         ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1239   Scalar      *x,zero = 0.0;
1240   MatScalar   *aa,*aa_j;
1241 
1242   PetscFunctionBegin;
1243   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Not for factored matrix");
1244   bs   = a->bs;
1245   aa   = a->a;
1246   ai   = a->i;
1247   aj   = a->j;
1248   ambs = a->mbs;
1249   bs2  = a->bs2;
1250 
1251   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1252   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1253   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1254   /* if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Nonconforming matrix and vector");*/
1255   for (i=0; i<ambs; i++) {
1256     j=ai[i];
1257     if (aj[j] == i) {             /* if this is a diagonal element */
1258       row  = i*bs;
1259       aa_j = aa + j*bs2;
1260       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1261     }
1262   }
1263   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1264   PetscFunctionReturn(0);
1265 }
1266 
1267 #undef __FUNC__
1268 #define __FUNC__ "MatDiagonalScale_SeqSBAIJ"
1269 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1270 {
1271   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1272   Scalar      *l,*r,x,*li,*ri;
1273   MatScalar   *aa,*v;
1274   int         ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1275 
1276   PetscFunctionBegin;
1277   ai  = a->i;
1278   aj  = a->j;
1279   aa  = a->a;
1280   m   = a->m;
1281   bs  = a->bs;
1282   mbs = a->mbs;
1283   bs2 = a->bs2;
1284 
1285   if (ll != rr) {
1286     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"For symmetric format, left and right scaling vectors must be same\n");
1287   }
1288   if (ll) {
1289     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1290     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1291     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Left scaling vector wrong length");
1292     for (i=0; i<mbs; i++) { /* for each block row */
1293       M  = ai[i+1] - ai[i];
1294       li = l + i*bs;
1295       v  = aa + bs2*ai[i];
1296       for (j=0; j<M; j++) { /* for each block */
1297         for (k=0; k<bs2; k++) {
1298           (*v++) *= li[k%bs];
1299         }
1300 #ifdef CONT
1301         /* will be used to replace the above loop */
1302         ri = l + bs*aj[ai[i]+j];
1303         for (k=0; k<bs; k++) { /* column value */
1304           x = ri[k];
1305           for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1306         }
1307 #endif
1308 
1309       }
1310     }
1311     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1312     PLogFlops(2*a->s_nz);
1313   }
1314   /* will be deleted */
1315   if (rr) {
1316     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
1317     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1318     if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Right scaling vector wrong length");
1319     for (i=0; i<mbs; i++) { /* for each block row */
1320       M  = ai[i+1] - ai[i];
1321       v  = aa + bs2*ai[i];
1322       for (j=0; j<M; j++) { /* for each block */
1323         ri = r + bs*aj[ai[i]+j];
1324         for (k=0; k<bs; k++) {
1325           x = ri[k];
1326           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1327         }
1328       }
1329     }
1330     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
1331     PLogFlops(a->s_nz);
1332   }
1333   PetscFunctionReturn(0);
1334 }
1335 
1336 #undef __FUNC__
1337 #define __FUNC__ "MatGetInfo_SeqSBAIJ"
1338 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1339 {
1340   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1341 
1342   PetscFunctionBegin;
1343   info->rows_global    = (double)a->m;
1344   info->columns_global = (double)a->m;
1345   info->rows_local     = (double)a->m;
1346   info->columns_local  = (double)a->m;
1347   info->block_size     = a->bs2;
1348   info->nz_allocated   = a->s_maxnz; /*num. of nonzeros in upper triangular part */
1349   info->nz_used        = a->bs2*a->s_nz; /*num. of nonzeros in upper triangular part */
1350   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1351   info->assemblies   = A->num_ass;
1352   info->mallocs      = a->reallocs;
1353   info->memory       = A->mem;
1354   if (A->factor) {
1355     info->fill_ratio_given  = A->info.fill_ratio_given;
1356     info->fill_ratio_needed = A->info.fill_ratio_needed;
1357     info->factor_mallocs    = A->info.factor_mallocs;
1358   } else {
1359     info->fill_ratio_given  = 0;
1360     info->fill_ratio_needed = 0;
1361     info->factor_mallocs    = 0;
1362   }
1363   PetscFunctionReturn(0);
1364 }
1365 
1366 
1367 #undef __FUNC__
1368 #define __FUNC__ "MatZeroEntries_SeqSBAIJ"
1369 int MatZeroEntries_SeqSBAIJ(Mat A)
1370 {
1371   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1372   int         ierr;
1373 
1374   PetscFunctionBegin;
1375   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1376   PetscFunctionReturn(0);
1377 }
1378