xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision b5758dff40dc4232f7fa9713b5481f7e0e0d5d24)
1 /*$Id: sbaij2.c,v 1.27 2001/01/16 18:18:03 balay Exp bsmith $*/
2 
3 #include "petscsys.h"
4 #include "src/mat/impls/baij/seq/baij.h"
5 #include "src/vec/vecimpl.h"
6 #include "src/inline/spops.h"
7 #include "src/inline/ilu.h"
8 #include "petscbt.h"
9 #include "sbaij.h"
10 
11 #undef __FUNC__
12 #define __FUNC__ "MatIncreaseOverlap_SeqSBAIJ"
13 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS *is,int ov)
14 {
15   PetscFunctionBegin;
16   SETERRQ(1,"Function not yet written for SBAIJ format");
17   /* PetscFunctionReturn(0); */
18 }
19 
20 #undef __FUNC__
21 #define __FUNC__ "MatGetSubMatrix_SeqSBAIJ_Private"
22 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
23 {
24   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
25   int          *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
26   int          row,mat_i,*mat_j,tcol,*mat_ilen;
27   int          *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
28   int          *aj = a->j,*ai = a->i;
29   MatScalar    *mat_a;
30   Mat          C;
31   PetscTruth   flag;
32 
33   PetscFunctionBegin;
34 
35   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
36   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
37   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
38 
39   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
40   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
41 
42   ierr  = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr);
43   ssmap = smap;
44   ierr  = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr);
45   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
46   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
47   /* determine lens of each row */
48   for (i=0; i<nrows; i++) {
49     kstart  = ai[irow[i]];
50     kend    = kstart + a->ilen[irow[i]];
51     lens[i] = 0;
52       for (k=kstart; k<kend; k++) {
53         if (ssmap[aj[k]]) {
54           lens[i]++;
55         }
56       }
57     }
58   /* Create and fill new matrix */
59   if (scall == MAT_REUSE_MATRIX) {
60     c = (Mat_SeqSBAIJ *)((*B)->data);
61 
62     if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
63     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr);
64     if (flag == PETSC_FALSE) {
65       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
66     }
67     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
68     C = *B;
69   } else {
70     ierr = MatCreateSeqSBAIJ(A->comm,bs,nrows*bs,nrows*bs,0,lens,&C);CHKERRQ(ierr);
71   }
72   c = (Mat_SeqSBAIJ *)(C->data);
73   for (i=0; i<nrows; i++) {
74     row    = irow[i];
75     kstart = ai[row];
76     kend   = kstart + a->ilen[row];
77     mat_i  = c->i[i];
78     mat_j  = c->j + mat_i;
79     mat_a  = c->a + mat_i*bs2;
80     mat_ilen = c->ilen + i;
81     for (k=kstart; k<kend; k++) {
82       if ((tcol=ssmap[a->j[k]])) {
83         *mat_j++ = tcol - 1;
84         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
85         mat_a   += bs2;
86         (*mat_ilen)++;
87       }
88     }
89   }
90 
91   /* Free work space */
92   ierr = PetscFree(smap);CHKERRQ(ierr);
93   ierr = PetscFree(lens);CHKERRQ(ierr);
94   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
95   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
96 
97   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
98   *B = C;
99   PetscFunctionReturn(0);
100 }
101 
102 #undef __FUNC__
103 #define __FUNC__ "MatGetSubMatrix_SeqSBAIJ"
104 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
105 {
106   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
107   IS          is1;
108   int         *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
109 
110   PetscFunctionBegin;
111   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
112 
113   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
114   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
115 
116   /* Verify if the indices corespond to each element in a block
117    and form the IS with compressed IS */
118   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr);
119   iary = vary + a->mbs;
120   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
121   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
122 
123   count = 0;
124   for (i=0; i<a->mbs; i++) {
125     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
126     if (vary[i]==bs) iary[count++] = i;
127   }
128   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
129 
130   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
131   ierr = PetscFree(vary);CHKERRQ(ierr);
132 
133   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
134   ISDestroy(is1);
135   PetscFunctionReturn(0);
136 }
137 
138 #undef __FUNC__
139 #define __FUNC__ "MatGetSubMatrices_SeqSBAIJ"
140 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,IS *irow,IS *icol,MatReuse scall,Mat **B)
141 {
142   int ierr,i;
143 
144   PetscFunctionBegin;
145   if (scall == MAT_INITIAL_MATRIX) {
146     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
147   }
148 
149   for (i=0; i<n; i++) {
150     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
151   }
152   PetscFunctionReturn(0);
153 }
154 
155 /* -------------------------------------------------------*/
156 /* Should check that shapes of vectors and matrices match */
157 /* -------------------------------------------------------*/
158 #include "petscblaslapack.h"
159 
160 #undef __FUNC__
161 #define __FUNC__ "MatMult_SeqSBAIJ_1"
162 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
163 {
164   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
165   Scalar          *x,*z,*xb,x1,zero=0.0;
166   MatScalar       *v;
167   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
168 
169   PetscFunctionBegin;
170   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
171   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
172   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
173 
174   v  = a->a;
175   xb = x;
176 
177   for (i=0; i<mbs; i++) {
178     n  = ai[1] - ai[0];  /* length of i_th row of A */
179     x1 = xb[0];
180     ib = aj + *ai;
181     jmin = 0;
182     if (*ib == i) {      /* (diag of A)*x */
183       z[i] += *v++ * x[*ib++];
184       jmin++;
185     }
186     for (j=jmin; j<n; j++) {
187       cval    = *ib;
188       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
189       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
190     }
191     xb++; ai++;
192   }
193 
194   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
195   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
196   PetscLogFlops(2*(a->s_nz*2 - A->m) - A->m);  /* s_nz = (nz+m)/2 */
197   PetscFunctionReturn(0);
198 }
199 
200 #undef __FUNC__
201 #define __FUNC__ "MatMult_SeqSBAIJ_2"
202 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
203 {
204   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
205   Scalar          *x,*z,*xb,x1,x2,zero=0.0;
206   MatScalar       *v;
207   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
208 
209 
210   PetscFunctionBegin;
211   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
212   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
213   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
214 
215   v     = a->a;
216   xb = x;
217 
218   for (i=0; i<mbs; i++) {
219     n  = ai[1] - ai[0]; /* length of i_th block row of A */
220     x1 = xb[0]; x2 = xb[1];
221     ib = aj + *ai;
222     jmin = 0;
223     if (*ib == i){     /* (diag of A)*x */
224       z[2*i]   += v[0]*x1 + v[2]*x2;
225       z[2*i+1] += v[2]*x1 + v[3]*x2;
226       v += 4; jmin++;
227     }
228     for (j=jmin; j<n; j++) {
229       /* (strict lower triangular part of A)*x  */
230       cval       = ib[j]*2;
231       z[cval]     += v[0]*x1 + v[1]*x2;
232       z[cval+1]   += v[2]*x1 + v[3]*x2;
233       /* (strict upper triangular part of A)*x  */
234       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
235       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
236       v  += 4;
237     }
238     xb +=2; ai++;
239   }
240 
241   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
242   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
243   PetscLogFlops(8*(a->s_nz*2 - A->m) - A->m);
244   PetscFunctionReturn(0);
245 }
246 
247 #undef __FUNC__
248 #define __FUNC__ "MatMult_SeqSBAIJ_3"
249 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
250 {
251   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
252   Scalar          *x,*z,*xb,x1,x2,x3,zero=0.0;
253   MatScalar       *v;
254   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
255 
256 
257   PetscFunctionBegin;
258   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
259   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
260   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
261 
262   v     = a->a;
263   xb = x;
264 
265   for (i=0; i<mbs; i++) {
266     n  = ai[1] - ai[0]; /* length of i_th block row of A */
267     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
268     ib = aj + *ai;
269     jmin = 0;
270     if (*ib == i){     /* (diag of A)*x */
271       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
272       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
273       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
274       v += 9; jmin++;
275     }
276     for (j=jmin; j<n; j++) {
277       /* (strict lower triangular part of A)*x  */
278       cval       = ib[j]*3;
279       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
280       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
281       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
282       /* (strict upper triangular part of A)*x  */
283       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
284       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
285       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
286       v  += 9;
287     }
288     xb +=3; ai++;
289   }
290 
291   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
292   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
293   PetscLogFlops(18*(a->s_nz*2 - A->m) - A->m);
294   PetscFunctionReturn(0);
295 }
296 
297 #undef __FUNC__
298 #define __FUNC__ "MatMult_SeqSBAIJ_4"
299 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
300 {
301   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
302   Scalar          *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
303   MatScalar       *v;
304   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
305 
306   PetscFunctionBegin;
307   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
308   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
309   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
310 
311   v     = a->a;
312   xb = x;
313 
314   for (i=0; i<mbs; i++) {
315     n  = ai[1] - ai[0]; /* length of i_th block row of A */
316     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
317     ib = aj + *ai;
318     jmin = 0;
319     if (*ib == i){     /* (diag of A)*x */
320       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
321       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
322       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
323       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
324       v += 16; jmin++;
325     }
326     for (j=jmin; j<n; j++) {
327       /* (strict lower triangular part of A)*x  */
328       cval       = ib[j]*4;
329       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
330       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
331       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
332       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
333       /* (strict upper triangular part of A)*x  */
334       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
335       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
336       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
337       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
338       v  += 16;
339     }
340     xb +=4; ai++;
341   }
342 
343   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
344   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
345   PetscLogFlops(32*(a->s_nz*2 - A->m) - A->m);
346   PetscFunctionReturn(0);
347 }
348 
349 #undef __FUNC__
350 #define __FUNC__ "MatMult_SeqSBAIJ_5"
351 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
352 {
353   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
354   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
355   MatScalar       *v;
356   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
357 
358   PetscFunctionBegin;
359   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
360   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
361   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
362 
363   v     = a->a;
364   xb = x;
365 
366   for (i=0; i<mbs; i++) {
367     n  = ai[1] - ai[0]; /* length of i_th block row of A */
368     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
369     ib = aj + *ai;
370     jmin = 0;
371     if (*ib == i){      /* (diag of A)*x */
372       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
373       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
374       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
375       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
376       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
377       v += 25; jmin++;
378     }
379     for (j=jmin; j<n; j++) {
380       /* (strict lower triangular part of A)*x  */
381       cval       = ib[j]*5;
382       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
383       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
384       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
385       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
386       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
387       /* (strict upper triangular part of A)*x  */
388       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
389       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
390       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
391       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
392       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
393       v  += 25;
394     }
395     xb +=5; ai++;
396   }
397 
398   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
399   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
400   PetscLogFlops(50*(a->s_nz*2 - A->m) - A->m);
401   PetscFunctionReturn(0);
402 }
403 
404 
405 #undef __FUNC__
406 #define __FUNC__ "MatMult_SeqSBAIJ_6"
407 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
408 {
409   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
410   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
411   MatScalar       *v;
412   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
413 
414   PetscFunctionBegin;
415   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
416   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
417   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
418 
419   v     = a->a;
420   xb = x;
421 
422   for (i=0; i<mbs; i++) {
423     n  = ai[1] - ai[0]; /* length of i_th block row of A */
424     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
425     ib = aj + *ai;
426     jmin = 0;
427     if (*ib == i){      /* (diag of A)*x */
428       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
429       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
430       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
431       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
432       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
433       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
434       v += 36; jmin++;
435     }
436     for (j=jmin; j<n; j++) {
437       /* (strict lower triangular part of A)*x  */
438       cval       = ib[j]*6;
439       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
440       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
441       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
442       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
443       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
444       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
445       /* (strict upper triangular part of A)*x  */
446       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
447       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
448       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
449       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
450       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
451       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
452       v  += 36;
453     }
454     xb +=6; ai++;
455   }
456 
457   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
458   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
459   PetscLogFlops(72*(a->s_nz*2 - A->m) - A->m);
460   PetscFunctionReturn(0);
461 }
462 #undef __FUNC__
463 #define __FUNC__ "MatMult_SeqSBAIJ_7"
464 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
465 {
466   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
467   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
468   MatScalar       *v;
469   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
470 
471   PetscFunctionBegin;
472   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
473   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
474   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
475 
476   v     = a->a;
477   xb = x;
478 
479   for (i=0; i<mbs; i++) {
480     n  = ai[1] - ai[0]; /* length of i_th block row of A */
481     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
482     ib = aj + *ai;
483     jmin = 0;
484     if (*ib == i){      /* (diag of A)*x */
485       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
486       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
487       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
488       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
489       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
490       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
491       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
492       v += 49; jmin++;
493     }
494     for (j=jmin; j<n; j++) {
495       /* (strict lower triangular part of A)*x  */
496       cval       = ib[j]*7;
497       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
498       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
499       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
500       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
501       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
502       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
503       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
504       /* (strict upper triangular part of A)*x  */
505       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
506       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
507       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
508       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
509       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
510       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
511       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
512       v  += 49;
513     }
514     xb +=7; ai++;
515   }
516   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
517   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
518   PetscLogFlops(98*(a->s_nz*2 - A->m) - A->m);
519   PetscFunctionReturn(0);
520 }
521 
522 /*
523     This will not work with MatScalar == float because it calls the BLAS
524 */
525 #undef __FUNC__
526 #define __FUNC__ "MatMult_SeqSBAIJ_N"
527 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
528 {
529   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
530   Scalar          *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
531   MatScalar       *v;
532   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
533   int             ncols,k;
534 
535   PetscFunctionBegin;
536   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
537   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
538   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
539 
540   aj   = a->j;
541   v    = a->a;
542   ii   = a->i;
543 
544   if (!a->mult_work) {
545     ierr = PetscMalloc((A->m+1)*sizeof(Scalar),&a->mult_work);CHKERRQ(ierr);
546   }
547   work = a->mult_work;
548 
549   for (i=0; i<mbs; i++) {
550     n     = ii[1] - ii[0]; ncols = n*bs;
551     workt = work; idx=aj+ii[0];
552 
553     /* upper triangular part */
554     for (j=0; j<n; j++) {
555       xb = x_ptr + bs*(*idx++);
556       for (k=0; k<bs; k++) workt[k] = xb[k];
557       workt += bs;
558     }
559     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
560     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
561 
562     /* strict lower triangular part */
563     idx = aj+ii[0];
564     if (*idx == i){
565       ncols -= bs; v += bs2; idx++; n--;
566     }
567 
568     if (ncols > 0){
569       workt = work;
570       ierr  = PetscMemzero(workt,ncols*sizeof(Scalar));CHKERRQ(ierr);
571       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
572       for (j=0; j<n; j++) {
573         zb = z_ptr + bs*(*idx++);
574         for (k=0; k<bs; k++) zb[k] += workt[k] ;
575         workt += bs;
576       }
577     }
578     x += bs; v += n*bs2; z += bs; ii++;
579   }
580 
581   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
582   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
583   PetscLogFlops(2*(a->s_nz*2 - A->m)*bs2 - A->m);
584   PetscFunctionReturn(0);
585 }
586 
587 #undef __FUNC__
588 #define __FUNC__ "MatMultAdd_SeqSBAIJ_1"
589 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
590 {
591   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
592   Scalar          *x,*y,*z,*xb,x1;
593   MatScalar       *v;
594   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
595 
596   PetscFunctionBegin;
597   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
598   if (yy != xx) {
599     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
600   } else {
601     y = x;
602   }
603   if (zz != yy) {
604     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
605     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
606   } else {
607     z = y;
608   }
609 
610   v  = a->a;
611   xb = x;
612 
613   for (i=0; i<mbs; i++) {
614     n  = ai[1] - ai[0];  /* length of i_th row of A */
615     x1 = xb[0];
616     ib = aj + *ai;
617     jmin = 0;
618     if (*ib == i) {            /* (diag of A)*x */
619       z[i] += *v++ * x[*ib++]; jmin++;
620     }
621     for (j=jmin; j<n; j++) {
622       cval    = *ib;
623       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
624       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
625     }
626     xb++; ai++;
627   }
628 
629   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
630   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
631   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
632 
633   PetscLogFlops(2*(a->s_nz*2 - A->m));
634   PetscFunctionReturn(0);
635 }
636 
637 #undef __FUNC__
638 #define __FUNC__ "MatMultAdd_SeqSBAIJ_2"
639 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
640 {
641   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
642   Scalar          *x,*y,*z,*xb,x1,x2;
643   MatScalar       *v;
644   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
645 
646   PetscFunctionBegin;
647   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
648   if (yy != xx) {
649     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
650   } else {
651     y = x;
652   }
653   if (zz != yy) {
654     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
655     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
656   } else {
657     z = y;
658   }
659 
660   v     = a->a;
661   xb = x;
662 
663   for (i=0; i<mbs; i++) {
664     n  = ai[1] - ai[0]; /* length of i_th block row of A */
665     x1 = xb[0]; x2 = xb[1];
666     ib = aj + *ai;
667     jmin = 0;
668     if (*ib == i){      /* (diag of A)*x */
669       z[2*i]   += v[0]*x1 + v[2]*x2;
670       z[2*i+1] += v[2]*x1 + v[3]*x2;
671       v += 4; jmin++;
672     }
673     for (j=jmin; j<n; j++) {
674       /* (strict lower triangular part of A)*x  */
675       cval       = ib[j]*2;
676       z[cval]     += v[0]*x1 + v[1]*x2;
677       z[cval+1]   += v[2]*x1 + v[3]*x2;
678       /* (strict upper triangular part of A)*x  */
679       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
680       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
681       v  += 4;
682     }
683     xb +=2; ai++;
684   }
685 
686   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
687   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
688   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
689 
690   PetscLogFlops(4*(a->s_nz*2 - A->m));
691   PetscFunctionReturn(0);
692 }
693 
694 #undef __FUNC__
695 #define __FUNC__ "MatMultAdd_SeqSBAIJ_3"
696 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
697 {
698   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
699   Scalar          *x,*y,*z,*xb,x1,x2,x3;
700   MatScalar       *v;
701   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
702 
703   PetscFunctionBegin;
704   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
705   if (yy != xx) {
706     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
707   } else {
708     y = x;
709   }
710   if (zz != yy) {
711     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
712     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
713   } else {
714     z = y;
715   }
716 
717   v     = a->a;
718   xb = x;
719 
720   for (i=0; i<mbs; i++) {
721     n  = ai[1] - ai[0]; /* length of i_th block row of A */
722     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
723     ib = aj + *ai;
724     jmin = 0;
725     if (*ib == i){     /* (diag of A)*x */
726      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
727      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
728      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
729      v += 9; jmin++;
730     }
731     for (j=jmin; j<n; j++) {
732       /* (strict lower triangular part of A)*x  */
733       cval       = ib[j]*3;
734       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
735       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
736       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
737       /* (strict upper triangular part of A)*x  */
738       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
739       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
740       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
741       v  += 9;
742     }
743     xb +=3; ai++;
744   }
745 
746   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
747   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
748   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
749 
750   PetscLogFlops(18*(a->s_nz*2 - A->m));
751   PetscFunctionReturn(0);
752 }
753 
754 #undef __FUNC__
755 #define __FUNC__ "MatMultAdd_SeqSBAIJ_4"
756 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
757 {
758   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
759   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4;
760   MatScalar       *v;
761   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
762 
763   PetscFunctionBegin;
764   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
765   if (yy != xx) {
766     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
767   } else {
768     y = x;
769   }
770   if (zz != yy) {
771     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
772     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
773   } else {
774     z = y;
775   }
776 
777   v     = a->a;
778   xb = x;
779 
780   for (i=0; i<mbs; i++) {
781     n  = ai[1] - ai[0]; /* length of i_th block row of A */
782     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
783     ib = aj + *ai;
784     jmin = 0;
785     if (*ib == i){      /* (diag of A)*x */
786       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
787       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
788       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
789       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
790       v += 16; jmin++;
791     }
792     for (j=jmin; j<n; j++) {
793       /* (strict lower triangular part of A)*x  */
794       cval       = ib[j]*4;
795       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
796       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
797       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
798       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
799       /* (strict upper triangular part of A)*x  */
800       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
801       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
802       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
803       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
804       v  += 16;
805     }
806     xb +=4; ai++;
807   }
808 
809   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
810   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
811   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
812 
813   PetscLogFlops(32*(a->s_nz*2 - A->m));
814   PetscFunctionReturn(0);
815 }
816 
817 #undef __FUNC__
818 #define __FUNC__ "MatMultAdd_SeqSBAIJ_5"
819 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
820 {
821   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
822   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4,x5;
823   MatScalar       *v;
824   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
825 
826   PetscFunctionBegin;
827   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
828   if (yy != xx) {
829     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
830   } else {
831     y = x;
832   }
833   if (zz != yy) {
834     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
835     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
836   } else {
837     z = y;
838   }
839 
840   v     = a->a;
841   xb = x;
842 
843   for (i=0; i<mbs; i++) {
844     n  = ai[1] - ai[0]; /* length of i_th block row of A */
845     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
846     ib = aj + *ai;
847     jmin = 0;
848     if (*ib == i){      /* (diag of A)*x */
849       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
850       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
851       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
852       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
853       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
854       v += 25; jmin++;
855     }
856     for (j=jmin; j<n; j++) {
857       /* (strict lower triangular part of A)*x  */
858       cval       = ib[j]*5;
859       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
860       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
861       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
862       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
863       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
864       /* (strict upper triangular part of A)*x  */
865       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
866       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
867       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
868       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
869       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
870       v  += 25;
871     }
872     xb +=5; ai++;
873   }
874 
875   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
876   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
877   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
878 
879   PetscLogFlops(50*(a->s_nz*2 - A->m));
880   PetscFunctionReturn(0);
881 }
882 #undef __FUNC__
883 #define __FUNC__ "MatMultAdd_SeqSBAIJ_6"
884 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
885 {
886   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
887   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
888   MatScalar       *v;
889   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
890 
891   PetscFunctionBegin;
892   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
893   if (yy != xx) {
894     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
895   } else {
896     y = x;
897   }
898   if (zz != yy) {
899     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
900     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
901   } else {
902     z = y;
903   }
904 
905   v     = a->a;
906   xb = x;
907 
908   for (i=0; i<mbs; i++) {
909     n  = ai[1] - ai[0]; /* length of i_th block row of A */
910     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
911     ib = aj + *ai;
912     jmin = 0;
913     if (*ib == i){     /* (diag of A)*x */
914       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
915       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
916       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
917       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
918       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
919       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
920       v += 36; jmin++;
921     }
922     for (j=jmin; j<n; j++) {
923       /* (strict lower triangular part of A)*x  */
924       cval       = ib[j]*6;
925       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
926       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
927       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
928       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
929       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
930       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
931       /* (strict upper triangular part of A)*x  */
932       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
933       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
934       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
935       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
936       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
937       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
938       v  += 36;
939     }
940     xb +=6; ai++;
941   }
942 
943   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
944   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
945   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
946 
947   PetscLogFlops(72*(a->s_nz*2 - A->m));
948   PetscFunctionReturn(0);
949 }
950 
951 #undef __FUNC__
952 #define __FUNC__ "MatMultAdd_SeqSBAIJ_7"
953 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
954 {
955   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
956   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
957   MatScalar       *v;
958   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
959 
960   PetscFunctionBegin;
961   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
962   if (yy != xx) {
963     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
964   } else {
965     y = x;
966   }
967   if (zz != yy) {
968     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
969     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
970   } else {
971     z = y;
972   }
973 
974   v     = a->a;
975   xb = x;
976 
977   for (i=0; i<mbs; i++) {
978     n  = ai[1] - ai[0]; /* length of i_th block row of A */
979     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
980     ib = aj + *ai;
981     jmin = 0;
982     if (*ib == i){     /* (diag of A)*x */
983       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
984       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
985       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
986       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
987       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
988       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
989       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
990       v += 49; jmin++;
991     }
992     for (j=jmin; j<n; j++) {
993       /* (strict lower triangular part of A)*x  */
994       cval       = ib[j]*7;
995       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
996       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
997       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
998       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
999       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1000       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1001       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1002       /* (strict upper triangular part of A)*x  */
1003       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1004       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1005       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1006       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1007       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1008       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1009       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1010       v  += 49;
1011     }
1012     xb +=7; ai++;
1013   }
1014 
1015   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1016   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1017   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1018 
1019   PetscLogFlops(98*(a->s_nz*2 - A->m));
1020   PetscFunctionReturn(0);
1021 }
1022 
1023 #undef __FUNC__
1024 #define __FUNC__ "MatMultAdd_SeqSBAIJ_N"
1025 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1026 {
1027   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
1028   Scalar          *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1029   MatScalar       *v;
1030   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
1031   int             ncols,k;
1032 
1033   PetscFunctionBegin;
1034   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1035   if (yy != xx) {
1036     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1037   } else {
1038     y = x;
1039   }
1040   if (zz != yy) {
1041     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1042     ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1043   } else {
1044     z = y;
1045   }
1046 
1047   aj   = a->j;
1048   v    = a->a;
1049   ii   = a->i;
1050 
1051   if (!a->mult_work) {
1052     ierr = PetscMalloc((A->m+1)*sizeof(Scalar),&a->mult_work);CHKERRQ(ierr);
1053   }
1054   work = a->mult_work;
1055 
1056 
1057   for (i=0; i<mbs; i++) {
1058     n     = ii[1] - ii[0]; ncols = n*bs;
1059     workt = work; idx=aj+ii[0];
1060 
1061     /* upper triangular part */
1062     for (j=0; j<n; j++) {
1063       xb = x_ptr + bs*(*idx++);
1064       for (k=0; k<bs; k++) workt[k] = xb[k];
1065       workt += bs;
1066     }
1067     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1068     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1069 
1070     /* strict lower triangular part */
1071     idx = aj+ii[0];
1072     if (*idx == i){
1073       ncols -= bs; v += bs2; idx++; n--;
1074     }
1075     if (ncols > 0){
1076       workt = work;
1077       ierr  = PetscMemzero(workt,ncols*sizeof(Scalar));CHKERRQ(ierr);
1078       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1079       for (j=0; j<n; j++) {
1080         zb = z_ptr + bs*(*idx++);
1081         /* idx++; */
1082         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1083         workt += bs;
1084       }
1085     }
1086 
1087     x += bs; v += n*bs2; z += bs; ii++;
1088   }
1089 
1090   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1091   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1092   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1093 
1094   PetscLogFlops(2*(a->s_nz*2 - A->m));
1095   PetscFunctionReturn(0);
1096 }
1097 
1098 #undef __FUNC__
1099 #define __FUNC__ "MatMultTranspose_SeqSBAIJ"
1100 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1101 {
1102   PetscFunctionBegin;
1103   SETERRQ(1,"Matrix is symmetric. Call MatMult().");
1104   /* PetscFunctionReturn(0); */
1105 }
1106 
1107 #undef __FUNC__
1108 #define __FUNC__ "MatMultTransposeAdd_SeqSBAIJ"
1109 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1110 
1111 {
1112   PetscFunctionBegin;
1113   SETERRQ(1,"Matrix is symmetric. Call MatMultAdd().");
1114   /* PetscFunctionReturn(0); */
1115 }
1116 
1117 #undef __FUNC__
1118 #define __FUNC__ "MatScale_SeqSBAIJ"
1119 int MatScale_SeqSBAIJ(Scalar *alpha,Mat inA)
1120 {
1121   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1122   int         one = 1,totalnz = a->bs2*a->s_nz;
1123 
1124   PetscFunctionBegin;
1125   BLscal_(&totalnz,alpha,a->a,&one);
1126   PetscLogFlops(totalnz);
1127   PetscFunctionReturn(0);
1128 }
1129 
1130 #undef __FUNC__
1131 #define __FUNC__ "MatNorm_SeqSBAIJ"
1132 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1133 {
1134   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1135   MatScalar   *v = a->a;
1136   PetscReal   sum_diag = 0.0, sum_off = 0.0, *sum;
1137   int         i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1138   int         *jl,*il,jmin,jmax,ierr,nexti,ik,*col;
1139 
1140   PetscFunctionBegin;
1141   if (type == NORM_FROBENIUS) {
1142     for (k=0; k<mbs; k++){
1143       jmin = a->i[k]; jmax = a->i[k+1];
1144       col  = aj + jmin;
1145       if (*col == k){         /* diagonal block */
1146         for (i=0; i<bs2; i++){
1147 #if defined(PETSC_USE_COMPLEX)
1148           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1149 #else
1150           sum_diag += (*v)*(*v); v++;
1151 #endif
1152         }
1153         jmin++;
1154       }
1155       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1156         for (i=0; i<bs2; i++){
1157 #if defined(PETSC_USE_COMPLEX)
1158           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1159 #else
1160           sum_off += (*v)*(*v); v++;
1161 #endif
1162         }
1163       }
1164     }
1165     *norm = sqrt(sum_diag + 2*sum_off);
1166 
1167   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1168     ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr);
1169     ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr);
1170     ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr);
1171     for (i=0; i<mbs; i++) {
1172       jl[i] = mbs; il[0] = 0;
1173     }
1174 
1175     *norm = 0.0;
1176     for (k=0; k<mbs; k++) { /* k_th block row */
1177       for (j=0; j<bs; j++) sum[j]=0.0;
1178 
1179       /*-- col sum --*/
1180       i = jl[k]; /* first |A(i,k)| to be added */
1181       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1182                   at step k */
1183       while (i<mbs){
1184         nexti = jl[i];  /* next block row to be added */
1185         ik    = il[i];  /* block index of A(i,k) in the array a */
1186         for (j=0; j<bs; j++){
1187           v = a->a + ik*bs2 + j*bs;
1188           for (k1=0; k1<bs; k1++) {
1189             sum[j] += PetscAbsScalar(*v); v++;
1190           }
1191         }
1192         /* update il, jl */
1193         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1194         jmax = a->i[i+1];
1195         if (jmin < jmax){
1196           il[i] = jmin;
1197           j   = a->j[jmin];
1198           jl[i] = jl[j]; jl[j]=i;
1199         }
1200         i = nexti;
1201       }
1202 
1203       /*-- row sum --*/
1204       jmin = a->i[k]; jmax = a->i[k+1];
1205       for (i=jmin; i<jmax; i++) {
1206         for (j=0; j<bs; j++){
1207           v = a->a + i*bs2 + j;
1208           for (k1=0; k1<bs; k1++){
1209             sum[j] += PetscAbsScalar(*v);
1210             v   += bs;
1211           }
1212         }
1213       }
1214       /* add k_th block row to il, jl */
1215       col = aj+jmin;
1216       if (*col == k) jmin++;
1217       if (jmin < jmax){
1218         il[k] = jmin;
1219         j   = a->j[jmin];
1220         jl[k] = jl[j]; jl[j] = k;
1221       }
1222       for (j=0; j<bs; j++){
1223         if (sum[j] > *norm) *norm = sum[j];
1224       }
1225     }
1226     ierr = PetscFree(il);CHKERRQ(ierr);
1227     ierr = PetscFree(jl);CHKERRQ(ierr);
1228     ierr = PetscFree(sum);CHKERRQ(ierr);
1229   } else {
1230     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1231   }
1232   PetscFunctionReturn(0);
1233 }
1234 
1235 #undef __FUNC__
1236 #define __FUNC__ "MatEqual_SeqSBAIJ"
1237 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1238 {
1239   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1240   int          ierr;
1241   PetscTruth   flag;
1242 
1243   PetscFunctionBegin;
1244   ierr = PetscTypeCompare((PetscObject)A,MATSEQBAIJ,&flag);CHKERRQ(ierr);
1245   if (!flag) SETERRQ(PETSC_ERR_ARG_INCOMP,"Matrices must be same type");
1246 
1247   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1248   if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->s_nz != b->s_nz)) {
1249     *flg = PETSC_FALSE;
1250     PetscFunctionReturn(0);
1251   }
1252 
1253   /* if the a->i are the same */
1254   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr);
1255   if (*flg == PETSC_FALSE) {
1256     PetscFunctionReturn(0);
1257   }
1258 
1259   /* if a->j are the same */
1260   ierr = PetscMemcmp(a->j,b->j,(a->s_nz)*sizeof(int),flg);CHKERRQ(ierr);
1261   if (*flg == PETSC_FALSE) {
1262     PetscFunctionReturn(0);
1263   }
1264   /* if a->a are the same */
1265   ierr = PetscMemcmp(a->a,b->a,(a->s_nz)*(a->bs)*(a->bs)*sizeof(Scalar),flg);CHKERRQ(ierr);
1266   PetscFunctionReturn(0);
1267 
1268 }
1269 
1270 #undef __FUNC__
1271 #define __FUNC__ "MatGetDiagonal_SeqSBAIJ"
1272 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1273 {
1274   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1275   int         ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1276   Scalar      *x,zero = 0.0;
1277   MatScalar   *aa,*aa_j;
1278 
1279   PetscFunctionBegin;
1280   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1281   bs   = a->bs;
1282   aa   = a->a;
1283   ai   = a->i;
1284   aj   = a->j;
1285   ambs = a->mbs;
1286   bs2  = a->bs2;
1287 
1288   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1289   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1290   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1291   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1292   for (i=0; i<ambs; i++) {
1293     j=ai[i];
1294     if (aj[j] == i) {             /* if this is a diagonal element */
1295       row  = i*bs;
1296       aa_j = aa + j*bs2;
1297       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1298     }
1299   }
1300   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1301   PetscFunctionReturn(0);
1302 }
1303 
1304 #undef __FUNC__
1305 #define __FUNC__ "MatDiagonalScale_SeqSBAIJ"
1306 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1307 {
1308   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1309   Scalar      *l,*r,x,*li,*ri;
1310   MatScalar   *aa,*v;
1311   int         ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1312 
1313   PetscFunctionBegin;
1314   ai  = a->i;
1315   aj  = a->j;
1316   aa  = a->a;
1317   m   = A->m;
1318   bs  = a->bs;
1319   mbs = a->mbs;
1320   bs2 = a->bs2;
1321 
1322   if (ll != rr) {
1323     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1324   }
1325   if (ll) {
1326     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1327     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1328     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1329     for (i=0; i<mbs; i++) { /* for each block row */
1330       M  = ai[i+1] - ai[i];
1331       li = l + i*bs;
1332       v  = aa + bs2*ai[i];
1333       for (j=0; j<M; j++) { /* for each block */
1334         for (k=0; k<bs2; k++) {
1335           (*v++) *= li[k%bs];
1336         }
1337 #ifdef CONT
1338         /* will be used to replace the above loop */
1339         ri = l + bs*aj[ai[i]+j];
1340         for (k=0; k<bs; k++) { /* column value */
1341           x = ri[k];
1342           for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1343         }
1344 #endif
1345 
1346       }
1347     }
1348     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1349     PetscLogFlops(2*a->s_nz);
1350   }
1351   /* will be deleted */
1352   if (rr) {
1353     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
1354     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1355     if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1356     for (i=0; i<mbs; i++) { /* for each block row */
1357       M  = ai[i+1] - ai[i];
1358       v  = aa + bs2*ai[i];
1359       for (j=0; j<M; j++) { /* for each block */
1360         ri = r + bs*aj[ai[i]+j];
1361         for (k=0; k<bs; k++) {
1362           x = ri[k];
1363           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1364         }
1365       }
1366     }
1367     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
1368     PetscLogFlops(a->s_nz);
1369   }
1370   PetscFunctionReturn(0);
1371 }
1372 
1373 #undef __FUNC__
1374 #define __FUNC__ "MatGetInfo_SeqSBAIJ"
1375 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1376 {
1377   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1378 
1379   PetscFunctionBegin;
1380   info->rows_global    = (double)A->m;
1381   info->columns_global = (double)A->m;
1382   info->rows_local     = (double)A->m;
1383   info->columns_local  = (double)A->m;
1384   info->block_size     = a->bs2;
1385   info->nz_allocated   = a->s_maxnz; /*num. of nonzeros in upper triangular part */
1386   info->nz_used        = a->bs2*a->s_nz; /*num. of nonzeros in upper triangular part */
1387   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1388   info->assemblies   = A->num_ass;
1389   info->mallocs      = a->reallocs;
1390   info->memory       = A->mem;
1391   if (A->factor) {
1392     info->fill_ratio_given  = A->info.fill_ratio_given;
1393     info->fill_ratio_needed = A->info.fill_ratio_needed;
1394     info->factor_mallocs    = A->info.factor_mallocs;
1395   } else {
1396     info->fill_ratio_given  = 0;
1397     info->fill_ratio_needed = 0;
1398     info->factor_mallocs    = 0;
1399   }
1400   PetscFunctionReturn(0);
1401 }
1402 
1403 
1404 #undef __FUNC__
1405 #define __FUNC__ "MatZeroEntries_SeqSBAIJ"
1406 int MatZeroEntries_SeqSBAIJ(Mat A)
1407 {
1408   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1409   int         ierr;
1410 
1411   PetscFunctionBegin;
1412   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1413   PetscFunctionReturn(0);
1414 }
1415 
1416 #undef __FUNC__
1417 #define __FUNC__ "MatGetRowMax_SeqSBAIJ"
1418 int MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1419 {
1420   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1421   int          ierr,i,j,n,row,col,bs,*ai,*aj,mbs;
1422   PetscReal    atmp;
1423   MatScalar    *aa;
1424   Scalar       zero = 0.0,*x;
1425   int          ncols,brow,bcol,krow,kcol;
1426 
1427   PetscFunctionBegin;
1428   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1429   bs   = a->bs;
1430   aa   = a->a;
1431   ai   = a->i;
1432   aj   = a->j;
1433   mbs = a->mbs;
1434 
1435   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1436   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1437   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1438   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1439   for (i=0; i<mbs; i++) {
1440     ncols = ai[1] - ai[0]; ai++;
1441     brow  = bs*i;
1442     for (j=0; j<ncols; j++){
1443       bcol = bs*(*aj);
1444       for (kcol=0; kcol<bs; kcol++){
1445         col = bcol + kcol;      /* col index */
1446         for (krow=0; krow<bs; krow++){
1447           atmp = PetscAbsScalar(*aa); aa++;
1448           row = brow + krow;    /* row index */
1449           /* printf("val[%d,%d]: %g\n",row,col,atmp); */
1450           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1451           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1452         }
1453       }
1454       aj++;
1455     }
1456   }
1457   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1458   PetscFunctionReturn(0);
1459 }
1460