xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision f102d2d3ba4052999c8b59e4b1ab65578f0fd3c9)
1 /*$Id: sbaij2.c,v 1.6 2000/08/16 15:15:02 balay Exp hzhang $*/
2 
3 #include "petscsys.h"
4 #include "src/mat/impls/baij/seq/baij.h"
5 #include "src/vec/vecimpl.h"
6 #include "src/inline/spops.h"
7 #include "src/inline/ilu.h"
8 #include "petscbt.h"
9 #include "sbaij.h"
10 
11 #undef __FUNC__
12 #define __FUNC__ "MatIncreaseOverlap_SeqSBAIJ"
13 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS *is,int ov)
14 {
15   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
16   int         row,i,j,k,l,m,n,*idx,ierr,*nidx,isz,val,ival;
17   int         start,end,*ai,*aj,bs,*nidx2;
18   PetscBT     table;
19 
20   PetscFunctionBegin;
21   SETERRQ(1,1,"Function not yet written for SBAIJ format");
22   PetscFunctionReturn(0);
23 }
24 
25 #undef __FUNC__
26 #define __FUNC__ "MatGetSubMatrix_SeqSBAIJ_Private"
27 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
28 {
29   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
30   int          *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
31   int          row,mat_i,*mat_j,tcol,*mat_ilen;
32   int          *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
33   int          *aj = a->j,*ai = a->i;
34   MatScalar    *mat_a;
35   Mat          C;
36   PetscTruth   flag;
37 
38   PetscFunctionBegin;
39 
40   if (isrow != iscol) SETERRA(1,0,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
41   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
42   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted");
43 
44   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
45   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
46 
47   smap  = (int*)PetscMalloc((1+oldcols)*sizeof(int));CHKPTRQ(smap);
48   ssmap = smap;
49   lens  = (int*)PetscMalloc((1+nrows)*sizeof(int));CHKPTRQ(lens);
50   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
51   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
52   /* determine lens of each row */
53   for (i=0; i<nrows; i++) {
54     kstart  = ai[irow[i]];
55     kend    = kstart + a->ilen[irow[i]];
56     lens[i] = 0;
57       for (k=kstart; k<kend; k++) {
58         if (ssmap[aj[k]]) {
59           lens[i]++;
60         }
61       }
62     }
63   /* Create and fill new matrix */
64   if (scall == MAT_REUSE_MATRIX) {
65     c = (Mat_SeqSBAIJ *)((*B)->data);
66 
67     if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Submatrix wrong size");
68     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr);
69     if (flag == PETSC_FALSE) {
70       SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong no of nonzeros");
71     }
72     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
73     C = *B;
74   } else {
75     ierr = MatCreateSeqSBAIJ(A->comm,bs,nrows*bs,nrows*bs,0,lens,&C);CHKERRQ(ierr);
76   }
77   c = (Mat_SeqSBAIJ *)(C->data);
78   for (i=0; i<nrows; i++) {
79     row    = irow[i];
80     kstart = ai[row];
81     kend   = kstart + a->ilen[row];
82     mat_i  = c->i[i];
83     mat_j  = c->j + mat_i;
84     mat_a  = c->a + mat_i*bs2;
85     mat_ilen = c->ilen + i;
86     for (k=kstart; k<kend; k++) {
87       if ((tcol=ssmap[a->j[k]])) {
88         *mat_j++ = tcol - 1;
89         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
90         mat_a   += bs2;
91         (*mat_ilen)++;
92       }
93     }
94   }
95 
96   /* Free work space */
97   ierr = PetscFree(smap);CHKERRQ(ierr);
98   ierr = PetscFree(lens);CHKERRQ(ierr);
99   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
100   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
101 
102   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
103   *B = C;
104   PetscFunctionReturn(0);
105 }
106 
107 #undef __FUNC__
108 #define __FUNC__ "MatGetSubMatrix_SeqSBAIJ"
109 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
110 {
111   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
112   IS          is1;
113   int         *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
114 
115   PetscFunctionBegin;
116   if (isrow != iscol) SETERRA(1,0,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
117 
118   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
119   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
120 
121   /* Verify if the indices corespond to each element in a block
122    and form the IS with compressed IS */
123   vary = (int*)PetscMalloc(2*(a->mbs+1)*sizeof(int));CHKPTRQ(vary);
124   iary = vary + a->mbs;
125   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
126   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
127 
128   count = 0;
129   for (i=0; i<a->mbs; i++) {
130     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"Index set does not match blocks");
131     if (vary[i]==bs) iary[count++] = i;
132   }
133   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
134 
135   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
136   ierr = PetscFree(vary);CHKERRQ(ierr);
137 
138   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
139   ISDestroy(is1);
140   PetscFunctionReturn(0);
141 }
142 
143 #undef __FUNC__
144 #define __FUNC__ "MatGetSubMatrices_SeqSBAIJ"
145 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,IS *irow,IS *icol,MatReuse scall,Mat **B)
146 {
147   int ierr,i;
148 
149   PetscFunctionBegin;
150   if (scall == MAT_INITIAL_MATRIX) {
151     *B = (Mat*)PetscMalloc((n+1)*sizeof(Mat));CHKPTRQ(*B);
152   }
153 
154   for (i=0; i<n; i++) {
155     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
156   }
157   PetscFunctionReturn(0);
158 }
159 
160 /* -------------------------------------------------------*/
161 /* Should check that shapes of vectors and matrices match */
162 /* -------------------------------------------------------*/
163 #include "petscblaslapack.h"
164 
165 #undef __FUNC__
166 #define __FUNC__ "MatMult_SeqSBAIJ_1"
167 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
168 {
169   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
170   Scalar          *x,*z,*xb,x1,zero=0.0;
171   MatScalar       *v;
172   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
173 
174   PetscFunctionBegin;
175   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
176   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
177   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
178 
179   v  = a->a;
180   xb = x;
181 
182   for (i=0; i<mbs; i++) {
183     n  = ai[1] - ai[0];  /* length of i_th row of A */
184     x1 = xb[0];
185     ib = aj + *ai;
186     z[i] += *v++ * x[*ib++];   /* (diag of A)*x */
187     for (j=1; j<n; j++) {
188       cval    = *ib;
189       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
190       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
191     }
192     xb++; ai++;
193   }
194 
195   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
196   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
197   PLogFlops(2*(a->s_nz*2 - a->m) - a->m);  /* s_nz = (nz+m)/2 */
198   PetscFunctionReturn(0);
199 }
200 
201 #undef __FUNC__
202 #define __FUNC__ "MatMult_SeqSBAIJ_2"
203 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
204 {
205   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
206   Scalar          *x,*z,*xb,x1,x2,zero=0.0;
207   MatScalar       *v;
208   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
209 
210 
211   PetscFunctionBegin;
212   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
213   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
214   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
215 
216   v     = a->a;
217   xb = x;
218 
219   for (i=0; i<mbs; i++) {
220     n  = ai[1] - ai[0]; /* length of i_th block row of A */
221     x1 = xb[0]; x2 = xb[1];
222     ib = aj + *ai;
223     /* (diag of A)*x */
224     z[2*i]   += v[0]*x1 + v[2]*x2;
225     z[2*i+1] += v[2]*x1 + v[3]*x2;
226     v += 4;
227     for (j=1; j<n; j++) {
228       /* (strict lower triangular part of A)*x  */
229       cval       = ib[j]*2;
230       z[cval]     += v[0]*x1 + v[1]*x2;
231       z[cval+1]   += v[2]*x1 + v[3]*x2;
232       /* (strict upper triangular part of A)*x  */
233       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
234       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
235       v  += 4;
236     }
237     xb +=2; ai++;
238   }
239 
240   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
241   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
242   PLogFlops(8*(a->s_nz*2 - a->m) - a->m);
243   PetscFunctionReturn(0);
244 }
245 
246 #undef __FUNC__
247 #define __FUNC__ "MatMult_SeqSBAIJ_3"
248 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
249 {
250   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
251   Scalar          *x,*z,*xb,x1,x2,x3,zero=0.0;
252   MatScalar       *v;
253   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
254 
255 
256   PetscFunctionBegin;
257   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
258   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
259   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
260 
261   v     = a->a;
262   xb = x;
263 
264   for (i=0; i<mbs; i++) {
265     n  = ai[1] - ai[0]; /* length of i_th block row of A */
266     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
267     ib = aj + *ai;
268     /* (diag of A)*x */
269     z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
270     z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
271     z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
272     v += 9;
273     for (j=1; j<n; j++) {
274       /* (strict lower triangular part of A)*x  */
275       cval       = ib[j]*3;
276       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
277       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
278       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
279       /* (strict upper triangular part of A)*x  */
280       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
281       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
282       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
283       v  += 9;
284     }
285     xb +=3; ai++;
286   }
287 
288   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
289   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
290   PLogFlops(18*(a->s_nz*2 - a->m) - a->m);
291   PetscFunctionReturn(0);
292 }
293 
294 #undef __FUNC__
295 #define __FUNC__ "MatMult_SeqSBAIJ_4"
296 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
297 {
298   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
299   Scalar          *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
300   MatScalar       *v;
301   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
302 
303   PetscFunctionBegin;
304   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
305   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
306   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
307 
308   v     = a->a;
309   xb = x;
310 
311   for (i=0; i<mbs; i++) {
312     n  = ai[1] - ai[0]; /* length of i_th block row of A */
313     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
314     ib = aj + *ai;
315     /* (diag of A)*x */
316     z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
317     z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
318     z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
319     z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
320     v += 16;
321     for (j=1; j<n; j++) {
322       /* (strict lower triangular part of A)*x  */
323       cval       = ib[j]*4;
324       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
325       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
326       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
327       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
328       /* (strict upper triangular part of A)*x  */
329       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
330       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
331       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
332       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
333       v  += 16;
334     }
335     xb +=4; ai++;
336   }
337 
338   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
339   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
340   PLogFlops(32*(a->s_nz*2 - a->m) - a->m);
341   PetscFunctionReturn(0);
342 }
343 
344 #undef __FUNC__
345 #define __FUNC__ "MatMult_SeqSBAIJ_5"
346 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
347 {
348   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
349   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
350   MatScalar       *v;
351   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
352 
353   PetscFunctionBegin;
354   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
355   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
356   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
357 
358   v     = a->a;
359   xb = x;
360 
361   for (i=0; i<mbs; i++) {
362     n  = ai[1] - ai[0]; /* length of i_th block row of A */
363     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
364     ib = aj + *ai;
365     /* (diag of A)*x */
366     z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
367     z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
368     z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
369     z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
370     z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
371     v += 25;
372     for (j=1; j<n; j++) {
373       /* (strict lower triangular part of A)*x  */
374       cval       = ib[j]*5;
375       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
376       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
377       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
378       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
379       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
380       /* (strict upper triangular part of A)*x  */
381       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
382       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
383       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
384       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
385       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
386       v  += 25;
387     }
388     xb +=5; ai++;
389   }
390 
391   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
392   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
393   PLogFlops(50*(a->s_nz*2 - a->m) - a->m);
394   PetscFunctionReturn(0);
395 }
396 
397 
398 #undef __FUNC__
399 #define __FUNC__ "MatMult_SeqSBAIJ_6"
400 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
401 {
402   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
403   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
404   MatScalar       *v;
405   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
406 
407   PetscFunctionBegin;
408   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
409   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
410   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
411 
412   v     = a->a;
413   xb = x;
414 
415   for (i=0; i<mbs; i++) {
416     n  = ai[1] - ai[0]; /* length of i_th block row of A */
417     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
418     ib = aj + *ai;
419     /* (diag of A)*x */
420     z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
421     z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
422     z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
423     z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
424     z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
425     z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
426     v += 36;
427     for (j=1; j<n; j++) {
428       /* (strict lower triangular part of A)*x  */
429       cval       = ib[j]*6;
430       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
431       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
432       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
433       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
434       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
435       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
436       /* (strict upper triangular part of A)*x  */
437       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
438       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
439       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
440       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
441       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
442       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
443       v  += 36;
444     }
445     xb +=6; ai++;
446   }
447 
448   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
449   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
450   PLogFlops(72*(a->s_nz*2 - a->m) - a->m);
451   PetscFunctionReturn(0);
452 }
453 #undef __FUNC__
454 #define __FUNC__ "MatMult_SeqSBAIJ_7"
455 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
456 {
457   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
458   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
459   MatScalar       *v;
460   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
461 
462   PetscFunctionBegin;
463   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
464   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
465   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
466 
467   v     = a->a;
468   xb = x;
469 
470   for (i=0; i<mbs; i++) {
471     n  = ai[1] - ai[0]; /* length of i_th block row of A */
472     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
473     ib = aj + *ai;
474     /* (diag of A)*x */
475     z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
476     z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
477     z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
478     z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
479     z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
480     z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
481     z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
482     v += 49;
483     for (j=1; j<n; j++) {
484       /* (strict lower triangular part of A)*x  */
485       cval       = ib[j]*7;
486       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
487       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
488       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
489       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
490       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
491       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
492       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
493       /* (strict upper triangular part of A)*x  */
494       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
495       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
496       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
497       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
498       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
499       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
500       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
501       v  += 49;
502     }
503     xb +=7; ai++;
504   }
505   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
506   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
507   PLogFlops(98*(a->s_nz*2 - a->m) - a->m);
508   PetscFunctionReturn(0);
509 }
510 
511 /*
512     This will not work with MatScalar == float because it calls the BLAS
513 */
514 #undef __FUNC__
515 #define __FUNC__ "MatMult_SeqSBAIJ_N"
516 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
517 {
518   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
519   Scalar          *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
520   MatScalar       *v;
521   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
522   int             ncols,k;
523 
524   PetscFunctionBegin;
525   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
526   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
527   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
528 
529   aj   = a->j;
530   v    = a->a;
531   ii   = a->i;
532 
533   if (!a->mult_work) {
534     k = a->m;
535     a->mult_work = (Scalar*)PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
536   }
537   work = a->mult_work;
538 
539   for (i=0; i<mbs; i++) {
540     n     = ii[1] - ii[0]; ncols = n*bs;
541     workt = work; idx=aj+ii[0];
542 
543     /* upper triangular part */
544     for (j=0; j<n; j++) {
545       /* col = bs*(*idx); */
546       xb = x_ptr + bs*(*idx++);
547       for (k=0; k<bs; k++) workt[k] = xb[k];
548       workt += bs;
549     }
550     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
551     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
552 
553     /* strict lower triangular part */
554     ncols -= bs;
555     if (ncols > 0){
556       workt = work;
557       ierr  = PetscMemzero(workt,ncols*sizeof(Scalar));CHKERRQ(ierr);
558       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v+bs2,workt);
559 
560       idx=aj+ii[0]+1;
561       for (j=1; j<n; j++) {
562         zb = z_ptr + bs*(*idx);
563         idx++;
564         for (k=0; k<bs; k++) zb[k] += workt[k] ;
565         workt += bs;
566       }
567     }
568 
569     x += bs; v += n*bs2; z += bs; ii++;
570   }
571 
572   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
573   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
574   PLogFlops(2*(a->s_nz*2 - a->m)*bs2 - a->m);
575   PetscFunctionReturn(0);
576 }
577 
578 #undef __FUNC__
579 #define __FUNC__ "MatMultAdd_SeqSBAIJ_1"
580 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
581 {
582   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
583   Scalar          *x,*y,*z,*xb,x1;
584   MatScalar       *v;
585   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
586 
587   PetscFunctionBegin;
588   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
589   if (yy != xx) {
590     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
591   } else {
592     y = x;
593   }
594   if (zz != yy) {
595     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
596     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
597   } else {
598     z = y;
599   }
600 
601   v  = a->a;
602   xb = x;
603 
604   for (i=0; i<mbs; i++) {
605     n  = ai[1] - ai[0];  /* length of i_th row of A */
606     x1 = xb[0];
607     ib = aj + *ai;
608     z[i] += *v++ * x[*ib++];   /* (diag of A)*x */
609     for (j=1; j<n; j++) {
610       cval    = *ib;
611       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
612       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
613     }
614     xb++; ai++;
615   }
616 
617   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
618   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
619   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
620 
621   PLogFlops(2*(a->s_nz*2 - a->m));
622   PetscFunctionReturn(0);
623 }
624 
625 #undef __FUNC__
626 #define __FUNC__ "MatMultAdd_SeqSBAIJ_2"
627 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
628 {
629   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
630   Scalar          *x,*y,*z,*xb,x1,x2;
631   MatScalar       *v;
632   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
633 
634   PetscFunctionBegin;
635   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
636   if (yy != xx) {
637     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
638   } else {
639     y = x;
640   }
641   if (zz != yy) {
642     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
643     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
644   } else {
645     z = y;
646   }
647 
648   v     = a->a;
649   xb = x;
650 
651   for (i=0; i<mbs; i++) {
652     n  = ai[1] - ai[0]; /* length of i_th block row of A */
653     x1 = xb[0]; x2 = xb[1];
654     ib = aj + *ai;
655     /* (diag of A)*x */
656     z[2*i]   += v[0]*x1 + v[2]*x2;
657     z[2*i+1] += v[2]*x1 + v[3]*x2;
658     v += 4;
659     for (j=1; j<n; j++) {
660       /* (strict lower triangular part of A)*x  */
661       cval       = ib[j]*2;
662       z[cval]     += v[0]*x1 + v[1]*x2;
663       z[cval+1]   += v[2]*x1 + v[3]*x2;
664       /* (strict upper triangular part of A)*x  */
665       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
666       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
667       v  += 4;
668     }
669     xb +=2; ai++;
670   }
671 
672   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
673   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
674   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
675 
676   PLogFlops(4*(a->s_nz*2 - a->m));
677   PetscFunctionReturn(0);
678 }
679 
680 #undef __FUNC__
681 #define __FUNC__ "MatMultAdd_SeqSBAIJ_3"
682 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
683 {
684   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
685   Scalar          *x,*y,*z,*xb,x1,x2,x3;
686   MatScalar       *v;
687   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
688 
689   PetscFunctionBegin;
690   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
691   if (yy != xx) {
692     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
693   } else {
694     y = x;
695   }
696   if (zz != yy) {
697     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
698     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
699   } else {
700     z = y;
701   }
702 
703   v     = a->a;
704   xb = x;
705 
706   for (i=0; i<mbs; i++) {
707     n  = ai[1] - ai[0]; /* length of i_th block row of A */
708     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
709     ib = aj + *ai;
710     /* (diag of A)*x */
711     z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
712     z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
713     z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
714     v += 9;
715     for (j=1; j<n; j++) {
716       /* (strict lower triangular part of A)*x  */
717       cval       = ib[j]*3;
718       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
719       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
720       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
721       /* (strict upper triangular part of A)*x  */
722       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
723       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
724       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
725       v  += 9;
726     }
727     xb +=3; ai++;
728   }
729 
730   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
731   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
732   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
733 
734   PLogFlops(18*(a->s_nz*2 - a->m));
735   PetscFunctionReturn(0);
736 }
737 
738 #undef __FUNC__
739 #define __FUNC__ "MatMultAdd_SeqSBAIJ_4"
740 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
741 {
742   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
743   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4;
744   MatScalar       *v;
745   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
746 
747   PetscFunctionBegin;
748   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
749   if (yy != xx) {
750     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
751   } else {
752     y = x;
753   }
754   if (zz != yy) {
755     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
756     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
757   } else {
758     z = y;
759   }
760 
761   v     = a->a;
762   xb = x;
763 
764   for (i=0; i<mbs; i++) {
765     n  = ai[1] - ai[0]; /* length of i_th block row of A */
766     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
767     ib = aj + *ai;
768     /* (diag of A)*x */
769     z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
770     z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
771     z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
772     z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
773     v += 16;
774     for (j=1; j<n; j++) {
775       /* (strict lower triangular part of A)*x  */
776       cval       = ib[j]*4;
777       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
778       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
779       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
780       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
781       /* (strict upper triangular part of A)*x  */
782       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
783       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
784       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
785       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
786       v  += 16;
787     }
788     xb +=4; ai++;
789   }
790 
791   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
792   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
793   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
794 
795   PLogFlops(32*(a->s_nz*2 - a->m));
796   PetscFunctionReturn(0);
797 }
798 
799 #undef __FUNC__
800 #define __FUNC__ "MatMultAdd_SeqSBAIJ_5"
801 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
802 {
803   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
804   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4,x5;
805   MatScalar       *v;
806   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
807 
808   PetscFunctionBegin;
809   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
810   if (yy != xx) {
811     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
812   } else {
813     y = x;
814   }
815   if (zz != yy) {
816     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
817     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
818   } else {
819     z = y;
820   }
821 
822   v     = a->a;
823   xb = x;
824 
825   for (i=0; i<mbs; i++) {
826     n  = ai[1] - ai[0]; /* length of i_th block row of A */
827     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
828     ib = aj + *ai;
829     /* (diag of A)*x */
830     z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
831     z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
832     z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
833     z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
834     z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
835     v += 25;
836     for (j=1; j<n; j++) {
837       /* (strict lower triangular part of A)*x  */
838       cval       = ib[j]*5;
839       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
840       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
841       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
842       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
843       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
844       /* (strict upper triangular part of A)*x  */
845       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
846       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
847       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
848       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
849       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
850       v  += 25;
851     }
852     xb +=5; ai++;
853   }
854 
855   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
856   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
857   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
858 
859   PLogFlops(50*(a->s_nz*2 - a->m));
860   PetscFunctionReturn(0);
861 }
862 #undef __FUNC__
863 #define __FUNC__ "MatMultAdd_SeqSBAIJ_6"
864 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
865 {
866   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
867   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
868   MatScalar       *v;
869   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
870 
871   PetscFunctionBegin;
872   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
873   if (yy != xx) {
874     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
875   } else {
876     y = x;
877   }
878   if (zz != yy) {
879     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
880     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
881   } else {
882     z = y;
883   }
884 
885   v     = a->a;
886   xb = x;
887 
888   for (i=0; i<mbs; i++) {
889     n  = ai[1] - ai[0]; /* length of i_th block row of A */
890     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
891     ib = aj + *ai;
892     /* (diag of A)*x */
893     z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
894     z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
895     z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
896     z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
897     z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
898     z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
899     v += 36;
900     for (j=1; j<n; j++) {
901       /* (strict lower triangular part of A)*x  */
902       cval       = ib[j]*6;
903       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
904       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
905       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
906       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
907       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
908       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
909       /* (strict upper triangular part of A)*x  */
910       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
911       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
912       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
913       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
914       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
915       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
916       v  += 36;
917     }
918     xb +=6; ai++;
919   }
920 
921   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
922   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
923   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
924 
925   PLogFlops(72*(a->s_nz*2 - a->m));
926   PetscFunctionReturn(0);
927 }
928 
929 #undef __FUNC__
930 #define __FUNC__ "MatMultAdd_SeqSBAIJ_7"
931 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
932 {
933   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
934   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
935   MatScalar       *v;
936   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
937 
938   PetscFunctionBegin;
939   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
940   if (yy != xx) {
941     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
942   } else {
943     y = x;
944   }
945   if (zz != yy) {
946     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
947     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
948   } else {
949     z = y;
950   }
951 
952   v     = a->a;
953   xb = x;
954 
955   for (i=0; i<mbs; i++) {
956     n  = ai[1] - ai[0]; /* length of i_th block row of A */
957     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
958     ib = aj + *ai;
959     /* (diag of A)*x */
960     z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
961     z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
962     z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
963     z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
964     z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
965     z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
966     z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
967     v += 49;
968     for (j=1; j<n; j++) {
969       /* (strict lower triangular part of A)*x  */
970       cval       = ib[j]*7;
971       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
972       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
973       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
974       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
975       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
976       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
977       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
978       /* (strict upper triangular part of A)*x  */
979       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
980       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
981       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
982       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
983       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
984       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
985       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
986       v  += 49;
987     }
988     xb +=7; ai++;
989   }
990 
991   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
992   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
993   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
994 
995   PLogFlops(98*(a->s_nz*2 - a->m));
996   PetscFunctionReturn(0);
997 }
998 
999 #undef __FUNC__
1000 #define __FUNC__ "MatMultAdd_SeqSBAIJ_N"
1001 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1002 {
1003   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
1004   Scalar          *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1005   MatScalar       *v;
1006   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
1007   int             ncols,k;
1008 
1009   PetscFunctionBegin;
1010   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1011   if (yy != xx) {
1012     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1013   } else {
1014     y = x;
1015   }
1016   if (zz != yy) {
1017     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1018     ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1019   } else {
1020     z = y;
1021   }
1022 
1023   aj   = a->j;
1024   v    = a->a;
1025   ii   = a->i;
1026 
1027   if (!a->mult_work) {
1028     k = a->m;
1029     a->mult_work = (Scalar*)PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1030   }
1031   work = a->mult_work;
1032 
1033 
1034   for (i=0; i<mbs; i++) {
1035     n     = ii[1] - ii[0]; ncols = n*bs;
1036     workt = work; idx=aj+ii[0];
1037 
1038     /* upper triangular part */
1039     for (j=0; j<n; j++) {
1040       /* col = bs*(*idx); */
1041       xb = x_ptr + bs*(*idx++);
1042       for (k=0; k<bs; k++) workt[k] = xb[k];
1043       workt += bs;
1044     }
1045     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1046     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1047 
1048     /* strict lower triangular part */
1049     ncols -= bs;
1050     if (ncols > 0){
1051       workt = work;
1052       ierr  = PetscMemzero(workt,ncols*sizeof(Scalar));CHKERRQ(ierr);
1053       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v+bs2,workt);
1054 
1055       idx=aj+ii[0]+1;
1056       for (j=1; j<n; j++) {
1057         zb = z_ptr + bs*(*idx);
1058         idx++;
1059         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1060         workt += bs;
1061       }
1062     }
1063 
1064     x += bs; v += n*bs2; z += bs; ii++;
1065   }
1066 
1067   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1068   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1069   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1070 
1071   PLogFlops(2*(a->s_nz*2 - a->m));
1072   PetscFunctionReturn(0);
1073 }
1074 
1075 #undef __FUNC__
1076 #define __FUNC__ "MatMultTranspose_SeqSBAIJ"
1077 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1078 {
1079   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
1080   Scalar          *xg,*zg,*zb,zero = 0.0;
1081   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
1082   MatScalar       *v;
1083   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval;
1084   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1085 
1086   PetscFunctionBegin;
1087   SETERRQ(1,1,"Matrix is symmetric. Call MatMult().");
1088   PetscFunctionReturn(0);
1089 }
1090 
1091 #undef __FUNC__
1092 #define __FUNC__ "MatMultTransposeAdd_SeqSBAIJ"
1093 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1094 
1095 {
1096   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
1097   Scalar          *xg,*zg,*zb,*x,*z,*xb,x1,x2,x3,x4,x5;
1098   MatScalar       *v;
1099   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1100 
1101   PetscFunctionBegin;
1102   SETERRQ(1,1,"Matrix is symmetric. Call MatMultAdd().");
1103   PetscFunctionReturn(0);
1104 }
1105 
1106 #undef __FUNC__
1107 #define __FUNC__ "MatScale_SeqSBAIJ"
1108 int MatScale_SeqSBAIJ(Scalar *alpha,Mat inA)
1109 {
1110   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1111   int         one = 1,totalnz = a->bs2*a->s_nz;
1112 
1113   PetscFunctionBegin;
1114   BLscal_(&totalnz,alpha,a->a,&one);
1115   PLogFlops(totalnz);
1116   PetscFunctionReturn(0);
1117 }
1118 
1119 #undef __FUNC__
1120 #define __FUNC__ "MatNorm_SeqSBAIJ"
1121 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1122 {
1123   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1124   MatScalar   *v = a->a;
1125   PetscReal   sum_diag = 0.0, sum_off = 0.0, *sum;
1126   int         i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs;
1127   int         *jl,*il,jmin,jmax,ierr,nexti,ik;
1128 
1129   PetscFunctionBegin;
1130   if (type == NORM_FROBENIUS) {
1131     for (k=0; k<mbs; k++){
1132       jmin = a->i[k]; jmax = a->i[k+1];
1133       /* diagonal block */
1134       for (i=0; i<bs2; i++){
1135 #if defined(PETSC_USE_COMPLEX)
1136         sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1137 #else
1138         sum_diag += (*v)*(*v); v++;
1139 #endif
1140       }
1141       for (j=jmin+1; j<jmax; j++){  /* off-diagonal blocks */
1142         for (i=0; i<bs2; i++){
1143 #if defined(PETSC_USE_COMPLEX)
1144           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1145 #else
1146           sum_off += (*v)*(*v); v++;
1147 #endif
1148         }
1149       }
1150     }
1151     *norm = sqrt(sum_diag + 2*sum_off);
1152 
1153   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1154     il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
1155     jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
1156     sum = (PetscReal*)PetscMalloc(bs*sizeof(PetscReal));CHKPTRQ(sum);
1157     for (i=0; i<mbs; i++) {
1158       jl[i] = mbs; il[0] = 0;
1159     }
1160 
1161     *norm = 0.0;
1162     for (k=0; k<mbs; k++) { /* k_th block row */
1163       for (j=0; j<bs; j++) sum[j]=0.0;
1164 
1165       /*-- col sum --*/
1166       i = jl[k]; /* first |A(i,k)| to be added */
1167       while (i<mbs){
1168         nexti = jl[i];  /* next block row to be added */
1169         ik    = il[i];  /* block index of A(i,k) in the array a */
1170         for (j=0; j<bs; j++){
1171           v = a->a + ik*bs2 + j*bs;
1172           for (k1=0; k1<bs; k1++) {
1173             sum[j] += PetscAbsScalar(*v); v++;
1174           }
1175         }
1176         /* update il, jl */
1177         jmin = ik + 1; jmax = a->i[i+1];
1178         if (jmin < jmax){
1179           il[i] = jmin;
1180           j   = a->j[jmin];
1181           jl[i] = jl[j]; jl[j]=i;
1182         }
1183         i = nexti;
1184       }
1185 
1186       /*-- row sum --*/
1187       jmin = a->i[k]; jmax = a->i[k+1];
1188       for (i=jmin; i<jmax; i++) {
1189         for (j=0; j<bs; j++){
1190           v = a->a + i*bs2 + j;
1191           for (k1=0; k1<bs; k1++){
1192             sum[j] += PetscAbsScalar(*v);
1193             v   += bs;
1194           }
1195         }
1196       }
1197       /* add k_th block row to il, jl */
1198       jmin++;
1199       if (jmin < jmax){
1200         il[k] = jmin;
1201         j   = a->j[jmin];
1202         jl[k] = jl[j]; jl[j] = k;
1203       }
1204       for (j=0; j<bs; j++){
1205         if (sum[j] > *norm) *norm = sum[j];
1206       }
1207     }
1208     ierr = PetscFree(il);CHKERRQ(ierr);
1209     ierr = PetscFree(jl);CHKERRQ(ierr);
1210     ierr = PetscFree(sum);CHKERRQ(ierr);
1211   } else {
1212     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1213   }
1214   PetscFunctionReturn(0);
1215 }
1216 
1217 #undef __FUNC__
1218 #define __FUNC__ "MatEqual_SeqSBAIJ"
1219 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1220 {
1221   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1222   int         ierr;
1223 
1224   PetscFunctionBegin;
1225   if (B->type !=MATSEQSBAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices must be same type");
1226 
1227   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1228   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)|| (a->s_nz != b->s_nz)) {
1229     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1230   }
1231 
1232   /* if the a->i are the same */
1233   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr);
1234   if (*flg == PETSC_FALSE) {
1235     PetscFunctionReturn(0);
1236   }
1237 
1238   /* if a->j are the same */
1239   ierr = PetscMemcmp(a->j,b->j,(a->s_nz)*sizeof(int),flg);CHKERRQ(ierr);
1240   if (*flg == PETSC_FALSE) {
1241     PetscFunctionReturn(0);
1242   }
1243   /* if a->a are the same */
1244   ierr = PetscMemcmp(a->a,b->a,(a->s_nz)*(a->bs)*(a->bs)*sizeof(Scalar),flg);CHKERRQ(ierr);
1245   PetscFunctionReturn(0);
1246 
1247 }
1248 
1249 #undef __FUNC__
1250 #define __FUNC__ "MatGetDiagonal_SeqSBAIJ"
1251 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1252 {
1253   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1254   int         ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1255   Scalar      *x,zero = 0.0;
1256   MatScalar   *aa,*aa_j;
1257 
1258   PetscFunctionBegin;
1259   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Not for factored matrix");
1260   bs   = a->bs;
1261   aa   = a->a;
1262   ai   = a->i;
1263   aj   = a->j;
1264   ambs = a->mbs;
1265   bs2  = a->bs2;
1266 
1267   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1268   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1269   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1270   /* if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Nonconforming matrix and vector");*/
1271   for (i=0; i<ambs; i++) {
1272     j=ai[i];
1273     if (aj[j] == i) {             /* if this is a diagonal element */
1274       row  = i*bs;
1275       aa_j = aa + j*bs2;
1276       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1277     }
1278   }
1279   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1280   PetscFunctionReturn(0);
1281 }
1282 
1283 #undef __FUNC__
1284 #define __FUNC__ "MatDiagonalScale_SeqSBAIJ"
1285 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1286 {
1287   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1288   Scalar      *l,*r,x,*li,*ri;
1289   MatScalar   *aa,*v;
1290   int         ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1291 
1292   PetscFunctionBegin;
1293   ai  = a->i;
1294   aj  = a->j;
1295   aa  = a->a;
1296   m   = a->m;
1297   bs  = a->bs;
1298   mbs = a->mbs;
1299   bs2 = a->bs2;
1300 
1301   if (ll != rr) {
1302     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"For symmetric format, left and right scaling vectors must be same\n");
1303   }
1304   if (ll) {
1305     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1306     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1307     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Left scaling vector wrong length");
1308     for (i=0; i<mbs; i++) { /* for each block row */
1309       M  = ai[i+1] - ai[i];
1310       li = l + i*bs;
1311       v  = aa + bs2*ai[i];
1312       for (j=0; j<M; j++) { /* for each block */
1313         for (k=0; k<bs2; k++) {
1314           (*v++) *= li[k%bs];
1315         }
1316 #ifdef CONT
1317         /* will be used to replace the above loop */
1318         ri = l + bs*aj[ai[i]+j];
1319         for (k=0; k<bs; k++) { /* column value */
1320           x = ri[k];
1321           for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1322         }
1323 #endif
1324 
1325       }
1326     }
1327     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1328     PLogFlops(2*a->s_nz);
1329   }
1330   /* will be deleted */
1331   if (rr) {
1332     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
1333     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1334     if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Right scaling vector wrong length");
1335     for (i=0; i<mbs; i++) { /* for each block row */
1336       M  = ai[i+1] - ai[i];
1337       v  = aa + bs2*ai[i];
1338       for (j=0; j<M; j++) { /* for each block */
1339         ri = r + bs*aj[ai[i]+j];
1340         for (k=0; k<bs; k++) {
1341           x = ri[k];
1342           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1343         }
1344       }
1345     }
1346     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
1347     PLogFlops(a->s_nz);
1348   }
1349   PetscFunctionReturn(0);
1350 }
1351 
1352 #undef __FUNC__
1353 #define __FUNC__ "MatGetInfo_SeqSBAIJ"
1354 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1355 {
1356   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1357 
1358   PetscFunctionBegin;
1359   info->rows_global    = (double)a->m;
1360   info->columns_global = (double)a->m;
1361   info->rows_local     = (double)a->m;
1362   info->columns_local  = (double)a->m;
1363   info->block_size     = a->bs2;
1364   info->nz_allocated   = a->s_maxnz; /*num. of nonzeros in upper triangular part */
1365   info->nz_used        = a->bs2*a->s_nz; /*num. of nonzeros in upper triangular part */
1366   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1367   info->assemblies   = A->num_ass;
1368   info->mallocs      = a->reallocs;
1369   info->memory       = A->mem;
1370   if (A->factor) {
1371     info->fill_ratio_given  = A->info.fill_ratio_given;
1372     info->fill_ratio_needed = A->info.fill_ratio_needed;
1373     info->factor_mallocs    = A->info.factor_mallocs;
1374   } else {
1375     info->fill_ratio_given  = 0;
1376     info->fill_ratio_needed = 0;
1377     info->factor_mallocs    = 0;
1378   }
1379   PetscFunctionReturn(0);
1380 }
1381 
1382 
1383 #undef __FUNC__
1384 #define __FUNC__ "MatZeroEntries_SeqSBAIJ"
1385 int MatZeroEntries_SeqSBAIJ(Mat A)
1386 {
1387   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1388   int         ierr;
1389 
1390   PetscFunctionBegin;
1391   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1392   PetscFunctionReturn(0);
1393 }
1394