xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 7843d17afa8c1ec0c9a03b3c2ea6d3e3db42e252)
1 /*$Id: sbaij2.c,v 1.21 2000/10/09 21:03:51 hzhang Exp hzhang $*/
2 
3 #include "petscsys.h"
4 #include "src/mat/impls/baij/seq/baij.h"
5 #include "src/vec/vecimpl.h"
6 #include "src/inline/spops.h"
7 #include "src/inline/ilu.h"
8 #include "petscbt.h"
9 #include "sbaij.h"
10 
11 #undef __FUNC__
12 #define __FUNC__ "MatIncreaseOverlap_SeqSBAIJ"
13 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS *is,int ov)
14 {
15   PetscFunctionBegin;
16   SETERRQ(1,"Function not yet written for SBAIJ format");
17   /* PetscFunctionReturn(0); */
18 }
19 
20 #undef __FUNC__
21 #define __FUNC__ "MatGetSubMatrix_SeqSBAIJ_Private"
22 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
23 {
24   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
25   int          *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
26   int          row,mat_i,*mat_j,tcol,*mat_ilen;
27   int          *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
28   int          *aj = a->j,*ai = a->i;
29   MatScalar    *mat_a;
30   Mat          C;
31   PetscTruth   flag;
32 
33   PetscFunctionBegin;
34 
35   if (isrow != iscol) SETERRA(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
36   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
37   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
38 
39   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
40   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
41 
42   smap  = (int*)PetscMalloc((1+oldcols)*sizeof(int));CHKPTRQ(smap);
43   ssmap = smap;
44   lens  = (int*)PetscMalloc((1+nrows)*sizeof(int));CHKPTRQ(lens);
45   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
46   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
47   /* determine lens of each row */
48   for (i=0; i<nrows; i++) {
49     kstart  = ai[irow[i]];
50     kend    = kstart + a->ilen[irow[i]];
51     lens[i] = 0;
52       for (k=kstart; k<kend; k++) {
53         if (ssmap[aj[k]]) {
54           lens[i]++;
55         }
56       }
57     }
58   /* Create and fill new matrix */
59   if (scall == MAT_REUSE_MATRIX) {
60     c = (Mat_SeqSBAIJ *)((*B)->data);
61 
62     if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
63     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr);
64     if (flag == PETSC_FALSE) {
65       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
66     }
67     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
68     C = *B;
69   } else {
70     ierr = MatCreateSeqSBAIJ(A->comm,bs,nrows*bs,nrows*bs,0,lens,&C);CHKERRQ(ierr);
71   }
72   c = (Mat_SeqSBAIJ *)(C->data);
73   for (i=0; i<nrows; i++) {
74     row    = irow[i];
75     kstart = ai[row];
76     kend   = kstart + a->ilen[row];
77     mat_i  = c->i[i];
78     mat_j  = c->j + mat_i;
79     mat_a  = c->a + mat_i*bs2;
80     mat_ilen = c->ilen + i;
81     for (k=kstart; k<kend; k++) {
82       if ((tcol=ssmap[a->j[k]])) {
83         *mat_j++ = tcol - 1;
84         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
85         mat_a   += bs2;
86         (*mat_ilen)++;
87       }
88     }
89   }
90 
91   /* Free work space */
92   ierr = PetscFree(smap);CHKERRQ(ierr);
93   ierr = PetscFree(lens);CHKERRQ(ierr);
94   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
95   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
96 
97   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
98   *B = C;
99   PetscFunctionReturn(0);
100 }
101 
102 #undef __FUNC__
103 #define __FUNC__ "MatGetSubMatrix_SeqSBAIJ"
104 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
105 {
106   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
107   IS          is1;
108   int         *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
109 
110   PetscFunctionBegin;
111   if (isrow != iscol) SETERRA(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
112 
113   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
114   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
115 
116   /* Verify if the indices corespond to each element in a block
117    and form the IS with compressed IS */
118   vary = (int*)PetscMalloc(2*(a->mbs+1)*sizeof(int));CHKPTRQ(vary);
119   iary = vary + a->mbs;
120   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
121   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
122 
123   count = 0;
124   for (i=0; i<a->mbs; i++) {
125     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,"Index set does not match blocks");
126     if (vary[i]==bs) iary[count++] = i;
127   }
128   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
129 
130   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
131   ierr = PetscFree(vary);CHKERRQ(ierr);
132 
133   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
134   ISDestroy(is1);
135   PetscFunctionReturn(0);
136 }
137 
138 #undef __FUNC__
139 #define __FUNC__ "MatGetSubMatrices_SeqSBAIJ"
140 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,IS *irow,IS *icol,MatReuse scall,Mat **B)
141 {
142   int ierr,i;
143 
144   PetscFunctionBegin;
145   if (scall == MAT_INITIAL_MATRIX) {
146     *B = (Mat*)PetscMalloc((n+1)*sizeof(Mat));CHKPTRQ(*B);
147   }
148 
149   for (i=0; i<n; i++) {
150     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
151   }
152   PetscFunctionReturn(0);
153 }
154 
155 /* -------------------------------------------------------*/
156 /* Should check that shapes of vectors and matrices match */
157 /* -------------------------------------------------------*/
158 #include "petscblaslapack.h"
159 
160 #undef __FUNC__
161 #define __FUNC__ "MatMult_SeqSBAIJ_1"
162 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
163 {
164   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
165   Scalar          *x,*z,*xb,x1,zero=0.0;
166   MatScalar       *v;
167   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
168 
169   PetscFunctionBegin;
170   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
171   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
172   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
173 
174   v  = a->a;
175   xb = x;
176 
177   for (i=0; i<mbs; i++) {
178     n  = ai[1] - ai[0];  /* length of i_th row of A */
179     x1 = xb[0];
180     ib = aj + *ai;
181     jmin = 0;
182     if (*ib == i) {      /* (diag of A)*x */
183       z[i] += *v++ * x[*ib++];
184       jmin++;
185     }
186     for (j=jmin; j<n; j++) {
187       cval    = *ib;
188       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
189       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
190     }
191     xb++; ai++;
192   }
193 
194   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
195   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
196   PLogFlops(2*(a->s_nz*2 - a->m) - a->m);  /* s_nz = (nz+m)/2 */
197   PetscFunctionReturn(0);
198 }
199 
200 #undef __FUNC__
201 #define __FUNC__ "MatMult_SeqSBAIJ_2"
202 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
203 {
204   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
205   Scalar          *x,*z,*xb,x1,x2,zero=0.0;
206   MatScalar       *v;
207   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
208 
209 
210   PetscFunctionBegin;
211   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
212   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
213   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
214 
215   v     = a->a;
216   xb = x;
217 
218   for (i=0; i<mbs; i++) {
219     n  = ai[1] - ai[0]; /* length of i_th block row of A */
220     x1 = xb[0]; x2 = xb[1];
221     ib = aj + *ai;
222     jmin = 0;
223     if (*ib == i){     /* (diag of A)*x */
224       z[2*i]   += v[0]*x1 + v[2]*x2;
225       z[2*i+1] += v[2]*x1 + v[3]*x2;
226       v += 4; jmin++;
227     }
228     for (j=jmin; j<n; j++) {
229       /* (strict lower triangular part of A)*x  */
230       cval       = ib[j]*2;
231       z[cval]     += v[0]*x1 + v[1]*x2;
232       z[cval+1]   += v[2]*x1 + v[3]*x2;
233       /* (strict upper triangular part of A)*x  */
234       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
235       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
236       v  += 4;
237     }
238     xb +=2; ai++;
239   }
240 
241   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
242   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
243   PLogFlops(8*(a->s_nz*2 - a->m) - a->m);
244   PetscFunctionReturn(0);
245 }
246 
247 #undef __FUNC__
248 #define __FUNC__ "MatMult_SeqSBAIJ_3"
249 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
250 {
251   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
252   Scalar          *x,*z,*xb,x1,x2,x3,zero=0.0;
253   MatScalar       *v;
254   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
255 
256 
257   PetscFunctionBegin;
258   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
259   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
260   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
261 
262   v     = a->a;
263   xb = x;
264 
265   for (i=0; i<mbs; i++) {
266     n  = ai[1] - ai[0]; /* length of i_th block row of A */
267     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
268     ib = aj + *ai;
269     jmin = 0;
270     if (*ib == i){     /* (diag of A)*x */
271       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
272       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
273       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
274       v += 9; jmin++;
275     }
276     for (j=jmin; j<n; j++) {
277       /* (strict lower triangular part of A)*x  */
278       cval       = ib[j]*3;
279       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
280       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
281       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
282       /* (strict upper triangular part of A)*x  */
283       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
284       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
285       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
286       v  += 9;
287     }
288     xb +=3; ai++;
289   }
290 
291   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
292   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
293   PLogFlops(18*(a->s_nz*2 - a->m) - a->m);
294   PetscFunctionReturn(0);
295 }
296 
297 #undef __FUNC__
298 #define __FUNC__ "MatMult_SeqSBAIJ_4"
299 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
300 {
301   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
302   Scalar          *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
303   MatScalar       *v;
304   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
305 
306   PetscFunctionBegin;
307   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
308   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
309   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
310 
311   v     = a->a;
312   xb = x;
313 
314   for (i=0; i<mbs; i++) {
315     n  = ai[1] - ai[0]; /* length of i_th block row of A */
316     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
317     ib = aj + *ai;
318     jmin = 0;
319     if (*ib == i){     /* (diag of A)*x */
320       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
321       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
322       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
323       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
324       v += 16; jmin++;
325     }
326     for (j=jmin; j<n; j++) {
327       /* (strict lower triangular part of A)*x  */
328       cval       = ib[j]*4;
329       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
330       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
331       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
332       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
333       /* (strict upper triangular part of A)*x  */
334       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
335       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
336       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
337       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
338       v  += 16;
339     }
340     xb +=4; ai++;
341   }
342 
343   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
344   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
345   PLogFlops(32*(a->s_nz*2 - a->m) - a->m);
346   PetscFunctionReturn(0);
347 }
348 
349 #undef __FUNC__
350 #define __FUNC__ "MatMult_SeqSBAIJ_5"
351 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
352 {
353   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
354   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
355   MatScalar       *v;
356   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
357 
358   PetscFunctionBegin;
359   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
360   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
361   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
362 
363   v     = a->a;
364   xb = x;
365 
366   for (i=0; i<mbs; i++) {
367     n  = ai[1] - ai[0]; /* length of i_th block row of A */
368     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
369     ib = aj + *ai;
370     jmin = 0;
371     if (*ib == i){      /* (diag of A)*x */
372       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
373       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
374       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
375       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
376       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
377       v += 25; jmin++;
378     }
379     for (j=jmin; j<n; j++) {
380       /* (strict lower triangular part of A)*x  */
381       cval       = ib[j]*5;
382       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
383       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
384       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
385       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
386       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
387       /* (strict upper triangular part of A)*x  */
388       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
389       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
390       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
391       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
392       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
393       v  += 25;
394     }
395     xb +=5; ai++;
396   }
397 
398   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
399   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
400   PLogFlops(50*(a->s_nz*2 - a->m) - a->m);
401   PetscFunctionReturn(0);
402 }
403 
404 
405 #undef __FUNC__
406 #define __FUNC__ "MatMult_SeqSBAIJ_6"
407 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
408 {
409   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
410   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
411   MatScalar       *v;
412   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
413 
414   PetscFunctionBegin;
415   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
416   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
417   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
418 
419   v     = a->a;
420   xb = x;
421 
422   for (i=0; i<mbs; i++) {
423     n  = ai[1] - ai[0]; /* length of i_th block row of A */
424     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
425     ib = aj + *ai;
426     jmin = 0;
427     if (*ib == i){      /* (diag of A)*x */
428       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
429       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
430       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
431       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
432       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
433       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
434       v += 36; jmin++;
435     }
436     for (j=jmin; j<n; j++) {
437       /* (strict lower triangular part of A)*x  */
438       cval       = ib[j]*6;
439       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
440       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
441       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
442       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
443       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
444       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
445       /* (strict upper triangular part of A)*x  */
446       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
447       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
448       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
449       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
450       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
451       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
452       v  += 36;
453     }
454     xb +=6; ai++;
455   }
456 
457   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
458   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
459   PLogFlops(72*(a->s_nz*2 - a->m) - a->m);
460   PetscFunctionReturn(0);
461 }
462 #undef __FUNC__
463 #define __FUNC__ "MatMult_SeqSBAIJ_7"
464 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
465 {
466   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
467   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
468   MatScalar       *v;
469   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
470 
471   PetscFunctionBegin;
472   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
473   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
474   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
475 
476   v     = a->a;
477   xb = x;
478 
479   for (i=0; i<mbs; i++) {
480     n  = ai[1] - ai[0]; /* length of i_th block row of A */
481     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
482     ib = aj + *ai;
483     jmin = 0;
484     if (*ib == i){      /* (diag of A)*x */
485       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
486       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
487       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
488       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
489       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
490       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
491       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
492       v += 49; jmin++;
493     }
494     for (j=jmin; j<n; j++) {
495       /* (strict lower triangular part of A)*x  */
496       cval       = ib[j]*7;
497       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
498       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
499       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
500       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
501       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
502       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
503       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
504       /* (strict upper triangular part of A)*x  */
505       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
506       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
507       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
508       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
509       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
510       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
511       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
512       v  += 49;
513     }
514     xb +=7; ai++;
515   }
516   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
517   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
518   PLogFlops(98*(a->s_nz*2 - a->m) - a->m);
519   PetscFunctionReturn(0);
520 }
521 
522 /*
523     This will not work with MatScalar == float because it calls the BLAS
524 */
525 #undef __FUNC__
526 #define __FUNC__ "MatMult_SeqSBAIJ_N"
527 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
528 {
529   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
530   Scalar          *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
531   MatScalar       *v;
532   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
533   int             ncols,k;
534 
535   PetscFunctionBegin;
536   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
537   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
538   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
539 
540   aj   = a->j;
541   v    = a->a;
542   ii   = a->i;
543 
544   if (!a->mult_work) {
545     k = a->m;
546     a->mult_work = (Scalar*)PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
547   }
548   work = a->mult_work;
549 
550   for (i=0; i<mbs; i++) {
551     n     = ii[1] - ii[0]; ncols = n*bs;
552     workt = work; idx=aj+ii[0];
553 
554     /* upper triangular part */
555     for (j=0; j<n; j++) {
556       xb = x_ptr + bs*(*idx++);
557       for (k=0; k<bs; k++) workt[k] = xb[k];
558       workt += bs;
559     }
560     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
561     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
562 
563     /* strict lower triangular part */
564     idx = aj+ii[0];
565     if (*idx == i){
566       ncols -= bs; v += bs2; idx++; n--;
567     }
568 
569     if (ncols > 0){
570       workt = work;
571       ierr  = PetscMemzero(workt,ncols*sizeof(Scalar));CHKERRQ(ierr);
572       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
573       for (j=0; j<n; j++) {
574         zb = z_ptr + bs*(*idx++);
575         for (k=0; k<bs; k++) zb[k] += workt[k] ;
576         workt += bs;
577       }
578     }
579     x += bs; v += n*bs2; z += bs; ii++;
580   }
581 
582   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
583   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
584   PLogFlops(2*(a->s_nz*2 - a->m)*bs2 - a->m);
585   PetscFunctionReturn(0);
586 }
587 
588 #undef __FUNC__
589 #define __FUNC__ "MatMultAdd_SeqSBAIJ_1"
590 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
591 {
592   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
593   Scalar          *x,*y,*z,*xb,x1;
594   MatScalar       *v;
595   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
596 
597   PetscFunctionBegin;
598   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
599   if (yy != xx) {
600     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
601   } else {
602     y = x;
603   }
604   if (zz != yy) {
605     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
606     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
607   } else {
608     z = y;
609   }
610 
611   v  = a->a;
612   xb = x;
613 
614   for (i=0; i<mbs; i++) {
615     n  = ai[1] - ai[0];  /* length of i_th row of A */
616     x1 = xb[0];
617     ib = aj + *ai;
618     jmin = 0;
619     if (*ib == i) {            /* (diag of A)*x */
620       z[i] += *v++ * x[*ib++]; jmin++;
621     }
622     for (j=jmin; j<n; j++) {
623       cval    = *ib;
624       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
625       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
626     }
627     xb++; ai++;
628   }
629 
630   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
631   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
632   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
633 
634   PLogFlops(2*(a->s_nz*2 - a->m));
635   PetscFunctionReturn(0);
636 }
637 
638 #undef __FUNC__
639 #define __FUNC__ "MatMultAdd_SeqSBAIJ_2"
640 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
641 {
642   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
643   Scalar          *x,*y,*z,*xb,x1,x2;
644   MatScalar       *v;
645   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
646 
647   PetscFunctionBegin;
648   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
649   if (yy != xx) {
650     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
651   } else {
652     y = x;
653   }
654   if (zz != yy) {
655     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
656     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
657   } else {
658     z = y;
659   }
660 
661   v     = a->a;
662   xb = x;
663 
664   for (i=0; i<mbs; i++) {
665     n  = ai[1] - ai[0]; /* length of i_th block row of A */
666     x1 = xb[0]; x2 = xb[1];
667     ib = aj + *ai;
668     jmin = 0;
669     if (*ib == i){      /* (diag of A)*x */
670       z[2*i]   += v[0]*x1 + v[2]*x2;
671       z[2*i+1] += v[2]*x1 + v[3]*x2;
672       v += 4; jmin++;
673     }
674     for (j=jmin; j<n; j++) {
675       /* (strict lower triangular part of A)*x  */
676       cval       = ib[j]*2;
677       z[cval]     += v[0]*x1 + v[1]*x2;
678       z[cval+1]   += v[2]*x1 + v[3]*x2;
679       /* (strict upper triangular part of A)*x  */
680       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
681       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
682       v  += 4;
683     }
684     xb +=2; ai++;
685   }
686 
687   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
688   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
689   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
690 
691   PLogFlops(4*(a->s_nz*2 - a->m));
692   PetscFunctionReturn(0);
693 }
694 
695 #undef __FUNC__
696 #define __FUNC__ "MatMultAdd_SeqSBAIJ_3"
697 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
698 {
699   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
700   Scalar          *x,*y,*z,*xb,x1,x2,x3;
701   MatScalar       *v;
702   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
703 
704   PetscFunctionBegin;
705   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
706   if (yy != xx) {
707     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
708   } else {
709     y = x;
710   }
711   if (zz != yy) {
712     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
713     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
714   } else {
715     z = y;
716   }
717 
718   v     = a->a;
719   xb = x;
720 
721   for (i=0; i<mbs; i++) {
722     n  = ai[1] - ai[0]; /* length of i_th block row of A */
723     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
724     ib = aj + *ai;
725     jmin = 0;
726     if (*ib == i){     /* (diag of A)*x */
727      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
728      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
729      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
730      v += 9; jmin++;
731     }
732     for (j=jmin; j<n; j++) {
733       /* (strict lower triangular part of A)*x  */
734       cval       = ib[j]*3;
735       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
736       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
737       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
738       /* (strict upper triangular part of A)*x  */
739       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
740       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
741       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
742       v  += 9;
743     }
744     xb +=3; ai++;
745   }
746 
747   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
748   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
749   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
750 
751   PLogFlops(18*(a->s_nz*2 - a->m));
752   PetscFunctionReturn(0);
753 }
754 
755 #undef __FUNC__
756 #define __FUNC__ "MatMultAdd_SeqSBAIJ_4"
757 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
758 {
759   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
760   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4;
761   MatScalar       *v;
762   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
763 
764   PetscFunctionBegin;
765   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
766   if (yy != xx) {
767     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
768   } else {
769     y = x;
770   }
771   if (zz != yy) {
772     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
773     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
774   } else {
775     z = y;
776   }
777 
778   v     = a->a;
779   xb = x;
780 
781   for (i=0; i<mbs; i++) {
782     n  = ai[1] - ai[0]; /* length of i_th block row of A */
783     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
784     ib = aj + *ai;
785     jmin = 0;
786     if (*ib == i){      /* (diag of A)*x */
787       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
788       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
789       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
790       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
791       v += 16; jmin++;
792     }
793     for (j=jmin; j<n; j++) {
794       /* (strict lower triangular part of A)*x  */
795       cval       = ib[j]*4;
796       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
797       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
798       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
799       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
800       /* (strict upper triangular part of A)*x  */
801       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
802       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
803       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
804       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
805       v  += 16;
806     }
807     xb +=4; ai++;
808   }
809 
810   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
811   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
812   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
813 
814   PLogFlops(32*(a->s_nz*2 - a->m));
815   PetscFunctionReturn(0);
816 }
817 
818 #undef __FUNC__
819 #define __FUNC__ "MatMultAdd_SeqSBAIJ_5"
820 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
821 {
822   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
823   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4,x5;
824   MatScalar       *v;
825   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
826 
827   PetscFunctionBegin;
828   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
829   if (yy != xx) {
830     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
831   } else {
832     y = x;
833   }
834   if (zz != yy) {
835     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
836     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
837   } else {
838     z = y;
839   }
840 
841   v     = a->a;
842   xb = x;
843 
844   for (i=0; i<mbs; i++) {
845     n  = ai[1] - ai[0]; /* length of i_th block row of A */
846     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
847     ib = aj + *ai;
848     jmin = 0;
849     if (*ib == i){      /* (diag of A)*x */
850       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
851       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
852       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
853       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
854       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
855       v += 25; jmin++;
856     }
857     for (j=jmin; j<n; j++) {
858       /* (strict lower triangular part of A)*x  */
859       cval       = ib[j]*5;
860       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
861       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
862       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
863       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
864       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
865       /* (strict upper triangular part of A)*x  */
866       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
867       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
868       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
869       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
870       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
871       v  += 25;
872     }
873     xb +=5; ai++;
874   }
875 
876   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
877   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
878   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
879 
880   PLogFlops(50*(a->s_nz*2 - a->m));
881   PetscFunctionReturn(0);
882 }
883 #undef __FUNC__
884 #define __FUNC__ "MatMultAdd_SeqSBAIJ_6"
885 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
886 {
887   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
888   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
889   MatScalar       *v;
890   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
891 
892   PetscFunctionBegin;
893   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
894   if (yy != xx) {
895     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
896   } else {
897     y = x;
898   }
899   if (zz != yy) {
900     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
901     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
902   } else {
903     z = y;
904   }
905 
906   v     = a->a;
907   xb = x;
908 
909   for (i=0; i<mbs; i++) {
910     n  = ai[1] - ai[0]; /* length of i_th block row of A */
911     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
912     ib = aj + *ai;
913     jmin = 0;
914     if (*ib == i){     /* (diag of A)*x */
915       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
916       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
917       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
918       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
919       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
920       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
921       v += 36; jmin++;
922     }
923     for (j=jmin; j<n; j++) {
924       /* (strict lower triangular part of A)*x  */
925       cval       = ib[j]*6;
926       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
927       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
928       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
929       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
930       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
931       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
932       /* (strict upper triangular part of A)*x  */
933       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
934       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
935       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
936       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
937       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
938       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
939       v  += 36;
940     }
941     xb +=6; ai++;
942   }
943 
944   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
945   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
946   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
947 
948   PLogFlops(72*(a->s_nz*2 - a->m));
949   PetscFunctionReturn(0);
950 }
951 
952 #undef __FUNC__
953 #define __FUNC__ "MatMultAdd_SeqSBAIJ_7"
954 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
955 {
956   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
957   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
958   MatScalar       *v;
959   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
960 
961   PetscFunctionBegin;
962   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
963   if (yy != xx) {
964     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
965   } else {
966     y = x;
967   }
968   if (zz != yy) {
969     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
970     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
971   } else {
972     z = y;
973   }
974 
975   v     = a->a;
976   xb = x;
977 
978   for (i=0; i<mbs; i++) {
979     n  = ai[1] - ai[0]; /* length of i_th block row of A */
980     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
981     ib = aj + *ai;
982     jmin = 0;
983     if (*ib == i){     /* (diag of A)*x */
984       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
985       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
986       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
987       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
988       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
989       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
990       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
991       v += 49; jmin++;
992     }
993     for (j=jmin; j<n; j++) {
994       /* (strict lower triangular part of A)*x  */
995       cval       = ib[j]*7;
996       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
997       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
998       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
999       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1000       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1001       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1002       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1003       /* (strict upper triangular part of A)*x  */
1004       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1005       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1006       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1007       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1008       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1009       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1010       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1011       v  += 49;
1012     }
1013     xb +=7; ai++;
1014   }
1015 
1016   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1017   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1018   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1019 
1020   PLogFlops(98*(a->s_nz*2 - a->m));
1021   PetscFunctionReturn(0);
1022 }
1023 
1024 #undef __FUNC__
1025 #define __FUNC__ "MatMultAdd_SeqSBAIJ_N"
1026 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1027 {
1028   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
1029   Scalar          *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1030   MatScalar       *v;
1031   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
1032   int             ncols,k;
1033 
1034   PetscFunctionBegin;
1035   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1036   if (yy != xx) {
1037     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1038   } else {
1039     y = x;
1040   }
1041   if (zz != yy) {
1042     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1043     ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1044   } else {
1045     z = y;
1046   }
1047 
1048   aj   = a->j;
1049   v    = a->a;
1050   ii   = a->i;
1051 
1052   if (!a->mult_work) {
1053     k = a->m;
1054     a->mult_work = (Scalar*)PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1055   }
1056   work = a->mult_work;
1057 
1058 
1059   for (i=0; i<mbs; i++) {
1060     n     = ii[1] - ii[0]; ncols = n*bs;
1061     workt = work; idx=aj+ii[0];
1062 
1063     /* upper triangular part */
1064     for (j=0; j<n; j++) {
1065       xb = x_ptr + bs*(*idx++);
1066       for (k=0; k<bs; k++) workt[k] = xb[k];
1067       workt += bs;
1068     }
1069     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1070     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1071 
1072     /* strict lower triangular part */
1073     idx = aj+ii[0];
1074     if (*idx == i){
1075       ncols -= bs; v += bs2; idx++; n--;
1076     }
1077     if (ncols > 0){
1078       workt = work;
1079       ierr  = PetscMemzero(workt,ncols*sizeof(Scalar));CHKERRQ(ierr);
1080       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1081       for (j=0; j<n; j++) {
1082         zb = z_ptr + bs*(*idx++);
1083         /* idx++; */
1084         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1085         workt += bs;
1086       }
1087     }
1088 
1089     x += bs; v += n*bs2; z += bs; ii++;
1090   }
1091 
1092   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1093   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1094   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1095 
1096   PLogFlops(2*(a->s_nz*2 - a->m));
1097   PetscFunctionReturn(0);
1098 }
1099 
1100 #undef __FUNC__
1101 #define __FUNC__ "MatMultTranspose_SeqSBAIJ"
1102 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1103 {
1104   PetscFunctionBegin;
1105   SETERRQ(1,"Matrix is symmetric. Call MatMult().");
1106   /* PetscFunctionReturn(0); */
1107 }
1108 
1109 #undef __FUNC__
1110 #define __FUNC__ "MatMultTransposeAdd_SeqSBAIJ"
1111 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1112 
1113 {
1114   PetscFunctionBegin;
1115   SETERRQ(1,"Matrix is symmetric. Call MatMultAdd().");
1116   /* PetscFunctionReturn(0); */
1117 }
1118 
1119 #undef __FUNC__
1120 #define __FUNC__ "MatScale_SeqSBAIJ"
1121 int MatScale_SeqSBAIJ(Scalar *alpha,Mat inA)
1122 {
1123   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1124   int         one = 1,totalnz = a->bs2*a->s_nz;
1125 
1126   PetscFunctionBegin;
1127   BLscal_(&totalnz,alpha,a->a,&one);
1128   PLogFlops(totalnz);
1129   PetscFunctionReturn(0);
1130 }
1131 
1132 #undef __FUNC__
1133 #define __FUNC__ "MatNorm_SeqSBAIJ"
1134 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1135 {
1136   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1137   MatScalar   *v = a->a;
1138   PetscReal   sum_diag = 0.0, sum_off = 0.0, *sum;
1139   int         i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1140   int         *jl,*il,jmin,jmax,ierr,nexti,ik,*col;
1141 
1142   PetscFunctionBegin;
1143   if (type == NORM_FROBENIUS) {
1144     for (k=0; k<mbs; k++){
1145       jmin = a->i[k]; jmax = a->i[k+1];
1146       col  = aj + jmin;
1147       if (*col == k){         /* diagonal block */
1148         for (i=0; i<bs2; i++){
1149 #if defined(PETSC_USE_COMPLEX)
1150           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1151 #else
1152           sum_diag += (*v)*(*v); v++;
1153 #endif
1154         }
1155         jmin++;
1156       }
1157       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1158         for (i=0; i<bs2; i++){
1159 #if defined(PETSC_USE_COMPLEX)
1160           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1161 #else
1162           sum_off += (*v)*(*v); v++;
1163 #endif
1164         }
1165       }
1166     }
1167     *norm = sqrt(sum_diag + 2*sum_off);
1168 
1169   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1170     il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
1171     jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
1172     sum = (PetscReal*)PetscMalloc(bs*sizeof(PetscReal));CHKPTRQ(sum);
1173     for (i=0; i<mbs; i++) {
1174       jl[i] = mbs; il[0] = 0;
1175     }
1176 
1177     *norm = 0.0;
1178     for (k=0; k<mbs; k++) { /* k_th block row */
1179       for (j=0; j<bs; j++) sum[j]=0.0;
1180 
1181       /*-- col sum --*/
1182       i = jl[k]; /* first |A(i,k)| to be added */
1183       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1184                   at step k */
1185       while (i<mbs){
1186         nexti = jl[i];  /* next block row to be added */
1187         ik    = il[i];  /* block index of A(i,k) in the array a */
1188         for (j=0; j<bs; j++){
1189           v = a->a + ik*bs2 + j*bs;
1190           for (k1=0; k1<bs; k1++) {
1191             sum[j] += PetscAbsScalar(*v); v++;
1192           }
1193         }
1194         /* update il, jl */
1195         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1196         jmax = a->i[i+1];
1197         if (jmin < jmax){
1198           il[i] = jmin;
1199           j   = a->j[jmin];
1200           jl[i] = jl[j]; jl[j]=i;
1201         }
1202         i = nexti;
1203       }
1204 
1205       /*-- row sum --*/
1206       jmin = a->i[k]; jmax = a->i[k+1];
1207       for (i=jmin; i<jmax; i++) {
1208         for (j=0; j<bs; j++){
1209           v = a->a + i*bs2 + j;
1210           for (k1=0; k1<bs; k1++){
1211             sum[j] += PetscAbsScalar(*v);
1212             v   += bs;
1213           }
1214         }
1215       }
1216       /* add k_th block row to il, jl */
1217       col = aj+jmin;
1218       if (*col == k) jmin++;
1219       if (jmin < jmax){
1220         il[k] = jmin;
1221         j   = a->j[jmin];
1222         jl[k] = jl[j]; jl[j] = k;
1223       }
1224       for (j=0; j<bs; j++){
1225         if (sum[j] > *norm) *norm = sum[j];
1226       }
1227     }
1228     ierr = PetscFree(il);CHKERRQ(ierr);
1229     ierr = PetscFree(jl);CHKERRQ(ierr);
1230     ierr = PetscFree(sum);CHKERRQ(ierr);
1231   } else {
1232     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1233   }
1234   PetscFunctionReturn(0);
1235 }
1236 
1237 #undef __FUNC__
1238 #define __FUNC__ "MatEqual_SeqSBAIJ"
1239 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1240 {
1241   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1242   int         ierr;
1243 
1244   PetscFunctionBegin;
1245   if (B->type !=MATSEQSBAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,"Matrices must be same type");
1246 
1247   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1248   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)|| (a->s_nz != b->s_nz)) {
1249     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1250   }
1251 
1252   /* if the a->i are the same */
1253   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr);
1254   if (*flg == PETSC_FALSE) {
1255     PetscFunctionReturn(0);
1256   }
1257 
1258   /* if a->j are the same */
1259   ierr = PetscMemcmp(a->j,b->j,(a->s_nz)*sizeof(int),flg);CHKERRQ(ierr);
1260   if (*flg == PETSC_FALSE) {
1261     PetscFunctionReturn(0);
1262   }
1263   /* if a->a are the same */
1264   ierr = PetscMemcmp(a->a,b->a,(a->s_nz)*(a->bs)*(a->bs)*sizeof(Scalar),flg);CHKERRQ(ierr);
1265   PetscFunctionReturn(0);
1266 
1267 }
1268 
1269 #undef __FUNC__
1270 #define __FUNC__ "MatGetDiagonal_SeqSBAIJ"
1271 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1272 {
1273   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1274   int         ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1275   Scalar      *x,zero = 0.0;
1276   MatScalar   *aa,*aa_j;
1277 
1278   PetscFunctionBegin;
1279   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1280   bs   = a->bs;
1281   aa   = a->a;
1282   ai   = a->i;
1283   aj   = a->j;
1284   ambs = a->mbs;
1285   bs2  = a->bs2;
1286 
1287   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1288   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1289   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1290   if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1291   for (i=0; i<ambs; i++) {
1292     j=ai[i];
1293     if (aj[j] == i) {             /* if this is a diagonal element */
1294       row  = i*bs;
1295       aa_j = aa + j*bs2;
1296       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1297     }
1298   }
1299   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1300   PetscFunctionReturn(0);
1301 }
1302 
1303 #undef __FUNC__
1304 #define __FUNC__ "MatDiagonalScale_SeqSBAIJ"
1305 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1306 {
1307   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1308   Scalar      *l,*r,x,*li,*ri;
1309   MatScalar   *aa,*v;
1310   int         ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1311 
1312   PetscFunctionBegin;
1313   ai  = a->i;
1314   aj  = a->j;
1315   aa  = a->a;
1316   m   = a->m;
1317   bs  = a->bs;
1318   mbs = a->mbs;
1319   bs2 = a->bs2;
1320 
1321   if (ll != rr) {
1322     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1323   }
1324   if (ll) {
1325     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1326     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1327     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1328     for (i=0; i<mbs; i++) { /* for each block row */
1329       M  = ai[i+1] - ai[i];
1330       li = l + i*bs;
1331       v  = aa + bs2*ai[i];
1332       for (j=0; j<M; j++) { /* for each block */
1333         for (k=0; k<bs2; k++) {
1334           (*v++) *= li[k%bs];
1335         }
1336 #ifdef CONT
1337         /* will be used to replace the above loop */
1338         ri = l + bs*aj[ai[i]+j];
1339         for (k=0; k<bs; k++) { /* column value */
1340           x = ri[k];
1341           for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1342         }
1343 #endif
1344 
1345       }
1346     }
1347     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1348     PLogFlops(2*a->s_nz);
1349   }
1350   /* will be deleted */
1351   if (rr) {
1352     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
1353     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1354     if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1355     for (i=0; i<mbs; i++) { /* for each block row */
1356       M  = ai[i+1] - ai[i];
1357       v  = aa + bs2*ai[i];
1358       for (j=0; j<M; j++) { /* for each block */
1359         ri = r + bs*aj[ai[i]+j];
1360         for (k=0; k<bs; k++) {
1361           x = ri[k];
1362           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1363         }
1364       }
1365     }
1366     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
1367     PLogFlops(a->s_nz);
1368   }
1369   PetscFunctionReturn(0);
1370 }
1371 
1372 #undef __FUNC__
1373 #define __FUNC__ "MatGetInfo_SeqSBAIJ"
1374 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1375 {
1376   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1377 
1378   PetscFunctionBegin;
1379   info->rows_global    = (double)a->m;
1380   info->columns_global = (double)a->m;
1381   info->rows_local     = (double)a->m;
1382   info->columns_local  = (double)a->m;
1383   info->block_size     = a->bs2;
1384   info->nz_allocated   = a->s_maxnz; /*num. of nonzeros in upper triangular part */
1385   info->nz_used        = a->bs2*a->s_nz; /*num. of nonzeros in upper triangular part */
1386   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1387   info->assemblies   = A->num_ass;
1388   info->mallocs      = a->reallocs;
1389   info->memory       = A->mem;
1390   if (A->factor) {
1391     info->fill_ratio_given  = A->info.fill_ratio_given;
1392     info->fill_ratio_needed = A->info.fill_ratio_needed;
1393     info->factor_mallocs    = A->info.factor_mallocs;
1394   } else {
1395     info->fill_ratio_given  = 0;
1396     info->fill_ratio_needed = 0;
1397     info->factor_mallocs    = 0;
1398   }
1399   PetscFunctionReturn(0);
1400 }
1401 
1402 
1403 #undef __FUNC__
1404 #define __FUNC__ "MatZeroEntries_SeqSBAIJ"
1405 int MatZeroEntries_SeqSBAIJ(Mat A)
1406 {
1407   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1408   int         ierr;
1409 
1410   PetscFunctionBegin;
1411   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1412   PetscFunctionReturn(0);
1413 }
1414 
1415 #undef __FUNC__
1416 #define __FUNC__ "MatGetRowMax_SeqSBAIJ"
1417 int MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1418 {
1419   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1420   int          ierr,i,j,n,row,col,bs,*ai,*aj,mbs;
1421   PetscReal    *x,zero = 0.0,atmp;
1422   MatScalar    *aa;
1423   int          ncols,brow,bcol,krow,kcol;
1424 
1425   PetscFunctionBegin;
1426   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1427   bs   = a->bs;
1428   aa   = a->a;
1429   ai   = a->i;
1430   aj   = a->j;
1431   mbs = a->mbs;
1432 
1433   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1434   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1435   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1436   if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1437   for (i=0; i<mbs; i++) {
1438     ncols = ai[1] - ai[0]; ai++;
1439     brow  = bs*i;
1440     for (j=0; j<ncols; j++){
1441       bcol = bs*(*aj);
1442       for (kcol=0; kcol<bs; kcol++){
1443         col = bcol + kcol;      /* col index */
1444         for (krow=0; krow<bs; krow++){
1445           atmp = PetscAbsScalar(*aa); aa++;
1446           row = brow + krow;    /* row index */
1447           /* printf("val[%d,%d]: %g\n",row,col,atmp); */
1448           if (x[row] < atmp) x[row] = atmp;
1449           if (*aj > i && x[col] < atmp) x[col] = atmp;
1450         }
1451       }
1452       aj++;
1453     }
1454   }
1455   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1456   PetscFunctionReturn(0);
1457 }
1458