xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision b2da817d11b8f98ec4a15fcfa65f2fa8cfedc826)
1 /*$Id: sbaij2.c,v 1.32 2001/08/07 03:03:01 balay Exp $*/
2 
3 #include "src/mat/impls/baij/seq/baij.h"
4 #include "src/inline/spops.h"
5 #include "src/inline/ilu.h"
6 #include "petscbt.h"
7 #include "src/mat/impls/sbaij/seq/sbaij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
11 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS is[],int ov)
12 {
13   PetscFunctionBegin;
14   SETERRQ(1,"Function not yet written for SBAIJ format");
15   /* PetscFunctionReturn(0); */
16 }
17 
18 #undef __FUNCT__
19 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
20 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
21 {
22   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
23   int          *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
24   int          row,mat_i,*mat_j,tcol,*mat_ilen;
25   int          *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
26   int          *aj = a->j,*ai = a->i;
27   MatScalar    *mat_a;
28   Mat          C;
29   PetscTruth   flag;
30 
31   PetscFunctionBegin;
32 
33   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
34   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
35   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
36 
37   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
38   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
39 
40   ierr  = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr);
41   ssmap = smap;
42   ierr  = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr);
43   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
44   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
45   /* determine lens of each row */
46   for (i=0; i<nrows; i++) {
47     kstart  = ai[irow[i]];
48     kend    = kstart + a->ilen[irow[i]];
49     lens[i] = 0;
50       for (k=kstart; k<kend; k++) {
51         if (ssmap[aj[k]]) {
52           lens[i]++;
53         }
54       }
55     }
56   /* Create and fill new matrix */
57   if (scall == MAT_REUSE_MATRIX) {
58     c = (Mat_SeqSBAIJ *)((*B)->data);
59 
60     if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
61     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr);
62     if (flag == PETSC_FALSE) {
63       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
64     }
65     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
66     C = *B;
67   } else {
68     ierr = MatCreateSeqSBAIJ(A->comm,bs,nrows*bs,nrows*bs,0,lens,&C);CHKERRQ(ierr);
69   }
70   c = (Mat_SeqSBAIJ *)(C->data);
71   for (i=0; i<nrows; i++) {
72     row    = irow[i];
73     kstart = ai[row];
74     kend   = kstart + a->ilen[row];
75     mat_i  = c->i[i];
76     mat_j  = c->j + mat_i;
77     mat_a  = c->a + mat_i*bs2;
78     mat_ilen = c->ilen + i;
79     for (k=kstart; k<kend; k++) {
80       if ((tcol=ssmap[a->j[k]])) {
81         *mat_j++ = tcol - 1;
82         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
83         mat_a   += bs2;
84         (*mat_ilen)++;
85       }
86     }
87   }
88 
89   /* Free work space */
90   ierr = PetscFree(smap);CHKERRQ(ierr);
91   ierr = PetscFree(lens);CHKERRQ(ierr);
92   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
93   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
94 
95   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
96   *B = C;
97   PetscFunctionReturn(0);
98 }
99 
100 #undef __FUNCT__
101 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
102 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
103 {
104   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
105   IS          is1;
106   int         *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
107 
108   PetscFunctionBegin;
109   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
110 
111   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
112   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
113 
114   /* Verify if the indices corespond to each element in a block
115    and form the IS with compressed IS */
116   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr);
117   iary = vary + a->mbs;
118   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
119   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
120 
121   count = 0;
122   for (i=0; i<a->mbs; i++) {
123     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
124     if (vary[i]==bs) iary[count++] = i;
125   }
126   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
127 
128   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
129   ierr = PetscFree(vary);CHKERRQ(ierr);
130 
131   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
132   ISDestroy(is1);
133   PetscFunctionReturn(0);
134 }
135 
136 #undef __FUNCT__
137 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
138 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
139 {
140   int ierr,i;
141 
142   PetscFunctionBegin;
143   if (scall == MAT_INITIAL_MATRIX) {
144     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
145   }
146 
147   for (i=0; i<n; i++) {
148     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
149   }
150   PetscFunctionReturn(0);
151 }
152 
153 /* -------------------------------------------------------*/
154 /* Should check that shapes of vectors and matrices match */
155 /* -------------------------------------------------------*/
156 #include "petscblaslapack.h"
157 
158 #undef __FUNCT__
159 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
160 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
161 {
162   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
163   PetscScalar     *x,*z,*xb,x1,zero=0.0;
164   MatScalar       *v;
165   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
166 
167   PetscFunctionBegin;
168   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
169   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
170   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
171 
172   v  = a->a;
173   xb = x;
174 
175   for (i=0; i<mbs; i++) {
176     n  = ai[1] - ai[0];  /* length of i_th row of A */
177     x1 = xb[0];
178     ib = aj + *ai;
179     jmin = 0;
180     if (*ib == i) {      /* (diag of A)*x */
181       z[i] += *v++ * x[*ib++];
182       jmin++;
183     }
184     for (j=jmin; j<n; j++) {
185       cval    = *ib;
186       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
187       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
188     }
189     xb++; ai++;
190   }
191 
192   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
193   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
194   PetscLogFlops(2*(a->nz*2 - A->m) - A->m);  /* nz = (nz+m)/2 */
195   PetscFunctionReturn(0);
196 }
197 
198 #undef __FUNCT__
199 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
200 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
201 {
202   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
203   PetscScalar      *x,*z,*xb,x1,x2,zero=0.0;
204   MatScalar        *v;
205   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
206 
207 
208   PetscFunctionBegin;
209   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
210   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
211   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
212 
213   v     = a->a;
214   xb = x;
215 
216   for (i=0; i<mbs; i++) {
217     n  = ai[1] - ai[0]; /* length of i_th block row of A */
218     x1 = xb[0]; x2 = xb[1];
219     ib = aj + *ai;
220     jmin = 0;
221     if (*ib == i){     /* (diag of A)*x */
222       z[2*i]   += v[0]*x1 + v[2]*x2;
223       z[2*i+1] += v[2]*x1 + v[3]*x2;
224       v += 4; jmin++;
225     }
226     for (j=jmin; j<n; j++) {
227       /* (strict lower triangular part of A)*x  */
228       cval       = ib[j]*2;
229       z[cval]     += v[0]*x1 + v[1]*x2;
230       z[cval+1]   += v[2]*x1 + v[3]*x2;
231       /* (strict upper triangular part of A)*x  */
232       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
233       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
234       v  += 4;
235     }
236     xb +=2; ai++;
237   }
238 
239   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
240   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
241   PetscLogFlops(8*(a->nz*2 - A->m) - A->m);
242   PetscFunctionReturn(0);
243 }
244 
245 #undef __FUNCT__
246 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
247 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
248 {
249   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
250   PetscScalar   *x,*z,*xb,x1,x2,x3,zero=0.0;
251   MatScalar     *v;
252   int           mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
253 
254 
255   PetscFunctionBegin;
256   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
257   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
258   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
259 
260   v     = a->a;
261   xb = x;
262 
263   for (i=0; i<mbs; i++) {
264     n  = ai[1] - ai[0]; /* length of i_th block row of A */
265     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
266     ib = aj + *ai;
267     jmin = 0;
268     if (*ib == i){     /* (diag of A)*x */
269       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
270       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
271       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
272       v += 9; jmin++;
273     }
274     for (j=jmin; j<n; j++) {
275       /* (strict lower triangular part of A)*x  */
276       cval       = ib[j]*3;
277       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
278       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
279       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
280       /* (strict upper triangular part of A)*x  */
281       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
282       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
283       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
284       v  += 9;
285     }
286     xb +=3; ai++;
287   }
288 
289   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
290   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
291   PetscLogFlops(18*(a->nz*2 - A->m) - A->m);
292   PetscFunctionReturn(0);
293 }
294 
295 #undef __FUNCT__
296 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
297 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
298 {
299   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
300   PetscScalar      *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
301   MatScalar        *v;
302   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
303 
304   PetscFunctionBegin;
305   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
306   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
307   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
308 
309   v     = a->a;
310   xb = x;
311 
312   for (i=0; i<mbs; i++) {
313     n  = ai[1] - ai[0]; /* length of i_th block row of A */
314     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
315     ib = aj + *ai;
316     jmin = 0;
317     if (*ib == i){     /* (diag of A)*x */
318       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
319       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
320       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
321       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
322       v += 16; jmin++;
323     }
324     for (j=jmin; j<n; j++) {
325       /* (strict lower triangular part of A)*x  */
326       cval       = ib[j]*4;
327       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
328       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
329       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
330       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
331       /* (strict upper triangular part of A)*x  */
332       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
333       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
334       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
335       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
336       v  += 16;
337     }
338     xb +=4; ai++;
339   }
340 
341   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
342   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
343   PetscLogFlops(32*(a->nz*2 - A->m) - A->m);
344   PetscFunctionReturn(0);
345 }
346 
347 #undef __FUNCT__
348 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
349 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
350 {
351   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
352   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
353   MatScalar       *v;
354   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
355 
356   PetscFunctionBegin;
357   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
358   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
359   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
360 
361   v     = a->a;
362   xb = x;
363 
364   for (i=0; i<mbs; i++) {
365     n  = ai[1] - ai[0]; /* length of i_th block row of A */
366     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
367     ib = aj + *ai;
368     jmin = 0;
369     if (*ib == i){      /* (diag of A)*x */
370       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
371       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
372       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
373       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
374       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
375       v += 25; jmin++;
376     }
377     for (j=jmin; j<n; j++) {
378       /* (strict lower triangular part of A)*x  */
379       cval       = ib[j]*5;
380       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
381       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
382       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
383       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
384       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
385       /* (strict upper triangular part of A)*x  */
386       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
387       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
388       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
389       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
390       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
391       v  += 25;
392     }
393     xb +=5; ai++;
394   }
395 
396   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
397   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
398   PetscLogFlops(50*(a->nz*2 - A->m) - A->m);
399   PetscFunctionReturn(0);
400 }
401 
402 
403 #undef __FUNCT__
404 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
405 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
406 {
407   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
408   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
409   MatScalar       *v;
410   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
411 
412   PetscFunctionBegin;
413   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
414   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
415   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
416 
417   v     = a->a;
418   xb = x;
419 
420   for (i=0; i<mbs; i++) {
421     n  = ai[1] - ai[0]; /* length of i_th block row of A */
422     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
423     ib = aj + *ai;
424     jmin = 0;
425     if (*ib == i){      /* (diag of A)*x */
426       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
427       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
428       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
429       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
430       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
431       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
432       v += 36; jmin++;
433     }
434     for (j=jmin; j<n; j++) {
435       /* (strict lower triangular part of A)*x  */
436       cval       = ib[j]*6;
437       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
438       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
439       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
440       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
441       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
442       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
443       /* (strict upper triangular part of A)*x  */
444       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
445       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
446       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
447       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
448       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
449       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
450       v  += 36;
451     }
452     xb +=6; ai++;
453   }
454 
455   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
456   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
457   PetscLogFlops(72*(a->nz*2 - A->m) - A->m);
458   PetscFunctionReturn(0);
459 }
460 #undef __FUNCT__
461 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
462 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
463 {
464   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
465   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
466   MatScalar       *v;
467   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
468 
469   PetscFunctionBegin;
470   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
471   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
472   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
473 
474   v     = a->a;
475   xb = x;
476 
477   for (i=0; i<mbs; i++) {
478     n  = ai[1] - ai[0]; /* length of i_th block row of A */
479     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
480     ib = aj + *ai;
481     jmin = 0;
482     if (*ib == i){      /* (diag of A)*x */
483       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
484       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
485       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
486       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
487       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
488       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
489       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
490       v += 49; jmin++;
491     }
492     for (j=jmin; j<n; j++) {
493       /* (strict lower triangular part of A)*x  */
494       cval       = ib[j]*7;
495       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
496       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
497       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
498       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
499       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
500       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
501       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
502       /* (strict upper triangular part of A)*x  */
503       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
504       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
505       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
506       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
507       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
508       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
509       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
510       v  += 49;
511     }
512     xb +=7; ai++;
513   }
514   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
515   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
516   PetscLogFlops(98*(a->nz*2 - A->m) - A->m);
517   PetscFunctionReturn(0);
518 }
519 
520 /*
521     This will not work with MatScalar == float because it calls the BLAS
522 */
523 #undef __FUNCT__
524 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
525 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
526 {
527   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
528   PetscScalar     *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
529   MatScalar       *v;
530   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
531   int             ncols,k;
532 
533   PetscFunctionBegin;
534   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
535   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); x_ptr=x;
536   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); z_ptr=z;
537 
538   aj   = a->j;
539   v    = a->a;
540   ii   = a->i;
541 
542   if (!a->mult_work) {
543     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
544   }
545   work = a->mult_work;
546 
547   for (i=0; i<mbs; i++) {
548     n     = ii[1] - ii[0]; ncols = n*bs;
549     workt = work; idx=aj+ii[0];
550 
551     /* upper triangular part */
552     for (j=0; j<n; j++) {
553       xb = x_ptr + bs*(*idx++);
554       for (k=0; k<bs; k++) workt[k] = xb[k];
555       workt += bs;
556     }
557     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
558     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
559 
560     /* strict lower triangular part */
561     idx = aj+ii[0];
562     if (*idx == i){
563       ncols -= bs; v += bs2; idx++; n--;
564     }
565 
566     if (ncols > 0){
567       workt = work;
568       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
569       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
570       for (j=0; j<n; j++) {
571         zb = z_ptr + bs*(*idx++);
572         for (k=0; k<bs; k++) zb[k] += workt[k] ;
573         workt += bs;
574       }
575     }
576     x += bs; v += n*bs2; z += bs; ii++;
577   }
578 
579   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
580   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
581   PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m);
582   PetscFunctionReturn(0);
583 }
584 
585 #undef __FUNCT__
586 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
587 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
588 {
589   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
590   PetscScalar     *x,*y,*z,*xb,x1;
591   MatScalar       *v;
592   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
593 
594   PetscFunctionBegin;
595   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
596   if (yy != xx) {
597     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
598   } else {
599     y = x;
600   }
601   if (zz != yy) {
602     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
603     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
604     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
605   } else {
606     z = y;
607   }
608 
609   v  = a->a;
610   xb = x;
611 
612   for (i=0; i<mbs; i++) {
613     n  = ai[1] - ai[0];  /* length of i_th row of A */
614     x1 = xb[0];
615     ib = aj + *ai;
616     jmin = 0;
617     if (*ib == i) {            /* (diag of A)*x */
618       z[i] += *v++ * x[*ib++]; jmin++;
619     }
620     for (j=jmin; j<n; j++) {
621       cval    = *ib;
622       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
623       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
624     }
625     xb++; ai++;
626   }
627 
628   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
629   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
630   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
631 
632   PetscLogFlops(2*(a->nz*2 - A->m));
633   PetscFunctionReturn(0);
634 }
635 
636 #undef __FUNCT__
637 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
638 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
639 {
640   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
641   PetscScalar     *x,*y,*z,*xb,x1,x2;
642   MatScalar       *v;
643   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
644 
645   PetscFunctionBegin;
646   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
647   if (yy != xx) {
648     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
649   } else {
650     y = x;
651   }
652   if (zz != yy) {
653     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
654     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
655     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
656   } else {
657     z = y;
658   }
659 
660   v     = a->a;
661   xb = x;
662 
663   for (i=0; i<mbs; i++) {
664     n  = ai[1] - ai[0]; /* length of i_th block row of A */
665     x1 = xb[0]; x2 = xb[1];
666     ib = aj + *ai;
667     jmin = 0;
668     if (*ib == i){      /* (diag of A)*x */
669       z[2*i]   += v[0]*x1 + v[2]*x2;
670       z[2*i+1] += v[2]*x1 + v[3]*x2;
671       v += 4; jmin++;
672     }
673     for (j=jmin; j<n; j++) {
674       /* (strict lower triangular part of A)*x  */
675       cval       = ib[j]*2;
676       z[cval]     += v[0]*x1 + v[1]*x2;
677       z[cval+1]   += v[2]*x1 + v[3]*x2;
678       /* (strict upper triangular part of A)*x  */
679       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
680       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
681       v  += 4;
682     }
683     xb +=2; ai++;
684   }
685 
686   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
687   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
688   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
689 
690   PetscLogFlops(4*(a->nz*2 - A->m));
691   PetscFunctionReturn(0);
692 }
693 
694 #undef __FUNCT__
695 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
696 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
697 {
698   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
699   PetscScalar     *x,*y,*z,*xb,x1,x2,x3;
700   MatScalar       *v;
701   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
702 
703   PetscFunctionBegin;
704   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
705   if (yy != xx) {
706     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
707   } else {
708     y = x;
709   }
710   if (zz != yy) {
711     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
712     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
713     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
714   } else {
715     z = y;
716   }
717 
718   v     = a->a;
719   xb = x;
720 
721   for (i=0; i<mbs; i++) {
722     n  = ai[1] - ai[0]; /* length of i_th block row of A */
723     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
724     ib = aj + *ai;
725     jmin = 0;
726     if (*ib == i){     /* (diag of A)*x */
727      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
728      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
729      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
730      v += 9; jmin++;
731     }
732     for (j=jmin; j<n; j++) {
733       /* (strict lower triangular part of A)*x  */
734       cval       = ib[j]*3;
735       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
736       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
737       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
738       /* (strict upper triangular part of A)*x  */
739       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
740       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
741       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
742       v  += 9;
743     }
744     xb +=3; ai++;
745   }
746 
747   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
748   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
749   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
750 
751   PetscLogFlops(18*(a->nz*2 - A->m));
752   PetscFunctionReturn(0);
753 }
754 
755 #undef __FUNCT__
756 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
757 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
758 {
759   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
760   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4;
761   MatScalar       *v;
762   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
763 
764   PetscFunctionBegin;
765   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
766   if (yy != xx) {
767     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
768   } else {
769     y = x;
770   }
771   if (zz != yy) {
772     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
773     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
774     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
775   } else {
776     z = y;
777   }
778 
779   v     = a->a;
780   xb = x;
781 
782   for (i=0; i<mbs; i++) {
783     n  = ai[1] - ai[0]; /* length of i_th block row of A */
784     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
785     ib = aj + *ai;
786     jmin = 0;
787     if (*ib == i){      /* (diag of A)*x */
788       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
789       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
790       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
791       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
792       v += 16; jmin++;
793     }
794     for (j=jmin; j<n; j++) {
795       /* (strict lower triangular part of A)*x  */
796       cval       = ib[j]*4;
797       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
798       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
799       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
800       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
801       /* (strict upper triangular part of A)*x  */
802       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
803       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
804       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
805       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
806       v  += 16;
807     }
808     xb +=4; ai++;
809   }
810 
811   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
812   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
813   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
814 
815   PetscLogFlops(32*(a->nz*2 - A->m));
816   PetscFunctionReturn(0);
817 }
818 
819 #undef __FUNCT__
820 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
821 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
822 {
823   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
824   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5;
825   MatScalar       *v;
826   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
827 
828   PetscFunctionBegin;
829   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
830   if (yy != xx) {
831     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
832   } else {
833     y = x;
834   }
835   if (zz != yy) {
836     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
837     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
838     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
839   } else {
840     z = y;
841   }
842 
843   v     = a->a;
844   xb = x;
845 
846   for (i=0; i<mbs; i++) {
847     n  = ai[1] - ai[0]; /* length of i_th block row of A */
848     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
849     ib = aj + *ai;
850     jmin = 0;
851     if (*ib == i){      /* (diag of A)*x */
852       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
853       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
854       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
855       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
856       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
857       v += 25; jmin++;
858     }
859     for (j=jmin; j<n; j++) {
860       /* (strict lower triangular part of A)*x  */
861       cval       = ib[j]*5;
862       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
863       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
864       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
865       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
866       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
867       /* (strict upper triangular part of A)*x  */
868       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
869       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
870       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
871       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
872       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
873       v  += 25;
874     }
875     xb +=5; ai++;
876   }
877 
878   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
879   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
880   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
881 
882   PetscLogFlops(50*(a->nz*2 - A->m));
883   PetscFunctionReturn(0);
884 }
885 #undef __FUNCT__
886 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
887 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
888 {
889   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
890   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
891   MatScalar       *v;
892   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
893 
894   PetscFunctionBegin;
895   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
896   if (yy != xx) {
897     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
898   } else {
899     y = x;
900   }
901   if (zz != yy) {
902     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
903     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
904     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
905   } else {
906     z = y;
907   }
908 
909   v     = a->a;
910   xb = x;
911 
912   for (i=0; i<mbs; i++) {
913     n  = ai[1] - ai[0]; /* length of i_th block row of A */
914     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
915     ib = aj + *ai;
916     jmin = 0;
917     if (*ib == i){     /* (diag of A)*x */
918       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
919       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
920       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
921       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
922       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
923       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
924       v += 36; jmin++;
925     }
926     for (j=jmin; j<n; j++) {
927       /* (strict lower triangular part of A)*x  */
928       cval       = ib[j]*6;
929       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
930       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
931       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
932       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
933       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
934       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
935       /* (strict upper triangular part of A)*x  */
936       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
937       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
938       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
939       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
940       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
941       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
942       v  += 36;
943     }
944     xb +=6; ai++;
945   }
946 
947   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
948   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
949   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
950 
951   PetscLogFlops(72*(a->nz*2 - A->m));
952   PetscFunctionReturn(0);
953 }
954 
955 #undef __FUNCT__
956 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
957 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
958 {
959   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
960   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
961   MatScalar       *v;
962   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
963 
964   PetscFunctionBegin;
965   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
966   if (yy != xx) {
967     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
968   } else {
969     y = x;
970   }
971   if (zz != yy) {
972     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
973     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
974     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
975   } else {
976     z = y;
977   }
978 
979   v     = a->a;
980   xb = x;
981 
982   for (i=0; i<mbs; i++) {
983     n  = ai[1] - ai[0]; /* length of i_th block row of A */
984     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
985     ib = aj + *ai;
986     jmin = 0;
987     if (*ib == i){     /* (diag of A)*x */
988       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
989       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
990       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
991       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
992       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
993       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
994       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
995       v += 49; jmin++;
996     }
997     for (j=jmin; j<n; j++) {
998       /* (strict lower triangular part of A)*x  */
999       cval       = ib[j]*7;
1000       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1001       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1002       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1003       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1004       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1005       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1006       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1007       /* (strict upper triangular part of A)*x  */
1008       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1009       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1010       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1011       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1012       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1013       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1014       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1015       v  += 49;
1016     }
1017     xb +=7; ai++;
1018   }
1019 
1020   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1021   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
1022   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
1023 
1024   PetscLogFlops(98*(a->nz*2 - A->m));
1025   PetscFunctionReturn(0);
1026 }
1027 
1028 #undef __FUNCT__
1029 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1030 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1031 {
1032   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1033   PetscScalar     *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1034   MatScalar       *v;
1035   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
1036   int             ncols,k;
1037 
1038   PetscFunctionBegin;
1039   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); x_ptr=x;
1040   if (yy != xx) {
1041     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
1042   } else {
1043     y = x;
1044   }
1045   if (zz != yy) {
1046     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1047     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); z_ptr=z;
1048     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1049   } else {
1050     z = y;
1051   }
1052 
1053   aj   = a->j;
1054   v    = a->a;
1055   ii   = a->i;
1056 
1057   if (!a->mult_work) {
1058     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1059   }
1060   work = a->mult_work;
1061 
1062 
1063   for (i=0; i<mbs; i++) {
1064     n     = ii[1] - ii[0]; ncols = n*bs;
1065     workt = work; idx=aj+ii[0];
1066 
1067     /* upper triangular part */
1068     for (j=0; j<n; j++) {
1069       xb = x_ptr + bs*(*idx++);
1070       for (k=0; k<bs; k++) workt[k] = xb[k];
1071       workt += bs;
1072     }
1073     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1074     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1075 
1076     /* strict lower triangular part */
1077     idx = aj+ii[0];
1078     if (*idx == i){
1079       ncols -= bs; v += bs2; idx++; n--;
1080     }
1081     if (ncols > 0){
1082       workt = work;
1083       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1084       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1085       for (j=0; j<n; j++) {
1086         zb = z_ptr + bs*(*idx++);
1087         /* idx++; */
1088         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1089         workt += bs;
1090       }
1091     }
1092 
1093     x += bs; v += n*bs2; z += bs; ii++;
1094   }
1095 
1096   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1097   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
1098   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
1099 
1100   PetscLogFlops(2*(a->nz*2 - A->m));
1101   PetscFunctionReturn(0);
1102 }
1103 
1104 #undef __FUNCT__
1105 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ"
1106 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1107 {
1108   int ierr;
1109 
1110   PetscFunctionBegin;
1111   ierr = MatMult(A,xx,zz);CHKERRQ(ierr);
1112   PetscFunctionReturn(0);
1113 }
1114 
1115 #undef __FUNCT__
1116 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ"
1117 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1118 
1119 {
1120   int ierr;
1121 
1122   PetscFunctionBegin;
1123   ierr = MatMultAdd(A,xx,yy,zz);CHKERRQ(ierr);
1124   PetscFunctionReturn(0);
1125 }
1126 
1127 #undef __FUNCT__
1128 #define __FUNCT__ "MatScale_SeqSBAIJ"
1129 int MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA)
1130 {
1131   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1132   int         one = 1,totalnz = a->bs2*a->nz;
1133 
1134   PetscFunctionBegin;
1135   BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one);
1136   PetscLogFlops(totalnz);
1137   PetscFunctionReturn(0);
1138 }
1139 
1140 #undef __FUNCT__
1141 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1142 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1143 {
1144   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1145   MatScalar   *v = a->a;
1146   PetscReal   sum_diag = 0.0, sum_off = 0.0, *sum;
1147   int         i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1148   int         *jl,*il,jmin,jmax,ierr,nexti,ik,*col;
1149 
1150   PetscFunctionBegin;
1151   if (type == NORM_FROBENIUS) {
1152     for (k=0; k<mbs; k++){
1153       jmin = a->i[k]; jmax = a->i[k+1];
1154       col  = aj + jmin;
1155       if (*col == k){         /* diagonal block */
1156         for (i=0; i<bs2; i++){
1157 #if defined(PETSC_USE_COMPLEX)
1158           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1159 #else
1160           sum_diag += (*v)*(*v); v++;
1161 #endif
1162         }
1163         jmin++;
1164       }
1165       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1166         for (i=0; i<bs2; i++){
1167 #if defined(PETSC_USE_COMPLEX)
1168           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1169 #else
1170           sum_off += (*v)*(*v); v++;
1171 #endif
1172         }
1173       }
1174     }
1175     *norm = sqrt(sum_diag + 2*sum_off);
1176 
1177   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1178     ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr);
1179     ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr);
1180     ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr);
1181     for (i=0; i<mbs; i++) {
1182       jl[i] = mbs; il[0] = 0;
1183     }
1184 
1185     *norm = 0.0;
1186     for (k=0; k<mbs; k++) { /* k_th block row */
1187       for (j=0; j<bs; j++) sum[j]=0.0;
1188 
1189       /*-- col sum --*/
1190       i = jl[k]; /* first |A(i,k)| to be added */
1191       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1192                   at step k */
1193       while (i<mbs){
1194         nexti = jl[i];  /* next block row to be added */
1195         ik    = il[i];  /* block index of A(i,k) in the array a */
1196         for (j=0; j<bs; j++){
1197           v = a->a + ik*bs2 + j*bs;
1198           for (k1=0; k1<bs; k1++) {
1199             sum[j] += PetscAbsScalar(*v); v++;
1200           }
1201         }
1202         /* update il, jl */
1203         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1204         jmax = a->i[i+1];
1205         if (jmin < jmax){
1206           il[i] = jmin;
1207           j   = a->j[jmin];
1208           jl[i] = jl[j]; jl[j]=i;
1209         }
1210         i = nexti;
1211       }
1212 
1213       /*-- row sum --*/
1214       jmin = a->i[k]; jmax = a->i[k+1];
1215       for (i=jmin; i<jmax; i++) {
1216         for (j=0; j<bs; j++){
1217           v = a->a + i*bs2 + j;
1218           for (k1=0; k1<bs; k1++){
1219             sum[j] += PetscAbsScalar(*v);
1220             v   += bs;
1221           }
1222         }
1223       }
1224       /* add k_th block row to il, jl */
1225       col = aj+jmin;
1226       if (*col == k) jmin++;
1227       if (jmin < jmax){
1228         il[k] = jmin;
1229         j   = a->j[jmin];
1230         jl[k] = jl[j]; jl[j] = k;
1231       }
1232       for (j=0; j<bs; j++){
1233         if (sum[j] > *norm) *norm = sum[j];
1234       }
1235     }
1236     ierr = PetscFree(il);CHKERRQ(ierr);
1237     ierr = PetscFree(jl);CHKERRQ(ierr);
1238     ierr = PetscFree(sum);CHKERRQ(ierr);
1239   } else {
1240     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1241   }
1242   PetscFunctionReturn(0);
1243 }
1244 
1245 #undef __FUNCT__
1246 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1247 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1248 {
1249   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1250   int          ierr;
1251 
1252   PetscFunctionBegin;
1253 
1254   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1255   if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1256     *flg = PETSC_FALSE;
1257     PetscFunctionReturn(0);
1258   }
1259 
1260   /* if the a->i are the same */
1261   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr);
1262   if (*flg == PETSC_FALSE) {
1263     PetscFunctionReturn(0);
1264   }
1265 
1266   /* if a->j are the same */
1267   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);CHKERRQ(ierr);
1268   if (*flg == PETSC_FALSE) {
1269     PetscFunctionReturn(0);
1270   }
1271   /* if a->a are the same */
1272   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1273 
1274   PetscFunctionReturn(0);
1275 }
1276 
1277 #undef __FUNCT__
1278 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1279 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1280 {
1281   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1282   int          ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1283   PetscScalar  *x,zero = 0.0;
1284   MatScalar    *aa,*aa_j;
1285 
1286   PetscFunctionBegin;
1287   bs   = a->bs;
1288   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1289 
1290   aa   = a->a;
1291   ai   = a->i;
1292   aj   = a->j;
1293   ambs = a->mbs;
1294   bs2  = a->bs2;
1295 
1296   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1297   ierr = VecGetArrayFast(v,&x);CHKERRQ(ierr);
1298   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1299   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1300   for (i=0; i<ambs; i++) {
1301     j=ai[i];
1302     if (aj[j] == i) {             /* if this is a diagonal element */
1303       row  = i*bs;
1304       aa_j = aa + j*bs2;
1305       if (A->factor && bs==1){
1306         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1307       } else {
1308         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1309       }
1310     }
1311   }
1312 
1313   ierr = VecRestoreArrayFast(v,&x);CHKERRQ(ierr);
1314   PetscFunctionReturn(0);
1315 }
1316 
1317 #undef __FUNCT__
1318 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1319 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1320 {
1321   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1322   PetscScalar  *l,*r,x,*li,*ri;
1323   MatScalar    *aa,*v;
1324   int          ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1325 
1326   PetscFunctionBegin;
1327   ai  = a->i;
1328   aj  = a->j;
1329   aa  = a->a;
1330   m   = A->m;
1331   bs  = a->bs;
1332   mbs = a->mbs;
1333   bs2 = a->bs2;
1334 
1335   if (ll != rr) {
1336     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1337   }
1338   if (ll) {
1339     ierr = VecGetArrayFast(ll,&l);CHKERRQ(ierr);
1340     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1341     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1342     for (i=0; i<mbs; i++) { /* for each block row */
1343       M  = ai[i+1] - ai[i];
1344       li = l + i*bs;
1345       v  = aa + bs2*ai[i];
1346       for (j=0; j<M; j++) { /* for each block */
1347         for (k=0; k<bs2; k++) {
1348           (*v++) *= li[k%bs];
1349         }
1350 #ifdef CONT
1351         /* will be used to replace the above loop */
1352         ri = l + bs*aj[ai[i]+j];
1353         for (k=0; k<bs; k++) { /* column value */
1354           x = ri[k];
1355           for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1356         }
1357 #endif
1358 
1359       }
1360     }
1361     ierr = VecRestoreArrayFast(ll,&l);CHKERRQ(ierr);
1362     PetscLogFlops(2*a->nz);
1363   }
1364   /* will be deleted */
1365   if (rr) {
1366     ierr = VecGetArrayFast(rr,&r);CHKERRQ(ierr);
1367     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1368     if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1369     for (i=0; i<mbs; i++) { /* for each block row */
1370       M  = ai[i+1] - ai[i];
1371       v  = aa + bs2*ai[i];
1372       for (j=0; j<M; j++) { /* for each block */
1373         ri = r + bs*aj[ai[i]+j];
1374         for (k=0; k<bs; k++) {
1375           x = ri[k];
1376           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1377         }
1378       }
1379     }
1380     ierr = VecRestoreArrayFast(rr,&r);CHKERRQ(ierr);
1381     PetscLogFlops(a->nz);
1382   }
1383   PetscFunctionReturn(0);
1384 }
1385 
1386 #undef __FUNCT__
1387 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1388 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1389 {
1390   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1391 
1392   PetscFunctionBegin;
1393   info->rows_global    = (double)A->m;
1394   info->columns_global = (double)A->m;
1395   info->rows_local     = (double)A->m;
1396   info->columns_local  = (double)A->m;
1397   info->block_size     = a->bs2;
1398   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1399   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1400   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1401   info->assemblies   = A->num_ass;
1402   info->mallocs      = a->reallocs;
1403   info->memory       = A->mem;
1404   if (A->factor) {
1405     info->fill_ratio_given  = A->info.fill_ratio_given;
1406     info->fill_ratio_needed = A->info.fill_ratio_needed;
1407     info->factor_mallocs    = A->info.factor_mallocs;
1408   } else {
1409     info->fill_ratio_given  = 0;
1410     info->fill_ratio_needed = 0;
1411     info->factor_mallocs    = 0;
1412   }
1413   PetscFunctionReturn(0);
1414 }
1415 
1416 
1417 #undef __FUNCT__
1418 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1419 int MatZeroEntries_SeqSBAIJ(Mat A)
1420 {
1421   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1422   int         ierr;
1423 
1424   PetscFunctionBegin;
1425   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1426   PetscFunctionReturn(0);
1427 }
1428 
1429 #undef __FUNCT__
1430 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ"
1431 int MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1432 {
1433   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1434   int          ierr,i,j,n,row,col,bs,*ai,*aj,mbs;
1435   PetscReal    atmp;
1436   MatScalar    *aa;
1437   PetscScalar  zero = 0.0,*x;
1438   int          ncols,brow,bcol,krow,kcol;
1439 
1440   PetscFunctionBegin;
1441   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1442   bs   = a->bs;
1443   aa   = a->a;
1444   ai   = a->i;
1445   aj   = a->j;
1446   mbs = a->mbs;
1447 
1448   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1449   ierr = VecGetArrayFast(v,&x);CHKERRQ(ierr);
1450   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1451   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1452   for (i=0; i<mbs; i++) {
1453     ncols = ai[1] - ai[0]; ai++;
1454     brow  = bs*i;
1455     for (j=0; j<ncols; j++){
1456       bcol = bs*(*aj);
1457       for (kcol=0; kcol<bs; kcol++){
1458         col = bcol + kcol;      /* col index */
1459         for (krow=0; krow<bs; krow++){
1460           atmp = PetscAbsScalar(*aa); aa++;
1461           row = brow + krow;    /* row index */
1462           /* printf("val[%d,%d]: %g\n",row,col,atmp); */
1463           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1464           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1465         }
1466       }
1467       aj++;
1468     }
1469   }
1470   ierr = VecRestoreArrayFast(v,&x);CHKERRQ(ierr);
1471   PetscFunctionReturn(0);
1472 }
1473