xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 94b7f48cc472a54ea2ce57edf1fe19e8a254237c)
1 /*$Id: sbaij2.c,v 1.32 2001/08/07 03:03:01 balay Exp $*/
2 
3 #include "src/mat/impls/baij/seq/baij.h"
4 #include "src/vec/vecimpl.h"
5 #include "src/inline/spops.h"
6 #include "src/inline/ilu.h"
7 #include "petscbt.h"
8 #include "src/mat/impls/sbaij/seq/sbaij.h"
9 
10 #undef __FUNCT__
11 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
12 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS is[],int ov)
13 {
14   PetscFunctionBegin;
15   SETERRQ(1,"Function not yet written for SBAIJ format");
16   /* PetscFunctionReturn(0); */
17 }
18 
19 #undef __FUNCT__
20 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
21 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
22 {
23   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
24   int          *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
25   int          row,mat_i,*mat_j,tcol,*mat_ilen;
26   int          *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
27   int          *aj = a->j,*ai = a->i;
28   MatScalar    *mat_a;
29   Mat          C;
30   PetscTruth   flag;
31 
32   PetscFunctionBegin;
33 
34   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
35   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
36   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
37 
38   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
39   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
40 
41   ierr  = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr);
42   ssmap = smap;
43   ierr  = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr);
44   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
45   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
46   /* determine lens of each row */
47   for (i=0; i<nrows; i++) {
48     kstart  = ai[irow[i]];
49     kend    = kstart + a->ilen[irow[i]];
50     lens[i] = 0;
51       for (k=kstart; k<kend; k++) {
52         if (ssmap[aj[k]]) {
53           lens[i]++;
54         }
55       }
56     }
57   /* Create and fill new matrix */
58   if (scall == MAT_REUSE_MATRIX) {
59     c = (Mat_SeqSBAIJ *)((*B)->data);
60 
61     if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
62     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr);
63     if (flag == PETSC_FALSE) {
64       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
65     }
66     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
67     C = *B;
68   } else {
69     ierr = MatCreateSeqSBAIJ(A->comm,bs,nrows*bs,nrows*bs,0,lens,&C);CHKERRQ(ierr);
70   }
71   c = (Mat_SeqSBAIJ *)(C->data);
72   for (i=0; i<nrows; i++) {
73     row    = irow[i];
74     kstart = ai[row];
75     kend   = kstart + a->ilen[row];
76     mat_i  = c->i[i];
77     mat_j  = c->j + mat_i;
78     mat_a  = c->a + mat_i*bs2;
79     mat_ilen = c->ilen + i;
80     for (k=kstart; k<kend; k++) {
81       if ((tcol=ssmap[a->j[k]])) {
82         *mat_j++ = tcol - 1;
83         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
84         mat_a   += bs2;
85         (*mat_ilen)++;
86       }
87     }
88   }
89 
90   /* Free work space */
91   ierr = PetscFree(smap);CHKERRQ(ierr);
92   ierr = PetscFree(lens);CHKERRQ(ierr);
93   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
94   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
95 
96   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
97   *B = C;
98   PetscFunctionReturn(0);
99 }
100 
101 #undef __FUNCT__
102 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
103 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
104 {
105   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
106   IS          is1;
107   int         *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
108 
109   PetscFunctionBegin;
110   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
111 
112   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
113   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
114 
115   /* Verify if the indices corespond to each element in a block
116    and form the IS with compressed IS */
117   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr);
118   iary = vary + a->mbs;
119   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
120   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
121 
122   count = 0;
123   for (i=0; i<a->mbs; i++) {
124     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
125     if (vary[i]==bs) iary[count++] = i;
126   }
127   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
128 
129   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
130   ierr = PetscFree(vary);CHKERRQ(ierr);
131 
132   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
133   ISDestroy(is1);
134   PetscFunctionReturn(0);
135 }
136 
137 #undef __FUNCT__
138 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
139 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
140 {
141   int ierr,i;
142 
143   PetscFunctionBegin;
144   if (scall == MAT_INITIAL_MATRIX) {
145     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
146   }
147 
148   for (i=0; i<n; i++) {
149     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
150   }
151   PetscFunctionReturn(0);
152 }
153 
154 /* -------------------------------------------------------*/
155 /* Should check that shapes of vectors and matrices match */
156 /* -------------------------------------------------------*/
157 #include "petscblaslapack.h"
158 
159 #undef __FUNCT__
160 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
161 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
162 {
163   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
164   PetscScalar     *x,*z,*xb,x1,zero=0.0;
165   MatScalar       *v;
166   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
167 
168   PetscFunctionBegin;
169   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
170   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
171   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
172 
173   v  = a->a;
174   xb = x;
175 
176   for (i=0; i<mbs; i++) {
177     n  = ai[1] - ai[0];  /* length of i_th row of A */
178     x1 = xb[0];
179     ib = aj + *ai;
180     jmin = 0;
181     if (*ib == i) {      /* (diag of A)*x */
182       z[i] += *v++ * x[*ib++];
183       jmin++;
184     }
185     for (j=jmin; j<n; j++) {
186       cval    = *ib;
187       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
188       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
189     }
190     xb++; ai++;
191   }
192 
193   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
194   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
195   PetscLogFlops(2*(a->s_nz*2 - A->m) - A->m);  /* s_nz = (nz+m)/2 */
196   PetscFunctionReturn(0);
197 }
198 
199 #undef __FUNCT__
200 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
201 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
202 {
203   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
204   PetscScalar      *x,*z,*xb,x1,x2,zero=0.0;
205   MatScalar        *v;
206   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
207 
208 
209   PetscFunctionBegin;
210   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
211   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
212   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
213 
214   v     = a->a;
215   xb = x;
216 
217   for (i=0; i<mbs; i++) {
218     n  = ai[1] - ai[0]; /* length of i_th block row of A */
219     x1 = xb[0]; x2 = xb[1];
220     ib = aj + *ai;
221     jmin = 0;
222     if (*ib == i){     /* (diag of A)*x */
223       z[2*i]   += v[0]*x1 + v[2]*x2;
224       z[2*i+1] += v[2]*x1 + v[3]*x2;
225       v += 4; jmin++;
226     }
227     for (j=jmin; j<n; j++) {
228       /* (strict lower triangular part of A)*x  */
229       cval       = ib[j]*2;
230       z[cval]     += v[0]*x1 + v[1]*x2;
231       z[cval+1]   += v[2]*x1 + v[3]*x2;
232       /* (strict upper triangular part of A)*x  */
233       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
234       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
235       v  += 4;
236     }
237     xb +=2; ai++;
238   }
239 
240   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
241   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
242   PetscLogFlops(8*(a->s_nz*2 - A->m) - A->m);
243   PetscFunctionReturn(0);
244 }
245 
246 #undef __FUNCT__
247 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
248 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
249 {
250   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
251   PetscScalar   *x,*z,*xb,x1,x2,x3,zero=0.0;
252   MatScalar     *v;
253   int           mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
254 
255 
256   PetscFunctionBegin;
257   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
258   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
259   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
260 
261   v     = a->a;
262   xb = x;
263 
264   for (i=0; i<mbs; i++) {
265     n  = ai[1] - ai[0]; /* length of i_th block row of A */
266     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
267     ib = aj + *ai;
268     jmin = 0;
269     if (*ib == i){     /* (diag of A)*x */
270       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
271       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
272       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
273       v += 9; jmin++;
274     }
275     for (j=jmin; j<n; j++) {
276       /* (strict lower triangular part of A)*x  */
277       cval       = ib[j]*3;
278       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
279       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
280       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
281       /* (strict upper triangular part of A)*x  */
282       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
283       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
284       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
285       v  += 9;
286     }
287     xb +=3; ai++;
288   }
289 
290   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
291   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
292   PetscLogFlops(18*(a->s_nz*2 - A->m) - A->m);
293   PetscFunctionReturn(0);
294 }
295 
296 #undef __FUNCT__
297 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
298 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
299 {
300   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
301   PetscScalar      *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
302   MatScalar        *v;
303   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
304 
305   PetscFunctionBegin;
306   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
307   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
308   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
309 
310   v     = a->a;
311   xb = x;
312 
313   for (i=0; i<mbs; i++) {
314     n  = ai[1] - ai[0]; /* length of i_th block row of A */
315     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
316     ib = aj + *ai;
317     jmin = 0;
318     if (*ib == i){     /* (diag of A)*x */
319       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
320       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
321       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
322       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
323       v += 16; jmin++;
324     }
325     for (j=jmin; j<n; j++) {
326       /* (strict lower triangular part of A)*x  */
327       cval       = ib[j]*4;
328       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
329       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
330       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
331       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
332       /* (strict upper triangular part of A)*x  */
333       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
334       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
335       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
336       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
337       v  += 16;
338     }
339     xb +=4; ai++;
340   }
341 
342   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
343   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
344   PetscLogFlops(32*(a->s_nz*2 - A->m) - A->m);
345   PetscFunctionReturn(0);
346 }
347 
348 #undef __FUNCT__
349 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
350 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
351 {
352   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
353   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
354   MatScalar       *v;
355   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
356 
357   PetscFunctionBegin;
358   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
359   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
360   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
361 
362   v     = a->a;
363   xb = x;
364 
365   for (i=0; i<mbs; i++) {
366     n  = ai[1] - ai[0]; /* length of i_th block row of A */
367     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
368     ib = aj + *ai;
369     jmin = 0;
370     if (*ib == i){      /* (diag of A)*x */
371       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
372       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
373       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
374       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
375       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
376       v += 25; jmin++;
377     }
378     for (j=jmin; j<n; j++) {
379       /* (strict lower triangular part of A)*x  */
380       cval       = ib[j]*5;
381       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
382       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
383       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
384       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
385       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
386       /* (strict upper triangular part of A)*x  */
387       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
388       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
389       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
390       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
391       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
392       v  += 25;
393     }
394     xb +=5; ai++;
395   }
396 
397   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
398   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
399   PetscLogFlops(50*(a->s_nz*2 - A->m) - A->m);
400   PetscFunctionReturn(0);
401 }
402 
403 
404 #undef __FUNCT__
405 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
406 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
407 {
408   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
409   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
410   MatScalar       *v;
411   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
412 
413   PetscFunctionBegin;
414   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
415   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
416   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
417 
418   v     = a->a;
419   xb = x;
420 
421   for (i=0; i<mbs; i++) {
422     n  = ai[1] - ai[0]; /* length of i_th block row of A */
423     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
424     ib = aj + *ai;
425     jmin = 0;
426     if (*ib == i){      /* (diag of A)*x */
427       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
428       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
429       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
430       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
431       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
432       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
433       v += 36; jmin++;
434     }
435     for (j=jmin; j<n; j++) {
436       /* (strict lower triangular part of A)*x  */
437       cval       = ib[j]*6;
438       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
439       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
440       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
441       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
442       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
443       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
444       /* (strict upper triangular part of A)*x  */
445       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
446       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
447       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
448       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
449       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
450       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
451       v  += 36;
452     }
453     xb +=6; ai++;
454   }
455 
456   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
457   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
458   PetscLogFlops(72*(a->s_nz*2 - A->m) - A->m);
459   PetscFunctionReturn(0);
460 }
461 #undef __FUNCT__
462 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
463 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
464 {
465   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
466   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
467   MatScalar       *v;
468   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
469 
470   PetscFunctionBegin;
471   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
472   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
473   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
474 
475   v     = a->a;
476   xb = x;
477 
478   for (i=0; i<mbs; i++) {
479     n  = ai[1] - ai[0]; /* length of i_th block row of A */
480     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
481     ib = aj + *ai;
482     jmin = 0;
483     if (*ib == i){      /* (diag of A)*x */
484       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
485       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
486       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
487       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
488       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
489       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
490       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
491       v += 49; jmin++;
492     }
493     for (j=jmin; j<n; j++) {
494       /* (strict lower triangular part of A)*x  */
495       cval       = ib[j]*7;
496       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
497       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
498       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
499       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
500       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
501       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
502       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
503       /* (strict upper triangular part of A)*x  */
504       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
505       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
506       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
507       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
508       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
509       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
510       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
511       v  += 49;
512     }
513     xb +=7; ai++;
514   }
515   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
516   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
517   PetscLogFlops(98*(a->s_nz*2 - A->m) - A->m);
518   PetscFunctionReturn(0);
519 }
520 
521 /*
522     This will not work with MatScalar == float because it calls the BLAS
523 */
524 #undef __FUNCT__
525 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
526 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
527 {
528   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
529   PetscScalar     *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
530   MatScalar       *v;
531   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
532   int             ncols,k;
533 
534   PetscFunctionBegin;
535   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
536   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
537   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
538 
539   aj   = a->j;
540   v    = a->a;
541   ii   = a->i;
542 
543   if (!a->mult_work) {
544     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
545   }
546   work = a->mult_work;
547 
548   for (i=0; i<mbs; i++) {
549     n     = ii[1] - ii[0]; ncols = n*bs;
550     workt = work; idx=aj+ii[0];
551 
552     /* upper triangular part */
553     for (j=0; j<n; j++) {
554       xb = x_ptr + bs*(*idx++);
555       for (k=0; k<bs; k++) workt[k] = xb[k];
556       workt += bs;
557     }
558     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
559     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
560 
561     /* strict lower triangular part */
562     idx = aj+ii[0];
563     if (*idx == i){
564       ncols -= bs; v += bs2; idx++; n--;
565     }
566 
567     if (ncols > 0){
568       workt = work;
569       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
570       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
571       for (j=0; j<n; j++) {
572         zb = z_ptr + bs*(*idx++);
573         for (k=0; k<bs; k++) zb[k] += workt[k] ;
574         workt += bs;
575       }
576     }
577     x += bs; v += n*bs2; z += bs; ii++;
578   }
579 
580   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
581   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
582   PetscLogFlops(2*(a->s_nz*2 - A->m)*bs2 - A->m);
583   PetscFunctionReturn(0);
584 }
585 
586 #undef __FUNCT__
587 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
588 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
589 {
590   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
591   PetscScalar     *x,*y,*z,*xb,x1;
592   MatScalar       *v;
593   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
594 
595   PetscFunctionBegin;
596   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
597   if (yy != xx) {
598     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
599   } else {
600     y = x;
601   }
602   if (zz != yy) {
603     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
604     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
605     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
606   } else {
607     z = y;
608   }
609 
610   v  = a->a;
611   xb = x;
612 
613   for (i=0; i<mbs; i++) {
614     n  = ai[1] - ai[0];  /* length of i_th row of A */
615     x1 = xb[0];
616     ib = aj + *ai;
617     jmin = 0;
618     if (*ib == i) {            /* (diag of A)*x */
619       z[i] += *v++ * x[*ib++]; jmin++;
620     }
621     for (j=jmin; j<n; j++) {
622       cval    = *ib;
623       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
624       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
625     }
626     xb++; ai++;
627   }
628 
629   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
630   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
631   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
632 
633   PetscLogFlops(2*(a->s_nz*2 - A->m));
634   PetscFunctionReturn(0);
635 }
636 
637 #undef __FUNCT__
638 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
639 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
640 {
641   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
642   PetscScalar     *x,*y,*z,*xb,x1,x2;
643   MatScalar       *v;
644   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
645 
646   PetscFunctionBegin;
647   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
648   if (yy != xx) {
649     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
650   } else {
651     y = x;
652   }
653   if (zz != yy) {
654     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
655     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
656     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
657   } else {
658     z = y;
659   }
660 
661   v     = a->a;
662   xb = x;
663 
664   for (i=0; i<mbs; i++) {
665     n  = ai[1] - ai[0]; /* length of i_th block row of A */
666     x1 = xb[0]; x2 = xb[1];
667     ib = aj + *ai;
668     jmin = 0;
669     if (*ib == i){      /* (diag of A)*x */
670       z[2*i]   += v[0]*x1 + v[2]*x2;
671       z[2*i+1] += v[2]*x1 + v[3]*x2;
672       v += 4; jmin++;
673     }
674     for (j=jmin; j<n; j++) {
675       /* (strict lower triangular part of A)*x  */
676       cval       = ib[j]*2;
677       z[cval]     += v[0]*x1 + v[1]*x2;
678       z[cval+1]   += v[2]*x1 + v[3]*x2;
679       /* (strict upper triangular part of A)*x  */
680       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
681       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
682       v  += 4;
683     }
684     xb +=2; ai++;
685   }
686 
687   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
688   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
689   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
690 
691   PetscLogFlops(4*(a->s_nz*2 - A->m));
692   PetscFunctionReturn(0);
693 }
694 
695 #undef __FUNCT__
696 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
697 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
698 {
699   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
700   PetscScalar     *x,*y,*z,*xb,x1,x2,x3;
701   MatScalar       *v;
702   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
703 
704   PetscFunctionBegin;
705   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
706   if (yy != xx) {
707     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
708   } else {
709     y = x;
710   }
711   if (zz != yy) {
712     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
713     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
714     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
715   } else {
716     z = y;
717   }
718 
719   v     = a->a;
720   xb = x;
721 
722   for (i=0; i<mbs; i++) {
723     n  = ai[1] - ai[0]; /* length of i_th block row of A */
724     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
725     ib = aj + *ai;
726     jmin = 0;
727     if (*ib == i){     /* (diag of A)*x */
728      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
729      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
730      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
731      v += 9; jmin++;
732     }
733     for (j=jmin; j<n; j++) {
734       /* (strict lower triangular part of A)*x  */
735       cval       = ib[j]*3;
736       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
737       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
738       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
739       /* (strict upper triangular part of A)*x  */
740       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
741       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
742       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
743       v  += 9;
744     }
745     xb +=3; ai++;
746   }
747 
748   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
749   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
750   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
751 
752   PetscLogFlops(18*(a->s_nz*2 - A->m));
753   PetscFunctionReturn(0);
754 }
755 
756 #undef __FUNCT__
757 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
758 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
759 {
760   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
761   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4;
762   MatScalar       *v;
763   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
764 
765   PetscFunctionBegin;
766   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
767   if (yy != xx) {
768     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
769   } else {
770     y = x;
771   }
772   if (zz != yy) {
773     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
774     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
775     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
776   } else {
777     z = y;
778   }
779 
780   v     = a->a;
781   xb = x;
782 
783   for (i=0; i<mbs; i++) {
784     n  = ai[1] - ai[0]; /* length of i_th block row of A */
785     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
786     ib = aj + *ai;
787     jmin = 0;
788     if (*ib == i){      /* (diag of A)*x */
789       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
790       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
791       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
792       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
793       v += 16; jmin++;
794     }
795     for (j=jmin; j<n; j++) {
796       /* (strict lower triangular part of A)*x  */
797       cval       = ib[j]*4;
798       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
799       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
800       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
801       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
802       /* (strict upper triangular part of A)*x  */
803       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
804       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
805       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
806       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
807       v  += 16;
808     }
809     xb +=4; ai++;
810   }
811 
812   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
813   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
814   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
815 
816   PetscLogFlops(32*(a->s_nz*2 - A->m));
817   PetscFunctionReturn(0);
818 }
819 
820 #undef __FUNCT__
821 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
822 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
823 {
824   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
825   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5;
826   MatScalar       *v;
827   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
828 
829   PetscFunctionBegin;
830   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
831   if (yy != xx) {
832     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
833   } else {
834     y = x;
835   }
836   if (zz != yy) {
837     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
838     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
839     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
840   } else {
841     z = y;
842   }
843 
844   v     = a->a;
845   xb = x;
846 
847   for (i=0; i<mbs; i++) {
848     n  = ai[1] - ai[0]; /* length of i_th block row of A */
849     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
850     ib = aj + *ai;
851     jmin = 0;
852     if (*ib == i){      /* (diag of A)*x */
853       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
854       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
855       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
856       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
857       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
858       v += 25; jmin++;
859     }
860     for (j=jmin; j<n; j++) {
861       /* (strict lower triangular part of A)*x  */
862       cval       = ib[j]*5;
863       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
864       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
865       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
866       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
867       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
868       /* (strict upper triangular part of A)*x  */
869       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
870       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
871       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
872       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
873       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
874       v  += 25;
875     }
876     xb +=5; ai++;
877   }
878 
879   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
880   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
881   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
882 
883   PetscLogFlops(50*(a->s_nz*2 - A->m));
884   PetscFunctionReturn(0);
885 }
886 #undef __FUNCT__
887 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
888 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
889 {
890   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
891   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
892   MatScalar       *v;
893   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
894 
895   PetscFunctionBegin;
896   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
897   if (yy != xx) {
898     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
899   } else {
900     y = x;
901   }
902   if (zz != yy) {
903     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
904     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
905     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
906   } else {
907     z = y;
908   }
909 
910   v     = a->a;
911   xb = x;
912 
913   for (i=0; i<mbs; i++) {
914     n  = ai[1] - ai[0]; /* length of i_th block row of A */
915     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
916     ib = aj + *ai;
917     jmin = 0;
918     if (*ib == i){     /* (diag of A)*x */
919       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
920       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
921       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
922       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
923       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
924       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
925       v += 36; jmin++;
926     }
927     for (j=jmin; j<n; j++) {
928       /* (strict lower triangular part of A)*x  */
929       cval       = ib[j]*6;
930       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
931       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
932       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
933       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
934       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
935       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
936       /* (strict upper triangular part of A)*x  */
937       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
938       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
939       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
940       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
941       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
942       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
943       v  += 36;
944     }
945     xb +=6; ai++;
946   }
947 
948   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
949   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
950   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
951 
952   PetscLogFlops(72*(a->s_nz*2 - A->m));
953   PetscFunctionReturn(0);
954 }
955 
956 #undef __FUNCT__
957 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
958 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
959 {
960   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
961   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
962   MatScalar       *v;
963   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
964 
965   PetscFunctionBegin;
966   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
967   if (yy != xx) {
968     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
969   } else {
970     y = x;
971   }
972   if (zz != yy) {
973     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
974     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
975     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
976   } else {
977     z = y;
978   }
979 
980   v     = a->a;
981   xb = x;
982 
983   for (i=0; i<mbs; i++) {
984     n  = ai[1] - ai[0]; /* length of i_th block row of A */
985     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
986     ib = aj + *ai;
987     jmin = 0;
988     if (*ib == i){     /* (diag of A)*x */
989       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
990       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
991       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
992       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
993       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
994       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
995       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
996       v += 49; jmin++;
997     }
998     for (j=jmin; j<n; j++) {
999       /* (strict lower triangular part of A)*x  */
1000       cval       = ib[j]*7;
1001       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1002       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1003       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1004       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1005       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1006       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1007       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1008       /* (strict upper triangular part of A)*x  */
1009       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1010       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1011       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1012       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1013       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1014       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1015       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1016       v  += 49;
1017     }
1018     xb +=7; ai++;
1019   }
1020 
1021   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1022   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1023   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1024 
1025   PetscLogFlops(98*(a->s_nz*2 - A->m));
1026   PetscFunctionReturn(0);
1027 }
1028 
1029 #undef __FUNCT__
1030 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1031 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1032 {
1033   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1034   PetscScalar     *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1035   MatScalar       *v;
1036   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
1037   int             ncols,k;
1038 
1039   PetscFunctionBegin;
1040   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1041   if (yy != xx) {
1042     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1043   } else {
1044     y = x;
1045   }
1046   if (zz != yy) {
1047     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1048     ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1049     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1050   } else {
1051     z = y;
1052   }
1053 
1054   aj   = a->j;
1055   v    = a->a;
1056   ii   = a->i;
1057 
1058   if (!a->mult_work) {
1059     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1060   }
1061   work = a->mult_work;
1062 
1063 
1064   for (i=0; i<mbs; i++) {
1065     n     = ii[1] - ii[0]; ncols = n*bs;
1066     workt = work; idx=aj+ii[0];
1067 
1068     /* upper triangular part */
1069     for (j=0; j<n; j++) {
1070       xb = x_ptr + bs*(*idx++);
1071       for (k=0; k<bs; k++) workt[k] = xb[k];
1072       workt += bs;
1073     }
1074     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1075     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1076 
1077     /* strict lower triangular part */
1078     idx = aj+ii[0];
1079     if (*idx == i){
1080       ncols -= bs; v += bs2; idx++; n--;
1081     }
1082     if (ncols > 0){
1083       workt = work;
1084       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1085       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1086       for (j=0; j<n; j++) {
1087         zb = z_ptr + bs*(*idx++);
1088         /* idx++; */
1089         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1090         workt += bs;
1091       }
1092     }
1093 
1094     x += bs; v += n*bs2; z += bs; ii++;
1095   }
1096 
1097   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1098   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1099   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1100 
1101   PetscLogFlops(2*(a->s_nz*2 - A->m));
1102   PetscFunctionReturn(0);
1103 }
1104 
1105 #undef __FUNCT__
1106 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ"
1107 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1108 {
1109   int ierr;
1110 
1111   PetscFunctionBegin;
1112   ierr = MatMult(A,xx,zz);CHKERRQ(ierr);
1113   PetscFunctionReturn(0);
1114 }
1115 
1116 #undef __FUNCT__
1117 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ"
1118 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1119 
1120 {
1121   int ierr;
1122 
1123   PetscFunctionBegin;
1124   ierr = MatMultAdd(A,xx,yy,zz);CHKERRQ(ierr);
1125   PetscFunctionReturn(0);
1126 }
1127 
1128 #undef __FUNCT__
1129 #define __FUNCT__ "MatScale_SeqSBAIJ"
1130 int MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA)
1131 {
1132   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1133   int         one = 1,totalnz = a->bs2*a->s_nz;
1134 
1135   PetscFunctionBegin;
1136   BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one);
1137   PetscLogFlops(totalnz);
1138   PetscFunctionReturn(0);
1139 }
1140 
1141 #undef __FUNCT__
1142 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1143 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1144 {
1145   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1146   MatScalar   *v = a->a;
1147   PetscReal   sum_diag = 0.0, sum_off = 0.0, *sum;
1148   int         i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1149   int         *jl,*il,jmin,jmax,ierr,nexti,ik,*col;
1150 
1151   PetscFunctionBegin;
1152   if (type == NORM_FROBENIUS) {
1153     for (k=0; k<mbs; k++){
1154       jmin = a->i[k]; jmax = a->i[k+1];
1155       col  = aj + jmin;
1156       if (*col == k){         /* diagonal block */
1157         for (i=0; i<bs2; i++){
1158 #if defined(PETSC_USE_COMPLEX)
1159           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1160 #else
1161           sum_diag += (*v)*(*v); v++;
1162 #endif
1163         }
1164         jmin++;
1165       }
1166       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1167         for (i=0; i<bs2; i++){
1168 #if defined(PETSC_USE_COMPLEX)
1169           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1170 #else
1171           sum_off += (*v)*(*v); v++;
1172 #endif
1173         }
1174       }
1175     }
1176     *norm = sqrt(sum_diag + 2*sum_off);
1177 
1178   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1179     ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr);
1180     ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr);
1181     ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr);
1182     for (i=0; i<mbs; i++) {
1183       jl[i] = mbs; il[0] = 0;
1184     }
1185 
1186     *norm = 0.0;
1187     for (k=0; k<mbs; k++) { /* k_th block row */
1188       for (j=0; j<bs; j++) sum[j]=0.0;
1189 
1190       /*-- col sum --*/
1191       i = jl[k]; /* first |A(i,k)| to be added */
1192       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1193                   at step k */
1194       while (i<mbs){
1195         nexti = jl[i];  /* next block row to be added */
1196         ik    = il[i];  /* block index of A(i,k) in the array a */
1197         for (j=0; j<bs; j++){
1198           v = a->a + ik*bs2 + j*bs;
1199           for (k1=0; k1<bs; k1++) {
1200             sum[j] += PetscAbsScalar(*v); v++;
1201           }
1202         }
1203         /* update il, jl */
1204         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1205         jmax = a->i[i+1];
1206         if (jmin < jmax){
1207           il[i] = jmin;
1208           j   = a->j[jmin];
1209           jl[i] = jl[j]; jl[j]=i;
1210         }
1211         i = nexti;
1212       }
1213 
1214       /*-- row sum --*/
1215       jmin = a->i[k]; jmax = a->i[k+1];
1216       for (i=jmin; i<jmax; i++) {
1217         for (j=0; j<bs; j++){
1218           v = a->a + i*bs2 + j;
1219           for (k1=0; k1<bs; k1++){
1220             sum[j] += PetscAbsScalar(*v);
1221             v   += bs;
1222           }
1223         }
1224       }
1225       /* add k_th block row to il, jl */
1226       col = aj+jmin;
1227       if (*col == k) jmin++;
1228       if (jmin < jmax){
1229         il[k] = jmin;
1230         j   = a->j[jmin];
1231         jl[k] = jl[j]; jl[j] = k;
1232       }
1233       for (j=0; j<bs; j++){
1234         if (sum[j] > *norm) *norm = sum[j];
1235       }
1236     }
1237     ierr = PetscFree(il);CHKERRQ(ierr);
1238     ierr = PetscFree(jl);CHKERRQ(ierr);
1239     ierr = PetscFree(sum);CHKERRQ(ierr);
1240   } else {
1241     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1242   }
1243   PetscFunctionReturn(0);
1244 }
1245 
1246 #undef __FUNCT__
1247 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1248 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1249 {
1250   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1251   int          ierr;
1252 
1253   PetscFunctionBegin;
1254 
1255   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1256   if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->s_nz != b->s_nz)) {
1257     *flg = PETSC_FALSE;
1258     PetscFunctionReturn(0);
1259   }
1260 
1261   /* if the a->i are the same */
1262   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr);
1263   if (*flg == PETSC_FALSE) {
1264     PetscFunctionReturn(0);
1265   }
1266 
1267   /* if a->j are the same */
1268   ierr = PetscMemcmp(a->j,b->j,(a->s_nz)*sizeof(int),flg);CHKERRQ(ierr);
1269   if (*flg == PETSC_FALSE) {
1270     PetscFunctionReturn(0);
1271   }
1272   /* if a->a are the same */
1273   ierr = PetscMemcmp(a->a,b->a,(a->s_nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1274 
1275   PetscFunctionReturn(0);
1276 }
1277 
1278 #undef __FUNCT__
1279 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1280 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1281 {
1282   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1283   int          ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1284   PetscScalar  *x,zero = 0.0;
1285   MatScalar    *aa,*aa_j;
1286 
1287   PetscFunctionBegin;
1288   bs   = a->bs;
1289   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1290 
1291   aa   = a->a;
1292   ai   = a->i;
1293   aj   = a->j;
1294   ambs = a->mbs;
1295   bs2  = a->bs2;
1296 
1297   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1298   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1299   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1300   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1301   for (i=0; i<ambs; i++) {
1302     j=ai[i];
1303     if (aj[j] == i) {             /* if this is a diagonal element */
1304       row  = i*bs;
1305       aa_j = aa + j*bs2;
1306       if (A->factor && bs==1){
1307         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1308       } else {
1309         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1310       }
1311     }
1312   }
1313 
1314   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1315   PetscFunctionReturn(0);
1316 }
1317 
1318 #undef __FUNCT__
1319 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1320 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1321 {
1322   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1323   PetscScalar  *l,*r,x,*li,*ri;
1324   MatScalar    *aa,*v;
1325   int          ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1326 
1327   PetscFunctionBegin;
1328   ai  = a->i;
1329   aj  = a->j;
1330   aa  = a->a;
1331   m   = A->m;
1332   bs  = a->bs;
1333   mbs = a->mbs;
1334   bs2 = a->bs2;
1335 
1336   if (ll != rr) {
1337     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1338   }
1339   if (ll) {
1340     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1341     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1342     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1343     for (i=0; i<mbs; i++) { /* for each block row */
1344       M  = ai[i+1] - ai[i];
1345       li = l + i*bs;
1346       v  = aa + bs2*ai[i];
1347       for (j=0; j<M; j++) { /* for each block */
1348         for (k=0; k<bs2; k++) {
1349           (*v++) *= li[k%bs];
1350         }
1351 #ifdef CONT
1352         /* will be used to replace the above loop */
1353         ri = l + bs*aj[ai[i]+j];
1354         for (k=0; k<bs; k++) { /* column value */
1355           x = ri[k];
1356           for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1357         }
1358 #endif
1359 
1360       }
1361     }
1362     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1363     PetscLogFlops(2*a->s_nz);
1364   }
1365   /* will be deleted */
1366   if (rr) {
1367     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
1368     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1369     if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1370     for (i=0; i<mbs; i++) { /* for each block row */
1371       M  = ai[i+1] - ai[i];
1372       v  = aa + bs2*ai[i];
1373       for (j=0; j<M; j++) { /* for each block */
1374         ri = r + bs*aj[ai[i]+j];
1375         for (k=0; k<bs; k++) {
1376           x = ri[k];
1377           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1378         }
1379       }
1380     }
1381     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
1382     PetscLogFlops(a->s_nz);
1383   }
1384   PetscFunctionReturn(0);
1385 }
1386 
1387 #undef __FUNCT__
1388 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1389 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1390 {
1391   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1392 
1393   PetscFunctionBegin;
1394   info->rows_global    = (double)A->m;
1395   info->columns_global = (double)A->m;
1396   info->rows_local     = (double)A->m;
1397   info->columns_local  = (double)A->m;
1398   info->block_size     = a->bs2;
1399   info->nz_allocated   = a->s_maxnz; /*num. of nonzeros in upper triangular part */
1400   info->nz_used        = a->bs2*a->s_nz; /*num. of nonzeros in upper triangular part */
1401   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1402   info->assemblies   = A->num_ass;
1403   info->mallocs      = a->reallocs;
1404   info->memory       = A->mem;
1405   if (A->factor) {
1406     info->fill_ratio_given  = A->info.fill_ratio_given;
1407     info->fill_ratio_needed = A->info.fill_ratio_needed;
1408     info->factor_mallocs    = A->info.factor_mallocs;
1409   } else {
1410     info->fill_ratio_given  = 0;
1411     info->fill_ratio_needed = 0;
1412     info->factor_mallocs    = 0;
1413   }
1414   PetscFunctionReturn(0);
1415 }
1416 
1417 
1418 #undef __FUNCT__
1419 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1420 int MatZeroEntries_SeqSBAIJ(Mat A)
1421 {
1422   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1423   int         ierr;
1424 
1425   PetscFunctionBegin;
1426   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1427   PetscFunctionReturn(0);
1428 }
1429 
1430 #undef __FUNCT__
1431 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ"
1432 int MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1433 {
1434   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1435   int          ierr,i,j,n,row,col,bs,*ai,*aj,mbs;
1436   PetscReal    atmp;
1437   MatScalar    *aa;
1438   PetscScalar  zero = 0.0,*x;
1439   int          ncols,brow,bcol,krow,kcol;
1440 
1441   PetscFunctionBegin;
1442   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1443   bs   = a->bs;
1444   aa   = a->a;
1445   ai   = a->i;
1446   aj   = a->j;
1447   mbs = a->mbs;
1448 
1449   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1450   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1451   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1452   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1453   for (i=0; i<mbs; i++) {
1454     ncols = ai[1] - ai[0]; ai++;
1455     brow  = bs*i;
1456     for (j=0; j<ncols; j++){
1457       bcol = bs*(*aj);
1458       for (kcol=0; kcol<bs; kcol++){
1459         col = bcol + kcol;      /* col index */
1460         for (krow=0; krow<bs; krow++){
1461           atmp = PetscAbsScalar(*aa); aa++;
1462           row = brow + krow;    /* row index */
1463           /* printf("val[%d,%d]: %g\n",row,col,atmp); */
1464           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1465           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1466         }
1467       }
1468       aj++;
1469     }
1470   }
1471   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1472   PetscFunctionReturn(0);
1473 }
1474