xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision e2d9671b86a8697a4c2c5fff9ed2002d1fb085ae)
1 /*$Id: sbaij2.c,v 1.32 2001/08/07 03:03:01 balay Exp $*/
2 
3 #include "src/mat/impls/baij/seq/baij.h"
4 #include "src/inline/spops.h"
5 #include "src/inline/ilu.h"
6 #include "petscbt.h"
7 #include "src/mat/impls/sbaij/seq/sbaij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
11 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS is[],int ov)
12 {
13   PetscFunctionBegin;
14   SETERRQ(1,"Function not yet written for SBAIJ format");
15   /* PetscFunctionReturn(0); */
16 }
17 
18 #undef __FUNCT__
19 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
20 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
21 {
22   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
23   int          *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
24   int          row,mat_i,*mat_j,tcol,*mat_ilen;
25   int          *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
26   int          *aj = a->j,*ai = a->i;
27   MatScalar    *mat_a;
28   Mat          C;
29   PetscTruth   flag;
30 
31   PetscFunctionBegin;
32 
33   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
34   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
35   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
36 
37   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
38   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
39 
40   ierr  = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr);
41   ssmap = smap;
42   ierr  = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr);
43   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
44   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
45   /* determine lens of each row */
46   for (i=0; i<nrows; i++) {
47     kstart  = ai[irow[i]];
48     kend    = kstart + a->ilen[irow[i]];
49     lens[i] = 0;
50       for (k=kstart; k<kend; k++) {
51         if (ssmap[aj[k]]) {
52           lens[i]++;
53         }
54       }
55     }
56   /* Create and fill new matrix */
57   if (scall == MAT_REUSE_MATRIX) {
58     c = (Mat_SeqSBAIJ *)((*B)->data);
59 
60     if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
61     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr);
62     if (flag == PETSC_FALSE) {
63       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
64     }
65     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
66     C = *B;
67   } else {
68     ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr);
69     ierr = MatSetType(C,A->type_name);CHKERRQ(ierr);
70     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
71   }
72   c = (Mat_SeqSBAIJ *)(C->data);
73   for (i=0; i<nrows; i++) {
74     row    = irow[i];
75     kstart = ai[row];
76     kend   = kstart + a->ilen[row];
77     mat_i  = c->i[i];
78     mat_j  = c->j + mat_i;
79     mat_a  = c->a + mat_i*bs2;
80     mat_ilen = c->ilen + i;
81     for (k=kstart; k<kend; k++) {
82       if ((tcol=ssmap[a->j[k]])) {
83         *mat_j++ = tcol - 1;
84         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
85         mat_a   += bs2;
86         (*mat_ilen)++;
87       }
88     }
89   }
90 
91   /* Free work space */
92   ierr = PetscFree(smap);CHKERRQ(ierr);
93   ierr = PetscFree(lens);CHKERRQ(ierr);
94   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
95   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
96 
97   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
98   *B = C;
99   PetscFunctionReturn(0);
100 }
101 
102 #undef __FUNCT__
103 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
104 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
105 {
106   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
107   IS          is1;
108   int         *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
109 
110   PetscFunctionBegin;
111   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
112 
113   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
114   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
115 
116   /* Verify if the indices corespond to each element in a block
117    and form the IS with compressed IS */
118   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr);
119   iary = vary + a->mbs;
120   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
121   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
122 
123   count = 0;
124   for (i=0; i<a->mbs; i++) {
125     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
126     if (vary[i]==bs) iary[count++] = i;
127   }
128   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
129 
130   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
131   ierr = PetscFree(vary);CHKERRQ(ierr);
132 
133   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
134   ISDestroy(is1);
135   PetscFunctionReturn(0);
136 }
137 
138 #undef __FUNCT__
139 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
140 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
141 {
142   int ierr,i;
143 
144   PetscFunctionBegin;
145   if (scall == MAT_INITIAL_MATRIX) {
146     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
147   }
148 
149   for (i=0; i<n; i++) {
150     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
151   }
152   PetscFunctionReturn(0);
153 }
154 
155 /* -------------------------------------------------------*/
156 /* Should check that shapes of vectors and matrices match */
157 /* -------------------------------------------------------*/
158 #include "petscblaslapack.h"
159 
160 #undef __FUNCT__
161 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
162 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
163 {
164   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
165   PetscScalar     *x,*z,*xb,x1,zero=0.0;
166   MatScalar       *v;
167   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
168 
169   PetscFunctionBegin;
170   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
171   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
172   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
173 
174   v  = a->a;
175   xb = x;
176 
177   for (i=0; i<mbs; i++) {
178     n  = ai[1] - ai[0];  /* length of i_th row of A */
179     x1 = xb[0];
180     ib = aj + *ai;
181     jmin = 0;
182     if (*ib == i) {      /* (diag of A)*x */
183       z[i] += *v++ * x[*ib++];
184       jmin++;
185     }
186     for (j=jmin; j<n; j++) {
187       cval    = *ib;
188       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
189       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
190     }
191     xb++; ai++;
192   }
193 
194   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
195   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
196   PetscLogFlops(2*(a->nz*2 - A->m) - A->m);  /* nz = (nz+m)/2 */
197   PetscFunctionReturn(0);
198 }
199 
200 #undef __FUNCT__
201 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
202 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
203 {
204   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
205   PetscScalar      *x,*z,*xb,x1,x2,zero=0.0;
206   MatScalar        *v;
207   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
208 
209 
210   PetscFunctionBegin;
211   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
212   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
213   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
214 
215   v     = a->a;
216   xb = x;
217 
218   for (i=0; i<mbs; i++) {
219     n  = ai[1] - ai[0]; /* length of i_th block row of A */
220     x1 = xb[0]; x2 = xb[1];
221     ib = aj + *ai;
222     jmin = 0;
223     if (*ib == i){     /* (diag of A)*x */
224       z[2*i]   += v[0]*x1 + v[2]*x2;
225       z[2*i+1] += v[2]*x1 + v[3]*x2;
226       v += 4; jmin++;
227     }
228     for (j=jmin; j<n; j++) {
229       /* (strict lower triangular part of A)*x  */
230       cval       = ib[j]*2;
231       z[cval]     += v[0]*x1 + v[1]*x2;
232       z[cval+1]   += v[2]*x1 + v[3]*x2;
233       /* (strict upper triangular part of A)*x  */
234       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
235       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
236       v  += 4;
237     }
238     xb +=2; ai++;
239   }
240 
241   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
242   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
243   PetscLogFlops(8*(a->nz*2 - A->m) - A->m);
244   PetscFunctionReturn(0);
245 }
246 
247 #undef __FUNCT__
248 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
249 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
250 {
251   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
252   PetscScalar   *x,*z,*xb,x1,x2,x3,zero=0.0;
253   MatScalar     *v;
254   int           mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
255 
256 
257   PetscFunctionBegin;
258   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
259   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
260   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
261 
262   v     = a->a;
263   xb = x;
264 
265   for (i=0; i<mbs; i++) {
266     n  = ai[1] - ai[0]; /* length of i_th block row of A */
267     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
268     ib = aj + *ai;
269     jmin = 0;
270     if (*ib == i){     /* (diag of A)*x */
271       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
272       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
273       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
274       v += 9; jmin++;
275     }
276     for (j=jmin; j<n; j++) {
277       /* (strict lower triangular part of A)*x  */
278       cval       = ib[j]*3;
279       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
280       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
281       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
282       /* (strict upper triangular part of A)*x  */
283       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
284       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
285       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
286       v  += 9;
287     }
288     xb +=3; ai++;
289   }
290 
291   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
292   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
293   PetscLogFlops(18*(a->nz*2 - A->m) - A->m);
294   PetscFunctionReturn(0);
295 }
296 
297 #undef __FUNCT__
298 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
299 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
300 {
301   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
302   PetscScalar      *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
303   MatScalar        *v;
304   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
305 
306   PetscFunctionBegin;
307   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
308   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
309   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
310 
311   v     = a->a;
312   xb = x;
313 
314   for (i=0; i<mbs; i++) {
315     n  = ai[1] - ai[0]; /* length of i_th block row of A */
316     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
317     ib = aj + *ai;
318     jmin = 0;
319     if (*ib == i){     /* (diag of A)*x */
320       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
321       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
322       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
323       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
324       v += 16; jmin++;
325     }
326     for (j=jmin; j<n; j++) {
327       /* (strict lower triangular part of A)*x  */
328       cval       = ib[j]*4;
329       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
330       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
331       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
332       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
333       /* (strict upper triangular part of A)*x  */
334       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
335       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
336       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
337       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
338       v  += 16;
339     }
340     xb +=4; ai++;
341   }
342 
343   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
344   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
345   PetscLogFlops(32*(a->nz*2 - A->m) - A->m);
346   PetscFunctionReturn(0);
347 }
348 
349 #undef __FUNCT__
350 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
351 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
352 {
353   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
354   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
355   MatScalar       *v;
356   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
357 
358   PetscFunctionBegin;
359   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
360   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
361   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
362 
363   v     = a->a;
364   xb = x;
365 
366   for (i=0; i<mbs; i++) {
367     n  = ai[1] - ai[0]; /* length of i_th block row of A */
368     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
369     ib = aj + *ai;
370     jmin = 0;
371     if (*ib == i){      /* (diag of A)*x */
372       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
373       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
374       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
375       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
376       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
377       v += 25; jmin++;
378     }
379     for (j=jmin; j<n; j++) {
380       /* (strict lower triangular part of A)*x  */
381       cval       = ib[j]*5;
382       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
383       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
384       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
385       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
386       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
387       /* (strict upper triangular part of A)*x  */
388       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
389       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
390       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
391       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
392       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
393       v  += 25;
394     }
395     xb +=5; ai++;
396   }
397 
398   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
399   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
400   PetscLogFlops(50*(a->nz*2 - A->m) - A->m);
401   PetscFunctionReturn(0);
402 }
403 
404 
405 #undef __FUNCT__
406 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
407 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
408 {
409   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
410   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
411   MatScalar       *v;
412   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
413 
414   PetscFunctionBegin;
415   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
416   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
417   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
418 
419   v     = a->a;
420   xb = x;
421 
422   for (i=0; i<mbs; i++) {
423     n  = ai[1] - ai[0]; /* length of i_th block row of A */
424     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
425     ib = aj + *ai;
426     jmin = 0;
427     if (*ib == i){      /* (diag of A)*x */
428       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
429       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
430       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
431       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
432       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
433       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
434       v += 36; jmin++;
435     }
436     for (j=jmin; j<n; j++) {
437       /* (strict lower triangular part of A)*x  */
438       cval       = ib[j]*6;
439       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
440       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
441       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
442       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
443       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
444       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
445       /* (strict upper triangular part of A)*x  */
446       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
447       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
448       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
449       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
450       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
451       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
452       v  += 36;
453     }
454     xb +=6; ai++;
455   }
456 
457   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
458   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
459   PetscLogFlops(72*(a->nz*2 - A->m) - A->m);
460   PetscFunctionReturn(0);
461 }
462 #undef __FUNCT__
463 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
464 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
465 {
466   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
467   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
468   MatScalar       *v;
469   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
470 
471   PetscFunctionBegin;
472   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
473   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
474   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
475 
476   v     = a->a;
477   xb = x;
478 
479   for (i=0; i<mbs; i++) {
480     n  = ai[1] - ai[0]; /* length of i_th block row of A */
481     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
482     ib = aj + *ai;
483     jmin = 0;
484     if (*ib == i){      /* (diag of A)*x */
485       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
486       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
487       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
488       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
489       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
490       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
491       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
492       v += 49; jmin++;
493     }
494     for (j=jmin; j<n; j++) {
495       /* (strict lower triangular part of A)*x  */
496       cval       = ib[j]*7;
497       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
498       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
499       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
500       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
501       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
502       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
503       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
504       /* (strict upper triangular part of A)*x  */
505       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
506       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
507       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
508       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
509       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
510       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
511       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
512       v  += 49;
513     }
514     xb +=7; ai++;
515   }
516   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
517   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
518   PetscLogFlops(98*(a->nz*2 - A->m) - A->m);
519   PetscFunctionReturn(0);
520 }
521 
522 /*
523     This will not work with MatScalar == float because it calls the BLAS
524 */
525 #undef __FUNCT__
526 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
527 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
528 {
529   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
530   PetscScalar     *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
531   MatScalar       *v;
532   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
533   int             ncols,k;
534 
535   PetscFunctionBegin;
536   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
537   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); x_ptr=x;
538   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); z_ptr=z;
539 
540   aj   = a->j;
541   v    = a->a;
542   ii   = a->i;
543 
544   if (!a->mult_work) {
545     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
546   }
547   work = a->mult_work;
548 
549   for (i=0; i<mbs; i++) {
550     n     = ii[1] - ii[0]; ncols = n*bs;
551     workt = work; idx=aj+ii[0];
552 
553     /* upper triangular part */
554     for (j=0; j<n; j++) {
555       xb = x_ptr + bs*(*idx++);
556       for (k=0; k<bs; k++) workt[k] = xb[k];
557       workt += bs;
558     }
559     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
560     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
561 
562     /* strict lower triangular part */
563     idx = aj+ii[0];
564     if (*idx == i){
565       ncols -= bs; v += bs2; idx++; n--;
566     }
567 
568     if (ncols > 0){
569       workt = work;
570       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
571       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
572       for (j=0; j<n; j++) {
573         zb = z_ptr + bs*(*idx++);
574         for (k=0; k<bs; k++) zb[k] += workt[k] ;
575         workt += bs;
576       }
577     }
578     x += bs; v += n*bs2; z += bs; ii++;
579   }
580 
581   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
582   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
583   PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m);
584   PetscFunctionReturn(0);
585 }
586 
587 #undef __FUNCT__
588 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
589 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
590 {
591   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
592   PetscScalar     *x,*y,*z,*xb,x1;
593   MatScalar       *v;
594   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
595 
596   PetscFunctionBegin;
597   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
598   if (yy != xx) {
599     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
600   } else {
601     y = x;
602   }
603   if (zz != yy) {
604     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
605     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
606     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
607   } else {
608     z = y;
609   }
610 
611   v  = a->a;
612   xb = x;
613 
614   for (i=0; i<mbs; i++) {
615     n  = ai[1] - ai[0];  /* length of i_th row of A */
616     x1 = xb[0];
617     ib = aj + *ai;
618     jmin = 0;
619     if (*ib == i) {            /* (diag of A)*x */
620       z[i] += *v++ * x[*ib++]; jmin++;
621     }
622     for (j=jmin; j<n; j++) {
623       cval    = *ib;
624       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
625       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
626     }
627     xb++; ai++;
628   }
629 
630   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
631   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
632   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
633 
634   PetscLogFlops(2*(a->nz*2 - A->m));
635   PetscFunctionReturn(0);
636 }
637 
638 #undef __FUNCT__
639 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
640 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
641 {
642   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
643   PetscScalar     *x,*y,*z,*xb,x1,x2;
644   MatScalar       *v;
645   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
646 
647   PetscFunctionBegin;
648   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
649   if (yy != xx) {
650     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
651   } else {
652     y = x;
653   }
654   if (zz != yy) {
655     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
656     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
657     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
658   } else {
659     z = y;
660   }
661 
662   v     = a->a;
663   xb = x;
664 
665   for (i=0; i<mbs; i++) {
666     n  = ai[1] - ai[0]; /* length of i_th block row of A */
667     x1 = xb[0]; x2 = xb[1];
668     ib = aj + *ai;
669     jmin = 0;
670     if (*ib == i){      /* (diag of A)*x */
671       z[2*i]   += v[0]*x1 + v[2]*x2;
672       z[2*i+1] += v[2]*x1 + v[3]*x2;
673       v += 4; jmin++;
674     }
675     for (j=jmin; j<n; j++) {
676       /* (strict lower triangular part of A)*x  */
677       cval       = ib[j]*2;
678       z[cval]     += v[0]*x1 + v[1]*x2;
679       z[cval+1]   += v[2]*x1 + v[3]*x2;
680       /* (strict upper triangular part of A)*x  */
681       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
682       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
683       v  += 4;
684     }
685     xb +=2; ai++;
686   }
687 
688   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
689   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
690   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
691 
692   PetscLogFlops(4*(a->nz*2 - A->m));
693   PetscFunctionReturn(0);
694 }
695 
696 #undef __FUNCT__
697 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
698 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
699 {
700   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
701   PetscScalar     *x,*y,*z,*xb,x1,x2,x3;
702   MatScalar       *v;
703   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
704 
705   PetscFunctionBegin;
706   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
707   if (yy != xx) {
708     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
709   } else {
710     y = x;
711   }
712   if (zz != yy) {
713     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
714     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
715     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
716   } else {
717     z = y;
718   }
719 
720   v     = a->a;
721   xb = x;
722 
723   for (i=0; i<mbs; i++) {
724     n  = ai[1] - ai[0]; /* length of i_th block row of A */
725     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
726     ib = aj + *ai;
727     jmin = 0;
728     if (*ib == i){     /* (diag of A)*x */
729      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
730      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
731      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
732      v += 9; jmin++;
733     }
734     for (j=jmin; j<n; j++) {
735       /* (strict lower triangular part of A)*x  */
736       cval       = ib[j]*3;
737       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
738       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
739       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
740       /* (strict upper triangular part of A)*x  */
741       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
742       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
743       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
744       v  += 9;
745     }
746     xb +=3; ai++;
747   }
748 
749   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
750   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
751   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
752 
753   PetscLogFlops(18*(a->nz*2 - A->m));
754   PetscFunctionReturn(0);
755 }
756 
757 #undef __FUNCT__
758 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
759 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
760 {
761   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
762   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4;
763   MatScalar       *v;
764   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
765 
766   PetscFunctionBegin;
767   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
768   if (yy != xx) {
769     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
770   } else {
771     y = x;
772   }
773   if (zz != yy) {
774     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
775     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
776     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
777   } else {
778     z = y;
779   }
780 
781   v     = a->a;
782   xb = x;
783 
784   for (i=0; i<mbs; i++) {
785     n  = ai[1] - ai[0]; /* length of i_th block row of A */
786     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
787     ib = aj + *ai;
788     jmin = 0;
789     if (*ib == i){      /* (diag of A)*x */
790       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
791       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
792       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
793       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
794       v += 16; jmin++;
795     }
796     for (j=jmin; j<n; j++) {
797       /* (strict lower triangular part of A)*x  */
798       cval       = ib[j]*4;
799       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
800       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
801       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
802       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
803       /* (strict upper triangular part of A)*x  */
804       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
805       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
806       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
807       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
808       v  += 16;
809     }
810     xb +=4; ai++;
811   }
812 
813   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
814   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
815   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
816 
817   PetscLogFlops(32*(a->nz*2 - A->m));
818   PetscFunctionReturn(0);
819 }
820 
821 #undef __FUNCT__
822 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
823 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
824 {
825   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
826   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5;
827   MatScalar       *v;
828   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
829 
830   PetscFunctionBegin;
831   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
832   if (yy != xx) {
833     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
834   } else {
835     y = x;
836   }
837   if (zz != yy) {
838     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
839     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
840     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
841   } else {
842     z = y;
843   }
844 
845   v     = a->a;
846   xb = x;
847 
848   for (i=0; i<mbs; i++) {
849     n  = ai[1] - ai[0]; /* length of i_th block row of A */
850     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
851     ib = aj + *ai;
852     jmin = 0;
853     if (*ib == i){      /* (diag of A)*x */
854       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
855       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
856       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
857       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
858       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
859       v += 25; jmin++;
860     }
861     for (j=jmin; j<n; j++) {
862       /* (strict lower triangular part of A)*x  */
863       cval       = ib[j]*5;
864       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
865       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
866       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
867       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
868       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
869       /* (strict upper triangular part of A)*x  */
870       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
871       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
872       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
873       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
874       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
875       v  += 25;
876     }
877     xb +=5; ai++;
878   }
879 
880   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
881   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
882   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
883 
884   PetscLogFlops(50*(a->nz*2 - A->m));
885   PetscFunctionReturn(0);
886 }
887 #undef __FUNCT__
888 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
889 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
890 {
891   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
892   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
893   MatScalar       *v;
894   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
895 
896   PetscFunctionBegin;
897   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
898   if (yy != xx) {
899     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
900   } else {
901     y = x;
902   }
903   if (zz != yy) {
904     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
905     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
906     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
907   } else {
908     z = y;
909   }
910 
911   v     = a->a;
912   xb = x;
913 
914   for (i=0; i<mbs; i++) {
915     n  = ai[1] - ai[0]; /* length of i_th block row of A */
916     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
917     ib = aj + *ai;
918     jmin = 0;
919     if (*ib == i){     /* (diag of A)*x */
920       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
921       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
922       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
923       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
924       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
925       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
926       v += 36; jmin++;
927     }
928     for (j=jmin; j<n; j++) {
929       /* (strict lower triangular part of A)*x  */
930       cval       = ib[j]*6;
931       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
932       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
933       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
934       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
935       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
936       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
937       /* (strict upper triangular part of A)*x  */
938       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
939       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
940       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
941       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
942       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
943       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
944       v  += 36;
945     }
946     xb +=6; ai++;
947   }
948 
949   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
950   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
951   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
952 
953   PetscLogFlops(72*(a->nz*2 - A->m));
954   PetscFunctionReturn(0);
955 }
956 
957 #undef __FUNCT__
958 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
959 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
960 {
961   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
962   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
963   MatScalar       *v;
964   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
965 
966   PetscFunctionBegin;
967   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
968   if (yy != xx) {
969     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
970   } else {
971     y = x;
972   }
973   if (zz != yy) {
974     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
975     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
976     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
977   } else {
978     z = y;
979   }
980 
981   v     = a->a;
982   xb = x;
983 
984   for (i=0; i<mbs; i++) {
985     n  = ai[1] - ai[0]; /* length of i_th block row of A */
986     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
987     ib = aj + *ai;
988     jmin = 0;
989     if (*ib == i){     /* (diag of A)*x */
990       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
991       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
992       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
993       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
994       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
995       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
996       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
997       v += 49; jmin++;
998     }
999     for (j=jmin; j<n; j++) {
1000       /* (strict lower triangular part of A)*x  */
1001       cval       = ib[j]*7;
1002       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1003       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1004       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1005       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1006       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1007       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1008       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1009       /* (strict upper triangular part of A)*x  */
1010       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1011       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1012       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1013       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1014       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1015       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1016       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1017       v  += 49;
1018     }
1019     xb +=7; ai++;
1020   }
1021 
1022   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1023   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
1024   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
1025 
1026   PetscLogFlops(98*(a->nz*2 - A->m));
1027   PetscFunctionReturn(0);
1028 }
1029 
1030 #undef __FUNCT__
1031 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1032 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1033 {
1034   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1035   PetscScalar     *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1036   MatScalar       *v;
1037   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
1038   int             ncols,k;
1039 
1040   PetscFunctionBegin;
1041   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); x_ptr=x;
1042   if (yy != xx) {
1043     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
1044   } else {
1045     y = x;
1046   }
1047   if (zz != yy) {
1048     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1049     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); z_ptr=z;
1050     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1051   } else {
1052     z = y;
1053   }
1054 
1055   aj   = a->j;
1056   v    = a->a;
1057   ii   = a->i;
1058 
1059   if (!a->mult_work) {
1060     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1061   }
1062   work = a->mult_work;
1063 
1064 
1065   for (i=0; i<mbs; i++) {
1066     n     = ii[1] - ii[0]; ncols = n*bs;
1067     workt = work; idx=aj+ii[0];
1068 
1069     /* upper triangular part */
1070     for (j=0; j<n; j++) {
1071       xb = x_ptr + bs*(*idx++);
1072       for (k=0; k<bs; k++) workt[k] = xb[k];
1073       workt += bs;
1074     }
1075     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1076     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1077 
1078     /* strict lower triangular part */
1079     idx = aj+ii[0];
1080     if (*idx == i){
1081       ncols -= bs; v += bs2; idx++; n--;
1082     }
1083     if (ncols > 0){
1084       workt = work;
1085       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1086       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1087       for (j=0; j<n; j++) {
1088         zb = z_ptr + bs*(*idx++);
1089         /* idx++; */
1090         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1091         workt += bs;
1092       }
1093     }
1094 
1095     x += bs; v += n*bs2; z += bs; ii++;
1096   }
1097 
1098   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1099   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
1100   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
1101 
1102   PetscLogFlops(2*(a->nz*2 - A->m));
1103   PetscFunctionReturn(0);
1104 }
1105 
1106 #undef __FUNCT__
1107 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ"
1108 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1109 {
1110   int ierr;
1111 
1112   PetscFunctionBegin;
1113   ierr = MatMult(A,xx,zz);CHKERRQ(ierr);
1114   PetscFunctionReturn(0);
1115 }
1116 
1117 #undef __FUNCT__
1118 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ"
1119 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1120 
1121 {
1122   int ierr;
1123 
1124   PetscFunctionBegin;
1125   ierr = MatMultAdd(A,xx,yy,zz);CHKERRQ(ierr);
1126   PetscFunctionReturn(0);
1127 }
1128 
1129 #undef __FUNCT__
1130 #define __FUNCT__ "MatScale_SeqSBAIJ"
1131 int MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA)
1132 {
1133   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1134   int         one = 1,totalnz = a->bs2*a->nz;
1135 
1136   PetscFunctionBegin;
1137   BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one);
1138   PetscLogFlops(totalnz);
1139   PetscFunctionReturn(0);
1140 }
1141 
1142 #undef __FUNCT__
1143 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1144 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1145 {
1146   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1147   MatScalar   *v = a->a;
1148   PetscReal   sum_diag = 0.0, sum_off = 0.0, *sum;
1149   int         i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1150   int         *jl,*il,jmin,jmax,ierr,nexti,ik,*col;
1151 
1152   PetscFunctionBegin;
1153   if (type == NORM_FROBENIUS) {
1154     for (k=0; k<mbs; k++){
1155       jmin = a->i[k]; jmax = a->i[k+1];
1156       col  = aj + jmin;
1157       if (*col == k){         /* diagonal block */
1158         for (i=0; i<bs2; i++){
1159 #if defined(PETSC_USE_COMPLEX)
1160           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1161 #else
1162           sum_diag += (*v)*(*v); v++;
1163 #endif
1164         }
1165         jmin++;
1166       }
1167       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1168         for (i=0; i<bs2; i++){
1169 #if defined(PETSC_USE_COMPLEX)
1170           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1171 #else
1172           sum_off += (*v)*(*v); v++;
1173 #endif
1174         }
1175       }
1176     }
1177     *norm = sqrt(sum_diag + 2*sum_off);
1178 
1179   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1180     ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr);
1181     ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr);
1182     ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr);
1183     for (i=0; i<mbs; i++) {
1184       jl[i] = mbs; il[0] = 0;
1185     }
1186 
1187     *norm = 0.0;
1188     for (k=0; k<mbs; k++) { /* k_th block row */
1189       for (j=0; j<bs; j++) sum[j]=0.0;
1190 
1191       /*-- col sum --*/
1192       i = jl[k]; /* first |A(i,k)| to be added */
1193       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1194                   at step k */
1195       while (i<mbs){
1196         nexti = jl[i];  /* next block row to be added */
1197         ik    = il[i];  /* block index of A(i,k) in the array a */
1198         for (j=0; j<bs; j++){
1199           v = a->a + ik*bs2 + j*bs;
1200           for (k1=0; k1<bs; k1++) {
1201             sum[j] += PetscAbsScalar(*v); v++;
1202           }
1203         }
1204         /* update il, jl */
1205         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1206         jmax = a->i[i+1];
1207         if (jmin < jmax){
1208           il[i] = jmin;
1209           j   = a->j[jmin];
1210           jl[i] = jl[j]; jl[j]=i;
1211         }
1212         i = nexti;
1213       }
1214 
1215       /*-- row sum --*/
1216       jmin = a->i[k]; jmax = a->i[k+1];
1217       for (i=jmin; i<jmax; i++) {
1218         for (j=0; j<bs; j++){
1219           v = a->a + i*bs2 + j;
1220           for (k1=0; k1<bs; k1++){
1221             sum[j] += PetscAbsScalar(*v);
1222             v   += bs;
1223           }
1224         }
1225       }
1226       /* add k_th block row to il, jl */
1227       col = aj+jmin;
1228       if (*col == k) jmin++;
1229       if (jmin < jmax){
1230         il[k] = jmin;
1231         j   = a->j[jmin];
1232         jl[k] = jl[j]; jl[j] = k;
1233       }
1234       for (j=0; j<bs; j++){
1235         if (sum[j] > *norm) *norm = sum[j];
1236       }
1237     }
1238     ierr = PetscFree(il);CHKERRQ(ierr);
1239     ierr = PetscFree(jl);CHKERRQ(ierr);
1240     ierr = PetscFree(sum);CHKERRQ(ierr);
1241   } else {
1242     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1243   }
1244   PetscFunctionReturn(0);
1245 }
1246 
1247 #undef __FUNCT__
1248 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1249 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1250 {
1251   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1252   int          ierr;
1253 
1254   PetscFunctionBegin;
1255 
1256   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1257   if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1258     *flg = PETSC_FALSE;
1259     PetscFunctionReturn(0);
1260   }
1261 
1262   /* if the a->i are the same */
1263   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr);
1264   if (*flg == PETSC_FALSE) {
1265     PetscFunctionReturn(0);
1266   }
1267 
1268   /* if a->j are the same */
1269   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);CHKERRQ(ierr);
1270   if (*flg == PETSC_FALSE) {
1271     PetscFunctionReturn(0);
1272   }
1273   /* if a->a are the same */
1274   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1275 
1276   PetscFunctionReturn(0);
1277 }
1278 
1279 #undef __FUNCT__
1280 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1281 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1282 {
1283   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1284   int          ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1285   PetscScalar  *x,zero = 0.0;
1286   MatScalar    *aa,*aa_j;
1287 
1288   PetscFunctionBegin;
1289   bs   = a->bs;
1290   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1291 
1292   aa   = a->a;
1293   ai   = a->i;
1294   aj   = a->j;
1295   ambs = a->mbs;
1296   bs2  = a->bs2;
1297 
1298   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1299   ierr = VecGetArrayFast(v,&x);CHKERRQ(ierr);
1300   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1301   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1302   for (i=0; i<ambs; i++) {
1303     j=ai[i];
1304     if (aj[j] == i) {             /* if this is a diagonal element */
1305       row  = i*bs;
1306       aa_j = aa + j*bs2;
1307       if (A->factor && bs==1){
1308         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1309       } else {
1310         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1311       }
1312     }
1313   }
1314 
1315   ierr = VecRestoreArrayFast(v,&x);CHKERRQ(ierr);
1316   PetscFunctionReturn(0);
1317 }
1318 
1319 #undef __FUNCT__
1320 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1321 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1322 {
1323   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1324   PetscScalar  *l,*r,x,*li,*ri;
1325   MatScalar    *aa,*v;
1326   int          ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1327 
1328   PetscFunctionBegin;
1329   ai  = a->i;
1330   aj  = a->j;
1331   aa  = a->a;
1332   m   = A->m;
1333   bs  = a->bs;
1334   mbs = a->mbs;
1335   bs2 = a->bs2;
1336 
1337   if (ll != rr) {
1338     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1339   }
1340   if (ll) {
1341     ierr = VecGetArrayFast(ll,&l);CHKERRQ(ierr);
1342     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1343     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1344     for (i=0; i<mbs; i++) { /* for each block row */
1345       M  = ai[i+1] - ai[i];
1346       li = l + i*bs;
1347       v  = aa + bs2*ai[i];
1348       for (j=0; j<M; j++) { /* for each block */
1349         for (k=0; k<bs2; k++) {
1350           (*v++) *= li[k%bs];
1351         }
1352 #ifdef CONT
1353         /* will be used to replace the above loop */
1354         ri = l + bs*aj[ai[i]+j];
1355         for (k=0; k<bs; k++) { /* column value */
1356           x = ri[k];
1357           for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1358         }
1359 #endif
1360 
1361       }
1362     }
1363     ierr = VecRestoreArrayFast(ll,&l);CHKERRQ(ierr);
1364     PetscLogFlops(2*a->nz);
1365   }
1366   /* will be deleted */
1367   if (rr) {
1368     ierr = VecGetArrayFast(rr,&r);CHKERRQ(ierr);
1369     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1370     if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1371     for (i=0; i<mbs; i++) { /* for each block row */
1372       M  = ai[i+1] - ai[i];
1373       v  = aa + bs2*ai[i];
1374       for (j=0; j<M; j++) { /* for each block */
1375         ri = r + bs*aj[ai[i]+j];
1376         for (k=0; k<bs; k++) {
1377           x = ri[k];
1378           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1379         }
1380       }
1381     }
1382     ierr = VecRestoreArrayFast(rr,&r);CHKERRQ(ierr);
1383     PetscLogFlops(a->nz);
1384   }
1385   PetscFunctionReturn(0);
1386 }
1387 
1388 #undef __FUNCT__
1389 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1390 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1391 {
1392   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1393 
1394   PetscFunctionBegin;
1395   info->rows_global    = (double)A->m;
1396   info->columns_global = (double)A->m;
1397   info->rows_local     = (double)A->m;
1398   info->columns_local  = (double)A->m;
1399   info->block_size     = a->bs2;
1400   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1401   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1402   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1403   info->assemblies   = A->num_ass;
1404   info->mallocs      = a->reallocs;
1405   info->memory       = A->mem;
1406   if (A->factor) {
1407     info->fill_ratio_given  = A->info.fill_ratio_given;
1408     info->fill_ratio_needed = A->info.fill_ratio_needed;
1409     info->factor_mallocs    = A->info.factor_mallocs;
1410   } else {
1411     info->fill_ratio_given  = 0;
1412     info->fill_ratio_needed = 0;
1413     info->factor_mallocs    = 0;
1414   }
1415   PetscFunctionReturn(0);
1416 }
1417 
1418 
1419 #undef __FUNCT__
1420 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1421 int MatZeroEntries_SeqSBAIJ(Mat A)
1422 {
1423   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1424   int         ierr;
1425 
1426   PetscFunctionBegin;
1427   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1428   PetscFunctionReturn(0);
1429 }
1430 
1431 #undef __FUNCT__
1432 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ"
1433 int MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1434 {
1435   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1436   int          ierr,i,j,n,row,col,bs,*ai,*aj,mbs;
1437   PetscReal    atmp;
1438   MatScalar    *aa;
1439   PetscScalar  zero = 0.0,*x;
1440   int          ncols,brow,bcol,krow,kcol;
1441 
1442   PetscFunctionBegin;
1443   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1444   bs   = a->bs;
1445   aa   = a->a;
1446   ai   = a->i;
1447   aj   = a->j;
1448   mbs = a->mbs;
1449 
1450   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1451   ierr = VecGetArrayFast(v,&x);CHKERRQ(ierr);
1452   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1453   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1454   for (i=0; i<mbs; i++) {
1455     ncols = ai[1] - ai[0]; ai++;
1456     brow  = bs*i;
1457     for (j=0; j<ncols; j++){
1458       bcol = bs*(*aj);
1459       for (kcol=0; kcol<bs; kcol++){
1460         col = bcol + kcol;      /* col index */
1461         for (krow=0; krow<bs; krow++){
1462           atmp = PetscAbsScalar(*aa); aa++;
1463           row = brow + krow;    /* row index */
1464           /* printf("val[%d,%d]: %g\n",row,col,atmp); */
1465           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1466           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1467         }
1468       }
1469       aj++;
1470     }
1471   }
1472   ierr = VecRestoreArrayFast(v,&x);CHKERRQ(ierr);
1473   PetscFunctionReturn(0);
1474 }
1475