xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 0c2fff18d3777e2004c38c700aa5f85e14cc2190)
1 /*$Id: sbaij2.c,v 1.32 2001/08/07 03:03:01 balay Exp $*/
2 
3 #include "src/mat/impls/baij/seq/baij.h"
4 #include "src/vec/vecimpl.h"
5 #include "src/inline/spops.h"
6 #include "src/inline/ilu.h"
7 #include "petscbt.h"
8 #include "src/mat/impls/sbaij/seq/sbaij.h"
9 
10 #undef __FUNCT__
11 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
12 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS *is,int ov)
13 {
14   PetscFunctionBegin;
15   SETERRQ(1,"Function not yet written for SBAIJ format");
16   /* PetscFunctionReturn(0); */
17 }
18 
19 #undef __FUNCT__
20 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
21 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
22 {
23   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
24   int          *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
25   int          row,mat_i,*mat_j,tcol,*mat_ilen;
26   int          *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
27   int          *aj = a->j,*ai = a->i;
28   MatScalar    *mat_a;
29   Mat          C;
30   PetscTruth   flag;
31 
32   PetscFunctionBegin;
33 
34   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
35   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
36   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
37 
38   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
39   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
40 
41   ierr  = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr);
42   ssmap = smap;
43   ierr  = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr);
44   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
45   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
46   /* determine lens of each row */
47   for (i=0; i<nrows; i++) {
48     kstart  = ai[irow[i]];
49     kend    = kstart + a->ilen[irow[i]];
50     lens[i] = 0;
51       for (k=kstart; k<kend; k++) {
52         if (ssmap[aj[k]]) {
53           lens[i]++;
54         }
55       }
56     }
57   /* Create and fill new matrix */
58   if (scall == MAT_REUSE_MATRIX) {
59     c = (Mat_SeqSBAIJ *)((*B)->data);
60 
61     if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
62     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr);
63     if (flag == PETSC_FALSE) {
64       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
65     }
66     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
67     C = *B;
68   } else {
69     ierr = MatCreateSeqSBAIJ(A->comm,bs,nrows*bs,nrows*bs,0,lens,&C);CHKERRQ(ierr);
70   }
71   c = (Mat_SeqSBAIJ *)(C->data);
72   for (i=0; i<nrows; i++) {
73     row    = irow[i];
74     kstart = ai[row];
75     kend   = kstart + a->ilen[row];
76     mat_i  = c->i[i];
77     mat_j  = c->j + mat_i;
78     mat_a  = c->a + mat_i*bs2;
79     mat_ilen = c->ilen + i;
80     for (k=kstart; k<kend; k++) {
81       if ((tcol=ssmap[a->j[k]])) {
82         *mat_j++ = tcol - 1;
83         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
84         mat_a   += bs2;
85         (*mat_ilen)++;
86       }
87     }
88   }
89 
90   /* Free work space */
91   ierr = PetscFree(smap);CHKERRQ(ierr);
92   ierr = PetscFree(lens);CHKERRQ(ierr);
93   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
94   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
95 
96   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
97   *B = C;
98   PetscFunctionReturn(0);
99 }
100 
101 #undef __FUNCT__
102 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
103 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
104 {
105   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
106   IS          is1;
107   int         *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
108 
109   PetscFunctionBegin;
110   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
111 
112   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
113   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
114 
115   /* Verify if the indices corespond to each element in a block
116    and form the IS with compressed IS */
117   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr);
118   iary = vary + a->mbs;
119   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
120   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
121 
122   count = 0;
123   for (i=0; i<a->mbs; i++) {
124     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
125     if (vary[i]==bs) iary[count++] = i;
126   }
127   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
128 
129   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
130   ierr = PetscFree(vary);CHKERRQ(ierr);
131 
132   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
133   ISDestroy(is1);
134   PetscFunctionReturn(0);
135 }
136 
137 #undef __FUNCT__
138 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
139 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,IS *irow,IS *icol,MatReuse scall,Mat **B)
140 {
141   int ierr,i;
142 
143   PetscFunctionBegin;
144   if (scall == MAT_INITIAL_MATRIX) {
145     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
146   }
147 
148   for (i=0; i<n; i++) {
149     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
150   }
151   PetscFunctionReturn(0);
152 }
153 
154 /* -------------------------------------------------------*/
155 /* Should check that shapes of vectors and matrices match */
156 /* -------------------------------------------------------*/
157 #include "petscblaslapack.h"
158 
159 #undef __FUNCT__
160 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
161 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
162 {
163   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
164   PetscScalar     *x,*z,*xb,x1,zero=0.0;
165   MatScalar       *v;
166   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
167 
168   PetscFunctionBegin;
169   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
170   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
171   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
172 
173   v  = a->a;
174   xb = x;
175 
176   for (i=0; i<mbs; i++) {
177     n  = ai[1] - ai[0];  /* length of i_th row of A */
178     x1 = xb[0];
179     ib = aj + *ai;
180     jmin = 0;
181     if (*ib == i) {      /* (diag of A)*x */
182       z[i] += *v++ * x[*ib++];
183       jmin++;
184     }
185     for (j=jmin; j<n; j++) {
186       cval    = *ib;
187       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
188       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
189     }
190     xb++; ai++;
191   }
192 
193   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
194   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
195   PetscLogFlops(2*(a->s_nz*2 - A->m) - A->m);  /* s_nz = (nz+m)/2 */
196   PetscFunctionReturn(0);
197 }
198 
199 #undef __FUNCT__
200 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
201 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
202 {
203   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
204   PetscScalar      *x,*z,*xb,x1,x2,zero=0.0;
205   MatScalar        *v;
206   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
207 
208 
209   PetscFunctionBegin;
210   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
211   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
212   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
213 
214   v     = a->a;
215   xb = x;
216 
217   for (i=0; i<mbs; i++) {
218     n  = ai[1] - ai[0]; /* length of i_th block row of A */
219     x1 = xb[0]; x2 = xb[1];
220     ib = aj + *ai;
221     jmin = 0;
222     if (*ib == i){     /* (diag of A)*x */
223       z[2*i]   += v[0]*x1 + v[2]*x2;
224       z[2*i+1] += v[2]*x1 + v[3]*x2;
225       v += 4; jmin++;
226     }
227     for (j=jmin; j<n; j++) {
228       /* (strict lower triangular part of A)*x  */
229       cval       = ib[j]*2;
230       z[cval]     += v[0]*x1 + v[1]*x2;
231       z[cval+1]   += v[2]*x1 + v[3]*x2;
232       /* (strict upper triangular part of A)*x  */
233       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
234       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
235       v  += 4;
236     }
237     xb +=2; ai++;
238   }
239 
240   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
241   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
242   PetscLogFlops(8*(a->s_nz*2 - A->m) - A->m);
243   PetscFunctionReturn(0);
244 }
245 
246 #undef __FUNCT__
247 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
248 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
249 {
250   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
251   PetscScalar   *x,*z,*xb,x1,x2,x3,zero=0.0;
252   MatScalar     *v;
253   int           mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
254 
255 
256   PetscFunctionBegin;
257   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
258   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
259   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
260 
261   v     = a->a;
262   xb = x;
263 
264   for (i=0; i<mbs; i++) {
265     n  = ai[1] - ai[0]; /* length of i_th block row of A */
266     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
267     ib = aj + *ai;
268     jmin = 0;
269     if (*ib == i){     /* (diag of A)*x */
270       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
271       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
272       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
273       v += 9; jmin++;
274     }
275     for (j=jmin; j<n; j++) {
276       /* (strict lower triangular part of A)*x  */
277       cval       = ib[j]*3;
278       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
279       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
280       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
281       /* (strict upper triangular part of A)*x  */
282       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
283       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
284       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
285       v  += 9;
286     }
287     xb +=3; ai++;
288   }
289 
290   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
291   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
292   PetscLogFlops(18*(a->s_nz*2 - A->m) - A->m);
293   PetscFunctionReturn(0);
294 }
295 
296 #undef __FUNCT__
297 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
298 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
299 {
300   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
301   PetscScalar      *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
302   MatScalar        *v;
303   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
304 
305   PetscFunctionBegin;
306   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
307   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
308   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
309 
310   v     = a->a;
311   xb = x;
312 
313   for (i=0; i<mbs; i++) {
314     n  = ai[1] - ai[0]; /* length of i_th block row of A */
315     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
316     ib = aj + *ai;
317     jmin = 0;
318     if (*ib == i){     /* (diag of A)*x */
319       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
320       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
321       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
322       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
323       v += 16; jmin++;
324     }
325     for (j=jmin; j<n; j++) {
326       /* (strict lower triangular part of A)*x  */
327       cval       = ib[j]*4;
328       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
329       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
330       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
331       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
332       /* (strict upper triangular part of A)*x  */
333       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
334       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
335       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
336       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
337       v  += 16;
338     }
339     xb +=4; ai++;
340   }
341 
342   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
343   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
344   PetscLogFlops(32*(a->s_nz*2 - A->m) - A->m);
345   PetscFunctionReturn(0);
346 }
347 
348 #undef __FUNCT__
349 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
350 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
351 {
352   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
353   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
354   MatScalar       *v;
355   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
356 
357   PetscFunctionBegin;
358   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
359   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
360   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
361 
362   v     = a->a;
363   xb = x;
364 
365   for (i=0; i<mbs; i++) {
366     n  = ai[1] - ai[0]; /* length of i_th block row of A */
367     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
368     ib = aj + *ai;
369     jmin = 0;
370     if (*ib == i){      /* (diag of A)*x */
371       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
372       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
373       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
374       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
375       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
376       v += 25; jmin++;
377     }
378     for (j=jmin; j<n; j++) {
379       /* (strict lower triangular part of A)*x  */
380       cval       = ib[j]*5;
381       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
382       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
383       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
384       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
385       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
386       /* (strict upper triangular part of A)*x  */
387       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
388       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
389       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
390       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
391       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
392       v  += 25;
393     }
394     xb +=5; ai++;
395   }
396 
397   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
398   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
399   PetscLogFlops(50*(a->s_nz*2 - A->m) - A->m);
400   PetscFunctionReturn(0);
401 }
402 
403 
404 #undef __FUNCT__
405 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
406 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
407 {
408   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
409   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
410   MatScalar       *v;
411   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
412 
413   PetscFunctionBegin;
414   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
415   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
416   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
417 
418   v     = a->a;
419   xb = x;
420 
421   for (i=0; i<mbs; i++) {
422     n  = ai[1] - ai[0]; /* length of i_th block row of A */
423     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
424     ib = aj + *ai;
425     jmin = 0;
426     if (*ib == i){      /* (diag of A)*x */
427       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
428       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
429       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
430       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
431       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
432       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
433       v += 36; jmin++;
434     }
435     for (j=jmin; j<n; j++) {
436       /* (strict lower triangular part of A)*x  */
437       cval       = ib[j]*6;
438       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
439       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
440       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
441       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
442       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
443       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
444       /* (strict upper triangular part of A)*x  */
445       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
446       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
447       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
448       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
449       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
450       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
451       v  += 36;
452     }
453     xb +=6; ai++;
454   }
455 
456   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
457   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
458   PetscLogFlops(72*(a->s_nz*2 - A->m) - A->m);
459   PetscFunctionReturn(0);
460 }
461 #undef __FUNCT__
462 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
463 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
464 {
465   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
466   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
467   MatScalar       *v;
468   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
469 
470   PetscFunctionBegin;
471   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
472   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
473   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
474 
475   v     = a->a;
476   xb = x;
477 
478   for (i=0; i<mbs; i++) {
479     n  = ai[1] - ai[0]; /* length of i_th block row of A */
480     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
481     ib = aj + *ai;
482     jmin = 0;
483     if (*ib == i){      /* (diag of A)*x */
484       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
485       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
486       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
487       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
488       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
489       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
490       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
491       v += 49; jmin++;
492     }
493     for (j=jmin; j<n; j++) {
494       /* (strict lower triangular part of A)*x  */
495       cval       = ib[j]*7;
496       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
497       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
498       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
499       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
500       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
501       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
502       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
503       /* (strict upper triangular part of A)*x  */
504       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
505       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
506       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
507       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
508       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
509       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
510       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
511       v  += 49;
512     }
513     xb +=7; ai++;
514   }
515   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
516   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
517   PetscLogFlops(98*(a->s_nz*2 - A->m) - A->m);
518   PetscFunctionReturn(0);
519 }
520 
521 /*
522     This will not work with MatScalar == float because it calls the BLAS
523 */
524 #undef __FUNCT__
525 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
526 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
527 {
528   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
529   PetscScalar     *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
530   MatScalar       *v;
531   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
532   int             ncols,k;
533 
534   PetscFunctionBegin;
535   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
536   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
537   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
538 
539   aj   = a->j;
540   v    = a->a;
541   ii   = a->i;
542 
543   if (!a->mult_work) {
544     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
545   }
546   work = a->mult_work;
547 
548   for (i=0; i<mbs; i++) {
549     n     = ii[1] - ii[0]; ncols = n*bs;
550     workt = work; idx=aj+ii[0];
551 
552     /* upper triangular part */
553     for (j=0; j<n; j++) {
554       xb = x_ptr + bs*(*idx++);
555       for (k=0; k<bs; k++) workt[k] = xb[k];
556       workt += bs;
557     }
558     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
559     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
560 
561     /* strict lower triangular part */
562     idx = aj+ii[0];
563     if (*idx == i){
564       ncols -= bs; v += bs2; idx++; n--;
565     }
566 
567     if (ncols > 0){
568       workt = work;
569       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
570       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
571       for (j=0; j<n; j++) {
572         zb = z_ptr + bs*(*idx++);
573         for (k=0; k<bs; k++) zb[k] += workt[k] ;
574         workt += bs;
575       }
576     }
577     x += bs; v += n*bs2; z += bs; ii++;
578   }
579 
580   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
581   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
582   PetscLogFlops(2*(a->s_nz*2 - A->m)*bs2 - A->m);
583   PetscFunctionReturn(0);
584 }
585 
586 #undef __FUNCT__
587 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
588 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
589 {
590   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
591   PetscScalar     *x,*y,*z,*xb,x1;
592   MatScalar       *v;
593   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
594 
595   PetscFunctionBegin;
596   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
597   if (yy != xx) {
598     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
599   } else {
600     y = x;
601   }
602   if (zz != yy) {
603     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
604     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
605   } else {
606     z = y;
607   }
608 
609   v  = a->a;
610   xb = x;
611 
612   for (i=0; i<mbs; i++) {
613     n  = ai[1] - ai[0];  /* length of i_th row of A */
614     x1 = xb[0];
615     ib = aj + *ai;
616     jmin = 0;
617     if (*ib == i) {            /* (diag of A)*x */
618       z[i] += *v++ * x[*ib++]; jmin++;
619     }
620     for (j=jmin; j<n; j++) {
621       cval    = *ib;
622       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
623       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
624     }
625     xb++; ai++;
626   }
627 
628   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
629   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
630   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
631 
632   PetscLogFlops(2*(a->s_nz*2 - A->m));
633   PetscFunctionReturn(0);
634 }
635 
636 #undef __FUNCT__
637 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
638 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
639 {
640   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
641   PetscScalar     *x,*y,*z,*xb,x1,x2;
642   MatScalar       *v;
643   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
644 
645   PetscFunctionBegin;
646   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
647   if (yy != xx) {
648     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
649   } else {
650     y = x;
651   }
652   if (zz != yy) {
653     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
654     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
655   } else {
656     z = y;
657   }
658 
659   v     = a->a;
660   xb = x;
661 
662   for (i=0; i<mbs; i++) {
663     n  = ai[1] - ai[0]; /* length of i_th block row of A */
664     x1 = xb[0]; x2 = xb[1];
665     ib = aj + *ai;
666     jmin = 0;
667     if (*ib == i){      /* (diag of A)*x */
668       z[2*i]   += v[0]*x1 + v[2]*x2;
669       z[2*i+1] += v[2]*x1 + v[3]*x2;
670       v += 4; jmin++;
671     }
672     for (j=jmin; j<n; j++) {
673       /* (strict lower triangular part of A)*x  */
674       cval       = ib[j]*2;
675       z[cval]     += v[0]*x1 + v[1]*x2;
676       z[cval+1]   += v[2]*x1 + v[3]*x2;
677       /* (strict upper triangular part of A)*x  */
678       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
679       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
680       v  += 4;
681     }
682     xb +=2; ai++;
683   }
684 
685   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
686   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
687   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
688 
689   PetscLogFlops(4*(a->s_nz*2 - A->m));
690   PetscFunctionReturn(0);
691 }
692 
693 #undef __FUNCT__
694 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
695 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
696 {
697   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
698   PetscScalar     *x,*y,*z,*xb,x1,x2,x3;
699   MatScalar       *v;
700   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
701 
702   PetscFunctionBegin;
703   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
704   if (yy != xx) {
705     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
706   } else {
707     y = x;
708   }
709   if (zz != yy) {
710     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
711     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
712   } else {
713     z = y;
714   }
715 
716   v     = a->a;
717   xb = x;
718 
719   for (i=0; i<mbs; i++) {
720     n  = ai[1] - ai[0]; /* length of i_th block row of A */
721     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
722     ib = aj + *ai;
723     jmin = 0;
724     if (*ib == i){     /* (diag of A)*x */
725      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
726      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
727      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
728      v += 9; jmin++;
729     }
730     for (j=jmin; j<n; j++) {
731       /* (strict lower triangular part of A)*x  */
732       cval       = ib[j]*3;
733       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
734       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
735       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
736       /* (strict upper triangular part of A)*x  */
737       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
738       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
739       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
740       v  += 9;
741     }
742     xb +=3; ai++;
743   }
744 
745   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
746   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
747   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
748 
749   PetscLogFlops(18*(a->s_nz*2 - A->m));
750   PetscFunctionReturn(0);
751 }
752 
753 #undef __FUNCT__
754 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
755 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
756 {
757   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
758   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4;
759   MatScalar       *v;
760   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
761 
762   PetscFunctionBegin;
763   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
764   if (yy != xx) {
765     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
766   } else {
767     y = x;
768   }
769   if (zz != yy) {
770     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
771     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
772   } else {
773     z = y;
774   }
775 
776   v     = a->a;
777   xb = x;
778 
779   for (i=0; i<mbs; i++) {
780     n  = ai[1] - ai[0]; /* length of i_th block row of A */
781     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
782     ib = aj + *ai;
783     jmin = 0;
784     if (*ib == i){      /* (diag of A)*x */
785       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
786       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
787       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
788       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
789       v += 16; jmin++;
790     }
791     for (j=jmin; j<n; j++) {
792       /* (strict lower triangular part of A)*x  */
793       cval       = ib[j]*4;
794       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
795       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
796       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
797       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
798       /* (strict upper triangular part of A)*x  */
799       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
800       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
801       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
802       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
803       v  += 16;
804     }
805     xb +=4; ai++;
806   }
807 
808   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
809   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
810   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
811 
812   PetscLogFlops(32*(a->s_nz*2 - A->m));
813   PetscFunctionReturn(0);
814 }
815 
816 #undef __FUNCT__
817 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
818 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
819 {
820   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
821   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5;
822   MatScalar       *v;
823   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
824 
825   PetscFunctionBegin;
826   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
827   if (yy != xx) {
828     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
829   } else {
830     y = x;
831   }
832   if (zz != yy) {
833     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
834     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
835   } else {
836     z = y;
837   }
838 
839   v     = a->a;
840   xb = x;
841 
842   for (i=0; i<mbs; i++) {
843     n  = ai[1] - ai[0]; /* length of i_th block row of A */
844     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
845     ib = aj + *ai;
846     jmin = 0;
847     if (*ib == i){      /* (diag of A)*x */
848       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
849       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
850       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
851       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
852       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
853       v += 25; jmin++;
854     }
855     for (j=jmin; j<n; j++) {
856       /* (strict lower triangular part of A)*x  */
857       cval       = ib[j]*5;
858       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
859       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
860       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
861       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
862       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
863       /* (strict upper triangular part of A)*x  */
864       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
865       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
866       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
867       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
868       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
869       v  += 25;
870     }
871     xb +=5; ai++;
872   }
873 
874   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
875   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
876   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
877 
878   PetscLogFlops(50*(a->s_nz*2 - A->m));
879   PetscFunctionReturn(0);
880 }
881 #undef __FUNCT__
882 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
883 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
884 {
885   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
886   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
887   MatScalar       *v;
888   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
889 
890   PetscFunctionBegin;
891   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
892   if (yy != xx) {
893     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
894   } else {
895     y = x;
896   }
897   if (zz != yy) {
898     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
899     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
900   } else {
901     z = y;
902   }
903 
904   v     = a->a;
905   xb = x;
906 
907   for (i=0; i<mbs; i++) {
908     n  = ai[1] - ai[0]; /* length of i_th block row of A */
909     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
910     ib = aj + *ai;
911     jmin = 0;
912     if (*ib == i){     /* (diag of A)*x */
913       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
914       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
915       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
916       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
917       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
918       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
919       v += 36; jmin++;
920     }
921     for (j=jmin; j<n; j++) {
922       /* (strict lower triangular part of A)*x  */
923       cval       = ib[j]*6;
924       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
925       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
926       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
927       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
928       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
929       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
930       /* (strict upper triangular part of A)*x  */
931       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
932       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
933       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
934       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
935       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
936       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
937       v  += 36;
938     }
939     xb +=6; ai++;
940   }
941 
942   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
943   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
944   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
945 
946   PetscLogFlops(72*(a->s_nz*2 - A->m));
947   PetscFunctionReturn(0);
948 }
949 
950 #undef __FUNCT__
951 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
952 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
953 {
954   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
955   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
956   MatScalar       *v;
957   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
958 
959   PetscFunctionBegin;
960   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
961   if (yy != xx) {
962     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
963   } else {
964     y = x;
965   }
966   if (zz != yy) {
967     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
968     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
969   } else {
970     z = y;
971   }
972 
973   v     = a->a;
974   xb = x;
975 
976   for (i=0; i<mbs; i++) {
977     n  = ai[1] - ai[0]; /* length of i_th block row of A */
978     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
979     ib = aj + *ai;
980     jmin = 0;
981     if (*ib == i){     /* (diag of A)*x */
982       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
983       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
984       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
985       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
986       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
987       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
988       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
989       v += 49; jmin++;
990     }
991     for (j=jmin; j<n; j++) {
992       /* (strict lower triangular part of A)*x  */
993       cval       = ib[j]*7;
994       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
995       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
996       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
997       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
998       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
999       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1000       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1001       /* (strict upper triangular part of A)*x  */
1002       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1003       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1004       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1005       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1006       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1007       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1008       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1009       v  += 49;
1010     }
1011     xb +=7; ai++;
1012   }
1013 
1014   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1015   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1016   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1017 
1018   PetscLogFlops(98*(a->s_nz*2 - A->m));
1019   PetscFunctionReturn(0);
1020 }
1021 
1022 #undef __FUNCT__
1023 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1024 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1025 {
1026   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1027   PetscScalar     *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1028   MatScalar       *v;
1029   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
1030   int             ncols,k;
1031 
1032   PetscFunctionBegin;
1033   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1034   if (yy != xx) {
1035     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1036   } else {
1037     y = x;
1038   }
1039   if (zz != yy) {
1040     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1041     ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1042   } else {
1043     z = y;
1044   }
1045 
1046   aj   = a->j;
1047   v    = a->a;
1048   ii   = a->i;
1049 
1050   if (!a->mult_work) {
1051     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1052   }
1053   work = a->mult_work;
1054 
1055 
1056   for (i=0; i<mbs; i++) {
1057     n     = ii[1] - ii[0]; ncols = n*bs;
1058     workt = work; idx=aj+ii[0];
1059 
1060     /* upper triangular part */
1061     for (j=0; j<n; j++) {
1062       xb = x_ptr + bs*(*idx++);
1063       for (k=0; k<bs; k++) workt[k] = xb[k];
1064       workt += bs;
1065     }
1066     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1067     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1068 
1069     /* strict lower triangular part */
1070     idx = aj+ii[0];
1071     if (*idx == i){
1072       ncols -= bs; v += bs2; idx++; n--;
1073     }
1074     if (ncols > 0){
1075       workt = work;
1076       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1077       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1078       for (j=0; j<n; j++) {
1079         zb = z_ptr + bs*(*idx++);
1080         /* idx++; */
1081         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1082         workt += bs;
1083       }
1084     }
1085 
1086     x += bs; v += n*bs2; z += bs; ii++;
1087   }
1088 
1089   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1090   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1091   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1092 
1093   PetscLogFlops(2*(a->s_nz*2 - A->m));
1094   PetscFunctionReturn(0);
1095 }
1096 
1097 #undef __FUNCT__
1098 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ"
1099 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1100 {
1101   PetscFunctionBegin;
1102   SETERRQ(1,"Matrix is symmetric. Call MatMult().");
1103   /* PetscFunctionReturn(0); */
1104 }
1105 
1106 #undef __FUNCT__
1107 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ"
1108 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1109 
1110 {
1111   PetscFunctionBegin;
1112   SETERRQ(1,"Matrix is symmetric. Call MatMultAdd().");
1113   /* PetscFunctionReturn(0); */
1114 }
1115 
1116 #undef __FUNCT__
1117 #define __FUNCT__ "MatScale_SeqSBAIJ"
1118 int MatScale_SeqSBAIJ(PetscScalar *alpha,Mat inA)
1119 {
1120   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1121   int         one = 1,totalnz = a->bs2*a->s_nz;
1122 
1123   PetscFunctionBegin;
1124   BLscal_(&totalnz,alpha,a->a,&one);
1125   PetscLogFlops(totalnz);
1126   PetscFunctionReturn(0);
1127 }
1128 
1129 #undef __FUNCT__
1130 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1131 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1132 {
1133   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1134   MatScalar   *v = a->a;
1135   PetscReal   sum_diag = 0.0, sum_off = 0.0, *sum;
1136   int         i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1137   int         *jl,*il,jmin,jmax,ierr,nexti,ik,*col;
1138 
1139   PetscFunctionBegin;
1140   if (type == NORM_FROBENIUS) {
1141     for (k=0; k<mbs; k++){
1142       jmin = a->i[k]; jmax = a->i[k+1];
1143       col  = aj + jmin;
1144       if (*col == k){         /* diagonal block */
1145         for (i=0; i<bs2; i++){
1146 #if defined(PETSC_USE_COMPLEX)
1147           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1148 #else
1149           sum_diag += (*v)*(*v); v++;
1150 #endif
1151         }
1152         jmin++;
1153       }
1154       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1155         for (i=0; i<bs2; i++){
1156 #if defined(PETSC_USE_COMPLEX)
1157           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1158 #else
1159           sum_off += (*v)*(*v); v++;
1160 #endif
1161         }
1162       }
1163     }
1164     *norm = sqrt(sum_diag + 2*sum_off);
1165 
1166   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1167     ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr);
1168     ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr);
1169     ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr);
1170     for (i=0; i<mbs; i++) {
1171       jl[i] = mbs; il[0] = 0;
1172     }
1173 
1174     *norm = 0.0;
1175     for (k=0; k<mbs; k++) { /* k_th block row */
1176       for (j=0; j<bs; j++) sum[j]=0.0;
1177 
1178       /*-- col sum --*/
1179       i = jl[k]; /* first |A(i,k)| to be added */
1180       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1181                   at step k */
1182       while (i<mbs){
1183         nexti = jl[i];  /* next block row to be added */
1184         ik    = il[i];  /* block index of A(i,k) in the array a */
1185         for (j=0; j<bs; j++){
1186           v = a->a + ik*bs2 + j*bs;
1187           for (k1=0; k1<bs; k1++) {
1188             sum[j] += PetscAbsScalar(*v); v++;
1189           }
1190         }
1191         /* update il, jl */
1192         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1193         jmax = a->i[i+1];
1194         if (jmin < jmax){
1195           il[i] = jmin;
1196           j   = a->j[jmin];
1197           jl[i] = jl[j]; jl[j]=i;
1198         }
1199         i = nexti;
1200       }
1201 
1202       /*-- row sum --*/
1203       jmin = a->i[k]; jmax = a->i[k+1];
1204       for (i=jmin; i<jmax; i++) {
1205         for (j=0; j<bs; j++){
1206           v = a->a + i*bs2 + j;
1207           for (k1=0; k1<bs; k1++){
1208             sum[j] += PetscAbsScalar(*v);
1209             v   += bs;
1210           }
1211         }
1212       }
1213       /* add k_th block row to il, jl */
1214       col = aj+jmin;
1215       if (*col == k) jmin++;
1216       if (jmin < jmax){
1217         il[k] = jmin;
1218         j   = a->j[jmin];
1219         jl[k] = jl[j]; jl[j] = k;
1220       }
1221       for (j=0; j<bs; j++){
1222         if (sum[j] > *norm) *norm = sum[j];
1223       }
1224     }
1225     ierr = PetscFree(il);CHKERRQ(ierr);
1226     ierr = PetscFree(jl);CHKERRQ(ierr);
1227     ierr = PetscFree(sum);CHKERRQ(ierr);
1228   } else {
1229     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1230   }
1231   PetscFunctionReturn(0);
1232 }
1233 
1234 #undef __FUNCT__
1235 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1236 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1237 {
1238   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1239   int          ierr;
1240   PetscTruth   flag;
1241 
1242   PetscFunctionBegin;
1243   ierr = PetscTypeCompare((PetscObject)A,MATSEQBAIJ,&flag);CHKERRQ(ierr);
1244   if (!flag) SETERRQ(PETSC_ERR_ARG_INCOMP,"Matrices must be same type");
1245 
1246   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1247   if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->s_nz != b->s_nz)) {
1248     *flg = PETSC_FALSE;
1249     PetscFunctionReturn(0);
1250   }
1251 
1252   /* if the a->i are the same */
1253   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr);
1254   if (*flg == PETSC_FALSE) {
1255     PetscFunctionReturn(0);
1256   }
1257 
1258   /* if a->j are the same */
1259   ierr = PetscMemcmp(a->j,b->j,(a->s_nz)*sizeof(int),flg);CHKERRQ(ierr);
1260   if (*flg == PETSC_FALSE) {
1261     PetscFunctionReturn(0);
1262   }
1263   /* if a->a are the same */
1264   ierr = PetscMemcmp(a->a,b->a,(a->s_nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1265   PetscFunctionReturn(0);
1266 
1267 }
1268 
1269 #undef __FUNCT__
1270 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1271 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1272 {
1273   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1274   int          ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1275   PetscScalar  *x,zero = 0.0;
1276   MatScalar    *aa,*aa_j;
1277 
1278   PetscFunctionBegin;
1279   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1280   bs   = a->bs;
1281   aa   = a->a;
1282   ai   = a->i;
1283   aj   = a->j;
1284   ambs = a->mbs;
1285   bs2  = a->bs2;
1286 
1287   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1288   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1289   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1290   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1291   for (i=0; i<ambs; i++) {
1292     j=ai[i];
1293     if (aj[j] == i) {             /* if this is a diagonal element */
1294       row  = i*bs;
1295       aa_j = aa + j*bs2;
1296       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1297     }
1298   }
1299   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1300   PetscFunctionReturn(0);
1301 }
1302 
1303 #undef __FUNCT__
1304 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1305 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1306 {
1307   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1308   PetscScalar  *l,*r,x,*li,*ri;
1309   MatScalar    *aa,*v;
1310   int          ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1311 
1312   PetscFunctionBegin;
1313   ai  = a->i;
1314   aj  = a->j;
1315   aa  = a->a;
1316   m   = A->m;
1317   bs  = a->bs;
1318   mbs = a->mbs;
1319   bs2 = a->bs2;
1320 
1321   if (ll != rr) {
1322     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1323   }
1324   if (ll) {
1325     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1326     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1327     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1328     for (i=0; i<mbs; i++) { /* for each block row */
1329       M  = ai[i+1] - ai[i];
1330       li = l + i*bs;
1331       v  = aa + bs2*ai[i];
1332       for (j=0; j<M; j++) { /* for each block */
1333         for (k=0; k<bs2; k++) {
1334           (*v++) *= li[k%bs];
1335         }
1336 #ifdef CONT
1337         /* will be used to replace the above loop */
1338         ri = l + bs*aj[ai[i]+j];
1339         for (k=0; k<bs; k++) { /* column value */
1340           x = ri[k];
1341           for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1342         }
1343 #endif
1344 
1345       }
1346     }
1347     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1348     PetscLogFlops(2*a->s_nz);
1349   }
1350   /* will be deleted */
1351   if (rr) {
1352     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
1353     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1354     if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1355     for (i=0; i<mbs; i++) { /* for each block row */
1356       M  = ai[i+1] - ai[i];
1357       v  = aa + bs2*ai[i];
1358       for (j=0; j<M; j++) { /* for each block */
1359         ri = r + bs*aj[ai[i]+j];
1360         for (k=0; k<bs; k++) {
1361           x = ri[k];
1362           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1363         }
1364       }
1365     }
1366     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
1367     PetscLogFlops(a->s_nz);
1368   }
1369   PetscFunctionReturn(0);
1370 }
1371 
1372 #undef __FUNCT__
1373 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1374 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1375 {
1376   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1377 
1378   PetscFunctionBegin;
1379   info->rows_global    = (double)A->m;
1380   info->columns_global = (double)A->m;
1381   info->rows_local     = (double)A->m;
1382   info->columns_local  = (double)A->m;
1383   info->block_size     = a->bs2;
1384   info->nz_allocated   = a->s_maxnz; /*num. of nonzeros in upper triangular part */
1385   info->nz_used        = a->bs2*a->s_nz; /*num. of nonzeros in upper triangular part */
1386   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1387   info->assemblies   = A->num_ass;
1388   info->mallocs      = a->reallocs;
1389   info->memory       = A->mem;
1390   if (A->factor) {
1391     info->fill_ratio_given  = A->info.fill_ratio_given;
1392     info->fill_ratio_needed = A->info.fill_ratio_needed;
1393     info->factor_mallocs    = A->info.factor_mallocs;
1394   } else {
1395     info->fill_ratio_given  = 0;
1396     info->fill_ratio_needed = 0;
1397     info->factor_mallocs    = 0;
1398   }
1399   PetscFunctionReturn(0);
1400 }
1401 
1402 
1403 #undef __FUNCT__
1404 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1405 int MatZeroEntries_SeqSBAIJ(Mat A)
1406 {
1407   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1408   int         ierr;
1409 
1410   PetscFunctionBegin;
1411   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1412   PetscFunctionReturn(0);
1413 }
1414 
1415 #undef __FUNCT__
1416 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ"
1417 int MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1418 {
1419   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1420   int          ierr,i,j,n,row,col,bs,*ai,*aj,mbs;
1421   PetscReal    atmp;
1422   MatScalar    *aa;
1423   PetscScalar  zero = 0.0,*x;
1424   int          ncols,brow,bcol,krow,kcol;
1425 
1426   PetscFunctionBegin;
1427   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1428   bs   = a->bs;
1429   aa   = a->a;
1430   ai   = a->i;
1431   aj   = a->j;
1432   mbs = a->mbs;
1433 
1434   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1435   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1436   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1437   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1438   for (i=0; i<mbs; i++) {
1439     ncols = ai[1] - ai[0]; ai++;
1440     brow  = bs*i;
1441     for (j=0; j<ncols; j++){
1442       bcol = bs*(*aj);
1443       for (kcol=0; kcol<bs; kcol++){
1444         col = bcol + kcol;      /* col index */
1445         for (krow=0; krow<bs; krow++){
1446           atmp = PetscAbsScalar(*aa); aa++;
1447           row = brow + krow;    /* row index */
1448           /* printf("val[%d,%d]: %g\n",row,col,atmp); */
1449           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1450           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1451         }
1452       }
1453       aj++;
1454     }
1455   }
1456   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1457   PetscFunctionReturn(0);
1458 }
1459