xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 5968eb51c36a0eee2a17bcd341216da8f7bdf0eb)
1 /*$Id: sbaij2.c,v 1.32 2001/08/07 03:03:01 balay Exp $*/
2 
3 #include "src/mat/impls/baij/seq/baij.h"
4 #include "src/inline/spops.h"
5 #include "src/inline/ilu.h"
6 #include "petscbt.h"
7 #include "src/mat/impls/sbaij/seq/sbaij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
11 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS is[],int ov)
12 {
13   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
14   int          brow,i,j,k,l,mbs,n,*idx,ierr,*nidx,isz,bcol,bcol_max,
15                start,end,*ai,*aj,bs,*nidx2;
16   PetscBT      table;
17   PetscBT      table0;
18 
19   PetscFunctionBegin;
20   mbs = a->mbs;
21   ai  = a->i;
22   aj  = a->j;
23   bs  = a->bs;
24 
25   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26 
27   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
28   ierr = PetscMalloc((mbs+1)*sizeof(int),&nidx);CHKERRQ(ierr);
29   ierr = PetscMalloc((A->m+1)*sizeof(int),&nidx2);CHKERRQ(ierr);
30   ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr);
31 
32   for (i=0; i<is_max; i++) { /* for each is */
33     isz  = 0;
34     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
35 
36     /* Extract the indices, assume there can be duplicate entries */
37     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
38     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
39 
40     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
41     bcol_max = 0;
42     for (j=0; j<n ; ++j){
43       brow = idx[j]/bs; /* convert the indices into block indices */
44       if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
45       if(!PetscBTLookupSet(table,brow)) {
46         nidx[isz++] = brow;
47         if (bcol_max < brow) bcol_max = brow;
48       }
49     }
50     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
51     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
52 
53     k = 0;
54     for (j=0; j<ov; j++){ /* for each overlap */
55       /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
56       ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr);
57       for (l=k; l<isz; l++) PetscBTSet(table0,nidx[l]);
58 
59       n = isz;  /* length of the updated is[i] */
60       for (brow=0; brow<mbs; brow++){
61         start = ai[brow]; end   = ai[brow+1];
62         if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
63           for (l = start; l<end ; l++){
64             bcol = aj[l];
65             if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
66           }
67           k++;
68           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
69         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
70           for (l = start; l<end ; l++){
71             bcol = aj[l];
72             if (bcol > bcol_max) break;
73             if (PetscBTLookup(table0,bcol)){
74               if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
75               break; /* for l = start; l<end ; l++) */
76             }
77           }
78         }
79       }
80     } /* for each overlap */
81 
82     /* expand the Index Set */
83     for (j=0; j<isz; j++) {
84       for (k=0; k<bs; k++)
85         nidx2[j*bs+k] = nidx[j]*bs+k;
86     }
87     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
88   }
89   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
90   ierr = PetscFree(nidx);CHKERRQ(ierr);
91   ierr = PetscFree(nidx2);CHKERRQ(ierr);
92   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
93   PetscFunctionReturn(0);
94 }
95 
96 #undef __FUNCT__
97 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
98 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
99 {
100   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
101   int          *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
102   int          row,mat_i,*mat_j,tcol,*mat_ilen;
103   int          *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
104   int          *aj = a->j,*ai = a->i;
105   MatScalar    *mat_a;
106   Mat          C;
107   PetscTruth   flag;
108 
109   PetscFunctionBegin;
110 
111   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
112   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
113   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
114 
115   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
116   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
117 
118   ierr  = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr);
119   ssmap = smap;
120   ierr  = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr);
121   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
122   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
123   /* determine lens of each row */
124   for (i=0; i<nrows; i++) {
125     kstart  = ai[irow[i]];
126     kend    = kstart + a->ilen[irow[i]];
127     lens[i] = 0;
128       for (k=kstart; k<kend; k++) {
129         if (ssmap[aj[k]]) {
130           lens[i]++;
131         }
132       }
133     }
134   /* Create and fill new matrix */
135   if (scall == MAT_REUSE_MATRIX) {
136     c = (Mat_SeqSBAIJ *)((*B)->data);
137 
138     if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
139     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr);
140     if (flag == PETSC_FALSE) {
141       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
142     }
143     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
144     C = *B;
145   } else {
146     ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr);
147     ierr = MatSetType(C,A->type_name);CHKERRQ(ierr);
148     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
149   }
150   c = (Mat_SeqSBAIJ *)(C->data);
151   for (i=0; i<nrows; i++) {
152     row    = irow[i];
153     kstart = ai[row];
154     kend   = kstart + a->ilen[row];
155     mat_i  = c->i[i];
156     mat_j  = c->j + mat_i;
157     mat_a  = c->a + mat_i*bs2;
158     mat_ilen = c->ilen + i;
159     for (k=kstart; k<kend; k++) {
160       if ((tcol=ssmap[a->j[k]])) {
161         *mat_j++ = tcol - 1;
162         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
163         mat_a   += bs2;
164         (*mat_ilen)++;
165       }
166     }
167   }
168 
169   /* Free work space */
170   ierr = PetscFree(smap);CHKERRQ(ierr);
171   ierr = PetscFree(lens);CHKERRQ(ierr);
172   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
173   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
174 
175   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
176   *B = C;
177   PetscFunctionReturn(0);
178 }
179 
180 #undef __FUNCT__
181 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
182 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
183 {
184   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
185   IS          is1;
186   int         *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
187 
188   PetscFunctionBegin;
189   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
190 
191   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
192   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
193 
194   /* Verify if the indices corespond to each element in a block
195    and form the IS with compressed IS */
196   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr);
197   iary = vary + a->mbs;
198   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
199   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
200 
201   count = 0;
202   for (i=0; i<a->mbs; i++) {
203     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
204     if (vary[i]==bs) iary[count++] = i;
205   }
206   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
207 
208   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
209   ierr = PetscFree(vary);CHKERRQ(ierr);
210 
211   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
212   ISDestroy(is1);
213   PetscFunctionReturn(0);
214 }
215 
216 #undef __FUNCT__
217 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
218 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
219 {
220   int ierr,i;
221 
222   PetscFunctionBegin;
223   if (scall == MAT_INITIAL_MATRIX) {
224     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
225   }
226 
227   for (i=0; i<n; i++) {
228     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
229   }
230   PetscFunctionReturn(0);
231 }
232 
233 /* -------------------------------------------------------*/
234 /* Should check that shapes of vectors and matrices match */
235 /* -------------------------------------------------------*/
236 #include "petscblaslapack.h"
237 
238 #undef __FUNCT__
239 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
240 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
241 {
242   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
243   PetscScalar     *x,*z,*xb,x1,zero=0.0;
244   MatScalar       *v;
245   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
246 
247   PetscFunctionBegin;
248   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
249   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
250   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
251 
252   v  = a->a;
253   xb = x;
254 
255   for (i=0; i<mbs; i++) {
256     n  = ai[1] - ai[0];  /* length of i_th row of A */
257     x1 = xb[0];
258     ib = aj + *ai;
259     jmin = 0;
260     if (*ib == i) {      /* (diag of A)*x */
261       z[i] += *v++ * x[*ib++];
262       jmin++;
263     }
264     for (j=jmin; j<n; j++) {
265       cval    = *ib;
266       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
267       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
268     }
269     xb++; ai++;
270   }
271 
272   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
273   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
274   PetscLogFlops(2*(a->nz*2 - A->m) - A->m);  /* nz = (nz+m)/2 */
275   PetscFunctionReturn(0);
276 }
277 
278 #undef __FUNCT__
279 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
280 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
281 {
282   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
283   PetscScalar      *x,*z,*xb,x1,x2,zero=0.0;
284   MatScalar        *v;
285   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
286 
287 
288   PetscFunctionBegin;
289   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
290   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
291   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
292 
293   v     = a->a;
294   xb = x;
295 
296   for (i=0; i<mbs; i++) {
297     n  = ai[1] - ai[0]; /* length of i_th block row of A */
298     x1 = xb[0]; x2 = xb[1];
299     ib = aj + *ai;
300     jmin = 0;
301     if (*ib == i){     /* (diag of A)*x */
302       z[2*i]   += v[0]*x1 + v[2]*x2;
303       z[2*i+1] += v[2]*x1 + v[3]*x2;
304       v += 4; jmin++;
305     }
306     for (j=jmin; j<n; j++) {
307       /* (strict lower triangular part of A)*x  */
308       cval       = ib[j]*2;
309       z[cval]     += v[0]*x1 + v[1]*x2;
310       z[cval+1]   += v[2]*x1 + v[3]*x2;
311       /* (strict upper triangular part of A)*x  */
312       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
313       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
314       v  += 4;
315     }
316     xb +=2; ai++;
317   }
318 
319   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
320   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
321   PetscLogFlops(8*(a->nz*2 - A->m) - A->m);
322   PetscFunctionReturn(0);
323 }
324 
325 #undef __FUNCT__
326 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
327 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
328 {
329   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
330   PetscScalar   *x,*z,*xb,x1,x2,x3,zero=0.0;
331   MatScalar     *v;
332   int           mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
333 
334 
335   PetscFunctionBegin;
336   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
337   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
338   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
339 
340   v     = a->a;
341   xb = x;
342 
343   for (i=0; i<mbs; i++) {
344     n  = ai[1] - ai[0]; /* length of i_th block row of A */
345     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
346     ib = aj + *ai;
347     jmin = 0;
348     if (*ib == i){     /* (diag of A)*x */
349       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
350       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
351       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
352       v += 9; jmin++;
353     }
354     for (j=jmin; j<n; j++) {
355       /* (strict lower triangular part of A)*x  */
356       cval       = ib[j]*3;
357       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
358       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
359       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
360       /* (strict upper triangular part of A)*x  */
361       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
362       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
363       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
364       v  += 9;
365     }
366     xb +=3; ai++;
367   }
368 
369   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
370   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
371   PetscLogFlops(18*(a->nz*2 - A->m) - A->m);
372   PetscFunctionReturn(0);
373 }
374 
375 #undef __FUNCT__
376 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
377 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
378 {
379   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
380   PetscScalar      *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
381   MatScalar        *v;
382   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
383 
384   PetscFunctionBegin;
385   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
386   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
387   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
388 
389   v     = a->a;
390   xb = x;
391 
392   for (i=0; i<mbs; i++) {
393     n  = ai[1] - ai[0]; /* length of i_th block row of A */
394     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
395     ib = aj + *ai;
396     jmin = 0;
397     if (*ib == i){     /* (diag of A)*x */
398       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
399       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
400       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
401       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
402       v += 16; jmin++;
403     }
404     for (j=jmin; j<n; j++) {
405       /* (strict lower triangular part of A)*x  */
406       cval       = ib[j]*4;
407       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
408       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
409       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
410       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
411       /* (strict upper triangular part of A)*x  */
412       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
413       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
414       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
415       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
416       v  += 16;
417     }
418     xb +=4; ai++;
419   }
420 
421   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
422   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
423   PetscLogFlops(32*(a->nz*2 - A->m) - A->m);
424   PetscFunctionReturn(0);
425 }
426 
427 #undef __FUNCT__
428 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
429 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
430 {
431   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
432   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
433   MatScalar       *v;
434   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
435 
436   PetscFunctionBegin;
437   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
438   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
439   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
440 
441   v     = a->a;
442   xb = x;
443 
444   for (i=0; i<mbs; i++) {
445     n  = ai[1] - ai[0]; /* length of i_th block row of A */
446     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
447     ib = aj + *ai;
448     jmin = 0;
449     if (*ib == i){      /* (diag of A)*x */
450       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
451       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
452       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
453       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
454       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
455       v += 25; jmin++;
456     }
457     for (j=jmin; j<n; j++) {
458       /* (strict lower triangular part of A)*x  */
459       cval       = ib[j]*5;
460       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
461       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
462       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
463       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
464       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
465       /* (strict upper triangular part of A)*x  */
466       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
467       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
468       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
469       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
470       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
471       v  += 25;
472     }
473     xb +=5; ai++;
474   }
475 
476   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
477   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
478   PetscLogFlops(50*(a->nz*2 - A->m) - A->m);
479   PetscFunctionReturn(0);
480 }
481 
482 
483 #undef __FUNCT__
484 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
485 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
486 {
487   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
488   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
489   MatScalar       *v;
490   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
491 
492   PetscFunctionBegin;
493   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
494   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
495   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
496 
497   v     = a->a;
498   xb = x;
499 
500   for (i=0; i<mbs; i++) {
501     n  = ai[1] - ai[0]; /* length of i_th block row of A */
502     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
503     ib = aj + *ai;
504     jmin = 0;
505     if (*ib == i){      /* (diag of A)*x */
506       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
507       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
508       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
509       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
510       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
511       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
512       v += 36; jmin++;
513     }
514     for (j=jmin; j<n; j++) {
515       /* (strict lower triangular part of A)*x  */
516       cval       = ib[j]*6;
517       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
518       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
519       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
520       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
521       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
522       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
523       /* (strict upper triangular part of A)*x  */
524       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
525       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
526       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
527       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
528       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
529       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
530       v  += 36;
531     }
532     xb +=6; ai++;
533   }
534 
535   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
536   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
537   PetscLogFlops(72*(a->nz*2 - A->m) - A->m);
538   PetscFunctionReturn(0);
539 }
540 #undef __FUNCT__
541 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
542 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
543 {
544   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
545   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
546   MatScalar       *v;
547   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
548 
549   PetscFunctionBegin;
550   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
551   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
552   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
553 
554   v     = a->a;
555   xb = x;
556 
557   for (i=0; i<mbs; i++) {
558     n  = ai[1] - ai[0]; /* length of i_th block row of A */
559     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
560     ib = aj + *ai;
561     jmin = 0;
562     if (*ib == i){      /* (diag of A)*x */
563       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
564       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
565       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
566       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
567       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
568       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
569       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
570       v += 49; jmin++;
571     }
572     for (j=jmin; j<n; j++) {
573       /* (strict lower triangular part of A)*x  */
574       cval       = ib[j]*7;
575       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
576       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
577       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
578       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
579       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
580       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
581       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
582       /* (strict upper triangular part of A)*x  */
583       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
584       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
585       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
586       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
587       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
588       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
589       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
590       v  += 49;
591     }
592     xb +=7; ai++;
593   }
594   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
595   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
596   PetscLogFlops(98*(a->nz*2 - A->m) - A->m);
597   PetscFunctionReturn(0);
598 }
599 
600 /*
601     This will not work with MatScalar == float because it calls the BLAS
602 */
603 #undef __FUNCT__
604 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
605 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
606 {
607   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
608   PetscScalar     *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
609   MatScalar       *v;
610   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
611   int             ncols,k;
612 
613   PetscFunctionBegin;
614   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
615   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
616   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
617 
618   aj   = a->j;
619   v    = a->a;
620   ii   = a->i;
621 
622   if (!a->mult_work) {
623     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
624   }
625   work = a->mult_work;
626 
627   for (i=0; i<mbs; i++) {
628     n     = ii[1] - ii[0]; ncols = n*bs;
629     workt = work; idx=aj+ii[0];
630 
631     /* upper triangular part */
632     for (j=0; j<n; j++) {
633       xb = x_ptr + bs*(*idx++);
634       for (k=0; k<bs; k++) workt[k] = xb[k];
635       workt += bs;
636     }
637     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
638     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
639 
640     /* strict lower triangular part */
641     idx = aj+ii[0];
642     if (*idx == i){
643       ncols -= bs; v += bs2; idx++; n--;
644     }
645 
646     if (ncols > 0){
647       workt = work;
648       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
649       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
650       for (j=0; j<n; j++) {
651         zb = z_ptr + bs*(*idx++);
652         for (k=0; k<bs; k++) zb[k] += workt[k] ;
653         workt += bs;
654       }
655     }
656     x += bs; v += n*bs2; z += bs; ii++;
657   }
658 
659   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
660   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
661   PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m);
662   PetscFunctionReturn(0);
663 }
664 
665 #undef __FUNCT__
666 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
667 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
668 {
669   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
670   PetscScalar     *x,*y,*z,*xb,x1;
671   MatScalar       *v;
672   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
673 
674   PetscFunctionBegin;
675   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
676   if (yy != xx) {
677     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
678   } else {
679     y = x;
680   }
681   if (zz != yy) {
682     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
683     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
684     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
685   } else {
686     z = y;
687   }
688 
689   v  = a->a;
690   xb = x;
691 
692   for (i=0; i<mbs; i++) {
693     n  = ai[1] - ai[0];  /* length of i_th row of A */
694     x1 = xb[0];
695     ib = aj + *ai;
696     jmin = 0;
697     if (*ib == i) {            /* (diag of A)*x */
698       z[i] += *v++ * x[*ib++]; jmin++;
699     }
700     for (j=jmin; j<n; j++) {
701       cval    = *ib;
702       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
703       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
704     }
705     xb++; ai++;
706   }
707 
708   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
709   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
710   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
711 
712   PetscLogFlops(2*(a->nz*2 - A->m));
713   PetscFunctionReturn(0);
714 }
715 
716 #undef __FUNCT__
717 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
718 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
719 {
720   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
721   PetscScalar     *x,*y,*z,*xb,x1,x2;
722   MatScalar       *v;
723   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
724 
725   PetscFunctionBegin;
726   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
727   if (yy != xx) {
728     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
729   } else {
730     y = x;
731   }
732   if (zz != yy) {
733     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
734     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
735     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
736   } else {
737     z = y;
738   }
739 
740   v     = a->a;
741   xb = x;
742 
743   for (i=0; i<mbs; i++) {
744     n  = ai[1] - ai[0]; /* length of i_th block row of A */
745     x1 = xb[0]; x2 = xb[1];
746     ib = aj + *ai;
747     jmin = 0;
748     if (*ib == i){      /* (diag of A)*x */
749       z[2*i]   += v[0]*x1 + v[2]*x2;
750       z[2*i+1] += v[2]*x1 + v[3]*x2;
751       v += 4; jmin++;
752     }
753     for (j=jmin; j<n; j++) {
754       /* (strict lower triangular part of A)*x  */
755       cval       = ib[j]*2;
756       z[cval]     += v[0]*x1 + v[1]*x2;
757       z[cval+1]   += v[2]*x1 + v[3]*x2;
758       /* (strict upper triangular part of A)*x  */
759       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
760       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
761       v  += 4;
762     }
763     xb +=2; ai++;
764   }
765 
766   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
767   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
768   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
769 
770   PetscLogFlops(4*(a->nz*2 - A->m));
771   PetscFunctionReturn(0);
772 }
773 
774 #undef __FUNCT__
775 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
776 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
777 {
778   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
779   PetscScalar     *x,*y,*z,*xb,x1,x2,x3;
780   MatScalar       *v;
781   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
782 
783   PetscFunctionBegin;
784   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
785   if (yy != xx) {
786     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
787   } else {
788     y = x;
789   }
790   if (zz != yy) {
791     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
792     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
793     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
794   } else {
795     z = y;
796   }
797 
798   v     = a->a;
799   xb = x;
800 
801   for (i=0; i<mbs; i++) {
802     n  = ai[1] - ai[0]; /* length of i_th block row of A */
803     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
804     ib = aj + *ai;
805     jmin = 0;
806     if (*ib == i){     /* (diag of A)*x */
807      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
808      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
809      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
810      v += 9; jmin++;
811     }
812     for (j=jmin; j<n; j++) {
813       /* (strict lower triangular part of A)*x  */
814       cval       = ib[j]*3;
815       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
816       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
817       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
818       /* (strict upper triangular part of A)*x  */
819       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
820       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
821       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
822       v  += 9;
823     }
824     xb +=3; ai++;
825   }
826 
827   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
828   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
829   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
830 
831   PetscLogFlops(18*(a->nz*2 - A->m));
832   PetscFunctionReturn(0);
833 }
834 
835 #undef __FUNCT__
836 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
837 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
838 {
839   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
840   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4;
841   MatScalar       *v;
842   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
843 
844   PetscFunctionBegin;
845   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
846   if (yy != xx) {
847     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
848   } else {
849     y = x;
850   }
851   if (zz != yy) {
852     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
853     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
854     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
855   } else {
856     z = y;
857   }
858 
859   v     = a->a;
860   xb = x;
861 
862   for (i=0; i<mbs; i++) {
863     n  = ai[1] - ai[0]; /* length of i_th block row of A */
864     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
865     ib = aj + *ai;
866     jmin = 0;
867     if (*ib == i){      /* (diag of A)*x */
868       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
869       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
870       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
871       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
872       v += 16; jmin++;
873     }
874     for (j=jmin; j<n; j++) {
875       /* (strict lower triangular part of A)*x  */
876       cval       = ib[j]*4;
877       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
878       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
879       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
880       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
881       /* (strict upper triangular part of A)*x  */
882       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
883       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
884       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
885       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
886       v  += 16;
887     }
888     xb +=4; ai++;
889   }
890 
891   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
892   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
893   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
894 
895   PetscLogFlops(32*(a->nz*2 - A->m));
896   PetscFunctionReturn(0);
897 }
898 
899 #undef __FUNCT__
900 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
901 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
902 {
903   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
904   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5;
905   MatScalar       *v;
906   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
907 
908   PetscFunctionBegin;
909   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
910   if (yy != xx) {
911     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
912   } else {
913     y = x;
914   }
915   if (zz != yy) {
916     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
917     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
918     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
919   } else {
920     z = y;
921   }
922 
923   v     = a->a;
924   xb = x;
925 
926   for (i=0; i<mbs; i++) {
927     n  = ai[1] - ai[0]; /* length of i_th block row of A */
928     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
929     ib = aj + *ai;
930     jmin = 0;
931     if (*ib == i){      /* (diag of A)*x */
932       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
933       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
934       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
935       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
936       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
937       v += 25; jmin++;
938     }
939     for (j=jmin; j<n; j++) {
940       /* (strict lower triangular part of A)*x  */
941       cval       = ib[j]*5;
942       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
943       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
944       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
945       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
946       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
947       /* (strict upper triangular part of A)*x  */
948       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
949       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
950       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
951       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
952       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
953       v  += 25;
954     }
955     xb +=5; ai++;
956   }
957 
958   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
959   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
960   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
961 
962   PetscLogFlops(50*(a->nz*2 - A->m));
963   PetscFunctionReturn(0);
964 }
965 #undef __FUNCT__
966 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
967 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
968 {
969   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
970   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
971   MatScalar       *v;
972   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
973 
974   PetscFunctionBegin;
975   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
976   if (yy != xx) {
977     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
978   } else {
979     y = x;
980   }
981   if (zz != yy) {
982     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
983     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
984     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
985   } else {
986     z = y;
987   }
988 
989   v     = a->a;
990   xb = x;
991 
992   for (i=0; i<mbs; i++) {
993     n  = ai[1] - ai[0]; /* length of i_th block row of A */
994     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
995     ib = aj + *ai;
996     jmin = 0;
997     if (*ib == i){     /* (diag of A)*x */
998       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
999       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1000       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1001       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1002       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1003       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1004       v += 36; jmin++;
1005     }
1006     for (j=jmin; j<n; j++) {
1007       /* (strict lower triangular part of A)*x  */
1008       cval       = ib[j]*6;
1009       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1010       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1011       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1012       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1013       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1014       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1015       /* (strict upper triangular part of A)*x  */
1016       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1017       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1018       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1019       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1020       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1021       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1022       v  += 36;
1023     }
1024     xb +=6; ai++;
1025   }
1026 
1027   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1028   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1029   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1030 
1031   PetscLogFlops(72*(a->nz*2 - A->m));
1032   PetscFunctionReturn(0);
1033 }
1034 
1035 #undef __FUNCT__
1036 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1037 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1038 {
1039   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1040   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
1041   MatScalar       *v;
1042   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
1043 
1044   PetscFunctionBegin;
1045   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1046   if (yy != xx) {
1047     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1048   } else {
1049     y = x;
1050   }
1051   if (zz != yy) {
1052     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1053     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1054     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1055   } else {
1056     z = y;
1057   }
1058 
1059   v     = a->a;
1060   xb = x;
1061 
1062   for (i=0; i<mbs; i++) {
1063     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1064     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1065     ib = aj + *ai;
1066     jmin = 0;
1067     if (*ib == i){     /* (diag of A)*x */
1068       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1069       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1070       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1071       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1072       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1073       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1074       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1075       v += 49; jmin++;
1076     }
1077     for (j=jmin; j<n; j++) {
1078       /* (strict lower triangular part of A)*x  */
1079       cval       = ib[j]*7;
1080       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1081       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1082       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1083       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1084       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1085       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1086       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1087       /* (strict upper triangular part of A)*x  */
1088       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1089       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1090       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1091       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1092       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1093       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1094       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1095       v  += 49;
1096     }
1097     xb +=7; ai++;
1098   }
1099 
1100   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1101   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1102   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1103 
1104   PetscLogFlops(98*(a->nz*2 - A->m));
1105   PetscFunctionReturn(0);
1106 }
1107 
1108 #undef __FUNCT__
1109 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1110 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1111 {
1112   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1113   PetscScalar     *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1114   MatScalar       *v;
1115   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
1116   int             ncols,k;
1117 
1118   PetscFunctionBegin;
1119   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1120   if (yy != xx) {
1121     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1122   } else {
1123     y = x;
1124   }
1125   if (zz != yy) {
1126     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1127     ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1128     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1129   } else {
1130     z = y;
1131   }
1132 
1133   aj   = a->j;
1134   v    = a->a;
1135   ii   = a->i;
1136 
1137   if (!a->mult_work) {
1138     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1139   }
1140   work = a->mult_work;
1141 
1142 
1143   for (i=0; i<mbs; i++) {
1144     n     = ii[1] - ii[0]; ncols = n*bs;
1145     workt = work; idx=aj+ii[0];
1146 
1147     /* upper triangular part */
1148     for (j=0; j<n; j++) {
1149       xb = x_ptr + bs*(*idx++);
1150       for (k=0; k<bs; k++) workt[k] = xb[k];
1151       workt += bs;
1152     }
1153     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1154     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1155 
1156     /* strict lower triangular part */
1157     idx = aj+ii[0];
1158     if (*idx == i){
1159       ncols -= bs; v += bs2; idx++; n--;
1160     }
1161     if (ncols > 0){
1162       workt = work;
1163       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1164       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1165       for (j=0; j<n; j++) {
1166         zb = z_ptr + bs*(*idx++);
1167         /* idx++; */
1168         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1169         workt += bs;
1170       }
1171     }
1172 
1173     x += bs; v += n*bs2; z += bs; ii++;
1174   }
1175 
1176   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1177   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1178   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1179 
1180   PetscLogFlops(2*(a->nz*2 - A->m));
1181   PetscFunctionReturn(0);
1182 }
1183 
1184 #undef __FUNCT__
1185 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ"
1186 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1187 {
1188   int ierr;
1189 
1190   PetscFunctionBegin;
1191   ierr = MatMult(A,xx,zz);CHKERRQ(ierr);
1192   PetscFunctionReturn(0);
1193 }
1194 
1195 #undef __FUNCT__
1196 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ"
1197 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1198 
1199 {
1200   int ierr;
1201 
1202   PetscFunctionBegin;
1203   ierr = MatMultAdd(A,xx,yy,zz);CHKERRQ(ierr);
1204   PetscFunctionReturn(0);
1205 }
1206 
1207 #undef __FUNCT__
1208 #define __FUNCT__ "MatScale_SeqSBAIJ"
1209 int MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA)
1210 {
1211   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1212   int         one = 1,totalnz = a->bs2*a->nz;
1213 
1214   PetscFunctionBegin;
1215   BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one);
1216   PetscLogFlops(totalnz);
1217   PetscFunctionReturn(0);
1218 }
1219 
1220 #undef __FUNCT__
1221 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1222 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1223 {
1224   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1225   MatScalar   *v = a->a;
1226   PetscReal   sum_diag = 0.0, sum_off = 0.0, *sum;
1227   int         i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1228   int         *jl,*il,jmin,jmax,ierr,nexti,ik,*col;
1229 
1230   PetscFunctionBegin;
1231   if (type == NORM_FROBENIUS) {
1232     for (k=0; k<mbs; k++){
1233       jmin = a->i[k]; jmax = a->i[k+1];
1234       col  = aj + jmin;
1235       if (*col == k){         /* diagonal block */
1236         for (i=0; i<bs2; i++){
1237 #if defined(PETSC_USE_COMPLEX)
1238           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1239 #else
1240           sum_diag += (*v)*(*v); v++;
1241 #endif
1242         }
1243         jmin++;
1244       }
1245       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1246         for (i=0; i<bs2; i++){
1247 #if defined(PETSC_USE_COMPLEX)
1248           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1249 #else
1250           sum_off += (*v)*(*v); v++;
1251 #endif
1252         }
1253       }
1254     }
1255     *norm = sqrt(sum_diag + 2*sum_off);
1256 
1257   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1258     ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr);
1259     ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr);
1260     ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr);
1261     for (i=0; i<mbs; i++) {
1262       jl[i] = mbs; il[0] = 0;
1263     }
1264 
1265     *norm = 0.0;
1266     for (k=0; k<mbs; k++) { /* k_th block row */
1267       for (j=0; j<bs; j++) sum[j]=0.0;
1268 
1269       /*-- col sum --*/
1270       i = jl[k]; /* first |A(i,k)| to be added */
1271       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1272                   at step k */
1273       while (i<mbs){
1274         nexti = jl[i];  /* next block row to be added */
1275         ik    = il[i];  /* block index of A(i,k) in the array a */
1276         for (j=0; j<bs; j++){
1277           v = a->a + ik*bs2 + j*bs;
1278           for (k1=0; k1<bs; k1++) {
1279             sum[j] += PetscAbsScalar(*v); v++;
1280           }
1281         }
1282         /* update il, jl */
1283         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1284         jmax = a->i[i+1];
1285         if (jmin < jmax){
1286           il[i] = jmin;
1287           j   = a->j[jmin];
1288           jl[i] = jl[j]; jl[j]=i;
1289         }
1290         i = nexti;
1291       }
1292 
1293       /*-- row sum --*/
1294       jmin = a->i[k]; jmax = a->i[k+1];
1295       for (i=jmin; i<jmax; i++) {
1296         for (j=0; j<bs; j++){
1297           v = a->a + i*bs2 + j;
1298           for (k1=0; k1<bs; k1++){
1299             sum[j] += PetscAbsScalar(*v);
1300             v   += bs;
1301           }
1302         }
1303       }
1304       /* add k_th block row to il, jl */
1305       col = aj+jmin;
1306       if (*col == k) jmin++;
1307       if (jmin < jmax){
1308         il[k] = jmin;
1309         j   = a->j[jmin];
1310         jl[k] = jl[j]; jl[j] = k;
1311       }
1312       for (j=0; j<bs; j++){
1313         if (sum[j] > *norm) *norm = sum[j];
1314       }
1315     }
1316     ierr = PetscFree(il);CHKERRQ(ierr);
1317     ierr = PetscFree(jl);CHKERRQ(ierr);
1318     ierr = PetscFree(sum);CHKERRQ(ierr);
1319   } else {
1320     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1321   }
1322   PetscFunctionReturn(0);
1323 }
1324 
1325 #undef __FUNCT__
1326 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1327 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1328 {
1329   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1330   int          ierr;
1331 
1332   PetscFunctionBegin;
1333 
1334   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1335   if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1336     *flg = PETSC_FALSE;
1337     PetscFunctionReturn(0);
1338   }
1339 
1340   /* if the a->i are the same */
1341   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr);
1342   if (*flg == PETSC_FALSE) {
1343     PetscFunctionReturn(0);
1344   }
1345 
1346   /* if a->j are the same */
1347   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);CHKERRQ(ierr);
1348   if (*flg == PETSC_FALSE) {
1349     PetscFunctionReturn(0);
1350   }
1351   /* if a->a are the same */
1352   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1353 
1354   PetscFunctionReturn(0);
1355 }
1356 
1357 #undef __FUNCT__
1358 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1359 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1360 {
1361   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1362   int          ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1363   PetscScalar  *x,zero = 0.0;
1364   MatScalar    *aa,*aa_j;
1365 
1366   PetscFunctionBegin;
1367   bs   = a->bs;
1368   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1369 
1370   aa   = a->a;
1371   ai   = a->i;
1372   aj   = a->j;
1373   ambs = a->mbs;
1374   bs2  = a->bs2;
1375 
1376   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1377   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1378   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1379   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1380   for (i=0; i<ambs; i++) {
1381     j=ai[i];
1382     if (aj[j] == i) {             /* if this is a diagonal element */
1383       row  = i*bs;
1384       aa_j = aa + j*bs2;
1385       if (A->factor && bs==1){
1386         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1387       } else {
1388         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1389       }
1390     }
1391   }
1392 
1393   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1394   PetscFunctionReturn(0);
1395 }
1396 
1397 #undef __FUNCT__
1398 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1399 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1400 {
1401   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1402   PetscScalar  *l,*r,x,*li,*ri;
1403   MatScalar    *aa,*v;
1404   int          ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1405 
1406   PetscFunctionBegin;
1407   ai  = a->i;
1408   aj  = a->j;
1409   aa  = a->a;
1410   m   = A->m;
1411   bs  = a->bs;
1412   mbs = a->mbs;
1413   bs2 = a->bs2;
1414 
1415   if (ll != rr) {
1416     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1417   }
1418   if (ll) {
1419     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1420     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1421     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1422     for (i=0; i<mbs; i++) { /* for each block row */
1423       M  = ai[i+1] - ai[i];
1424       li = l + i*bs;
1425       v  = aa + bs2*ai[i];
1426       for (j=0; j<M; j++) { /* for each block */
1427         for (k=0; k<bs2; k++) {
1428           (*v++) *= li[k%bs];
1429         }
1430 #ifdef CONT
1431         /* will be used to replace the above loop */
1432         ri = l + bs*aj[ai[i]+j];
1433         for (k=0; k<bs; k++) { /* column value */
1434           x = ri[k];
1435           for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1436         }
1437 #endif
1438 
1439       }
1440     }
1441     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1442     PetscLogFlops(2*a->nz);
1443   }
1444   /* will be deleted */
1445   if (rr) {
1446     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
1447     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1448     if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1449     for (i=0; i<mbs; i++) { /* for each block row */
1450       M  = ai[i+1] - ai[i];
1451       v  = aa + bs2*ai[i];
1452       for (j=0; j<M; j++) { /* for each block */
1453         ri = r + bs*aj[ai[i]+j];
1454         for (k=0; k<bs; k++) {
1455           x = ri[k];
1456           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1457         }
1458       }
1459     }
1460     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
1461     PetscLogFlops(a->nz);
1462   }
1463   PetscFunctionReturn(0);
1464 }
1465 
1466 #undef __FUNCT__
1467 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1468 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1469 {
1470   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1471 
1472   PetscFunctionBegin;
1473   info->rows_global    = (double)A->m;
1474   info->columns_global = (double)A->m;
1475   info->rows_local     = (double)A->m;
1476   info->columns_local  = (double)A->m;
1477   info->block_size     = a->bs2;
1478   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1479   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1480   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1481   info->assemblies   = A->num_ass;
1482   info->mallocs      = a->reallocs;
1483   info->memory       = A->mem;
1484   if (A->factor) {
1485     info->fill_ratio_given  = A->info.fill_ratio_given;
1486     info->fill_ratio_needed = A->info.fill_ratio_needed;
1487     info->factor_mallocs    = A->info.factor_mallocs;
1488   } else {
1489     info->fill_ratio_given  = 0;
1490     info->fill_ratio_needed = 0;
1491     info->factor_mallocs    = 0;
1492   }
1493   PetscFunctionReturn(0);
1494 }
1495 
1496 
1497 #undef __FUNCT__
1498 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1499 int MatZeroEntries_SeqSBAIJ(Mat A)
1500 {
1501   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1502   int         ierr;
1503 
1504   PetscFunctionBegin;
1505   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1506   PetscFunctionReturn(0);
1507 }
1508 
1509 #undef __FUNCT__
1510 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ"
1511 int MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1512 {
1513   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1514   int          ierr,i,j,n,row,col,bs,*ai,*aj,mbs;
1515   PetscReal    atmp;
1516   MatScalar    *aa;
1517   PetscScalar  zero = 0.0,*x;
1518   int          ncols,brow,bcol,krow,kcol;
1519 
1520   PetscFunctionBegin;
1521   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1522   bs   = a->bs;
1523   aa   = a->a;
1524   ai   = a->i;
1525   aj   = a->j;
1526   mbs = a->mbs;
1527 
1528   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1529   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1530   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1531   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1532   for (i=0; i<mbs; i++) {
1533     ncols = ai[1] - ai[0]; ai++;
1534     brow  = bs*i;
1535     for (j=0; j<ncols; j++){
1536       bcol = bs*(*aj);
1537       for (kcol=0; kcol<bs; kcol++){
1538         col = bcol + kcol;      /* col index */
1539         for (krow=0; krow<bs; krow++){
1540           atmp = PetscAbsScalar(*aa); aa++;
1541           row = brow + krow;    /* row index */
1542           /* printf("val[%d,%d]: %g\n",row,col,atmp); */
1543           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1544           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1545         }
1546       }
1547       aj++;
1548     }
1549   }
1550   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1551   PetscFunctionReturn(0);
1552 }
1553