xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision be1d678a52e6eff2808b2fa31ae986cdbf03c9fe)
1 #define PETSCMAT_DLL
2 
3 #include "src/mat/impls/baij/seq/baij.h"
4 #include "src/inline/spops.h"
5 #include "src/inline/ilu.h"
6 #include "petscbt.h"
7 #include "src/mat/impls/sbaij/seq/sbaij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
11 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
12 {
13   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       brow,i,j,k,l,mbs,n,*idx,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
16   PetscBT        table,table0;
17 
18   PetscFunctionBegin;
19   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20   mbs = a->mbs;
21   ai  = a->i;
22   aj  = a->j;
23   bs  = A->bs;
24   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
25   ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
26   ierr = PetscMalloc((A->m+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
27   ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr);
28 
29   for (i=0; i<is_max; i++) { /* for each is */
30     isz  = 0;
31     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
32 
33     /* Extract the indices, assume there can be duplicate entries */
34     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36 
37     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
38     bcol_max = 0;
39     for (j=0; j<n ; ++j){
40       brow = idx[j]/bs; /* convert the indices into block indices */
41       if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42       if(!PetscBTLookupSet(table,brow)) {
43         nidx[isz++] = brow;
44         if (bcol_max < brow) bcol_max = brow;
45       }
46     }
47     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
49 
50     k = 0;
51     for (j=0; j<ov; j++){ /* for each overlap */
52       /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53       ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr);
54       for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); }
55 
56       n = isz;  /* length of the updated is[i] */
57       for (brow=0; brow<mbs; brow++){
58         start = ai[brow]; end   = ai[brow+1];
59         if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
60           for (l = start; l<end ; l++){
61             bcol = aj[l];
62             if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
63           }
64           k++;
65           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
66         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
67           for (l = start; l<end ; l++){
68             bcol = aj[l];
69             if (bcol > bcol_max) break;
70             if (PetscBTLookup(table0,bcol)){
71               if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
72               break; /* for l = start; l<end ; l++) */
73             }
74           }
75         }
76       }
77     } /* for each overlap */
78 
79     /* expand the Index Set */
80     for (j=0; j<isz; j++) {
81       for (k=0; k<bs; k++)
82         nidx2[j*bs+k] = nidx[j]*bs+k;
83     }
84     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
85   }
86   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
87   ierr = PetscFree(nidx);CHKERRQ(ierr);
88   ierr = PetscFree(nidx2);CHKERRQ(ierr);
89   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
90   PetscFunctionReturn(0);
91 }
92 
93 #undef __FUNCT__
94 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
95 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
96 {
97   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
98   PetscErrorCode ierr;
99   PetscInt       *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
100   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
101   PetscInt       *irow,nrows,*ssmap,bs=A->bs,bs2=a->bs2;
102   PetscInt       *aj = a->j,*ai = a->i;
103   MatScalar      *mat_a;
104   Mat            C;
105   PetscTruth     flag;
106 
107   PetscFunctionBegin;
108   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
109   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
110   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
111 
112   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
113   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
114 
115   ierr  = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
116   ssmap = smap;
117   ierr  = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
118   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
119   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
120   /* determine lens of each row */
121   for (i=0; i<nrows; i++) {
122     kstart  = ai[irow[i]];
123     kend    = kstart + a->ilen[irow[i]];
124     lens[i] = 0;
125       for (k=kstart; k<kend; k++) {
126         if (ssmap[aj[k]]) {
127           lens[i]++;
128         }
129       }
130     }
131   /* Create and fill new matrix */
132   if (scall == MAT_REUSE_MATRIX) {
133     c = (Mat_SeqSBAIJ *)((*B)->data);
134 
135     if (c->mbs!=nrows || (*B)->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
136     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
137     if (!flag) {
138       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
139     }
140     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
141     C = *B;
142   } else {
143     ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr);
144     ierr = MatSetType(C,A->type_name);CHKERRQ(ierr);
145     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
146   }
147   c = (Mat_SeqSBAIJ *)(C->data);
148   for (i=0; i<nrows; i++) {
149     row    = irow[i];
150     kstart = ai[row];
151     kend   = kstart + a->ilen[row];
152     mat_i  = c->i[i];
153     mat_j  = c->j + mat_i;
154     mat_a  = c->a + mat_i*bs2;
155     mat_ilen = c->ilen + i;
156     for (k=kstart; k<kend; k++) {
157       if ((tcol=ssmap[a->j[k]])) {
158         *mat_j++ = tcol - 1;
159         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
160         mat_a   += bs2;
161         (*mat_ilen)++;
162       }
163     }
164   }
165 
166   /* Free work space */
167   ierr = PetscFree(smap);CHKERRQ(ierr);
168   ierr = PetscFree(lens);CHKERRQ(ierr);
169   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
170   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
171 
172   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
173   *B = C;
174   PetscFunctionReturn(0);
175 }
176 
177 #undef __FUNCT__
178 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
179 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
180 {
181   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
182   IS             is1;
183   PetscErrorCode ierr;
184   PetscInt       *vary,*iary,*irow,nrows,i,bs=A->bs,count;
185 
186   PetscFunctionBegin;
187   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
188 
189   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
190   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
191 
192   /* Verify if the indices corespond to each element in a block
193    and form the IS with compressed IS */
194   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);CHKERRQ(ierr);
195   iary = vary + a->mbs;
196   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
197   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
198 
199   count = 0;
200   for (i=0; i<a->mbs; i++) {
201     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
202     if (vary[i]==bs) iary[count++] = i;
203   }
204   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
205 
206   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
207   ierr = PetscFree(vary);CHKERRQ(ierr);
208 
209   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
210   ISDestroy(is1);
211   PetscFunctionReturn(0);
212 }
213 
214 #undef __FUNCT__
215 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
216 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
217 {
218   PetscErrorCode ierr;
219   PetscInt       i;
220 
221   PetscFunctionBegin;
222   if (scall == MAT_INITIAL_MATRIX) {
223     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
224   }
225 
226   for (i=0; i<n; i++) {
227     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
228   }
229   PetscFunctionReturn(0);
230 }
231 
232 /* -------------------------------------------------------*/
233 /* Should check that shapes of vectors and matrices match */
234 /* -------------------------------------------------------*/
235 #include "petscblaslapack.h"
236 
237 #undef __FUNCT__
238 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
239 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
240 {
241   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
242   PetscScalar    *x,*z,*xb,x1,zero=0.0;
243   MatScalar      *v;
244   PetscErrorCode ierr;
245   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
246 
247   PetscFunctionBegin;
248   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
249   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
250   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
251 
252   v  = a->a;
253   xb = x;
254 
255   for (i=0; i<mbs; i++) {
256     n  = ai[1] - ai[0];  /* length of i_th row of A */
257     x1 = xb[0];
258     ib = aj + *ai;
259     jmin = 0;
260     if (*ib == i) {      /* (diag of A)*x */
261       z[i] += *v++ * x[*ib++];
262       jmin++;
263     }
264     for (j=jmin; j<n; j++) {
265       cval    = *ib;
266       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
267       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
268     }
269     xb++; ai++;
270   }
271 
272   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
273   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
274   ierr = PetscLogFlops(2*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr);  /* nz = (nz+m)/2 */
275   PetscFunctionReturn(0);
276 }
277 
278 #undef __FUNCT__
279 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
280 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
281 {
282   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
283   PetscScalar    *x,*z,*xb,x1,x2,zero=0.0;
284   MatScalar      *v;
285   PetscErrorCode ierr;
286   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
287 
288   PetscFunctionBegin;
289   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
290   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
291   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
292 
293   v     = a->a;
294   xb = x;
295 
296   for (i=0; i<mbs; i++) {
297     n  = ai[1] - ai[0]; /* length of i_th block row of A */
298     x1 = xb[0]; x2 = xb[1];
299     ib = aj + *ai;
300     jmin = 0;
301     if (*ib == i){     /* (diag of A)*x */
302       z[2*i]   += v[0]*x1 + v[2]*x2;
303       z[2*i+1] += v[2]*x1 + v[3]*x2;
304       v += 4; jmin++;
305     }
306     for (j=jmin; j<n; j++) {
307       /* (strict lower triangular part of A)*x  */
308       cval       = ib[j]*2;
309       z[cval]     += v[0]*x1 + v[1]*x2;
310       z[cval+1]   += v[2]*x1 + v[3]*x2;
311       /* (strict upper triangular part of A)*x  */
312       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
313       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
314       v  += 4;
315     }
316     xb +=2; ai++;
317   }
318 
319   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
320   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
321   ierr = PetscLogFlops(8*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr);
322   PetscFunctionReturn(0);
323 }
324 
325 #undef __FUNCT__
326 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
327 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
328 {
329   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
330   PetscScalar    *x,*z,*xb,x1,x2,x3,zero=0.0;
331   MatScalar      *v;
332   PetscErrorCode ierr;
333   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
334 
335   PetscFunctionBegin;
336   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
337   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
338   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
339 
340   v    = a->a;
341   xb   = x;
342 
343   for (i=0; i<mbs; i++) {
344     n  = ai[1] - ai[0]; /* length of i_th block row of A */
345     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
346     ib = aj + *ai;
347     jmin = 0;
348     if (*ib == i){     /* (diag of A)*x */
349       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
350       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
351       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
352       v += 9; jmin++;
353     }
354     for (j=jmin; j<n; j++) {
355       /* (strict lower triangular part of A)*x  */
356       cval       = ib[j]*3;
357       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
358       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
359       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
360       /* (strict upper triangular part of A)*x  */
361       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
362       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
363       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
364       v  += 9;
365     }
366     xb +=3; ai++;
367   }
368 
369   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
370   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
371   ierr = PetscLogFlops(18*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr);
372   PetscFunctionReturn(0);
373 }
374 
375 #undef __FUNCT__
376 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
377 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
378 {
379   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
380   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
381   MatScalar      *v;
382   PetscErrorCode ierr;
383   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
384 
385   PetscFunctionBegin;
386   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
387   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
388   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
389 
390   v     = a->a;
391   xb = x;
392 
393   for (i=0; i<mbs; i++) {
394     n  = ai[1] - ai[0]; /* length of i_th block row of A */
395     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
396     ib = aj + *ai;
397     jmin = 0;
398     if (*ib == i){     /* (diag of A)*x */
399       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
400       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
401       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
402       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
403       v += 16; jmin++;
404     }
405     for (j=jmin; j<n; j++) {
406       /* (strict lower triangular part of A)*x  */
407       cval       = ib[j]*4;
408       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
409       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
410       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
411       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
412       /* (strict upper triangular part of A)*x  */
413       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
414       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
415       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
416       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
417       v  += 16;
418     }
419     xb +=4; ai++;
420   }
421 
422   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
423   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
424   ierr = PetscLogFlops(32*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr);
425   PetscFunctionReturn(0);
426 }
427 
428 #undef __FUNCT__
429 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
430 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
431 {
432   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
433   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
434   MatScalar      *v;
435   PetscErrorCode ierr;
436   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
437 
438   PetscFunctionBegin;
439   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
440   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
441   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
442 
443   v     = a->a;
444   xb = x;
445 
446   for (i=0; i<mbs; i++) {
447     n  = ai[1] - ai[0]; /* length of i_th block row of A */
448     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
449     ib = aj + *ai;
450     jmin = 0;
451     if (*ib == i){      /* (diag of A)*x */
452       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
453       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
454       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
455       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
456       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
457       v += 25; jmin++;
458     }
459     for (j=jmin; j<n; j++) {
460       /* (strict lower triangular part of A)*x  */
461       cval       = ib[j]*5;
462       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
463       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
464       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
465       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
466       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
467       /* (strict upper triangular part of A)*x  */
468       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
469       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
470       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
471       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
472       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
473       v  += 25;
474     }
475     xb +=5; ai++;
476   }
477 
478   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
479   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
480   ierr = PetscLogFlops(50*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr);
481   PetscFunctionReturn(0);
482 }
483 
484 
485 #undef __FUNCT__
486 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
487 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
488 {
489   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
490   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
491   MatScalar      *v;
492   PetscErrorCode ierr;
493   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
494 
495   PetscFunctionBegin;
496   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
497   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
498   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
499 
500   v     = a->a;
501   xb = x;
502 
503   for (i=0; i<mbs; i++) {
504     n  = ai[1] - ai[0]; /* length of i_th block row of A */
505     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
506     ib = aj + *ai;
507     jmin = 0;
508     if (*ib == i){      /* (diag of A)*x */
509       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
510       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
511       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
512       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
513       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
514       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
515       v += 36; jmin++;
516     }
517     for (j=jmin; j<n; j++) {
518       /* (strict lower triangular part of A)*x  */
519       cval       = ib[j]*6;
520       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
521       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
522       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
523       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
524       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
525       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
526       /* (strict upper triangular part of A)*x  */
527       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
528       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
529       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
530       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
531       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
532       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
533       v  += 36;
534     }
535     xb +=6; ai++;
536   }
537 
538   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
539   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
540   ierr = PetscLogFlops(72*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr);
541   PetscFunctionReturn(0);
542 }
543 #undef __FUNCT__
544 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
545 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
546 {
547   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
548   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
549   MatScalar      *v;
550   PetscErrorCode ierr;
551   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
552 
553   PetscFunctionBegin;
554   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
555   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
556   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
557 
558   v     = a->a;
559   xb = x;
560 
561   for (i=0; i<mbs; i++) {
562     n  = ai[1] - ai[0]; /* length of i_th block row of A */
563     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
564     ib = aj + *ai;
565     jmin = 0;
566     if (*ib == i){      /* (diag of A)*x */
567       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
568       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
569       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
570       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
571       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
572       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
573       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
574       v += 49; jmin++;
575     }
576     for (j=jmin; j<n; j++) {
577       /* (strict lower triangular part of A)*x  */
578       cval       = ib[j]*7;
579       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
580       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
581       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
582       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
583       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
584       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
585       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
586       /* (strict upper triangular part of A)*x  */
587       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
588       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
589       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
590       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
591       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
592       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
593       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
594       v  += 49;
595     }
596     xb +=7; ai++;
597   }
598   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
599   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
600   ierr = PetscLogFlops(98*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr);
601   PetscFunctionReturn(0);
602 }
603 
604 /*
605     This will not work with MatScalar == float because it calls the BLAS
606 */
607 #undef __FUNCT__
608 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
609 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
610 {
611   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
612   PetscScalar    *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
613   MatScalar      *v;
614   PetscErrorCode ierr;
615   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->bs,j,n,bs2=a->bs2,ncols,k;
616 
617   PetscFunctionBegin;
618   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
619   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
620   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
621 
622   aj   = a->j;
623   v    = a->a;
624   ii   = a->i;
625 
626   if (!a->mult_work) {
627     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
628   }
629   work = a->mult_work;
630 
631   for (i=0; i<mbs; i++) {
632     n     = ii[1] - ii[0]; ncols = n*bs;
633     workt = work; idx=aj+ii[0];
634 
635     /* upper triangular part */
636     for (j=0; j<n; j++) {
637       xb = x_ptr + bs*(*idx++);
638       for (k=0; k<bs; k++) workt[k] = xb[k];
639       workt += bs;
640     }
641     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
642     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
643 
644     /* strict lower triangular part */
645     idx = aj+ii[0];
646     if (*idx == i){
647       ncols -= bs; v += bs2; idx++; n--;
648     }
649 
650     if (ncols > 0){
651       workt = work;
652       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
653       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
654       for (j=0; j<n; j++) {
655         zb = z_ptr + bs*(*idx++);
656         for (k=0; k<bs; k++) zb[k] += workt[k] ;
657         workt += bs;
658       }
659     }
660     x += bs; v += n*bs2; z += bs; ii++;
661   }
662 
663   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
664   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
665   ierr = PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m);CHKERRQ(ierr);
666   PetscFunctionReturn(0);
667 }
668 
669 #undef __FUNCT__
670 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
671 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
672 {
673   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
674   PetscScalar    *x,*y,*z,*xb,x1;
675   MatScalar      *v;
676   PetscErrorCode ierr;
677   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
678 
679   PetscFunctionBegin;
680   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
681   if (yy != xx) {
682     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
683   } else {
684     y = x;
685   }
686   if (zz != yy) {
687     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
688     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
689     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
690   } else {
691     z = y;
692   }
693 
694   v  = a->a;
695   xb = x;
696 
697   for (i=0; i<mbs; i++) {
698     n  = ai[1] - ai[0];  /* length of i_th row of A */
699     x1 = xb[0];
700     ib = aj + *ai;
701     jmin = 0;
702     if (*ib == i) {            /* (diag of A)*x */
703       z[i] += *v++ * x[*ib++]; jmin++;
704     }
705     for (j=jmin; j<n; j++) {
706       cval    = *ib;
707       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
708       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
709     }
710     xb++; ai++;
711   }
712 
713   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
714   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
715   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
716 
717   ierr = PetscLogFlops(2*(a->nz*2 - A->m));CHKERRQ(ierr);
718   PetscFunctionReturn(0);
719 }
720 
721 #undef __FUNCT__
722 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
723 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
724 {
725   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
726   PetscScalar    *x,*y,*z,*xb,x1,x2;
727   MatScalar      *v;
728   PetscErrorCode ierr;
729   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
730 
731   PetscFunctionBegin;
732   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
733   if (yy != xx) {
734     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
735   } else {
736     y = x;
737   }
738   if (zz != yy) {
739     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
740     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
741     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
742   } else {
743     z = y;
744   }
745 
746   v     = a->a;
747   xb = x;
748 
749   for (i=0; i<mbs; i++) {
750     n  = ai[1] - ai[0]; /* length of i_th block row of A */
751     x1 = xb[0]; x2 = xb[1];
752     ib = aj + *ai;
753     jmin = 0;
754     if (*ib == i){      /* (diag of A)*x */
755       z[2*i]   += v[0]*x1 + v[2]*x2;
756       z[2*i+1] += v[2]*x1 + v[3]*x2;
757       v += 4; jmin++;
758     }
759     for (j=jmin; j<n; j++) {
760       /* (strict lower triangular part of A)*x  */
761       cval       = ib[j]*2;
762       z[cval]     += v[0]*x1 + v[1]*x2;
763       z[cval+1]   += v[2]*x1 + v[3]*x2;
764       /* (strict upper triangular part of A)*x  */
765       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
766       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
767       v  += 4;
768     }
769     xb +=2; ai++;
770   }
771 
772   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
773   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
774   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
775 
776   ierr = PetscLogFlops(4*(a->nz*2 - A->m));CHKERRQ(ierr);
777   PetscFunctionReturn(0);
778 }
779 
780 #undef __FUNCT__
781 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
782 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
783 {
784   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
785   PetscScalar    *x,*y,*z,*xb,x1,x2,x3;
786   MatScalar      *v;
787   PetscErrorCode ierr;
788   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
789 
790   PetscFunctionBegin;
791   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
792   if (yy != xx) {
793     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
794   } else {
795     y = x;
796   }
797   if (zz != yy) {
798     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
799     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
800     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
801   } else {
802     z = y;
803   }
804 
805   v     = a->a;
806   xb = x;
807 
808   for (i=0; i<mbs; i++) {
809     n  = ai[1] - ai[0]; /* length of i_th block row of A */
810     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
811     ib = aj + *ai;
812     jmin = 0;
813     if (*ib == i){     /* (diag of A)*x */
814      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
815      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
816      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
817      v += 9; jmin++;
818     }
819     for (j=jmin; j<n; j++) {
820       /* (strict lower triangular part of A)*x  */
821       cval       = ib[j]*3;
822       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
823       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
824       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
825       /* (strict upper triangular part of A)*x  */
826       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
827       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
828       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
829       v  += 9;
830     }
831     xb +=3; ai++;
832   }
833 
834   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
835   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
836   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
837 
838   ierr = PetscLogFlops(18*(a->nz*2 - A->m));CHKERRQ(ierr);
839   PetscFunctionReturn(0);
840 }
841 
842 #undef __FUNCT__
843 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
844 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
845 {
846   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
847   PetscScalar    *x,*y,*z,*xb,x1,x2,x3,x4;
848   MatScalar      *v;
849   PetscErrorCode ierr;
850   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
851 
852   PetscFunctionBegin;
853   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
854   if (yy != xx) {
855     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
856   } else {
857     y = x;
858   }
859   if (zz != yy) {
860     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
861     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
862     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
863   } else {
864     z = y;
865   }
866 
867   v     = a->a;
868   xb = x;
869 
870   for (i=0; i<mbs; i++) {
871     n  = ai[1] - ai[0]; /* length of i_th block row of A */
872     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
873     ib = aj + *ai;
874     jmin = 0;
875     if (*ib == i){      /* (diag of A)*x */
876       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
877       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
878       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
879       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
880       v += 16; jmin++;
881     }
882     for (j=jmin; j<n; j++) {
883       /* (strict lower triangular part of A)*x  */
884       cval       = ib[j]*4;
885       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
886       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
887       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
888       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
889       /* (strict upper triangular part of A)*x  */
890       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
891       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
892       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
893       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
894       v  += 16;
895     }
896     xb +=4; ai++;
897   }
898 
899   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
900   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
901   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
902 
903   ierr = PetscLogFlops(32*(a->nz*2 - A->m));CHKERRQ(ierr);
904   PetscFunctionReturn(0);
905 }
906 
907 #undef __FUNCT__
908 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
909 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
910 {
911   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
912   PetscScalar    *x,*y,*z,*xb,x1,x2,x3,x4,x5;
913   MatScalar      *v;
914   PetscErrorCode ierr;
915   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
916 
917   PetscFunctionBegin;
918   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
919   if (yy != xx) {
920     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
921   } else {
922     y = x;
923   }
924   if (zz != yy) {
925     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
926     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
927     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
928   } else {
929     z = y;
930   }
931 
932   v     = a->a;
933   xb = x;
934 
935   for (i=0; i<mbs; i++) {
936     n  = ai[1] - ai[0]; /* length of i_th block row of A */
937     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
938     ib = aj + *ai;
939     jmin = 0;
940     if (*ib == i){      /* (diag of A)*x */
941       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
942       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
943       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
944       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
945       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
946       v += 25; jmin++;
947     }
948     for (j=jmin; j<n; j++) {
949       /* (strict lower triangular part of A)*x  */
950       cval       = ib[j]*5;
951       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
952       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
953       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
954       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
955       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
956       /* (strict upper triangular part of A)*x  */
957       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
958       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
959       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
960       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
961       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
962       v  += 25;
963     }
964     xb +=5; ai++;
965   }
966 
967   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
968   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
969   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
970 
971   ierr = PetscLogFlops(50*(a->nz*2 - A->m));CHKERRQ(ierr);
972   PetscFunctionReturn(0);
973 }
974 #undef __FUNCT__
975 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
976 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
977 {
978   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
979   PetscScalar    *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
980   MatScalar      *v;
981   PetscErrorCode ierr;
982   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
983 
984   PetscFunctionBegin;
985   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
986   if (yy != xx) {
987     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
988   } else {
989     y = x;
990   }
991   if (zz != yy) {
992     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
993     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
994   } else {
995     z = y;
996   }
997 
998   v     = a->a;
999   xb = x;
1000 
1001   for (i=0; i<mbs; i++) {
1002     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1003     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
1004     ib = aj + *ai;
1005     jmin = 0;
1006     if (*ib == i){     /* (diag of A)*x */
1007       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
1008       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1009       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1010       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1011       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1012       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1013       v += 36; jmin++;
1014     }
1015     for (j=jmin; j<n; j++) {
1016       /* (strict lower triangular part of A)*x  */
1017       cval       = ib[j]*6;
1018       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1019       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1020       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1021       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1022       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1023       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1024       /* (strict upper triangular part of A)*x  */
1025       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1026       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1027       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1028       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1029       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1030       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1031       v  += 36;
1032     }
1033     xb +=6; ai++;
1034   }
1035 
1036   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1037   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1038   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1039 
1040   ierr = PetscLogFlops(72*(a->nz*2 - A->m));CHKERRQ(ierr);
1041   PetscFunctionReturn(0);
1042 }
1043 
1044 #undef __FUNCT__
1045 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1046 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1047 {
1048   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1049   PetscScalar    *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
1050   MatScalar      *v;
1051   PetscErrorCode ierr;
1052   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
1053 
1054   PetscFunctionBegin;
1055   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1056   if (yy != xx) {
1057     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1058   } else {
1059     y = x;
1060   }
1061   if (zz != yy) {
1062     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1063     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1064   } else {
1065     z = y;
1066   }
1067 
1068   v     = a->a;
1069   xb = x;
1070 
1071   for (i=0; i<mbs; i++) {
1072     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1073     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1074     ib = aj + *ai;
1075     jmin = 0;
1076     if (*ib == i){     /* (diag of A)*x */
1077       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1078       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1079       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1080       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1081       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1082       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1083       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1084       v += 49; jmin++;
1085     }
1086     for (j=jmin; j<n; j++) {
1087       /* (strict lower triangular part of A)*x  */
1088       cval       = ib[j]*7;
1089       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1090       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1091       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1092       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1093       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1094       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1095       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1096       /* (strict upper triangular part of A)*x  */
1097       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1098       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1099       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1100       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1101       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1102       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1103       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1104       v  += 49;
1105     }
1106     xb +=7; ai++;
1107   }
1108 
1109   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1110   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1111   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1112 
1113   ierr = PetscLogFlops(98*(a->nz*2 - A->m));CHKERRQ(ierr);
1114   PetscFunctionReturn(0);
1115 }
1116 
1117 #undef __FUNCT__
1118 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1119 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1120 {
1121   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1122   PetscScalar    *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1123   MatScalar      *v;
1124   PetscErrorCode ierr;
1125   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->bs,j,n,bs2=a->bs2,ncols,k;
1126 
1127   PetscFunctionBegin;
1128   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1129   if (yy != xx) {
1130     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1131   } else {
1132     y = x;
1133   }
1134   if (zz != yy) {
1135     ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1136     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1137   } else {
1138     z = y;
1139   }
1140 
1141   aj   = a->j;
1142   v    = a->a;
1143   ii   = a->i;
1144 
1145   if (!a->mult_work) {
1146     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1147   }
1148   work = a->mult_work;
1149 
1150 
1151   for (i=0; i<mbs; i++) {
1152     n     = ii[1] - ii[0]; ncols = n*bs;
1153     workt = work; idx=aj+ii[0];
1154 
1155     /* upper triangular part */
1156     for (j=0; j<n; j++) {
1157       xb = x_ptr + bs*(*idx++);
1158       for (k=0; k<bs; k++) workt[k] = xb[k];
1159       workt += bs;
1160     }
1161     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1162     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1163 
1164     /* strict lower triangular part */
1165     idx = aj+ii[0];
1166     if (*idx == i){
1167       ncols -= bs; v += bs2; idx++; n--;
1168     }
1169     if (ncols > 0){
1170       workt = work;
1171       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1172       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1173       for (j=0; j<n; j++) {
1174         zb = z_ptr + bs*(*idx++);
1175         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1176         workt += bs;
1177       }
1178     }
1179 
1180     x += bs; v += n*bs2; z += bs; ii++;
1181   }
1182 
1183   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1184   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1185   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1186 
1187   ierr = PetscLogFlops(2*(a->nz*2 - A->m));CHKERRQ(ierr);
1188   PetscFunctionReturn(0);
1189 }
1190 
1191 #undef __FUNCT__
1192 #define __FUNCT__ "MatScale_SeqSBAIJ"
1193 PetscErrorCode MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA)
1194 {
1195   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1196   PetscBLASInt one = 1,totalnz = (PetscBLASInt)a->bs2*a->nz;
1197   PetscErrorCode ierr;
1198 
1199   PetscFunctionBegin;
1200   BLASscal_(&totalnz,(PetscScalar*)alpha,a->a,&one);
1201   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1202   PetscFunctionReturn(0);
1203 }
1204 
1205 #undef __FUNCT__
1206 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1207 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1208 {
1209   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1210   MatScalar      *v = a->a;
1211   PetscReal      sum_diag = 0.0, sum_off = 0.0, *sum;
1212   PetscInt       i,j,k,bs = A->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1213   PetscErrorCode ierr;
1214   PetscInt       *jl,*il,jmin,jmax,nexti,ik,*col;
1215 
1216   PetscFunctionBegin;
1217   if (type == NORM_FROBENIUS) {
1218     for (k=0; k<mbs; k++){
1219       jmin = a->i[k]; jmax = a->i[k+1];
1220       col  = aj + jmin;
1221       if (*col == k){         /* diagonal block */
1222         for (i=0; i<bs2; i++){
1223 #if defined(PETSC_USE_COMPLEX)
1224           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1225 #else
1226           sum_diag += (*v)*(*v); v++;
1227 #endif
1228         }
1229         jmin++;
1230       }
1231       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1232         for (i=0; i<bs2; i++){
1233 #if defined(PETSC_USE_COMPLEX)
1234           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1235 #else
1236           sum_off += (*v)*(*v); v++;
1237 #endif
1238         }
1239       }
1240     }
1241     *norm = sqrt(sum_diag + 2*sum_off);
1242   }  else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1243     ierr = PetscMalloc((2*mbs+1)*sizeof(PetscInt)+bs*sizeof(PetscReal),&il);CHKERRQ(ierr);
1244     jl   = il + mbs;
1245     sum  = (PetscReal*)(jl + mbs);
1246     for (i=0; i<mbs; i++) jl[i] = mbs;
1247     il[0] = 0;
1248 
1249     *norm = 0.0;
1250     for (k=0; k<mbs; k++) { /* k_th block row */
1251       for (j=0; j<bs; j++) sum[j]=0.0;
1252       /*-- col sum --*/
1253       i = jl[k]; /* first |A(i,k)| to be added */
1254       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1255                   at step k */
1256       while (i<mbs){
1257         nexti = jl[i];  /* next block row to be added */
1258         ik    = il[i];  /* block index of A(i,k) in the array a */
1259         for (j=0; j<bs; j++){
1260           v = a->a + ik*bs2 + j*bs;
1261           for (k1=0; k1<bs; k1++) {
1262             sum[j] += PetscAbsScalar(*v); v++;
1263           }
1264         }
1265         /* update il, jl */
1266         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1267         jmax = a->i[i+1];
1268         if (jmin < jmax){
1269           il[i] = jmin;
1270           j   = a->j[jmin];
1271           jl[i] = jl[j]; jl[j]=i;
1272         }
1273         i = nexti;
1274       }
1275       /*-- row sum --*/
1276       jmin = a->i[k]; jmax = a->i[k+1];
1277       for (i=jmin; i<jmax; i++) {
1278         for (j=0; j<bs; j++){
1279           v = a->a + i*bs2 + j;
1280           for (k1=0; k1<bs; k1++){
1281             sum[j] += PetscAbsScalar(*v); v += bs;
1282           }
1283         }
1284       }
1285       /* add k_th block row to il, jl */
1286       col = aj+jmin;
1287       if (*col == k) jmin++;
1288       if (jmin < jmax){
1289         il[k] = jmin;
1290         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1291       }
1292       for (j=0; j<bs; j++){
1293         if (sum[j] > *norm) *norm = sum[j];
1294       }
1295     }
1296     ierr = PetscFree(il);CHKERRQ(ierr);
1297   } else {
1298     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1299   }
1300   PetscFunctionReturn(0);
1301 }
1302 
1303 #undef __FUNCT__
1304 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1305 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1306 {
1307   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1308   PetscErrorCode ierr;
1309 
1310   PetscFunctionBegin;
1311 
1312   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1313   if ((A->m != B->m) || (A->n != B->n) || (A->bs != B->bs)|| (a->nz != b->nz)) {
1314     *flg = PETSC_FALSE;
1315     PetscFunctionReturn(0);
1316   }
1317 
1318   /* if the a->i are the same */
1319   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1320   if (!*flg) {
1321     PetscFunctionReturn(0);
1322   }
1323 
1324   /* if a->j are the same */
1325   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1326   if (!*flg) {
1327     PetscFunctionReturn(0);
1328   }
1329   /* if a->a are the same */
1330   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->bs)*(A->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1331   PetscFunctionReturn(0);
1332 }
1333 
1334 #undef __FUNCT__
1335 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1336 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1337 {
1338   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1339   PetscErrorCode ierr;
1340   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1341   PetscScalar    *x,zero = 0.0;
1342   MatScalar      *aa,*aa_j;
1343 
1344   PetscFunctionBegin;
1345   bs   = A->bs;
1346   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1347 
1348   aa   = a->a;
1349   ai   = a->i;
1350   aj   = a->j;
1351   ambs = a->mbs;
1352   bs2  = a->bs2;
1353 
1354   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1355   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1356   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1357   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1358   for (i=0; i<ambs; i++) {
1359     j=ai[i];
1360     if (aj[j] == i) {             /* if this is a diagonal element */
1361       row  = i*bs;
1362       aa_j = aa + j*bs2;
1363       if (A->factor && bs==1){
1364         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1365       } else {
1366         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1367       }
1368     }
1369   }
1370 
1371   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1372   PetscFunctionReturn(0);
1373 }
1374 
1375 #undef __FUNCT__
1376 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1377 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1378 {
1379   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1380   PetscScalar    *l,x,*li,*ri;
1381   MatScalar      *aa,*v;
1382   PetscErrorCode ierr;
1383   PetscInt       i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1384   PetscTruth     flg;
1385 
1386   PetscFunctionBegin;
1387   if (ll != rr){
1388     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1389     if (!flg)
1390       SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1391   }
1392   if (!ll) PetscFunctionReturn(0);
1393   ai  = a->i;
1394   aj  = a->j;
1395   aa  = a->a;
1396   m   = A->m;
1397   bs  = A->bs;
1398   mbs = a->mbs;
1399   bs2 = a->bs2;
1400 
1401   ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1402   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1403   if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1404   for (i=0; i<mbs; i++) { /* for each block row */
1405     M  = ai[i+1] - ai[i];
1406     li = l + i*bs;
1407     v  = aa + bs2*ai[i];
1408     for (j=0; j<M; j++) { /* for each block */
1409       ri = l + bs*aj[ai[i]+j];
1410       for (k=0; k<bs; k++) {
1411         x = ri[k];
1412         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1413       }
1414     }
1415   }
1416   ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1417   ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr);
1418   PetscFunctionReturn(0);
1419 }
1420 
1421 #undef __FUNCT__
1422 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1423 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1424 {
1425   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1426 
1427   PetscFunctionBegin;
1428   info->rows_global    = (double)A->m;
1429   info->columns_global = (double)A->m;
1430   info->rows_local     = (double)A->m;
1431   info->columns_local  = (double)A->m;
1432   info->block_size     = a->bs2;
1433   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1434   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1435   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1436   info->assemblies   = A->num_ass;
1437   info->mallocs      = a->reallocs;
1438   info->memory       = A->mem;
1439   if (A->factor) {
1440     info->fill_ratio_given  = A->info.fill_ratio_given;
1441     info->fill_ratio_needed = A->info.fill_ratio_needed;
1442     info->factor_mallocs    = A->info.factor_mallocs;
1443   } else {
1444     info->fill_ratio_given  = 0;
1445     info->fill_ratio_needed = 0;
1446     info->factor_mallocs    = 0;
1447   }
1448   PetscFunctionReturn(0);
1449 }
1450 
1451 
1452 #undef __FUNCT__
1453 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1454 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1455 {
1456   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1457   PetscErrorCode ierr;
1458 
1459   PetscFunctionBegin;
1460   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1461   PetscFunctionReturn(0);
1462 }
1463 
1464 #undef __FUNCT__
1465 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ"
1466 PetscErrorCode MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1467 {
1468   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1469   PetscErrorCode ierr;
1470   PetscInt i,j,n,row,col,bs,*ai,*aj,mbs;
1471   PetscReal    atmp;
1472   MatScalar    *aa;
1473   PetscScalar  zero = 0.0,*x;
1474   PetscInt          ncols,brow,bcol,krow,kcol;
1475 
1476   PetscFunctionBegin;
1477   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1478   bs   = A->bs;
1479   aa   = a->a;
1480   ai   = a->i;
1481   aj   = a->j;
1482   mbs = a->mbs;
1483 
1484   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1485   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1486   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1487   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1488   for (i=0; i<mbs; i++) {
1489     ncols = ai[1] - ai[0]; ai++;
1490     brow  = bs*i;
1491     for (j=0; j<ncols; j++){
1492       bcol = bs*(*aj);
1493       for (kcol=0; kcol<bs; kcol++){
1494         col = bcol + kcol;      /* col index */
1495         for (krow=0; krow<bs; krow++){
1496           atmp = PetscAbsScalar(*aa); aa++;
1497           row = brow + krow;    /* row index */
1498           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1499           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1500         }
1501       }
1502       aj++;
1503     }
1504   }
1505   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1506   PetscFunctionReturn(0);
1507 }
1508