xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 72ce74bd30d5d09d1371a3eb387704f4af395d1a)
1 
2 #include "src/mat/impls/baij/seq/baij.h"
3 #include "src/inline/spops.h"
4 #include "src/inline/ilu.h"
5 #include "petscbt.h"
6 #include "src/mat/impls/sbaij/seq/sbaij.h"
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS is[],int ov)
11 {
12   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
13   int          brow,i,j,k,l,mbs,n,*idx,ierr,*nidx,isz,bcol,bcol_max,
14                start,end,*ai,*aj,bs,*nidx2;
15   PetscBT      table;
16   PetscBT      table0;
17 
18   PetscFunctionBegin;
19   mbs = a->mbs;
20   ai  = a->i;
21   aj  = a->j;
22   bs  = a->bs;
23 
24   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
25 
26   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
27   ierr = PetscMalloc((mbs+1)*sizeof(int),&nidx);CHKERRQ(ierr);
28   ierr = PetscMalloc((A->m+1)*sizeof(int),&nidx2);CHKERRQ(ierr);
29   ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr);
30 
31   for (i=0; i<is_max; i++) { /* for each is */
32     isz  = 0;
33     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
34 
35     /* Extract the indices, assume there can be duplicate entries */
36     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
37     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
38 
39     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
40     bcol_max = 0;
41     for (j=0; j<n ; ++j){
42       brow = idx[j]/bs; /* convert the indices into block indices */
43       if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
44       if(!PetscBTLookupSet(table,brow)) {
45         nidx[isz++] = brow;
46         if (bcol_max < brow) bcol_max = brow;
47       }
48     }
49     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
50     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
51 
52     k = 0;
53     for (j=0; j<ov; j++){ /* for each overlap */
54       /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
55       ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr);
56       for (l=k; l<isz; l++) PetscBTSet(table0,nidx[l]);
57 
58       n = isz;  /* length of the updated is[i] */
59       for (brow=0; brow<mbs; brow++){
60         start = ai[brow]; end   = ai[brow+1];
61         if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
62           for (l = start; l<end ; l++){
63             bcol = aj[l];
64             if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
65           }
66           k++;
67           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
68         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
69           for (l = start; l<end ; l++){
70             bcol = aj[l];
71             if (bcol > bcol_max) break;
72             if (PetscBTLookup(table0,bcol)){
73               if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
74               break; /* for l = start; l<end ; l++) */
75             }
76           }
77         }
78       }
79     } /* for each overlap */
80 
81     /* expand the Index Set */
82     for (j=0; j<isz; j++) {
83       for (k=0; k<bs; k++)
84         nidx2[j*bs+k] = nidx[j]*bs+k;
85     }
86     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
87   }
88   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
89   ierr = PetscFree(nidx);CHKERRQ(ierr);
90   ierr = PetscFree(nidx2);CHKERRQ(ierr);
91   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
92   PetscFunctionReturn(0);
93 }
94 
95 #undef __FUNCT__
96 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
97 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
98 {
99   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
100   int          *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
101   int          row,mat_i,*mat_j,tcol,*mat_ilen;
102   int          *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
103   int          *aj = a->j,*ai = a->i;
104   MatScalar    *mat_a;
105   Mat          C;
106   PetscTruth   flag;
107 
108   PetscFunctionBegin;
109 
110   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
111   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
112   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
113 
114   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
115   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
116 
117   ierr  = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr);
118   ssmap = smap;
119   ierr  = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr);
120   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
121   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
122   /* determine lens of each row */
123   for (i=0; i<nrows; i++) {
124     kstart  = ai[irow[i]];
125     kend    = kstart + a->ilen[irow[i]];
126     lens[i] = 0;
127       for (k=kstart; k<kend; k++) {
128         if (ssmap[aj[k]]) {
129           lens[i]++;
130         }
131       }
132     }
133   /* Create and fill new matrix */
134   if (scall == MAT_REUSE_MATRIX) {
135     c = (Mat_SeqSBAIJ *)((*B)->data);
136 
137     if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
138     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr);
139     if (flag == PETSC_FALSE) {
140       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
141     }
142     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
143     C = *B;
144   } else {
145     ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr);
146     ierr = MatSetType(C,A->type_name);CHKERRQ(ierr);
147     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
148   }
149   c = (Mat_SeqSBAIJ *)(C->data);
150   for (i=0; i<nrows; i++) {
151     row    = irow[i];
152     kstart = ai[row];
153     kend   = kstart + a->ilen[row];
154     mat_i  = c->i[i];
155     mat_j  = c->j + mat_i;
156     mat_a  = c->a + mat_i*bs2;
157     mat_ilen = c->ilen + i;
158     for (k=kstart; k<kend; k++) {
159       if ((tcol=ssmap[a->j[k]])) {
160         *mat_j++ = tcol - 1;
161         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
162         mat_a   += bs2;
163         (*mat_ilen)++;
164       }
165     }
166   }
167 
168   /* Free work space */
169   ierr = PetscFree(smap);CHKERRQ(ierr);
170   ierr = PetscFree(lens);CHKERRQ(ierr);
171   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
172   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
173 
174   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
175   *B = C;
176   PetscFunctionReturn(0);
177 }
178 
179 #undef __FUNCT__
180 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
181 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
182 {
183   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
184   IS          is1;
185   int         *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
186 
187   PetscFunctionBegin;
188   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
189 
190   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
191   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
192 
193   /* Verify if the indices corespond to each element in a block
194    and form the IS with compressed IS */
195   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr);
196   iary = vary + a->mbs;
197   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
198   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
199 
200   count = 0;
201   for (i=0; i<a->mbs; i++) {
202     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
203     if (vary[i]==bs) iary[count++] = i;
204   }
205   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
206 
207   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
208   ierr = PetscFree(vary);CHKERRQ(ierr);
209 
210   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
211   ISDestroy(is1);
212   PetscFunctionReturn(0);
213 }
214 
215 #undef __FUNCT__
216 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
217 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
218 {
219   PetscErrorCode ierr,i;
220 
221   PetscFunctionBegin;
222   if (scall == MAT_INITIAL_MATRIX) {
223     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
224   }
225 
226   for (i=0; i<n; i++) {
227     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
228   }
229   PetscFunctionReturn(0);
230 }
231 
232 /* -------------------------------------------------------*/
233 /* Should check that shapes of vectors and matrices match */
234 /* -------------------------------------------------------*/
235 #include "petscblaslapack.h"
236 
237 #undef __FUNCT__
238 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
239 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
240 {
241   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
242   PetscScalar     *x,*z,*xb,x1,zero=0.0;
243   MatScalar       *v;
244   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
245 
246   PetscFunctionBegin;
247   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
248   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
249   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
250 
251   v  = a->a;
252   xb = x;
253 
254   for (i=0; i<mbs; i++) {
255     n  = ai[1] - ai[0];  /* length of i_th row of A */
256     x1 = xb[0];
257     ib = aj + *ai;
258     jmin = 0;
259     if (*ib == i) {      /* (diag of A)*x */
260       z[i] += *v++ * x[*ib++];
261       jmin++;
262     }
263     for (j=jmin; j<n; j++) {
264       cval    = *ib;
265       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
266       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
267     }
268     xb++; ai++;
269   }
270 
271   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
272   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
273   PetscLogFlops(2*(a->nz*2 - A->m) - A->m);  /* nz = (nz+m)/2 */
274   PetscFunctionReturn(0);
275 }
276 
277 #undef __FUNCT__
278 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
279 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
280 {
281   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
282   PetscScalar      *x,*z,*xb,x1,x2,zero=0.0;
283   MatScalar        *v;
284   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
285 
286 
287   PetscFunctionBegin;
288   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
289   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
290   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
291 
292   v     = a->a;
293   xb = x;
294 
295   for (i=0; i<mbs; i++) {
296     n  = ai[1] - ai[0]; /* length of i_th block row of A */
297     x1 = xb[0]; x2 = xb[1];
298     ib = aj + *ai;
299     jmin = 0;
300     if (*ib == i){     /* (diag of A)*x */
301       z[2*i]   += v[0]*x1 + v[2]*x2;
302       z[2*i+1] += v[2]*x1 + v[3]*x2;
303       v += 4; jmin++;
304     }
305     for (j=jmin; j<n; j++) {
306       /* (strict lower triangular part of A)*x  */
307       cval       = ib[j]*2;
308       z[cval]     += v[0]*x1 + v[1]*x2;
309       z[cval+1]   += v[2]*x1 + v[3]*x2;
310       /* (strict upper triangular part of A)*x  */
311       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
312       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
313       v  += 4;
314     }
315     xb +=2; ai++;
316   }
317 
318   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
319   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
320   PetscLogFlops(8*(a->nz*2 - A->m) - A->m);
321   PetscFunctionReturn(0);
322 }
323 
324 #undef __FUNCT__
325 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
326 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
327 {
328   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
329   PetscScalar   *x,*z,*xb,x1,x2,x3,zero=0.0;
330   MatScalar     *v;
331   int           mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
332 
333 
334   PetscFunctionBegin;
335   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
336   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
337   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
338 
339   v     = a->a;
340   xb = x;
341 
342   for (i=0; i<mbs; i++) {
343     n  = ai[1] - ai[0]; /* length of i_th block row of A */
344     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
345     ib = aj + *ai;
346     jmin = 0;
347     if (*ib == i){     /* (diag of A)*x */
348       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
349       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
350       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
351       v += 9; jmin++;
352     }
353     for (j=jmin; j<n; j++) {
354       /* (strict lower triangular part of A)*x  */
355       cval       = ib[j]*3;
356       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
357       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
358       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
359       /* (strict upper triangular part of A)*x  */
360       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
361       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
362       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
363       v  += 9;
364     }
365     xb +=3; ai++;
366   }
367 
368   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
369   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
370   PetscLogFlops(18*(a->nz*2 - A->m) - A->m);
371   PetscFunctionReturn(0);
372 }
373 
374 #undef __FUNCT__
375 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
376 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
377 {
378   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
379   PetscScalar      *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
380   MatScalar        *v;
381   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
382 
383   PetscFunctionBegin;
384   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
385   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
386   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
387 
388   v     = a->a;
389   xb = x;
390 
391   for (i=0; i<mbs; i++) {
392     n  = ai[1] - ai[0]; /* length of i_th block row of A */
393     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
394     ib = aj + *ai;
395     jmin = 0;
396     if (*ib == i){     /* (diag of A)*x */
397       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
398       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
399       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
400       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
401       v += 16; jmin++;
402     }
403     for (j=jmin; j<n; j++) {
404       /* (strict lower triangular part of A)*x  */
405       cval       = ib[j]*4;
406       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
407       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
408       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
409       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
410       /* (strict upper triangular part of A)*x  */
411       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
412       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
413       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
414       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
415       v  += 16;
416     }
417     xb +=4; ai++;
418   }
419 
420   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
421   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
422   PetscLogFlops(32*(a->nz*2 - A->m) - A->m);
423   PetscFunctionReturn(0);
424 }
425 
426 #undef __FUNCT__
427 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
428 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
429 {
430   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
431   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
432   MatScalar       *v;
433   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
434 
435   PetscFunctionBegin;
436   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
437   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
438   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
439 
440   v     = a->a;
441   xb = x;
442 
443   for (i=0; i<mbs; i++) {
444     n  = ai[1] - ai[0]; /* length of i_th block row of A */
445     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
446     ib = aj + *ai;
447     jmin = 0;
448     if (*ib == i){      /* (diag of A)*x */
449       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
450       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
451       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
452       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
453       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
454       v += 25; jmin++;
455     }
456     for (j=jmin; j<n; j++) {
457       /* (strict lower triangular part of A)*x  */
458       cval       = ib[j]*5;
459       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
460       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
461       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
462       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
463       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
464       /* (strict upper triangular part of A)*x  */
465       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
466       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
467       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
468       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
469       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
470       v  += 25;
471     }
472     xb +=5; ai++;
473   }
474 
475   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
476   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
477   PetscLogFlops(50*(a->nz*2 - A->m) - A->m);
478   PetscFunctionReturn(0);
479 }
480 
481 
482 #undef __FUNCT__
483 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
484 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
485 {
486   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
487   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
488   MatScalar       *v;
489   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
490 
491   PetscFunctionBegin;
492   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
493   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
494   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
495 
496   v     = a->a;
497   xb = x;
498 
499   for (i=0; i<mbs; i++) {
500     n  = ai[1] - ai[0]; /* length of i_th block row of A */
501     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
502     ib = aj + *ai;
503     jmin = 0;
504     if (*ib == i){      /* (diag of A)*x */
505       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
506       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
507       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
508       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
509       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
510       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
511       v += 36; jmin++;
512     }
513     for (j=jmin; j<n; j++) {
514       /* (strict lower triangular part of A)*x  */
515       cval       = ib[j]*6;
516       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
517       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
518       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
519       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
520       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
521       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
522       /* (strict upper triangular part of A)*x  */
523       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
524       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
525       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
526       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
527       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
528       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
529       v  += 36;
530     }
531     xb +=6; ai++;
532   }
533 
534   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
535   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
536   PetscLogFlops(72*(a->nz*2 - A->m) - A->m);
537   PetscFunctionReturn(0);
538 }
539 #undef __FUNCT__
540 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
541 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
542 {
543   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
544   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
545   MatScalar       *v;
546   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
547 
548   PetscFunctionBegin;
549   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
550   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
551   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
552 
553   v     = a->a;
554   xb = x;
555 
556   for (i=0; i<mbs; i++) {
557     n  = ai[1] - ai[0]; /* length of i_th block row of A */
558     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
559     ib = aj + *ai;
560     jmin = 0;
561     if (*ib == i){      /* (diag of A)*x */
562       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
563       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
564       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
565       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
566       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
567       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
568       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
569       v += 49; jmin++;
570     }
571     for (j=jmin; j<n; j++) {
572       /* (strict lower triangular part of A)*x  */
573       cval       = ib[j]*7;
574       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
575       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
576       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
577       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
578       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
579       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
580       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
581       /* (strict upper triangular part of A)*x  */
582       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
583       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
584       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
585       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
586       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
587       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
588       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
589       v  += 49;
590     }
591     xb +=7; ai++;
592   }
593   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
594   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
595   PetscLogFlops(98*(a->nz*2 - A->m) - A->m);
596   PetscFunctionReturn(0);
597 }
598 
599 /*
600     This will not work with MatScalar == float because it calls the BLAS
601 */
602 #undef __FUNCT__
603 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
604 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
605 {
606   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
607   PetscScalar     *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
608   MatScalar       *v;
609   PetscErrorCode ierr;
610   int mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
611   int             ncols,k;
612 
613   PetscFunctionBegin;
614   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
615   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
616   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
617 
618   aj   = a->j;
619   v    = a->a;
620   ii   = a->i;
621 
622   if (!a->mult_work) {
623     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
624   }
625   work = a->mult_work;
626 
627   for (i=0; i<mbs; i++) {
628     n     = ii[1] - ii[0]; ncols = n*bs;
629     workt = work; idx=aj+ii[0];
630 
631     /* upper triangular part */
632     for (j=0; j<n; j++) {
633       xb = x_ptr + bs*(*idx++);
634       for (k=0; k<bs; k++) workt[k] = xb[k];
635       workt += bs;
636     }
637     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
638     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
639 
640     /* strict lower triangular part */
641     idx = aj+ii[0];
642     if (*idx == i){
643       ncols -= bs; v += bs2; idx++; n--;
644     }
645 
646     if (ncols > 0){
647       workt = work;
648       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
649       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
650       for (j=0; j<n; j++) {
651         zb = z_ptr + bs*(*idx++);
652         for (k=0; k<bs; k++) zb[k] += workt[k] ;
653         workt += bs;
654       }
655     }
656     x += bs; v += n*bs2; z += bs; ii++;
657   }
658 
659   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
660   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
661   PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m);
662   PetscFunctionReturn(0);
663 }
664 
665 #undef __FUNCT__
666 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
667 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
668 {
669   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
670   PetscScalar     *x,*y,*z,*xb,x1;
671   MatScalar       *v;
672   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
673 
674   PetscFunctionBegin;
675   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
676   if (yy != xx) {
677     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
678   } else {
679     y = x;
680   }
681   if (zz != yy) {
682     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
683     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
684     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
685   } else {
686     z = y;
687   }
688 
689   v  = a->a;
690   xb = x;
691 
692   for (i=0; i<mbs; i++) {
693     n  = ai[1] - ai[0];  /* length of i_th row of A */
694     x1 = xb[0];
695     ib = aj + *ai;
696     jmin = 0;
697     if (*ib == i) {            /* (diag of A)*x */
698       z[i] += *v++ * x[*ib++]; jmin++;
699     }
700     for (j=jmin; j<n; j++) {
701       cval    = *ib;
702       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
703       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
704     }
705     xb++; ai++;
706   }
707 
708   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
709   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
710   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
711 
712   PetscLogFlops(2*(a->nz*2 - A->m));
713   PetscFunctionReturn(0);
714 }
715 
716 #undef __FUNCT__
717 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
718 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
719 {
720   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
721   PetscScalar     *x,*y,*z,*xb,x1,x2;
722   MatScalar       *v;
723   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
724 
725   PetscFunctionBegin;
726   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
727   if (yy != xx) {
728     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
729   } else {
730     y = x;
731   }
732   if (zz != yy) {
733     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
734     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
735     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
736   } else {
737     z = y;
738   }
739 
740   v     = a->a;
741   xb = x;
742 
743   for (i=0; i<mbs; i++) {
744     n  = ai[1] - ai[0]; /* length of i_th block row of A */
745     x1 = xb[0]; x2 = xb[1];
746     ib = aj + *ai;
747     jmin = 0;
748     if (*ib == i){      /* (diag of A)*x */
749       z[2*i]   += v[0]*x1 + v[2]*x2;
750       z[2*i+1] += v[2]*x1 + v[3]*x2;
751       v += 4; jmin++;
752     }
753     for (j=jmin; j<n; j++) {
754       /* (strict lower triangular part of A)*x  */
755       cval       = ib[j]*2;
756       z[cval]     += v[0]*x1 + v[1]*x2;
757       z[cval+1]   += v[2]*x1 + v[3]*x2;
758       /* (strict upper triangular part of A)*x  */
759       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
760       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
761       v  += 4;
762     }
763     xb +=2; ai++;
764   }
765 
766   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
767   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
768   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
769 
770   PetscLogFlops(4*(a->nz*2 - A->m));
771   PetscFunctionReturn(0);
772 }
773 
774 #undef __FUNCT__
775 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
776 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
777 {
778   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
779   PetscScalar     *x,*y,*z,*xb,x1,x2,x3;
780   MatScalar       *v;
781   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
782 
783   PetscFunctionBegin;
784   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
785   if (yy != xx) {
786     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
787   } else {
788     y = x;
789   }
790   if (zz != yy) {
791     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
792     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
793     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
794   } else {
795     z = y;
796   }
797 
798   v     = a->a;
799   xb = x;
800 
801   for (i=0; i<mbs; i++) {
802     n  = ai[1] - ai[0]; /* length of i_th block row of A */
803     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
804     ib = aj + *ai;
805     jmin = 0;
806     if (*ib == i){     /* (diag of A)*x */
807      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
808      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
809      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
810      v += 9; jmin++;
811     }
812     for (j=jmin; j<n; j++) {
813       /* (strict lower triangular part of A)*x  */
814       cval       = ib[j]*3;
815       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
816       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
817       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
818       /* (strict upper triangular part of A)*x  */
819       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
820       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
821       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
822       v  += 9;
823     }
824     xb +=3; ai++;
825   }
826 
827   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
828   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
829   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
830 
831   PetscLogFlops(18*(a->nz*2 - A->m));
832   PetscFunctionReturn(0);
833 }
834 
835 #undef __FUNCT__
836 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
837 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
838 {
839   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
840   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4;
841   MatScalar       *v;
842   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
843 
844   PetscFunctionBegin;
845   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
846   if (yy != xx) {
847     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
848   } else {
849     y = x;
850   }
851   if (zz != yy) {
852     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
853     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
854     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
855   } else {
856     z = y;
857   }
858 
859   v     = a->a;
860   xb = x;
861 
862   for (i=0; i<mbs; i++) {
863     n  = ai[1] - ai[0]; /* length of i_th block row of A */
864     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
865     ib = aj + *ai;
866     jmin = 0;
867     if (*ib == i){      /* (diag of A)*x */
868       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
869       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
870       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
871       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
872       v += 16; jmin++;
873     }
874     for (j=jmin; j<n; j++) {
875       /* (strict lower triangular part of A)*x  */
876       cval       = ib[j]*4;
877       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
878       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
879       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
880       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
881       /* (strict upper triangular part of A)*x  */
882       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
883       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
884       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
885       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
886       v  += 16;
887     }
888     xb +=4; ai++;
889   }
890 
891   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
892   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
893   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
894 
895   PetscLogFlops(32*(a->nz*2 - A->m));
896   PetscFunctionReturn(0);
897 }
898 
899 #undef __FUNCT__
900 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
901 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
902 {
903   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
904   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5;
905   MatScalar       *v;
906   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
907 
908   PetscFunctionBegin;
909   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
910   if (yy != xx) {
911     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
912   } else {
913     y = x;
914   }
915   if (zz != yy) {
916     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
917     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
918     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
919   } else {
920     z = y;
921   }
922 
923   v     = a->a;
924   xb = x;
925 
926   for (i=0; i<mbs; i++) {
927     n  = ai[1] - ai[0]; /* length of i_th block row of A */
928     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
929     ib = aj + *ai;
930     jmin = 0;
931     if (*ib == i){      /* (diag of A)*x */
932       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
933       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
934       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
935       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
936       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
937       v += 25; jmin++;
938     }
939     for (j=jmin; j<n; j++) {
940       /* (strict lower triangular part of A)*x  */
941       cval       = ib[j]*5;
942       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
943       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
944       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
945       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
946       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
947       /* (strict upper triangular part of A)*x  */
948       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
949       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
950       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
951       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
952       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
953       v  += 25;
954     }
955     xb +=5; ai++;
956   }
957 
958   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
959   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
960   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
961 
962   PetscLogFlops(50*(a->nz*2 - A->m));
963   PetscFunctionReturn(0);
964 }
965 #undef __FUNCT__
966 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
967 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
968 {
969   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
970   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
971   MatScalar       *v;
972   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
973 
974   PetscFunctionBegin;
975   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
976   if (yy != xx) {
977     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
978   } else {
979     y = x;
980   }
981   if (zz != yy) {
982     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
983     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
984     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
985   } else {
986     z = y;
987   }
988 
989   v     = a->a;
990   xb = x;
991 
992   for (i=0; i<mbs; i++) {
993     n  = ai[1] - ai[0]; /* length of i_th block row of A */
994     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
995     ib = aj + *ai;
996     jmin = 0;
997     if (*ib == i){     /* (diag of A)*x */
998       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
999       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1000       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1001       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1002       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1003       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1004       v += 36; jmin++;
1005     }
1006     for (j=jmin; j<n; j++) {
1007       /* (strict lower triangular part of A)*x  */
1008       cval       = ib[j]*6;
1009       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1010       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1011       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1012       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1013       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1014       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1015       /* (strict upper triangular part of A)*x  */
1016       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1017       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1018       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1019       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1020       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1021       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1022       v  += 36;
1023     }
1024     xb +=6; ai++;
1025   }
1026 
1027   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1028   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1029   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1030 
1031   PetscLogFlops(72*(a->nz*2 - A->m));
1032   PetscFunctionReturn(0);
1033 }
1034 
1035 #undef __FUNCT__
1036 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1037 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1038 {
1039   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1040   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
1041   MatScalar       *v;
1042   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
1043 
1044   PetscFunctionBegin;
1045   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1046   if (yy != xx) {
1047     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1048   } else {
1049     y = x;
1050   }
1051   if (zz != yy) {
1052     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1053     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1054     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1055   } else {
1056     z = y;
1057   }
1058 
1059   v     = a->a;
1060   xb = x;
1061 
1062   for (i=0; i<mbs; i++) {
1063     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1064     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1065     ib = aj + *ai;
1066     jmin = 0;
1067     if (*ib == i){     /* (diag of A)*x */
1068       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1069       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1070       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1071       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1072       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1073       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1074       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1075       v += 49; jmin++;
1076     }
1077     for (j=jmin; j<n; j++) {
1078       /* (strict lower triangular part of A)*x  */
1079       cval       = ib[j]*7;
1080       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1081       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1082       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1083       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1084       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1085       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1086       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1087       /* (strict upper triangular part of A)*x  */
1088       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1089       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1090       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1091       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1092       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1093       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1094       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1095       v  += 49;
1096     }
1097     xb +=7; ai++;
1098   }
1099 
1100   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1101   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1102   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1103 
1104   PetscLogFlops(98*(a->nz*2 - A->m));
1105   PetscFunctionReturn(0);
1106 }
1107 
1108 #undef __FUNCT__
1109 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1110 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1111 {
1112   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1113   PetscScalar     *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1114   MatScalar       *v;
1115   PetscErrorCode ierr;
1116   int mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
1117   int             ncols,k;
1118 
1119   PetscFunctionBegin;
1120   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1121   if (yy != xx) {
1122     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1123   } else {
1124     y = x;
1125   }
1126   if (zz != yy) {
1127     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1128     ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1129     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1130   } else {
1131     z = y;
1132   }
1133 
1134   aj   = a->j;
1135   v    = a->a;
1136   ii   = a->i;
1137 
1138   if (!a->mult_work) {
1139     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1140   }
1141   work = a->mult_work;
1142 
1143 
1144   for (i=0; i<mbs; i++) {
1145     n     = ii[1] - ii[0]; ncols = n*bs;
1146     workt = work; idx=aj+ii[0];
1147 
1148     /* upper triangular part */
1149     for (j=0; j<n; j++) {
1150       xb = x_ptr + bs*(*idx++);
1151       for (k=0; k<bs; k++) workt[k] = xb[k];
1152       workt += bs;
1153     }
1154     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1155     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1156 
1157     /* strict lower triangular part */
1158     idx = aj+ii[0];
1159     if (*idx == i){
1160       ncols -= bs; v += bs2; idx++; n--;
1161     }
1162     if (ncols > 0){
1163       workt = work;
1164       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1165       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1166       for (j=0; j<n; j++) {
1167         zb = z_ptr + bs*(*idx++);
1168         /* idx++; */
1169         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1170         workt += bs;
1171       }
1172     }
1173 
1174     x += bs; v += n*bs2; z += bs; ii++;
1175   }
1176 
1177   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1178   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1179   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1180 
1181   PetscLogFlops(2*(a->nz*2 - A->m));
1182   PetscFunctionReturn(0);
1183 }
1184 
1185 #undef __FUNCT__
1186 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ"
1187 PetscErrorCode MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1188 {
1189   PetscErrorCode ierr;
1190 
1191   PetscFunctionBegin;
1192   ierr = MatMult(A,xx,zz);CHKERRQ(ierr);
1193   PetscFunctionReturn(0);
1194 }
1195 
1196 #undef __FUNCT__
1197 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ"
1198 PetscErrorCode MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1199 
1200 {
1201   PetscErrorCode ierr;
1202 
1203   PetscFunctionBegin;
1204   ierr = MatMultAdd(A,xx,yy,zz);CHKERRQ(ierr);
1205   PetscFunctionReturn(0);
1206 }
1207 
1208 #undef __FUNCT__
1209 #define __FUNCT__ "MatScale_SeqSBAIJ"
1210 PetscErrorCode MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA)
1211 {
1212   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1213   int         one = 1,totalnz = a->bs2*a->nz;
1214 
1215   PetscFunctionBegin;
1216   BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one);
1217   PetscLogFlops(totalnz);
1218   PetscFunctionReturn(0);
1219 }
1220 
1221 #undef __FUNCT__
1222 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1223 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1224 {
1225   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1226   MatScalar   *v = a->a;
1227   PetscReal   sum_diag = 0.0, sum_off = 0.0, *sum;
1228   int         i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1229   int         *jl,*il,jmin,jmax,ierr,nexti,ik,*col;
1230 
1231   PetscFunctionBegin;
1232   if (type == NORM_FROBENIUS) {
1233     for (k=0; k<mbs; k++){
1234       jmin = a->i[k]; jmax = a->i[k+1];
1235       col  = aj + jmin;
1236       if (*col == k){         /* diagonal block */
1237         for (i=0; i<bs2; i++){
1238 #if defined(PETSC_USE_COMPLEX)
1239           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1240 #else
1241           sum_diag += (*v)*(*v); v++;
1242 #endif
1243         }
1244         jmin++;
1245       }
1246       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1247         for (i=0; i<bs2; i++){
1248 #if defined(PETSC_USE_COMPLEX)
1249           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1250 #else
1251           sum_off += (*v)*(*v); v++;
1252 #endif
1253         }
1254       }
1255     }
1256     *norm = sqrt(sum_diag + 2*sum_off);
1257 
1258   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1259     ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr);
1260     ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr);
1261     ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr);
1262     for (i=0; i<mbs; i++) {
1263       jl[i] = mbs; il[0] = 0;
1264     }
1265 
1266     *norm = 0.0;
1267     for (k=0; k<mbs; k++) { /* k_th block row */
1268       for (j=0; j<bs; j++) sum[j]=0.0;
1269 
1270       /*-- col sum --*/
1271       i = jl[k]; /* first |A(i,k)| to be added */
1272       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1273                   at step k */
1274       while (i<mbs){
1275         nexti = jl[i];  /* next block row to be added */
1276         ik    = il[i];  /* block index of A(i,k) in the array a */
1277         for (j=0; j<bs; j++){
1278           v = a->a + ik*bs2 + j*bs;
1279           for (k1=0; k1<bs; k1++) {
1280             sum[j] += PetscAbsScalar(*v); v++;
1281           }
1282         }
1283         /* update il, jl */
1284         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1285         jmax = a->i[i+1];
1286         if (jmin < jmax){
1287           il[i] = jmin;
1288           j   = a->j[jmin];
1289           jl[i] = jl[j]; jl[j]=i;
1290         }
1291         i = nexti;
1292       }
1293 
1294       /*-- row sum --*/
1295       jmin = a->i[k]; jmax = a->i[k+1];
1296       for (i=jmin; i<jmax; i++) {
1297         for (j=0; j<bs; j++){
1298           v = a->a + i*bs2 + j;
1299           for (k1=0; k1<bs; k1++){
1300             sum[j] += PetscAbsScalar(*v);
1301             v   += bs;
1302           }
1303         }
1304       }
1305       /* add k_th block row to il, jl */
1306       col = aj+jmin;
1307       if (*col == k) jmin++;
1308       if (jmin < jmax){
1309         il[k] = jmin;
1310         j   = a->j[jmin];
1311         jl[k] = jl[j]; jl[j] = k;
1312       }
1313       for (j=0; j<bs; j++){
1314         if (sum[j] > *norm) *norm = sum[j];
1315       }
1316     }
1317     ierr = PetscFree(il);CHKERRQ(ierr);
1318     ierr = PetscFree(jl);CHKERRQ(ierr);
1319     ierr = PetscFree(sum);CHKERRQ(ierr);
1320   } else {
1321     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1322   }
1323   PetscFunctionReturn(0);
1324 }
1325 
1326 #undef __FUNCT__
1327 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1328 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1329 {
1330   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1331   PetscErrorCode ierr;
1332 
1333   PetscFunctionBegin;
1334 
1335   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1336   if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1337     *flg = PETSC_FALSE;
1338     PetscFunctionReturn(0);
1339   }
1340 
1341   /* if the a->i are the same */
1342   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr);
1343   if (*flg == PETSC_FALSE) {
1344     PetscFunctionReturn(0);
1345   }
1346 
1347   /* if a->j are the same */
1348   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);CHKERRQ(ierr);
1349   if (*flg == PETSC_FALSE) {
1350     PetscFunctionReturn(0);
1351   }
1352   /* if a->a are the same */
1353   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1354 
1355   PetscFunctionReturn(0);
1356 }
1357 
1358 #undef __FUNCT__
1359 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1360 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1361 {
1362   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1363   PetscErrorCode ierr;
1364   int i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1365   PetscScalar  *x,zero = 0.0;
1366   MatScalar    *aa,*aa_j;
1367 
1368   PetscFunctionBegin;
1369   bs   = a->bs;
1370   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1371 
1372   aa   = a->a;
1373   ai   = a->i;
1374   aj   = a->j;
1375   ambs = a->mbs;
1376   bs2  = a->bs2;
1377 
1378   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1379   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1380   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1381   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1382   for (i=0; i<ambs; i++) {
1383     j=ai[i];
1384     if (aj[j] == i) {             /* if this is a diagonal element */
1385       row  = i*bs;
1386       aa_j = aa + j*bs2;
1387       if (A->factor && bs==1){
1388         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1389       } else {
1390         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1391       }
1392     }
1393   }
1394 
1395   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1396   PetscFunctionReturn(0);
1397 }
1398 
1399 #undef __FUNCT__
1400 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1401 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1402 {
1403   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1404   PetscScalar  *l,*r,x,*li,*ri;
1405   MatScalar    *aa,*v;
1406   PetscErrorCode ierr;
1407   int i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1408 
1409   PetscFunctionBegin;
1410   ai  = a->i;
1411   aj  = a->j;
1412   aa  = a->a;
1413   m   = A->m;
1414   bs  = a->bs;
1415   mbs = a->mbs;
1416   bs2 = a->bs2;
1417 
1418   if (ll != rr) {
1419     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1420   }
1421   if (ll) {
1422     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1423     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1424     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1425     for (i=0; i<mbs; i++) { /* for each block row */
1426       M  = ai[i+1] - ai[i];
1427       li = l + i*bs;
1428       v  = aa + bs2*ai[i];
1429       for (j=0; j<M; j++) { /* for each block */
1430         for (k=0; k<bs2; k++) {
1431           (*v++) *= li[k%bs];
1432         }
1433 #ifdef CONT
1434         /* will be used to replace the above loop */
1435         ri = l + bs*aj[ai[i]+j];
1436         for (k=0; k<bs; k++) { /* column value */
1437           x = ri[k];
1438           for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1439         }
1440 #endif
1441 
1442       }
1443     }
1444     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1445     PetscLogFlops(2*a->nz);
1446   }
1447   /* will be deleted */
1448   if (rr) {
1449     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
1450     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1451     if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1452     for (i=0; i<mbs; i++) { /* for each block row */
1453       M  = ai[i+1] - ai[i];
1454       v  = aa + bs2*ai[i];
1455       for (j=0; j<M; j++) { /* for each block */
1456         ri = r + bs*aj[ai[i]+j];
1457         for (k=0; k<bs; k++) {
1458           x = ri[k];
1459           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1460         }
1461       }
1462     }
1463     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
1464     PetscLogFlops(a->nz);
1465   }
1466   PetscFunctionReturn(0);
1467 }
1468 
1469 #undef __FUNCT__
1470 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1471 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1472 {
1473   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1474 
1475   PetscFunctionBegin;
1476   info->rows_global    = (double)A->m;
1477   info->columns_global = (double)A->m;
1478   info->rows_local     = (double)A->m;
1479   info->columns_local  = (double)A->m;
1480   info->block_size     = a->bs2;
1481   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1482   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1483   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1484   info->assemblies   = A->num_ass;
1485   info->mallocs      = a->reallocs;
1486   info->memory       = A->mem;
1487   if (A->factor) {
1488     info->fill_ratio_given  = A->info.fill_ratio_given;
1489     info->fill_ratio_needed = A->info.fill_ratio_needed;
1490     info->factor_mallocs    = A->info.factor_mallocs;
1491   } else {
1492     info->fill_ratio_given  = 0;
1493     info->fill_ratio_needed = 0;
1494     info->factor_mallocs    = 0;
1495   }
1496   PetscFunctionReturn(0);
1497 }
1498 
1499 
1500 #undef __FUNCT__
1501 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1502 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1503 {
1504   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1505   PetscErrorCode ierr;
1506 
1507   PetscFunctionBegin;
1508   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1509   PetscFunctionReturn(0);
1510 }
1511 
1512 #undef __FUNCT__
1513 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ"
1514 PetscErrorCode MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1515 {
1516   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1517   PetscErrorCode ierr;
1518   int i,j,n,row,col,bs,*ai,*aj,mbs;
1519   PetscReal    atmp;
1520   MatScalar    *aa;
1521   PetscScalar  zero = 0.0,*x;
1522   int          ncols,brow,bcol,krow,kcol;
1523 
1524   PetscFunctionBegin;
1525   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1526   bs   = a->bs;
1527   aa   = a->a;
1528   ai   = a->i;
1529   aj   = a->j;
1530   mbs = a->mbs;
1531 
1532   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1533   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1534   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1535   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1536   for (i=0; i<mbs; i++) {
1537     ncols = ai[1] - ai[0]; ai++;
1538     brow  = bs*i;
1539     for (j=0; j<ncols; j++){
1540       bcol = bs*(*aj);
1541       for (kcol=0; kcol<bs; kcol++){
1542         col = bcol + kcol;      /* col index */
1543         for (krow=0; krow<bs; krow++){
1544           atmp = PetscAbsScalar(*aa); aa++;
1545           row = brow + krow;    /* row index */
1546           /* printf("val[%d,%d]: %g\n",row,col,atmp); */
1547           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1548           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1549         }
1550       }
1551       aj++;
1552     }
1553   }
1554   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1555   PetscFunctionReturn(0);
1556 }
1557