xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision afebec48acb8b1040c5bfa54d0857ffcc0c220e6)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <../src/mat/impls/sbaij/seq/sbaij.h>
6 #include <petscblaslapack.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11 {
12   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
15   const PetscInt *idx;
16   PetscBT        table_out,table_in;
17 
18   PetscFunctionBegin;
19   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20   mbs  = a->mbs;
21   ai   = a->i;
22   aj   = a->j;
23   bs   = A->rmap->bs;
24   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
25   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
26   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
27   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
28 
29   for (i=0; i<is_max; i++) { /* for each is */
30     isz  = 0;
31     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
32 
33     /* Extract the indices, assume there can be duplicate entries */
34     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36 
37     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
38     bcol_max = 0;
39     for (j=0; j<n; ++j) {
40       brow = idx[j]/bs; /* convert the indices into block indices */
41       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42       if (!PetscBTLookupSet(table_out,brow)) {
43         nidx[isz++] = brow;
44         if (bcol_max < brow) bcol_max = brow;
45       }
46     }
47     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
49 
50     k = 0;
51     for (j=0; j<ov; j++) { /* for each overlap */
52       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
54       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
55 
56       n = isz;  /* length of the updated is[i] */
57       for (brow=0; brow<mbs; brow++) {
58         start = ai[brow]; end   = ai[brow+1];
59         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
60           for (l = start; l<end; l++) {
61             bcol = aj[l];
62             if (!PetscBTLookupSet(table_out,bcol)) {
63               nidx[isz++] = bcol;
64               if (bcol_max < bcol) bcol_max = bcol;
65             }
66           }
67           k++;
68           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
69         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
70           for (l = start; l<end; l++) {
71             bcol = aj[l];
72             if (bcol > bcol_max) break;
73             if (PetscBTLookup(table_in,bcol)) {
74               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
75               break; /* for l = start; l<end ; l++) */
76             }
77           }
78         }
79       }
80     } /* for each overlap */
81 
82     /* expand the Index Set */
83     for (j=0; j<isz; j++) {
84       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
85     }
86     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
87   }
88   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
89   ierr = PetscFree(nidx);CHKERRQ(ierr);
90   ierr = PetscFree(nidx2);CHKERRQ(ierr);
91   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
92   PetscFunctionReturn(0);
93 }
94 
95 #undef __FUNCT__
96 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
97 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,MatReuse scall,Mat *B)
98 {
99   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data,*c;
100   PetscErrorCode  ierr;
101   PetscInt        *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
102   PetscInt        row,mat_i,*mat_j,tcol,*mat_ilen;
103   PetscInt        nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
104   const PetscInt  *irow,*aj = a->j,*ai = a->i;
105   MatScalar       *mat_a;
106   Mat             C;
107   PetscBool       flag,sorted;
108 
109   PetscFunctionBegin;
110   ierr = ISSorted(isrow,&sorted);CHKERRQ(ierr);
111   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
112 
113   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
114   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
115 
116   ierr  = PetscMalloc1(oldcols,&smap);CHKERRQ(ierr);
117   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
118   ssmap = smap;
119   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
120   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
121   /* determine lens of each row */
122   for (i=0; i<nrows; i++) {
123     kstart  = ai[irow[i]];
124     kend    = kstart + a->ilen[irow[i]];
125     lens[i] = 0;
126     for (k=kstart; k<kend; k++) {
127       if (ssmap[aj[k]]) lens[i]++;
128     }
129   }
130   /* Create and fill new matrix */
131   if (scall == MAT_REUSE_MATRIX) {
132     c = (Mat_SeqSBAIJ*)((*B)->data);
133 
134     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
135     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
136     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
137     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
138     C    = *B;
139   } else {
140     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
141     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
142     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
143     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
144   }
145   c = (Mat_SeqSBAIJ*)(C->data);
146   for (i=0; i<nrows; i++) {
147     row      = irow[i];
148     kstart   = ai[row];
149     kend     = kstart + a->ilen[row];
150     mat_i    = c->i[i];
151     mat_j    = c->j + mat_i;
152     mat_a    = c->a + mat_i*bs2;
153     mat_ilen = c->ilen + i;
154     for (k=kstart; k<kend; k++) {
155       if ((tcol=ssmap[a->j[k]])) {
156         *mat_j++ = tcol - 1;
157         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
158         mat_a   += bs2;
159         (*mat_ilen)++;
160       }
161     }
162   }
163 
164   /* Free work space */
165   ierr = PetscFree(smap);CHKERRQ(ierr);
166   ierr = PetscFree(lens);CHKERRQ(ierr);
167   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
168   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
169 
170   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
171   *B   = C;
172   PetscFunctionReturn(0);
173 }
174 
175 #undef __FUNCT__
176 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
177 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
178 {
179   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
180   IS             is1;
181   PetscErrorCode ierr;
182   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
183   const PetscInt *irow;
184 
185   PetscFunctionBegin;
186   if (isrow != iscol) {
187     PetscBool isequal;
188     ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr);
189     if (!isequal) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isrow");
190   }
191 
192   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
193   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
194 
195   /* Verify if the indices corespond to each element in a block
196    and form the IS with compressed IS */
197   ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr);
198   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
199   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
200 
201   count = 0;
202   for (i=0; i<a->mbs; i++) {
203     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
204     if (vary[i]==bs) iary[count++] = i;
205   }
206   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
207   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
208   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
209 
210   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,scall,B);CHKERRQ(ierr);
211   ierr = ISDestroy(&is1);CHKERRQ(ierr);
212   PetscFunctionReturn(0);
213 }
214 
215 #undef __FUNCT__
216 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
217 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
218 {
219   PetscErrorCode ierr;
220   PetscInt       i;
221   PetscBool      flg;
222 
223   PetscFunctionBegin;
224   ierr = MatGetSubMatrices_SeqBAIJ(A,n,irow,icol,scall,B);CHKERRQ(ierr);
225   for (i=0; i<n; i++) {
226     ierr = ISEqual(irow[i],icol[i],&flg);CHKERRQ(ierr);
227     if (!flg) {
228       Mat Bseq = *B[i];
229       /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
230          Zero some ops' to avoid invalid usse */
231       ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr);
232       ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr);
233       Bseq->ops->mult                   = 0;
234       Bseq->ops->multadd                = 0;
235       Bseq->ops->multtranspose          = 0;
236       Bseq->ops->multtransposeadd       = 0;
237       Bseq->ops->lufactor               = 0;
238       Bseq->ops->choleskyfactor         = 0;
239       Bseq->ops->lufactorsymbolic       = 0;
240       Bseq->ops->choleskyfactorsymbolic = 0;
241       Bseq->ops->getinertia             = 0;
242     }
243   }
244   PetscFunctionReturn(0);
245 }
246 
247 /* -------------------------------------------------------*/
248 /* Should check that shapes of vectors and matrices match */
249 /* -------------------------------------------------------*/
250 
251 #undef __FUNCT__
252 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
253 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
254 {
255   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
256   PetscScalar       *z,x1,x2,zero=0.0;
257   const PetscScalar *x,*xb;
258   const MatScalar   *v;
259   PetscErrorCode    ierr;
260   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
261   const PetscInt    *aj=a->j,*ai=a->i,*ib;
262   PetscInt          nonzerorow=0;
263 
264   PetscFunctionBegin;
265   ierr = VecSet(zz,zero);CHKERRQ(ierr);
266   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
267   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
268 
269   v  = a->a;
270   xb = x;
271 
272   for (i=0; i<mbs; i++) {
273     n           = ai[1] - ai[0]; /* length of i_th block row of A */
274     x1          = xb[0]; x2 = xb[1];
275     ib          = aj + *ai;
276     jmin        = 0;
277     nonzerorow += (n>0);
278     if (*ib == i) {     /* (diag of A)*x */
279       z[2*i]   += v[0]*x1 + v[2]*x2;
280       z[2*i+1] += v[2]*x1 + v[3]*x2;
281       v        += 4; jmin++;
282     }
283     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
284     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
285     for (j=jmin; j<n; j++) {
286       /* (strict lower triangular part of A)*x  */
287       cval       = ib[j]*2;
288       z[cval]   += v[0]*x1 + v[1]*x2;
289       z[cval+1] += v[2]*x1 + v[3]*x2;
290       /* (strict upper triangular part of A)*x  */
291       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
292       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
293       v        += 4;
294     }
295     xb +=2; ai++;
296   }
297 
298   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
299   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
300   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
301   PetscFunctionReturn(0);
302 }
303 
304 #undef __FUNCT__
305 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
306 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
307 {
308   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
309   PetscScalar       *z,x1,x2,x3,zero=0.0;
310   const PetscScalar *x,*xb;
311   const MatScalar   *v;
312   PetscErrorCode    ierr;
313   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
314   const PetscInt    *aj = a->j,*ai = a->i,*ib;
315   PetscInt          nonzerorow=0;
316 
317   PetscFunctionBegin;
318   ierr = VecSet(zz,zero);CHKERRQ(ierr);
319   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
320   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
321 
322   v  = a->a;
323   xb = x;
324 
325   for (i=0; i<mbs; i++) {
326     n           = ai[1] - ai[0]; /* length of i_th block row of A */
327     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
328     ib          = aj + *ai;
329     jmin        = 0;
330     nonzerorow += (n>0);
331     if (*ib == i) {     /* (diag of A)*x */
332       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
333       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
334       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
335       v        += 9; jmin++;
336     }
337     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
338     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
339     for (j=jmin; j<n; j++) {
340       /* (strict lower triangular part of A)*x  */
341       cval       = ib[j]*3;
342       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
343       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
344       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
345       /* (strict upper triangular part of A)*x  */
346       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
347       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
348       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
349       v        += 9;
350     }
351     xb +=3; ai++;
352   }
353 
354   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
355   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
356   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
357   PetscFunctionReturn(0);
358 }
359 
360 #undef __FUNCT__
361 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
362 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
363 {
364   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
365   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
366   const PetscScalar *x,*xb;
367   const MatScalar   *v;
368   PetscErrorCode    ierr;
369   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
370   const PetscInt    *aj = a->j,*ai = a->i,*ib;
371   PetscInt          nonzerorow = 0;
372 
373   PetscFunctionBegin;
374   ierr = VecSet(zz,zero);CHKERRQ(ierr);
375   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
376   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
377 
378   v  = a->a;
379   xb = x;
380 
381   for (i=0; i<mbs; i++) {
382     n           = ai[1] - ai[0]; /* length of i_th block row of A */
383     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
384     ib          = aj + *ai;
385     jmin        = 0;
386     nonzerorow += (n>0);
387     if (*ib == i) {     /* (diag of A)*x */
388       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
389       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
390       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
391       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
392       v        += 16; jmin++;
393     }
394     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
395     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
396     for (j=jmin; j<n; j++) {
397       /* (strict lower triangular part of A)*x  */
398       cval       = ib[j]*4;
399       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
400       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
401       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
402       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
403       /* (strict upper triangular part of A)*x  */
404       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
405       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
406       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
407       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
408       v        += 16;
409     }
410     xb +=4; ai++;
411   }
412 
413   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
414   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
415   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
416   PetscFunctionReturn(0);
417 }
418 
419 #undef __FUNCT__
420 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
421 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
422 {
423   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
424   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
425   const PetscScalar *x,*xb;
426   const MatScalar   *v;
427   PetscErrorCode    ierr;
428   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
429   const PetscInt    *aj = a->j,*ai = a->i,*ib;
430   PetscInt          nonzerorow=0;
431 
432   PetscFunctionBegin;
433   ierr = VecSet(zz,zero);CHKERRQ(ierr);
434   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
435   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
436 
437   v  = a->a;
438   xb = x;
439 
440   for (i=0; i<mbs; i++) {
441     n           = ai[1] - ai[0]; /* length of i_th block row of A */
442     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
443     ib          = aj + *ai;
444     jmin        = 0;
445     nonzerorow += (n>0);
446     if (*ib == i) {      /* (diag of A)*x */
447       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
448       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
449       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
450       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
451       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
452       v        += 25; jmin++;
453     }
454     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
455     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
456     for (j=jmin; j<n; j++) {
457       /* (strict lower triangular part of A)*x  */
458       cval       = ib[j]*5;
459       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
460       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
461       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
462       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
463       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
464       /* (strict upper triangular part of A)*x  */
465       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
466       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
467       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
468       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
469       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
470       v        += 25;
471     }
472     xb +=5; ai++;
473   }
474 
475   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
476   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
477   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
478   PetscFunctionReturn(0);
479 }
480 
481 
482 #undef __FUNCT__
483 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
484 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
485 {
486   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
487   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
488   const PetscScalar *x,*xb;
489   const MatScalar   *v;
490   PetscErrorCode    ierr;
491   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
492   const PetscInt    *aj=a->j,*ai=a->i,*ib;
493   PetscInt          nonzerorow=0;
494 
495   PetscFunctionBegin;
496   ierr = VecSet(zz,zero);CHKERRQ(ierr);
497   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
498   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
499 
500   v  = a->a;
501   xb = x;
502 
503   for (i=0; i<mbs; i++) {
504     n           = ai[1] - ai[0]; /* length of i_th block row of A */
505     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
506     ib          = aj + *ai;
507     jmin        = 0;
508     nonzerorow += (n>0);
509     if (*ib == i) {      /* (diag of A)*x */
510       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
511       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
512       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
513       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
514       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
515       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
516       v        += 36; jmin++;
517     }
518     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
519     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
520     for (j=jmin; j<n; j++) {
521       /* (strict lower triangular part of A)*x  */
522       cval       = ib[j]*6;
523       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
524       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
525       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
526       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
527       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
528       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
529       /* (strict upper triangular part of A)*x  */
530       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
531       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
532       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
533       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
534       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
535       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
536       v        += 36;
537     }
538     xb +=6; ai++;
539   }
540 
541   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
542   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
543   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
544   PetscFunctionReturn(0);
545 }
546 #undef __FUNCT__
547 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
548 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
549 {
550   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
551   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
552   const PetscScalar *x,*xb;
553   const MatScalar   *v;
554   PetscErrorCode    ierr;
555   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
556   const PetscInt    *aj=a->j,*ai=a->i,*ib;
557   PetscInt          nonzerorow=0;
558 
559   PetscFunctionBegin;
560   ierr = VecSet(zz,zero);CHKERRQ(ierr);
561   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
562   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
563 
564   v  = a->a;
565   xb = x;
566 
567   for (i=0; i<mbs; i++) {
568     n           = ai[1] - ai[0]; /* length of i_th block row of A */
569     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
570     ib          = aj + *ai;
571     jmin        = 0;
572     nonzerorow += (n>0);
573     if (*ib == i) {      /* (diag of A)*x */
574       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
575       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
576       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
577       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
578       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
579       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
580       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
581       v        += 49; jmin++;
582     }
583     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
584     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
585     for (j=jmin; j<n; j++) {
586       /* (strict lower triangular part of A)*x  */
587       cval       = ib[j]*7;
588       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
589       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
590       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
591       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
592       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
593       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
594       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
595       /* (strict upper triangular part of A)*x  */
596       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
597       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
598       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
599       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
600       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
601       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
602       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
603       v       += 49;
604     }
605     xb +=7; ai++;
606   }
607   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
608   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
609   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
610   PetscFunctionReturn(0);
611 }
612 
613 /*
614     This will not work with MatScalar == float because it calls the BLAS
615 */
616 #undef __FUNCT__
617 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
618 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
619 {
620   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
621   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
622   const PetscScalar *x,*x_ptr,*xb;
623   const MatScalar   *v;
624   PetscErrorCode    ierr;
625   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
626   const PetscInt    *idx,*aj,*ii;
627   PetscInt          nonzerorow=0;
628 
629   PetscFunctionBegin;
630   ierr = VecSet(zz,zero);CHKERRQ(ierr);
631   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x;
632   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
633 
634   aj = a->j;
635   v  = a->a;
636   ii = a->i;
637 
638   if (!a->mult_work) {
639     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
640   }
641   work = a->mult_work;
642 
643   for (i=0; i<mbs; i++) {
644     n           = ii[1] - ii[0]; ncols = n*bs;
645     workt       = work; idx=aj+ii[0];
646     nonzerorow += (n>0);
647 
648     /* upper triangular part */
649     for (j=0; j<n; j++) {
650       xb = x_ptr + bs*(*idx++);
651       for (k=0; k<bs; k++) workt[k] = xb[k];
652       workt += bs;
653     }
654     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
655     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
656 
657     /* strict lower triangular part */
658     idx = aj+ii[0];
659     if (*idx == i) {
660       ncols -= bs; v += bs2; idx++; n--;
661     }
662 
663     if (ncols > 0) {
664       workt = work;
665       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
666       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
667       for (j=0; j<n; j++) {
668         zb = z_ptr + bs*(*idx++);
669         for (k=0; k<bs; k++) zb[k] += workt[k];
670         workt += bs;
671       }
672     }
673     x += bs; v += n*bs2; z += bs; ii++;
674   }
675 
676   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
677   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
678   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
679   PetscFunctionReturn(0);
680 }
681 
682 #undef __FUNCT__
683 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
684 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
685 {
686   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
687   PetscScalar       *z,x1;
688   const PetscScalar *x,*xb;
689   const MatScalar   *v;
690   PetscErrorCode    ierr;
691   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
692   const PetscInt    *aj=a->j,*ai=a->i,*ib;
693   PetscInt          nonzerorow=0;
694 
695   PetscFunctionBegin;
696   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
697   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
698   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
699   v    = a->a;
700   xb   = x;
701 
702   for (i=0; i<mbs; i++) {
703     n           = ai[1] - ai[0]; /* length of i_th row of A */
704     x1          = xb[0];
705     ib          = aj + *ai;
706     jmin        = 0;
707     nonzerorow += (n>0);
708     if (*ib == i) {            /* (diag of A)*x */
709       z[i] += *v++ * x[*ib++]; jmin++;
710     }
711     for (j=jmin; j<n; j++) {
712       cval    = *ib;
713       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
714       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
715     }
716     xb++; ai++;
717   }
718 
719   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
720   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
721 
722   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
723   PetscFunctionReturn(0);
724 }
725 
726 #undef __FUNCT__
727 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
728 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
729 {
730   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
731   PetscScalar       *z,x1,x2;
732   const PetscScalar *x,*xb;
733   const MatScalar   *v;
734   PetscErrorCode    ierr;
735   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
736   const PetscInt    *aj=a->j,*ai=a->i,*ib;
737   PetscInt          nonzerorow=0;
738 
739   PetscFunctionBegin;
740   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
741   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
742   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
743 
744   v  = a->a;
745   xb = x;
746 
747   for (i=0; i<mbs; i++) {
748     n           = ai[1] - ai[0]; /* length of i_th block row of A */
749     x1          = xb[0]; x2 = xb[1];
750     ib          = aj + *ai;
751     jmin        = 0;
752     nonzerorow += (n>0);
753     if (*ib == i) {      /* (diag of A)*x */
754       z[2*i]   += v[0]*x1 + v[2]*x2;
755       z[2*i+1] += v[2]*x1 + v[3]*x2;
756       v        += 4; jmin++;
757     }
758     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
759     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
760     for (j=jmin; j<n; j++) {
761       /* (strict lower triangular part of A)*x  */
762       cval       = ib[j]*2;
763       z[cval]   += v[0]*x1 + v[1]*x2;
764       z[cval+1] += v[2]*x1 + v[3]*x2;
765       /* (strict upper triangular part of A)*x  */
766       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
767       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
768       v        += 4;
769     }
770     xb +=2; ai++;
771   }
772   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
773   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
774 
775   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
776   PetscFunctionReturn(0);
777 }
778 
779 #undef __FUNCT__
780 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
781 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
782 {
783   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
784   PetscScalar       *z,x1,x2,x3;
785   const PetscScalar *x,*xb;
786   const MatScalar   *v;
787   PetscErrorCode    ierr;
788   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
789   const PetscInt    *aj=a->j,*ai=a->i,*ib;
790   PetscInt          nonzerorow=0;
791 
792   PetscFunctionBegin;
793   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
794   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
795   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
796 
797   v  = a->a;
798   xb = x;
799 
800   for (i=0; i<mbs; i++) {
801     n           = ai[1] - ai[0]; /* length of i_th block row of A */
802     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
803     ib          = aj + *ai;
804     jmin        = 0;
805     nonzerorow += (n>0);
806     if (*ib == i) {     /* (diag of A)*x */
807       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
808       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
809       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
810       v        += 9; jmin++;
811     }
812     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
813     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
814     for (j=jmin; j<n; j++) {
815       /* (strict lower triangular part of A)*x  */
816       cval       = ib[j]*3;
817       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
818       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
819       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
820       /* (strict upper triangular part of A)*x  */
821       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
822       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
823       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
824       v        += 9;
825     }
826     xb +=3; ai++;
827   }
828 
829   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
830   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
831 
832   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
833   PetscFunctionReturn(0);
834 }
835 
836 #undef __FUNCT__
837 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
838 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
839 {
840   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
841   PetscScalar       *z,x1,x2,x3,x4;
842   const PetscScalar *x,*xb;
843   const MatScalar   *v;
844   PetscErrorCode    ierr;
845   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
846   const PetscInt    *aj=a->j,*ai=a->i,*ib;
847   PetscInt          nonzerorow=0;
848 
849   PetscFunctionBegin;
850   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
851   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
852   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
853 
854   v  = a->a;
855   xb = x;
856 
857   for (i=0; i<mbs; i++) {
858     n           = ai[1] - ai[0]; /* length of i_th block row of A */
859     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
860     ib          = aj + *ai;
861     jmin        = 0;
862     nonzerorow += (n>0);
863     if (*ib == i) {      /* (diag of A)*x */
864       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
865       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
866       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
867       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
868       v        += 16; jmin++;
869     }
870     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
871     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
872     for (j=jmin; j<n; j++) {
873       /* (strict lower triangular part of A)*x  */
874       cval       = ib[j]*4;
875       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
876       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
877       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
878       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
879       /* (strict upper triangular part of A)*x  */
880       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
881       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
882       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
883       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
884       v        += 16;
885     }
886     xb +=4; ai++;
887   }
888 
889   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
890   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
891 
892   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
893   PetscFunctionReturn(0);
894 }
895 
896 #undef __FUNCT__
897 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
898 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
899 {
900   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
901   PetscScalar       *z,x1,x2,x3,x4,x5;
902   const PetscScalar *x,*xb;
903   const MatScalar   *v;
904   PetscErrorCode    ierr;
905   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
906   const PetscInt    *aj=a->j,*ai=a->i,*ib;
907   PetscInt          nonzerorow=0;
908 
909   PetscFunctionBegin;
910   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
911   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
912   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
913 
914   v  = a->a;
915   xb = x;
916 
917   for (i=0; i<mbs; i++) {
918     n           = ai[1] - ai[0]; /* length of i_th block row of A */
919     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
920     ib          = aj + *ai;
921     jmin        = 0;
922     nonzerorow += (n>0);
923     if (*ib == i) {      /* (diag of A)*x */
924       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
925       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
926       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
927       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
928       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
929       v        += 25; jmin++;
930     }
931     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
932     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
933     for (j=jmin; j<n; j++) {
934       /* (strict lower triangular part of A)*x  */
935       cval       = ib[j]*5;
936       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
937       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
938       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
939       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
940       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
941       /* (strict upper triangular part of A)*x  */
942       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
943       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
944       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
945       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
946       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
947       v        += 25;
948     }
949     xb +=5; ai++;
950   }
951 
952   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
953   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
954 
955   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
956   PetscFunctionReturn(0);
957 }
958 #undef __FUNCT__
959 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
960 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
961 {
962   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
963   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
964   const PetscScalar *x,*xb;
965   const MatScalar   *v;
966   PetscErrorCode    ierr;
967   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
968   const PetscInt    *aj=a->j,*ai=a->i,*ib;
969   PetscInt          nonzerorow=0;
970 
971   PetscFunctionBegin;
972   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
973   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
974   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
975 
976   v  = a->a;
977   xb = x;
978 
979   for (i=0; i<mbs; i++) {
980     n           = ai[1] - ai[0]; /* length of i_th block row of A */
981     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
982     ib          = aj + *ai;
983     jmin        = 0;
984     nonzerorow += (n>0);
985     if (*ib == i) {     /* (diag of A)*x */
986       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
987       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
988       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
989       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
990       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
991       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
992       v        += 36; jmin++;
993     }
994     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
995     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
996     for (j=jmin; j<n; j++) {
997       /* (strict lower triangular part of A)*x  */
998       cval       = ib[j]*6;
999       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1000       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1001       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1002       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1003       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1004       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1005       /* (strict upper triangular part of A)*x  */
1006       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1007       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1008       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1009       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1010       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1011       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1012       v        += 36;
1013     }
1014     xb +=6; ai++;
1015   }
1016 
1017   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1018   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1019 
1020   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1021   PetscFunctionReturn(0);
1022 }
1023 
1024 #undef __FUNCT__
1025 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1026 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1027 {
1028   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1029   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1030   const PetscScalar *x,*xb;
1031   const MatScalar   *v;
1032   PetscErrorCode    ierr;
1033   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1034   const PetscInt    *aj=a->j,*ai=a->i,*ib;
1035   PetscInt          nonzerorow=0;
1036 
1037   PetscFunctionBegin;
1038   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1039   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1040   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1041 
1042   v  = a->a;
1043   xb = x;
1044 
1045   for (i=0; i<mbs; i++) {
1046     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1047     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1048     ib          = aj + *ai;
1049     jmin        = 0;
1050     nonzerorow += (n>0);
1051     if (*ib == i) {     /* (diag of A)*x */
1052       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1053       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1054       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1055       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1056       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1057       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1058       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1059       v        += 49; jmin++;
1060     }
1061     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1062     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1063     for (j=jmin; j<n; j++) {
1064       /* (strict lower triangular part of A)*x  */
1065       cval       = ib[j]*7;
1066       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1067       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1068       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1069       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1070       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1071       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1072       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1073       /* (strict upper triangular part of A)*x  */
1074       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1075       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1076       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1077       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1078       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1079       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1080       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1081       v       += 49;
1082     }
1083     xb +=7; ai++;
1084   }
1085 
1086   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1087   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1088 
1089   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1090   PetscFunctionReturn(0);
1091 }
1092 
1093 #undef __FUNCT__
1094 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1095 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1096 {
1097   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1098   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1099   const PetscScalar *x,*x_ptr,*xb;
1100   const MatScalar   *v;
1101   PetscErrorCode    ierr;
1102   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1103   const PetscInt    *idx,*aj,*ii;
1104   PetscInt          nonzerorow=0;
1105 
1106   PetscFunctionBegin;
1107   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1108   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
1109   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1110 
1111   aj = a->j;
1112   v  = a->a;
1113   ii = a->i;
1114 
1115   if (!a->mult_work) {
1116     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
1117   }
1118   work = a->mult_work;
1119 
1120 
1121   for (i=0; i<mbs; i++) {
1122     n           = ii[1] - ii[0]; ncols = n*bs;
1123     workt       = work; idx=aj+ii[0];
1124     nonzerorow += (n>0);
1125 
1126     /* upper triangular part */
1127     for (j=0; j<n; j++) {
1128       xb = x_ptr + bs*(*idx++);
1129       for (k=0; k<bs; k++) workt[k] = xb[k];
1130       workt += bs;
1131     }
1132     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1133     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1134 
1135     /* strict lower triangular part */
1136     idx = aj+ii[0];
1137     if (*idx == i) {
1138       ncols -= bs; v += bs2; idx++; n--;
1139     }
1140     if (ncols > 0) {
1141       workt = work;
1142       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1143       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1144       for (j=0; j<n; j++) {
1145         zb = z_ptr + bs*(*idx++);
1146         for (k=0; k<bs; k++) zb[k] += workt[k];
1147         workt += bs;
1148       }
1149     }
1150 
1151     x += bs; v += n*bs2; z += bs; ii++;
1152   }
1153 
1154   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1155   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1156 
1157   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1158   PetscFunctionReturn(0);
1159 }
1160 
1161 #undef __FUNCT__
1162 #define __FUNCT__ "MatScale_SeqSBAIJ"
1163 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1164 {
1165   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1166   PetscScalar    oalpha = alpha;
1167   PetscErrorCode ierr;
1168   PetscBLASInt   one = 1,totalnz;
1169 
1170   PetscFunctionBegin;
1171   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
1172   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1173   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1174   PetscFunctionReturn(0);
1175 }
1176 
1177 #undef __FUNCT__
1178 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1179 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1180 {
1181   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1182   const MatScalar *v       = a->a;
1183   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1184   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1185   PetscErrorCode  ierr;
1186   const PetscInt  *aj=a->j,*col;
1187 
1188   PetscFunctionBegin;
1189   if (type == NORM_FROBENIUS) {
1190     for (k=0; k<mbs; k++) {
1191       jmin = a->i[k]; jmax = a->i[k+1];
1192       col  = aj + jmin;
1193       if (*col == k) {         /* diagonal block */
1194         for (i=0; i<bs2; i++) {
1195           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1196         }
1197         jmin++;
1198       }
1199       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
1200         for (i=0; i<bs2; i++) {
1201           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1202         }
1203       }
1204     }
1205     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1206   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1207     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
1208     for (i=0; i<mbs; i++) jl[i] = mbs;
1209     il[0] = 0;
1210 
1211     *norm = 0.0;
1212     for (k=0; k<mbs; k++) { /* k_th block row */
1213       for (j=0; j<bs; j++) sum[j]=0.0;
1214       /*-- col sum --*/
1215       i = jl[k]; /* first |A(i,k)| to be added */
1216       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1217                   at step k */
1218       while (i<mbs) {
1219         nexti = jl[i];  /* next block row to be added */
1220         ik    = il[i];  /* block index of A(i,k) in the array a */
1221         for (j=0; j<bs; j++) {
1222           v = a->a + ik*bs2 + j*bs;
1223           for (k1=0; k1<bs; k1++) {
1224             sum[j] += PetscAbsScalar(*v); v++;
1225           }
1226         }
1227         /* update il, jl */
1228         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1229         jmax = a->i[i+1];
1230         if (jmin < jmax) {
1231           il[i] = jmin;
1232           j     = a->j[jmin];
1233           jl[i] = jl[j]; jl[j]=i;
1234         }
1235         i = nexti;
1236       }
1237       /*-- row sum --*/
1238       jmin = a->i[k]; jmax = a->i[k+1];
1239       for (i=jmin; i<jmax; i++) {
1240         for (j=0; j<bs; j++) {
1241           v = a->a + i*bs2 + j;
1242           for (k1=0; k1<bs; k1++) {
1243             sum[j] += PetscAbsScalar(*v); v += bs;
1244           }
1245         }
1246       }
1247       /* add k_th block row to il, jl */
1248       col = aj+jmin;
1249       if (*col == k) jmin++;
1250       if (jmin < jmax) {
1251         il[k] = jmin;
1252         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1253       }
1254       for (j=0; j<bs; j++) {
1255         if (sum[j] > *norm) *norm = sum[j];
1256       }
1257     }
1258     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1259   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1260   PetscFunctionReturn(0);
1261 }
1262 
1263 #undef __FUNCT__
1264 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1265 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1266 {
1267   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1268   PetscErrorCode ierr;
1269 
1270   PetscFunctionBegin;
1271   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1272   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1273     *flg = PETSC_FALSE;
1274     PetscFunctionReturn(0);
1275   }
1276 
1277   /* if the a->i are the same */
1278   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1279   if (!*flg) PetscFunctionReturn(0);
1280 
1281   /* if a->j are the same */
1282   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1283   if (!*flg) PetscFunctionReturn(0);
1284 
1285   /* if a->a are the same */
1286   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1287   PetscFunctionReturn(0);
1288 }
1289 
1290 #undef __FUNCT__
1291 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1292 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1293 {
1294   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1295   PetscErrorCode  ierr;
1296   PetscInt        i,j,k,row,bs,ambs,bs2;
1297   const PetscInt  *ai,*aj;
1298   PetscScalar     *x,zero = 0.0;
1299   const MatScalar *aa,*aa_j;
1300 
1301   PetscFunctionBegin;
1302   bs = A->rmap->bs;
1303   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1304 
1305   aa   = a->a;
1306   ambs = a->mbs;
1307 
1308   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1309     PetscInt *diag=a->diag;
1310     aa   = a->a;
1311     ambs = a->mbs;
1312     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1313     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1314     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1315     PetscFunctionReturn(0);
1316   }
1317 
1318   ai   = a->i;
1319   aj   = a->j;
1320   bs2  = a->bs2;
1321   ierr = VecSet(v,zero);CHKERRQ(ierr);
1322   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1323   for (i=0; i<ambs; i++) {
1324     j=ai[i];
1325     if (aj[j] == i) {    /* if this is a diagonal element */
1326       row  = i*bs;
1327       aa_j = aa + j*bs2;
1328       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1329     }
1330   }
1331   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1332   PetscFunctionReturn(0);
1333 }
1334 
1335 #undef __FUNCT__
1336 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1337 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1338 {
1339   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1340   PetscScalar       x;
1341   const PetscScalar *l,*li,*ri;
1342   MatScalar         *aa,*v;
1343   PetscErrorCode    ierr;
1344   PetscInt          i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1345   PetscBool         flg;
1346 
1347   PetscFunctionBegin;
1348   if (ll != rr) {
1349     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1350     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1351   }
1352   if (!ll) PetscFunctionReturn(0);
1353   ai  = a->i;
1354   aj  = a->j;
1355   aa  = a->a;
1356   m   = A->rmap->N;
1357   bs  = A->rmap->bs;
1358   mbs = a->mbs;
1359   bs2 = a->bs2;
1360 
1361   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
1362   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1363   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1364   for (i=0; i<mbs; i++) { /* for each block row */
1365     M  = ai[i+1] - ai[i];
1366     li = l + i*bs;
1367     v  = aa + bs2*ai[i];
1368     for (j=0; j<M; j++) { /* for each block */
1369       ri = l + bs*aj[ai[i]+j];
1370       for (k=0; k<bs; k++) {
1371         x = ri[k];
1372         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1373       }
1374     }
1375   }
1376   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1377   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1378   PetscFunctionReturn(0);
1379 }
1380 
1381 #undef __FUNCT__
1382 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1383 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1384 {
1385   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1386 
1387   PetscFunctionBegin;
1388   info->block_size   = a->bs2;
1389   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
1390   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
1391   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
1392   info->assemblies   = A->num_ass;
1393   info->mallocs      = A->info.mallocs;
1394   info->memory       = ((PetscObject)A)->mem;
1395   if (A->factortype) {
1396     info->fill_ratio_given  = A->info.fill_ratio_given;
1397     info->fill_ratio_needed = A->info.fill_ratio_needed;
1398     info->factor_mallocs    = A->info.factor_mallocs;
1399   } else {
1400     info->fill_ratio_given  = 0;
1401     info->fill_ratio_needed = 0;
1402     info->factor_mallocs    = 0;
1403   }
1404   PetscFunctionReturn(0);
1405 }
1406 
1407 
1408 #undef __FUNCT__
1409 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1410 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1411 {
1412   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1413   PetscErrorCode ierr;
1414 
1415   PetscFunctionBegin;
1416   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1417   PetscFunctionReturn(0);
1418 }
1419 
1420 #undef __FUNCT__
1421 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1422 /*
1423    This code does not work since it only checks the upper triangular part of
1424   the matrix. Hence it is not listed in the function table.
1425 */
1426 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1427 {
1428   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1429   PetscErrorCode  ierr;
1430   PetscInt        i,j,n,row,col,bs,mbs;
1431   const PetscInt  *ai,*aj;
1432   PetscReal       atmp;
1433   const MatScalar *aa;
1434   PetscScalar     *x;
1435   PetscInt        ncols,brow,bcol,krow,kcol;
1436 
1437   PetscFunctionBegin;
1438   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1439   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1440   bs  = A->rmap->bs;
1441   aa  = a->a;
1442   ai  = a->i;
1443   aj  = a->j;
1444   mbs = a->mbs;
1445 
1446   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1447   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1448   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1449   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1450   for (i=0; i<mbs; i++) {
1451     ncols = ai[1] - ai[0]; ai++;
1452     brow  = bs*i;
1453     for (j=0; j<ncols; j++) {
1454       bcol = bs*(*aj);
1455       for (kcol=0; kcol<bs; kcol++) {
1456         col = bcol + kcol;      /* col index */
1457         for (krow=0; krow<bs; krow++) {
1458           atmp = PetscAbsScalar(*aa); aa++;
1459           row  = brow + krow;   /* row index */
1460           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1461           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1462         }
1463       }
1464       aj++;
1465     }
1466   }
1467   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1468   PetscFunctionReturn(0);
1469 }
1470