xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 92cf717c53549e6d856cdf7d4e6c2e4bd59ba535)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <../src/mat/impls/sbaij/seq/sbaij.h>
6 #include <petscblaslapack.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11 {
12   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
15   const PetscInt *idx;
16   PetscBT        table_out,table_in;
17 
18   PetscFunctionBegin;
19   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20   mbs  = a->mbs;
21   ai   = a->i;
22   aj   = a->j;
23   bs   = A->rmap->bs;
24   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
25   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
26   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
27   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
28 
29   for (i=0; i<is_max; i++) { /* for each is */
30     isz  = 0;
31     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
32 
33     /* Extract the indices, assume there can be duplicate entries */
34     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36 
37     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
38     bcol_max = 0;
39     for (j=0; j<n; ++j) {
40       brow = idx[j]/bs; /* convert the indices into block indices */
41       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42       if (!PetscBTLookupSet(table_out,brow)) {
43         nidx[isz++] = brow;
44         if (bcol_max < brow) bcol_max = brow;
45       }
46     }
47     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
49 
50     k = 0;
51     for (j=0; j<ov; j++) { /* for each overlap */
52       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
54       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
55 
56       n = isz;  /* length of the updated is[i] */
57       for (brow=0; brow<mbs; brow++) {
58         start = ai[brow]; end   = ai[brow+1];
59         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
60           for (l = start; l<end; l++) {
61             bcol = aj[l];
62             if (!PetscBTLookupSet(table_out,bcol)) {
63               nidx[isz++] = bcol;
64               if (bcol_max < bcol) bcol_max = bcol;
65             }
66           }
67           k++;
68           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
69         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
70           for (l = start; l<end; l++) {
71             bcol = aj[l];
72             if (bcol > bcol_max) break;
73             if (PetscBTLookup(table_in,bcol)) {
74               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
75               break; /* for l = start; l<end ; l++) */
76             }
77           }
78         }
79       }
80     } /* for each overlap */
81 
82     /* expand the Index Set */
83     for (j=0; j<isz; j++) {
84       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
85     }
86     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
87   }
88   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
89   ierr = PetscFree(nidx);CHKERRQ(ierr);
90   ierr = PetscFree(nidx2);CHKERRQ(ierr);
91   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
92   PetscFunctionReturn(0);
93 }
94 
95 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
96         Zero some ops' to avoid invalid usse */
97 #undef __FUNCT__
98 #define __FUNCT__ "MatSeqSBAIJZeroOps_Private"
99 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
100 {
101   PetscErrorCode ierr;
102 
103   PetscFunctionBegin;
104   ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr);
105   Bseq->ops->mult                   = 0;
106   Bseq->ops->multadd                = 0;
107   Bseq->ops->multtranspose          = 0;
108   Bseq->ops->multtransposeadd       = 0;
109   Bseq->ops->lufactor               = 0;
110   Bseq->ops->choleskyfactor         = 0;
111   Bseq->ops->lufactorsymbolic       = 0;
112   Bseq->ops->choleskyfactorsymbolic = 0;
113   Bseq->ops->getinertia             = 0;
114   PetscFunctionReturn(0);
115 }
116 
117 #undef __FUNCT__
118 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
119 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
120 {
121   PetscErrorCode ierr;
122 
123   PetscFunctionBegin;
124   ierr = MatGetSubMatrix_SeqBAIJ(A,isrow,iscol,scall,B);CHKERRQ(ierr);
125 
126   if (isrow != iscol) {
127     PetscBool isequal;
128     ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr);
129     if (!isequal) {
130       ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr);
131     }
132   }
133   PetscFunctionReturn(0);
134 }
135 
136 #undef __FUNCT__
137 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
138 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
139 {
140   PetscErrorCode ierr;
141   PetscInt       i;
142   PetscBool      flg;
143 
144   PetscFunctionBegin;
145   ierr = MatGetSubMatrices_SeqBAIJ(A,n,irow,icol,scall,B);CHKERRQ(ierr);
146   for (i=0; i<n; i++) {
147     ierr = ISEqual(irow[i],icol[i],&flg);CHKERRQ(ierr);
148     if (!flg) {
149       ierr = MatSeqSBAIJZeroOps_Private(*B[i]);CHKERRQ(ierr);
150     }
151   }
152   PetscFunctionReturn(0);
153 }
154 
155 /* -------------------------------------------------------*/
156 /* Should check that shapes of vectors and matrices match */
157 /* -------------------------------------------------------*/
158 
159 #undef __FUNCT__
160 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
161 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
162 {
163   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
164   PetscScalar       *z,x1,x2,zero=0.0;
165   const PetscScalar *x,*xb;
166   const MatScalar   *v;
167   PetscErrorCode    ierr;
168   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
169   const PetscInt    *aj=a->j,*ai=a->i,*ib;
170   PetscInt          nonzerorow=0;
171 
172   PetscFunctionBegin;
173   ierr = VecSet(zz,zero);CHKERRQ(ierr);
174   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
175   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
176 
177   v  = a->a;
178   xb = x;
179 
180   for (i=0; i<mbs; i++) {
181     n           = ai[1] - ai[0]; /* length of i_th block row of A */
182     x1          = xb[0]; x2 = xb[1];
183     ib          = aj + *ai;
184     jmin        = 0;
185     nonzerorow += (n>0);
186     if (*ib == i) {     /* (diag of A)*x */
187       z[2*i]   += v[0]*x1 + v[2]*x2;
188       z[2*i+1] += v[2]*x1 + v[3]*x2;
189       v        += 4; jmin++;
190     }
191     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
192     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
193     for (j=jmin; j<n; j++) {
194       /* (strict lower triangular part of A)*x  */
195       cval       = ib[j]*2;
196       z[cval]   += v[0]*x1 + v[1]*x2;
197       z[cval+1] += v[2]*x1 + v[3]*x2;
198       /* (strict upper triangular part of A)*x  */
199       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
200       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
201       v        += 4;
202     }
203     xb +=2; ai++;
204   }
205 
206   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
207   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
208   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
209   PetscFunctionReturn(0);
210 }
211 
212 #undef __FUNCT__
213 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
214 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
215 {
216   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
217   PetscScalar       *z,x1,x2,x3,zero=0.0;
218   const PetscScalar *x,*xb;
219   const MatScalar   *v;
220   PetscErrorCode    ierr;
221   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
222   const PetscInt    *aj = a->j,*ai = a->i,*ib;
223   PetscInt          nonzerorow=0;
224 
225   PetscFunctionBegin;
226   ierr = VecSet(zz,zero);CHKERRQ(ierr);
227   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
228   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
229 
230   v  = a->a;
231   xb = x;
232 
233   for (i=0; i<mbs; i++) {
234     n           = ai[1] - ai[0]; /* length of i_th block row of A */
235     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
236     ib          = aj + *ai;
237     jmin        = 0;
238     nonzerorow += (n>0);
239     if (*ib == i) {     /* (diag of A)*x */
240       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
241       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
242       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
243       v        += 9; jmin++;
244     }
245     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
246     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
247     for (j=jmin; j<n; j++) {
248       /* (strict lower triangular part of A)*x  */
249       cval       = ib[j]*3;
250       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
251       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
252       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
253       /* (strict upper triangular part of A)*x  */
254       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
255       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
256       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
257       v        += 9;
258     }
259     xb +=3; ai++;
260   }
261 
262   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
263   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
264   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
265   PetscFunctionReturn(0);
266 }
267 
268 #undef __FUNCT__
269 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
270 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
271 {
272   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
273   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
274   const PetscScalar *x,*xb;
275   const MatScalar   *v;
276   PetscErrorCode    ierr;
277   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
278   const PetscInt    *aj = a->j,*ai = a->i,*ib;
279   PetscInt          nonzerorow = 0;
280 
281   PetscFunctionBegin;
282   ierr = VecSet(zz,zero);CHKERRQ(ierr);
283   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
284   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
285 
286   v  = a->a;
287   xb = x;
288 
289   for (i=0; i<mbs; i++) {
290     n           = ai[1] - ai[0]; /* length of i_th block row of A */
291     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
292     ib          = aj + *ai;
293     jmin        = 0;
294     nonzerorow += (n>0);
295     if (*ib == i) {     /* (diag of A)*x */
296       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
297       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
298       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
299       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
300       v        += 16; jmin++;
301     }
302     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
303     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
304     for (j=jmin; j<n; j++) {
305       /* (strict lower triangular part of A)*x  */
306       cval       = ib[j]*4;
307       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
308       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
309       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
310       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
311       /* (strict upper triangular part of A)*x  */
312       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
313       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
314       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
315       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
316       v        += 16;
317     }
318     xb +=4; ai++;
319   }
320 
321   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
322   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
323   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
324   PetscFunctionReturn(0);
325 }
326 
327 #undef __FUNCT__
328 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
329 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
330 {
331   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
332   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
333   const PetscScalar *x,*xb;
334   const MatScalar   *v;
335   PetscErrorCode    ierr;
336   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
337   const PetscInt    *aj = a->j,*ai = a->i,*ib;
338   PetscInt          nonzerorow=0;
339 
340   PetscFunctionBegin;
341   ierr = VecSet(zz,zero);CHKERRQ(ierr);
342   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
343   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
344 
345   v  = a->a;
346   xb = x;
347 
348   for (i=0; i<mbs; i++) {
349     n           = ai[1] - ai[0]; /* length of i_th block row of A */
350     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
351     ib          = aj + *ai;
352     jmin        = 0;
353     nonzerorow += (n>0);
354     if (*ib == i) {      /* (diag of A)*x */
355       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
356       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
357       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
358       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
359       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
360       v        += 25; jmin++;
361     }
362     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
363     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
364     for (j=jmin; j<n; j++) {
365       /* (strict lower triangular part of A)*x  */
366       cval       = ib[j]*5;
367       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
368       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
369       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
370       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
371       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
372       /* (strict upper triangular part of A)*x  */
373       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
374       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
375       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
376       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
377       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
378       v        += 25;
379     }
380     xb +=5; ai++;
381   }
382 
383   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
384   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
385   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
386   PetscFunctionReturn(0);
387 }
388 
389 
390 #undef __FUNCT__
391 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
392 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
393 {
394   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
395   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
396   const PetscScalar *x,*xb;
397   const MatScalar   *v;
398   PetscErrorCode    ierr;
399   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
400   const PetscInt    *aj=a->j,*ai=a->i,*ib;
401   PetscInt          nonzerorow=0;
402 
403   PetscFunctionBegin;
404   ierr = VecSet(zz,zero);CHKERRQ(ierr);
405   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
406   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
407 
408   v  = a->a;
409   xb = x;
410 
411   for (i=0; i<mbs; i++) {
412     n           = ai[1] - ai[0]; /* length of i_th block row of A */
413     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
414     ib          = aj + *ai;
415     jmin        = 0;
416     nonzerorow += (n>0);
417     if (*ib == i) {      /* (diag of A)*x */
418       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
419       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
420       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
421       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
422       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
423       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
424       v        += 36; jmin++;
425     }
426     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
427     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
428     for (j=jmin; j<n; j++) {
429       /* (strict lower triangular part of A)*x  */
430       cval       = ib[j]*6;
431       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
432       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
433       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
434       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
435       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
436       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
437       /* (strict upper triangular part of A)*x  */
438       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
439       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
440       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
441       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
442       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
443       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
444       v        += 36;
445     }
446     xb +=6; ai++;
447   }
448 
449   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
450   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
451   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
452   PetscFunctionReturn(0);
453 }
454 #undef __FUNCT__
455 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
456 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
457 {
458   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
459   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
460   const PetscScalar *x,*xb;
461   const MatScalar   *v;
462   PetscErrorCode    ierr;
463   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
464   const PetscInt    *aj=a->j,*ai=a->i,*ib;
465   PetscInt          nonzerorow=0;
466 
467   PetscFunctionBegin;
468   ierr = VecSet(zz,zero);CHKERRQ(ierr);
469   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
470   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
471 
472   v  = a->a;
473   xb = x;
474 
475   for (i=0; i<mbs; i++) {
476     n           = ai[1] - ai[0]; /* length of i_th block row of A */
477     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
478     ib          = aj + *ai;
479     jmin        = 0;
480     nonzerorow += (n>0);
481     if (*ib == i) {      /* (diag of A)*x */
482       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
483       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
484       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
485       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
486       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
487       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
488       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
489       v        += 49; jmin++;
490     }
491     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
492     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
493     for (j=jmin; j<n; j++) {
494       /* (strict lower triangular part of A)*x  */
495       cval       = ib[j]*7;
496       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
497       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
498       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
499       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
500       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
501       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
502       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
503       /* (strict upper triangular part of A)*x  */
504       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
505       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
506       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
507       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
508       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
509       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
510       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
511       v       += 49;
512     }
513     xb +=7; ai++;
514   }
515   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
516   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
517   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
518   PetscFunctionReturn(0);
519 }
520 
521 /*
522     This will not work with MatScalar == float because it calls the BLAS
523 */
524 #undef __FUNCT__
525 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
526 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
527 {
528   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
529   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
530   const PetscScalar *x,*x_ptr,*xb;
531   const MatScalar   *v;
532   PetscErrorCode    ierr;
533   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
534   const PetscInt    *idx,*aj,*ii;
535   PetscInt          nonzerorow=0;
536 
537   PetscFunctionBegin;
538   ierr = VecSet(zz,zero);CHKERRQ(ierr);
539   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x;
540   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
541 
542   aj = a->j;
543   v  = a->a;
544   ii = a->i;
545 
546   if (!a->mult_work) {
547     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
548   }
549   work = a->mult_work;
550 
551   for (i=0; i<mbs; i++) {
552     n           = ii[1] - ii[0]; ncols = n*bs;
553     workt       = work; idx=aj+ii[0];
554     nonzerorow += (n>0);
555 
556     /* upper triangular part */
557     for (j=0; j<n; j++) {
558       xb = x_ptr + bs*(*idx++);
559       for (k=0; k<bs; k++) workt[k] = xb[k];
560       workt += bs;
561     }
562     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
563     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
564 
565     /* strict lower triangular part */
566     idx = aj+ii[0];
567     if (*idx == i) {
568       ncols -= bs; v += bs2; idx++; n--;
569     }
570 
571     if (ncols > 0) {
572       workt = work;
573       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
574       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
575       for (j=0; j<n; j++) {
576         zb = z_ptr + bs*(*idx++);
577         for (k=0; k<bs; k++) zb[k] += workt[k];
578         workt += bs;
579       }
580     }
581     x += bs; v += n*bs2; z += bs; ii++;
582   }
583 
584   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
585   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
586   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
587   PetscFunctionReturn(0);
588 }
589 
590 #undef __FUNCT__
591 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
592 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
593 {
594   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
595   PetscScalar       *z,x1;
596   const PetscScalar *x,*xb;
597   const MatScalar   *v;
598   PetscErrorCode    ierr;
599   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
600   const PetscInt    *aj=a->j,*ai=a->i,*ib;
601   PetscInt          nonzerorow=0;
602 
603   PetscFunctionBegin;
604   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
605   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
606   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
607   v    = a->a;
608   xb   = x;
609 
610   for (i=0; i<mbs; i++) {
611     n           = ai[1] - ai[0]; /* length of i_th row of A */
612     x1          = xb[0];
613     ib          = aj + *ai;
614     jmin        = 0;
615     nonzerorow += (n>0);
616     if (*ib == i) {            /* (diag of A)*x */
617       z[i] += *v++ * x[*ib++]; jmin++;
618     }
619     for (j=jmin; j<n; j++) {
620       cval    = *ib;
621       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
622       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
623     }
624     xb++; ai++;
625   }
626 
627   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
628   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
629 
630   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
631   PetscFunctionReturn(0);
632 }
633 
634 #undef __FUNCT__
635 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
636 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
637 {
638   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
639   PetscScalar       *z,x1,x2;
640   const PetscScalar *x,*xb;
641   const MatScalar   *v;
642   PetscErrorCode    ierr;
643   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
644   const PetscInt    *aj=a->j,*ai=a->i,*ib;
645   PetscInt          nonzerorow=0;
646 
647   PetscFunctionBegin;
648   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
649   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
650   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
651 
652   v  = a->a;
653   xb = x;
654 
655   for (i=0; i<mbs; i++) {
656     n           = ai[1] - ai[0]; /* length of i_th block row of A */
657     x1          = xb[0]; x2 = xb[1];
658     ib          = aj + *ai;
659     jmin        = 0;
660     nonzerorow += (n>0);
661     if (*ib == i) {      /* (diag of A)*x */
662       z[2*i]   += v[0]*x1 + v[2]*x2;
663       z[2*i+1] += v[2]*x1 + v[3]*x2;
664       v        += 4; jmin++;
665     }
666     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
667     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
668     for (j=jmin; j<n; j++) {
669       /* (strict lower triangular part of A)*x  */
670       cval       = ib[j]*2;
671       z[cval]   += v[0]*x1 + v[1]*x2;
672       z[cval+1] += v[2]*x1 + v[3]*x2;
673       /* (strict upper triangular part of A)*x  */
674       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
675       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
676       v        += 4;
677     }
678     xb +=2; ai++;
679   }
680   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
681   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
682 
683   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
684   PetscFunctionReturn(0);
685 }
686 
687 #undef __FUNCT__
688 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
689 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
690 {
691   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
692   PetscScalar       *z,x1,x2,x3;
693   const PetscScalar *x,*xb;
694   const MatScalar   *v;
695   PetscErrorCode    ierr;
696   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
697   const PetscInt    *aj=a->j,*ai=a->i,*ib;
698   PetscInt          nonzerorow=0;
699 
700   PetscFunctionBegin;
701   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
702   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
703   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
704 
705   v  = a->a;
706   xb = x;
707 
708   for (i=0; i<mbs; i++) {
709     n           = ai[1] - ai[0]; /* length of i_th block row of A */
710     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
711     ib          = aj + *ai;
712     jmin        = 0;
713     nonzerorow += (n>0);
714     if (*ib == i) {     /* (diag of A)*x */
715       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
716       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
717       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
718       v        += 9; jmin++;
719     }
720     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
721     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
722     for (j=jmin; j<n; j++) {
723       /* (strict lower triangular part of A)*x  */
724       cval       = ib[j]*3;
725       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
726       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
727       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
728       /* (strict upper triangular part of A)*x  */
729       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
730       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
731       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
732       v        += 9;
733     }
734     xb +=3; ai++;
735   }
736 
737   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
738   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
739 
740   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
741   PetscFunctionReturn(0);
742 }
743 
744 #undef __FUNCT__
745 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
746 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
747 {
748   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
749   PetscScalar       *z,x1,x2,x3,x4;
750   const PetscScalar *x,*xb;
751   const MatScalar   *v;
752   PetscErrorCode    ierr;
753   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
754   const PetscInt    *aj=a->j,*ai=a->i,*ib;
755   PetscInt          nonzerorow=0;
756 
757   PetscFunctionBegin;
758   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
759   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
760   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
761 
762   v  = a->a;
763   xb = x;
764 
765   for (i=0; i<mbs; i++) {
766     n           = ai[1] - ai[0]; /* length of i_th block row of A */
767     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
768     ib          = aj + *ai;
769     jmin        = 0;
770     nonzerorow += (n>0);
771     if (*ib == i) {      /* (diag of A)*x */
772       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
773       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
774       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
775       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
776       v        += 16; jmin++;
777     }
778     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
779     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
780     for (j=jmin; j<n; j++) {
781       /* (strict lower triangular part of A)*x  */
782       cval       = ib[j]*4;
783       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
784       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
785       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
786       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
787       /* (strict upper triangular part of A)*x  */
788       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
789       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
790       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
791       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
792       v        += 16;
793     }
794     xb +=4; ai++;
795   }
796 
797   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
798   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
799 
800   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
801   PetscFunctionReturn(0);
802 }
803 
804 #undef __FUNCT__
805 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
806 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
807 {
808   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
809   PetscScalar       *z,x1,x2,x3,x4,x5;
810   const PetscScalar *x,*xb;
811   const MatScalar   *v;
812   PetscErrorCode    ierr;
813   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
814   const PetscInt    *aj=a->j,*ai=a->i,*ib;
815   PetscInt          nonzerorow=0;
816 
817   PetscFunctionBegin;
818   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
819   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
820   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
821 
822   v  = a->a;
823   xb = x;
824 
825   for (i=0; i<mbs; i++) {
826     n           = ai[1] - ai[0]; /* length of i_th block row of A */
827     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
828     ib          = aj + *ai;
829     jmin        = 0;
830     nonzerorow += (n>0);
831     if (*ib == i) {      /* (diag of A)*x */
832       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
833       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
834       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
835       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
836       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
837       v        += 25; jmin++;
838     }
839     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
840     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
841     for (j=jmin; j<n; j++) {
842       /* (strict lower triangular part of A)*x  */
843       cval       = ib[j]*5;
844       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
845       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
846       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
847       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
848       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
849       /* (strict upper triangular part of A)*x  */
850       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
851       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
852       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
853       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
854       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
855       v        += 25;
856     }
857     xb +=5; ai++;
858   }
859 
860   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
861   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
862 
863   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
864   PetscFunctionReturn(0);
865 }
866 #undef __FUNCT__
867 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
868 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
869 {
870   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
871   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
872   const PetscScalar *x,*xb;
873   const MatScalar   *v;
874   PetscErrorCode    ierr;
875   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
876   const PetscInt    *aj=a->j,*ai=a->i,*ib;
877   PetscInt          nonzerorow=0;
878 
879   PetscFunctionBegin;
880   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
881   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
882   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
883 
884   v  = a->a;
885   xb = x;
886 
887   for (i=0; i<mbs; i++) {
888     n           = ai[1] - ai[0]; /* length of i_th block row of A */
889     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
890     ib          = aj + *ai;
891     jmin        = 0;
892     nonzerorow += (n>0);
893     if (*ib == i) {     /* (diag of A)*x */
894       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
895       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
896       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
897       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
898       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
899       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
900       v        += 36; jmin++;
901     }
902     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
903     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
904     for (j=jmin; j<n; j++) {
905       /* (strict lower triangular part of A)*x  */
906       cval       = ib[j]*6;
907       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
908       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
909       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
910       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
911       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
912       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
913       /* (strict upper triangular part of A)*x  */
914       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
915       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
916       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
917       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
918       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
919       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
920       v        += 36;
921     }
922     xb +=6; ai++;
923   }
924 
925   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
926   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
927 
928   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
929   PetscFunctionReturn(0);
930 }
931 
932 #undef __FUNCT__
933 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
934 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
935 {
936   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
937   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
938   const PetscScalar *x,*xb;
939   const MatScalar   *v;
940   PetscErrorCode    ierr;
941   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
942   const PetscInt    *aj=a->j,*ai=a->i,*ib;
943   PetscInt          nonzerorow=0;
944 
945   PetscFunctionBegin;
946   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
947   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
948   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
949 
950   v  = a->a;
951   xb = x;
952 
953   for (i=0; i<mbs; i++) {
954     n           = ai[1] - ai[0]; /* length of i_th block row of A */
955     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
956     ib          = aj + *ai;
957     jmin        = 0;
958     nonzerorow += (n>0);
959     if (*ib == i) {     /* (diag of A)*x */
960       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
961       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
962       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
963       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
964       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
965       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
966       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
967       v        += 49; jmin++;
968     }
969     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
970     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
971     for (j=jmin; j<n; j++) {
972       /* (strict lower triangular part of A)*x  */
973       cval       = ib[j]*7;
974       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
975       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
976       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
977       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
978       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
979       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
980       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
981       /* (strict upper triangular part of A)*x  */
982       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
983       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
984       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
985       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
986       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
987       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
988       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
989       v       += 49;
990     }
991     xb +=7; ai++;
992   }
993 
994   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
995   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
996 
997   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
998   PetscFunctionReturn(0);
999 }
1000 
1001 #undef __FUNCT__
1002 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1003 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1004 {
1005   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1006   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1007   const PetscScalar *x,*x_ptr,*xb;
1008   const MatScalar   *v;
1009   PetscErrorCode    ierr;
1010   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1011   const PetscInt    *idx,*aj,*ii;
1012   PetscInt          nonzerorow=0;
1013 
1014   PetscFunctionBegin;
1015   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1016   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
1017   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1018 
1019   aj = a->j;
1020   v  = a->a;
1021   ii = a->i;
1022 
1023   if (!a->mult_work) {
1024     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
1025   }
1026   work = a->mult_work;
1027 
1028 
1029   for (i=0; i<mbs; i++) {
1030     n           = ii[1] - ii[0]; ncols = n*bs;
1031     workt       = work; idx=aj+ii[0];
1032     nonzerorow += (n>0);
1033 
1034     /* upper triangular part */
1035     for (j=0; j<n; j++) {
1036       xb = x_ptr + bs*(*idx++);
1037       for (k=0; k<bs; k++) workt[k] = xb[k];
1038       workt += bs;
1039     }
1040     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1041     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1042 
1043     /* strict lower triangular part */
1044     idx = aj+ii[0];
1045     if (*idx == i) {
1046       ncols -= bs; v += bs2; idx++; n--;
1047     }
1048     if (ncols > 0) {
1049       workt = work;
1050       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1051       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1052       for (j=0; j<n; j++) {
1053         zb = z_ptr + bs*(*idx++);
1054         for (k=0; k<bs; k++) zb[k] += workt[k];
1055         workt += bs;
1056       }
1057     }
1058 
1059     x += bs; v += n*bs2; z += bs; ii++;
1060   }
1061 
1062   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1063   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1064 
1065   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1066   PetscFunctionReturn(0);
1067 }
1068 
1069 #undef __FUNCT__
1070 #define __FUNCT__ "MatScale_SeqSBAIJ"
1071 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1072 {
1073   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1074   PetscScalar    oalpha = alpha;
1075   PetscErrorCode ierr;
1076   PetscBLASInt   one = 1,totalnz;
1077 
1078   PetscFunctionBegin;
1079   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
1080   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1081   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1082   PetscFunctionReturn(0);
1083 }
1084 
1085 #undef __FUNCT__
1086 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1087 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1088 {
1089   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1090   const MatScalar *v       = a->a;
1091   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1092   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1093   PetscErrorCode  ierr;
1094   const PetscInt  *aj=a->j,*col;
1095 
1096   PetscFunctionBegin;
1097   if (type == NORM_FROBENIUS) {
1098     for (k=0; k<mbs; k++) {
1099       jmin = a->i[k]; jmax = a->i[k+1];
1100       col  = aj + jmin;
1101       if (*col == k) {         /* diagonal block */
1102         for (i=0; i<bs2; i++) {
1103           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1104         }
1105         jmin++;
1106       }
1107       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
1108         for (i=0; i<bs2; i++) {
1109           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1110         }
1111       }
1112     }
1113     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1114     ierr = PetscLogFlops(2*bs2*a->nz);CHKERRQ(ierr);
1115   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1116     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
1117     for (i=0; i<mbs; i++) jl[i] = mbs;
1118     il[0] = 0;
1119 
1120     *norm = 0.0;
1121     for (k=0; k<mbs; k++) { /* k_th block row */
1122       for (j=0; j<bs; j++) sum[j]=0.0;
1123       /*-- col sum --*/
1124       i = jl[k]; /* first |A(i,k)| to be added */
1125       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1126                   at step k */
1127       while (i<mbs) {
1128         nexti = jl[i];  /* next block row to be added */
1129         ik    = il[i];  /* block index of A(i,k) in the array a */
1130         for (j=0; j<bs; j++) {
1131           v = a->a + ik*bs2 + j*bs;
1132           for (k1=0; k1<bs; k1++) {
1133             sum[j] += PetscAbsScalar(*v); v++;
1134           }
1135         }
1136         /* update il, jl */
1137         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1138         jmax = a->i[i+1];
1139         if (jmin < jmax) {
1140           il[i] = jmin;
1141           j     = a->j[jmin];
1142           jl[i] = jl[j]; jl[j]=i;
1143         }
1144         i = nexti;
1145       }
1146       /*-- row sum --*/
1147       jmin = a->i[k]; jmax = a->i[k+1];
1148       for (i=jmin; i<jmax; i++) {
1149         for (j=0; j<bs; j++) {
1150           v = a->a + i*bs2 + j;
1151           for (k1=0; k1<bs; k1++) {
1152             sum[j] += PetscAbsScalar(*v); v += bs;
1153           }
1154         }
1155       }
1156       /* add k_th block row to il, jl */
1157       col = aj+jmin;
1158       if (*col == k) jmin++;
1159       if (jmin < jmax) {
1160         il[k] = jmin;
1161         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1162       }
1163       for (j=0; j<bs; j++) {
1164         if (sum[j] > *norm) *norm = sum[j];
1165       }
1166     }
1167     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1168     ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr);
1169   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1170   PetscFunctionReturn(0);
1171 }
1172 
1173 #undef __FUNCT__
1174 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1175 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1176 {
1177   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1178   PetscErrorCode ierr;
1179 
1180   PetscFunctionBegin;
1181   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1182   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1183     *flg = PETSC_FALSE;
1184     PetscFunctionReturn(0);
1185   }
1186 
1187   /* if the a->i are the same */
1188   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1189   if (!*flg) PetscFunctionReturn(0);
1190 
1191   /* if a->j are the same */
1192   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1193   if (!*flg) PetscFunctionReturn(0);
1194 
1195   /* if a->a are the same */
1196   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1197   PetscFunctionReturn(0);
1198 }
1199 
1200 #undef __FUNCT__
1201 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1202 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1203 {
1204   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1205   PetscErrorCode  ierr;
1206   PetscInt        i,j,k,row,bs,ambs,bs2;
1207   const PetscInt  *ai,*aj;
1208   PetscScalar     *x,zero = 0.0;
1209   const MatScalar *aa,*aa_j;
1210 
1211   PetscFunctionBegin;
1212   bs = A->rmap->bs;
1213   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1214 
1215   aa   = a->a;
1216   ambs = a->mbs;
1217 
1218   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1219     PetscInt *diag=a->diag;
1220     aa   = a->a;
1221     ambs = a->mbs;
1222     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1223     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1224     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1225     PetscFunctionReturn(0);
1226   }
1227 
1228   ai   = a->i;
1229   aj   = a->j;
1230   bs2  = a->bs2;
1231   ierr = VecSet(v,zero);CHKERRQ(ierr);
1232   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1233   for (i=0; i<ambs; i++) {
1234     j=ai[i];
1235     if (aj[j] == i) {    /* if this is a diagonal element */
1236       row  = i*bs;
1237       aa_j = aa + j*bs2;
1238       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1239     }
1240   }
1241   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1242   PetscFunctionReturn(0);
1243 }
1244 
1245 #undef __FUNCT__
1246 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1247 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1248 {
1249   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1250   PetscScalar       x;
1251   const PetscScalar *l,*li,*ri;
1252   MatScalar         *aa,*v;
1253   PetscErrorCode    ierr;
1254   PetscInt          i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1255   PetscBool         flg;
1256 
1257   PetscFunctionBegin;
1258   if (ll != rr) {
1259     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1260     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1261   }
1262   if (!ll) PetscFunctionReturn(0);
1263   ai  = a->i;
1264   aj  = a->j;
1265   aa  = a->a;
1266   m   = A->rmap->N;
1267   bs  = A->rmap->bs;
1268   mbs = a->mbs;
1269   bs2 = a->bs2;
1270 
1271   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
1272   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1273   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1274   for (i=0; i<mbs; i++) { /* for each block row */
1275     M  = ai[i+1] - ai[i];
1276     li = l + i*bs;
1277     v  = aa + bs2*ai[i];
1278     for (j=0; j<M; j++) { /* for each block */
1279       ri = l + bs*aj[ai[i]+j];
1280       for (k=0; k<bs; k++) {
1281         x = ri[k];
1282         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1283       }
1284     }
1285   }
1286   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1287   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1288   PetscFunctionReturn(0);
1289 }
1290 
1291 #undef __FUNCT__
1292 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1293 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1294 {
1295   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1296 
1297   PetscFunctionBegin;
1298   info->block_size   = a->bs2;
1299   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
1300   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
1301   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
1302   info->assemblies   = A->num_ass;
1303   info->mallocs      = A->info.mallocs;
1304   info->memory       = ((PetscObject)A)->mem;
1305   if (A->factortype) {
1306     info->fill_ratio_given  = A->info.fill_ratio_given;
1307     info->fill_ratio_needed = A->info.fill_ratio_needed;
1308     info->factor_mallocs    = A->info.factor_mallocs;
1309   } else {
1310     info->fill_ratio_given  = 0;
1311     info->fill_ratio_needed = 0;
1312     info->factor_mallocs    = 0;
1313   }
1314   PetscFunctionReturn(0);
1315 }
1316 
1317 
1318 #undef __FUNCT__
1319 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1320 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1321 {
1322   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1323   PetscErrorCode ierr;
1324 
1325   PetscFunctionBegin;
1326   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1327   PetscFunctionReturn(0);
1328 }
1329 
1330 #undef __FUNCT__
1331 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1332 /*
1333    This code does not work since it only checks the upper triangular part of
1334   the matrix. Hence it is not listed in the function table.
1335 */
1336 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1337 {
1338   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1339   PetscErrorCode  ierr;
1340   PetscInt        i,j,n,row,col,bs,mbs;
1341   const PetscInt  *ai,*aj;
1342   PetscReal       atmp;
1343   const MatScalar *aa;
1344   PetscScalar     *x;
1345   PetscInt        ncols,brow,bcol,krow,kcol;
1346 
1347   PetscFunctionBegin;
1348   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1349   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1350   bs  = A->rmap->bs;
1351   aa  = a->a;
1352   ai  = a->i;
1353   aj  = a->j;
1354   mbs = a->mbs;
1355 
1356   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1357   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1358   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1359   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1360   for (i=0; i<mbs; i++) {
1361     ncols = ai[1] - ai[0]; ai++;
1362     brow  = bs*i;
1363     for (j=0; j<ncols; j++) {
1364       bcol = bs*(*aj);
1365       for (kcol=0; kcol<bs; kcol++) {
1366         col = bcol + kcol;      /* col index */
1367         for (krow=0; krow<bs; krow++) {
1368           atmp = PetscAbsScalar(*aa); aa++;
1369           row  = brow + krow;   /* row index */
1370           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1371           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1372         }
1373       }
1374       aj++;
1375     }
1376   }
1377   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1378   PetscFunctionReturn(0);
1379 }
1380