xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision d4bf62d159b4eec940d9872c577cf78ea329f539)
1 #define PETSCMAT_DLL
2 
3 #include "../src/mat/impls/baij/seq/baij.h"
4 #include "../src/mat/blockinvert.h"
5 #include "petscbt.h"
6 #include "../src/mat/impls/sbaij/seq/sbaij.h"
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11 {
12   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
15   const PetscInt *idx;
16   PetscBT        table,table0;
17 
18   PetscFunctionBegin;
19   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20   mbs = a->mbs;
21   ai  = a->i;
22   aj  = a->j;
23   bs  = A->rmap->bs;
24   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
25   ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
26   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
27   ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr);
28 
29   for (i=0; i<is_max; i++) { /* for each is */
30     isz  = 0;
31     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
32 
33     /* Extract the indices, assume there can be duplicate entries */
34     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36 
37     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
38     bcol_max = 0;
39     for (j=0; j<n ; ++j){
40       brow = idx[j]/bs; /* convert the indices into block indices */
41       if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42       if(!PetscBTLookupSet(table,brow)) {
43         nidx[isz++] = brow;
44         if (bcol_max < brow) bcol_max = brow;
45       }
46     }
47     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
49 
50     k = 0;
51     for (j=0; j<ov; j++){ /* for each overlap */
52       /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53       ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr);
54       for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); }
55 
56       n = isz;  /* length of the updated is[i] */
57       for (brow=0; brow<mbs; brow++){
58         start = ai[brow]; end   = ai[brow+1];
59         if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
60           for (l = start; l<end ; l++){
61             bcol = aj[l];
62             if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
63           }
64           k++;
65           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
66         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
67           for (l = start; l<end ; l++){
68             bcol = aj[l];
69             if (bcol > bcol_max) break;
70             if (PetscBTLookup(table0,bcol)){
71               if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
72               break; /* for l = start; l<end ; l++) */
73             }
74           }
75         }
76       }
77     } /* for each overlap */
78 
79     /* expand the Index Set */
80     for (j=0; j<isz; j++) {
81       for (k=0; k<bs; k++)
82         nidx2[j*bs+k] = nidx[j]*bs+k;
83     }
84     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
85   }
86   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
87   ierr = PetscFree(nidx);CHKERRQ(ierr);
88   ierr = PetscFree(nidx2);CHKERRQ(ierr);
89   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
90   PetscFunctionReturn(0);
91 }
92 
93 #undef __FUNCT__
94 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
95 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
96 {
97   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
98   PetscErrorCode ierr;
99   PetscInt       *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
100   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
101   PetscInt       nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
102   const PetscInt *irow,*aj = a->j,*ai = a->i;
103   MatScalar      *mat_a;
104   Mat            C;
105   PetscTruth     flag,sorted;
106 
107   PetscFunctionBegin;
108   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
109   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
110   if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
111 
112   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
113   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
114 
115   ierr  = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
116   ssmap = smap;
117   ierr  = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
118   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
119   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
120   /* determine lens of each row */
121   for (i=0; i<nrows; i++) {
122     kstart  = ai[irow[i]];
123     kend    = kstart + a->ilen[irow[i]];
124     lens[i] = 0;
125       for (k=kstart; k<kend; k++) {
126         if (ssmap[aj[k]]) {
127           lens[i]++;
128         }
129       }
130     }
131   /* Create and fill new matrix */
132   if (scall == MAT_REUSE_MATRIX) {
133     c = (Mat_SeqSBAIJ *)((*B)->data);
134 
135     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
136     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
137     if (!flag) {
138       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
139     }
140     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
141     C = *B;
142   } else {
143     ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr);
144     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
145     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
146     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
147   }
148   c = (Mat_SeqSBAIJ *)(C->data);
149   for (i=0; i<nrows; i++) {
150     row    = irow[i];
151     kstart = ai[row];
152     kend   = kstart + a->ilen[row];
153     mat_i  = c->i[i];
154     mat_j  = c->j + mat_i;
155     mat_a  = c->a + mat_i*bs2;
156     mat_ilen = c->ilen + i;
157     for (k=kstart; k<kend; k++) {
158       if ((tcol=ssmap[a->j[k]])) {
159         *mat_j++ = tcol - 1;
160         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
161         mat_a   += bs2;
162         (*mat_ilen)++;
163       }
164     }
165   }
166 
167   /* Free work space */
168   ierr = PetscFree(smap);CHKERRQ(ierr);
169   ierr = PetscFree(lens);CHKERRQ(ierr);
170   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
171   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
172 
173   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
174   *B = C;
175   PetscFunctionReturn(0);
176 }
177 
178 #undef __FUNCT__
179 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
180 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
181 {
182   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
183   IS             is1;
184   PetscErrorCode ierr;
185   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
186   const PetscInt *irow;
187 
188   PetscFunctionBegin;
189   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
190 
191   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
192   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
193 
194   /* Verify if the indices corespond to each element in a block
195    and form the IS with compressed IS */
196   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);CHKERRQ(ierr);
197   iary = vary + a->mbs;
198   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
199   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
200 
201   count = 0;
202   for (i=0; i<a->mbs; i++) {
203     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
204     if (vary[i]==bs) iary[count++] = i;
205   }
206   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
207 
208   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
209   ierr = PetscFree(vary);CHKERRQ(ierr);
210 
211   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr);
212   ISDestroy(is1);
213   PetscFunctionReturn(0);
214 }
215 
216 #undef __FUNCT__
217 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
218 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
219 {
220   PetscErrorCode ierr;
221   PetscInt       i;
222 
223   PetscFunctionBegin;
224   if (scall == MAT_INITIAL_MATRIX) {
225     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
226   }
227 
228   for (i=0; i<n; i++) {
229     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
230   }
231   PetscFunctionReturn(0);
232 }
233 
234 /* -------------------------------------------------------*/
235 /* Should check that shapes of vectors and matrices match */
236 /* -------------------------------------------------------*/
237 #include "petscblaslapack.h"
238 
239 #undef __FUNCT__
240 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
241 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
242 {
243   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
244   const PetscScalar *x;
245   PetscScalar       *z,x1,sum;
246   const MatScalar   *v;
247   PetscErrorCode    ierr;
248   PetscInt          mbs=a->mbs,i,j,nz;
249   const PetscInt    *aj=a->j,*ai=a->i,*ib;
250 
251   PetscFunctionBegin;
252   ierr = VecSet(zz,0.0);CHKERRQ(ierr);
253   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
254   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
255 
256   v  = a->a;
257   ib = aj;
258 
259   for (i=0; i<mbs; i++) {
260     nz   = ai[i+1] - ai[i];        /* length of i_th row of A */
261     x1   = x[i];
262     sum  = v[0]*x1;                /* diagonal term */
263     for (j=1; j<nz; j++) {
264       z[ib[j]] += v[j] * x1;       /* (strict lower triangular part of A)*x  */
265       sum      += v[j] * x[ib[j]]; /* (strict upper triangular part of A)*x  */
266     }
267     z[i] += sum;
268     v    += nz;
269     ib   += nz;
270   }
271 
272   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
273   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
274   ierr = PetscLogFlops(2.0*(2.0*a->nz - mbs) - mbs);CHKERRQ(ierr);
275   PetscFunctionReturn(0);
276 }
277 
278 #undef __FUNCT__
279 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
280 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
281 {
282   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
283   PetscScalar    *x,*z,*xb,x1,x2,zero=0.0;
284   MatScalar      *v;
285   PetscErrorCode ierr;
286   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
287   PetscInt       nonzerorow=0;
288 
289   PetscFunctionBegin;
290   ierr = VecSet(zz,zero);CHKERRQ(ierr);
291   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
292   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
293 
294   v     = a->a;
295   xb = x;
296 
297   for (i=0; i<mbs; i++) {
298     n  = ai[1] - ai[0]; /* length of i_th block row of A */
299     x1 = xb[0]; x2 = xb[1];
300     ib = aj + *ai;
301     jmin = 0;
302     nonzerorow += (n>0);
303     if (*ib == i){     /* (diag of A)*x */
304       z[2*i]   += v[0]*x1 + v[2]*x2;
305       z[2*i+1] += v[2]*x1 + v[3]*x2;
306       v += 4; jmin++;
307     }
308     for (j=jmin; j<n; j++) {
309       /* (strict lower triangular part of A)*x  */
310       cval       = ib[j]*2;
311       z[cval]     += v[0]*x1 + v[1]*x2;
312       z[cval+1]   += v[2]*x1 + v[3]*x2;
313       /* (strict upper triangular part of A)*x  */
314       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
315       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
316       v  += 4;
317     }
318     xb +=2; ai++;
319   }
320 
321   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
322   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
323   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
324   PetscFunctionReturn(0);
325 }
326 
327 #undef __FUNCT__
328 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
329 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
330 {
331   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
332   PetscScalar    *x,*z,*xb,x1,x2,x3,zero=0.0;
333   MatScalar      *v;
334   PetscErrorCode ierr;
335   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
336   PetscInt       nonzerorow=0;
337 
338   PetscFunctionBegin;
339   ierr = VecSet(zz,zero);CHKERRQ(ierr);
340   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
341   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
342 
343   v    = a->a;
344   xb   = x;
345 
346   for (i=0; i<mbs; i++) {
347     n  = ai[1] - ai[0]; /* length of i_th block row of A */
348     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
349     ib = aj + *ai;
350     jmin = 0;
351     nonzerorow += (n>0);
352     if (*ib == i){     /* (diag of A)*x */
353       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
354       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
355       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
356       v += 9; jmin++;
357     }
358     for (j=jmin; j<n; j++) {
359       /* (strict lower triangular part of A)*x  */
360       cval       = ib[j]*3;
361       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
362       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
363       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
364       /* (strict upper triangular part of A)*x  */
365       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
366       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
367       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
368       v  += 9;
369     }
370     xb +=3; ai++;
371   }
372 
373   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
374   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
375   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
376   PetscFunctionReturn(0);
377 }
378 
379 #undef __FUNCT__
380 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
381 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
382 {
383   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
384   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
385   MatScalar      *v;
386   PetscErrorCode ierr;
387   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
388   PetscInt       nonzerorow=0;
389 
390   PetscFunctionBegin;
391   ierr = VecSet(zz,zero);CHKERRQ(ierr);
392   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
393   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
394 
395   v     = a->a;
396   xb = x;
397 
398   for (i=0; i<mbs; i++) {
399     n  = ai[1] - ai[0]; /* length of i_th block row of A */
400     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
401     ib = aj + *ai;
402     jmin = 0;
403     nonzerorow += (n>0);
404     if (*ib == i){     /* (diag of A)*x */
405       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
406       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
407       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
408       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
409       v += 16; jmin++;
410     }
411     for (j=jmin; j<n; j++) {
412       /* (strict lower triangular part of A)*x  */
413       cval       = ib[j]*4;
414       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
415       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
416       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
417       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
418       /* (strict upper triangular part of A)*x  */
419       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
420       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
421       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
422       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
423       v  += 16;
424     }
425     xb +=4; ai++;
426   }
427 
428   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
429   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
430   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
431   PetscFunctionReturn(0);
432 }
433 
434 #undef __FUNCT__
435 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
436 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
437 {
438   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
439   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
440   MatScalar      *v;
441   PetscErrorCode ierr;
442   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
443   PetscInt       nonzerorow=0;
444 
445   PetscFunctionBegin;
446   ierr = VecSet(zz,zero);CHKERRQ(ierr);
447   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
448   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
449 
450   v     = a->a;
451   xb = x;
452 
453   for (i=0; i<mbs; i++) {
454     n  = ai[1] - ai[0]; /* length of i_th block row of A */
455     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
456     ib = aj + *ai;
457     jmin = 0;
458     nonzerorow += (n>0);
459     if (*ib == i){      /* (diag of A)*x */
460       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
461       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
462       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
463       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
464       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
465       v += 25; jmin++;
466     }
467     for (j=jmin; j<n; j++) {
468       /* (strict lower triangular part of A)*x  */
469       cval       = ib[j]*5;
470       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
471       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
472       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
473       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
474       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
475       /* (strict upper triangular part of A)*x  */
476       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
477       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
478       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
479       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
480       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
481       v  += 25;
482     }
483     xb +=5; ai++;
484   }
485 
486   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
487   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
488   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
489   PetscFunctionReturn(0);
490 }
491 
492 
493 #undef __FUNCT__
494 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
495 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
496 {
497   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
498   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
499   MatScalar      *v;
500   PetscErrorCode ierr;
501   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
502   PetscInt       nonzerorow=0;
503 
504   PetscFunctionBegin;
505   ierr = VecSet(zz,zero);CHKERRQ(ierr);
506   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
507   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
508 
509   v     = a->a;
510   xb = x;
511 
512   for (i=0; i<mbs; i++) {
513     n  = ai[1] - ai[0]; /* length of i_th block row of A */
514     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
515     ib = aj + *ai;
516     jmin = 0;
517     nonzerorow += (n>0);
518     if (*ib == i){      /* (diag of A)*x */
519       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
520       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
521       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
522       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
523       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
524       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
525       v += 36; jmin++;
526     }
527     for (j=jmin; j<n; j++) {
528       /* (strict lower triangular part of A)*x  */
529       cval       = ib[j]*6;
530       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
531       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
532       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
533       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
534       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
535       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
536       /* (strict upper triangular part of A)*x  */
537       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
538       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
539       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
540       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
541       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
542       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
543       v  += 36;
544     }
545     xb +=6; ai++;
546   }
547 
548   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
549   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
550   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
551   PetscFunctionReturn(0);
552 }
553 #undef __FUNCT__
554 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
555 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
556 {
557   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
558   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
559   MatScalar      *v;
560   PetscErrorCode ierr;
561   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
562   PetscInt       nonzerorow=0;
563 
564   PetscFunctionBegin;
565   ierr = VecSet(zz,zero);CHKERRQ(ierr);
566   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
567   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
568 
569   v     = a->a;
570   xb = x;
571 
572   for (i=0; i<mbs; i++) {
573     n  = ai[1] - ai[0]; /* length of i_th block row of A */
574     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
575     ib = aj + *ai;
576     jmin = 0;
577     nonzerorow += (n>0);
578     if (*ib == i){      /* (diag of A)*x */
579       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
580       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
581       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
582       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
583       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
584       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
585       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
586       v += 49; jmin++;
587     }
588     for (j=jmin; j<n; j++) {
589       /* (strict lower triangular part of A)*x  */
590       cval       = ib[j]*7;
591       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
592       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
593       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
594       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
595       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
596       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
597       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
598       /* (strict upper triangular part of A)*x  */
599       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
600       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
601       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
602       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
603       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
604       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
605       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
606       v  += 49;
607     }
608     xb +=7; ai++;
609   }
610   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
611   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
612   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
613   PetscFunctionReturn(0);
614 }
615 
616 /*
617     This will not work with MatScalar == float because it calls the BLAS
618 */
619 #undef __FUNCT__
620 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
621 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
622 {
623   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
624   PetscScalar    *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
625   MatScalar      *v;
626   PetscErrorCode ierr;
627   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
628   PetscInt       nonzerorow=0;
629 
630   PetscFunctionBegin;
631   ierr = VecSet(zz,zero);CHKERRQ(ierr);
632   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
633   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
634 
635   aj   = a->j;
636   v    = a->a;
637   ii   = a->i;
638 
639   if (!a->mult_work) {
640     ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
641   }
642   work = a->mult_work;
643 
644   for (i=0; i<mbs; i++) {
645     n     = ii[1] - ii[0]; ncols = n*bs;
646     workt = work; idx=aj+ii[0];
647     nonzerorow += (n>0);
648 
649     /* upper triangular part */
650     for (j=0; j<n; j++) {
651       xb = x_ptr + bs*(*idx++);
652       for (k=0; k<bs; k++) workt[k] = xb[k];
653       workt += bs;
654     }
655     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
656     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
657 
658     /* strict lower triangular part */
659     idx = aj+ii[0];
660     if (*idx == i){
661       ncols -= bs; v += bs2; idx++; n--;
662     }
663 
664     if (ncols > 0){
665       workt = work;
666       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
667       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
668       for (j=0; j<n; j++) {
669         zb = z_ptr + bs*(*idx++);
670         for (k=0; k<bs; k++) zb[k] += workt[k] ;
671         workt += bs;
672       }
673     }
674     x += bs; v += n*bs2; z += bs; ii++;
675   }
676 
677   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
678   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
679   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
680   PetscFunctionReturn(0);
681 }
682 
683 extern PetscErrorCode VecCopy_Seq(Vec,Vec);
684 #undef __FUNCT__
685 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
686 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
687 {
688   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
689   PetscScalar    *x,*z,*xb,x1;
690   MatScalar      *v;
691   PetscErrorCode ierr;
692   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
693   PetscInt       nonzerorow=0;
694 
695   PetscFunctionBegin;
696   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
697   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
698   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
699   v  = a->a;
700   xb = x;
701 
702   for (i=0; i<mbs; i++) {
703     n  = ai[1] - ai[0];  /* length of i_th row of A */
704     x1 = xb[0];
705     ib = aj + *ai;
706     jmin = 0;
707     nonzerorow += (n>0);
708     if (*ib == i) {            /* (diag of A)*x */
709       z[i] += *v++ * x[*ib++]; jmin++;
710     }
711     for (j=jmin; j<n; j++) {
712       cval    = *ib;
713       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
714       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
715     }
716     xb++; ai++;
717   }
718 
719   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
720   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
721 
722   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
723   PetscFunctionReturn(0);
724 }
725 
726 #undef __FUNCT__
727 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
728 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
729 {
730   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
731   PetscScalar    *x,*z,*xb,x1,x2;
732   MatScalar      *v;
733   PetscErrorCode ierr;
734   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
735   PetscInt       nonzerorow=0;
736 
737   PetscFunctionBegin;
738   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
739   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
740   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
741 
742   v  = a->a;
743   xb = x;
744 
745   for (i=0; i<mbs; i++) {
746     n  = ai[1] - ai[0]; /* length of i_th block row of A */
747     x1 = xb[0]; x2 = xb[1];
748     ib = aj + *ai;
749     jmin = 0;
750     nonzerorow += (n>0);
751     if (*ib == i){      /* (diag of A)*x */
752       z[2*i]   += v[0]*x1 + v[2]*x2;
753       z[2*i+1] += v[2]*x1 + v[3]*x2;
754       v += 4; jmin++;
755     }
756     for (j=jmin; j<n; j++) {
757       /* (strict lower triangular part of A)*x  */
758       cval       = ib[j]*2;
759       z[cval]     += v[0]*x1 + v[1]*x2;
760       z[cval+1]   += v[2]*x1 + v[3]*x2;
761       /* (strict upper triangular part of A)*x  */
762       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
763       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
764       v  += 4;
765     }
766     xb +=2; ai++;
767   }
768   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
769   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
770 
771   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
772   PetscFunctionReturn(0);
773 }
774 
775 #undef __FUNCT__
776 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
777 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
778 {
779   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
780   PetscScalar    *x,*z,*xb,x1,x2,x3;
781   MatScalar      *v;
782   PetscErrorCode ierr;
783   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
784   PetscInt       nonzerorow=0;
785 
786   PetscFunctionBegin;
787   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
788   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
789   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
790 
791   v     = a->a;
792   xb = x;
793 
794   for (i=0; i<mbs; i++) {
795     n  = ai[1] - ai[0]; /* length of i_th block row of A */
796     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
797     ib = aj + *ai;
798     jmin = 0;
799     nonzerorow += (n>0);
800     if (*ib == i){     /* (diag of A)*x */
801      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
802      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
803      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
804      v += 9; jmin++;
805     }
806     for (j=jmin; j<n; j++) {
807       /* (strict lower triangular part of A)*x  */
808       cval       = ib[j]*3;
809       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
810       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
811       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
812       /* (strict upper triangular part of A)*x  */
813       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
814       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
815       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
816       v  += 9;
817     }
818     xb +=3; ai++;
819   }
820 
821   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
822   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
823 
824   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
825   PetscFunctionReturn(0);
826 }
827 
828 #undef __FUNCT__
829 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
830 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
831 {
832   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
833   PetscScalar    *x,*z,*xb,x1,x2,x3,x4;
834   MatScalar      *v;
835   PetscErrorCode ierr;
836   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
837   PetscInt       nonzerorow=0;
838 
839   PetscFunctionBegin;
840   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
841   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
842   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
843 
844   v     = a->a;
845   xb = x;
846 
847   for (i=0; i<mbs; i++) {
848     n  = ai[1] - ai[0]; /* length of i_th block row of A */
849     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
850     ib = aj + *ai;
851     jmin = 0;
852     nonzerorow += (n>0);
853     if (*ib == i){      /* (diag of A)*x */
854       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
855       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
856       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
857       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
858       v += 16; jmin++;
859     }
860     for (j=jmin; j<n; j++) {
861       /* (strict lower triangular part of A)*x  */
862       cval       = ib[j]*4;
863       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
864       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
865       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
866       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
867       /* (strict upper triangular part of A)*x  */
868       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
869       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
870       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
871       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
872       v  += 16;
873     }
874     xb +=4; ai++;
875   }
876 
877   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
878   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
879 
880   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
881   PetscFunctionReturn(0);
882 }
883 
884 #undef __FUNCT__
885 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
886 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
887 {
888   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
889   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5;
890   MatScalar      *v;
891   PetscErrorCode ierr;
892   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
893   PetscInt       nonzerorow=0;
894 
895   PetscFunctionBegin;
896   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
897   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
898   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
899 
900   v     = a->a;
901   xb = x;
902 
903   for (i=0; i<mbs; i++) {
904     n  = ai[1] - ai[0]; /* length of i_th block row of A */
905     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
906     ib = aj + *ai;
907     jmin = 0;
908     nonzerorow += (n>0);
909     if (*ib == i){      /* (diag of A)*x */
910       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
911       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
912       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
913       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
914       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
915       v += 25; jmin++;
916     }
917     for (j=jmin; j<n; j++) {
918       /* (strict lower triangular part of A)*x  */
919       cval       = ib[j]*5;
920       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
921       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
922       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
923       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
924       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
925       /* (strict upper triangular part of A)*x  */
926       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
927       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
928       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
929       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
930       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
931       v  += 25;
932     }
933     xb +=5; ai++;
934   }
935 
936   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
937   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
938 
939   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
940   PetscFunctionReturn(0);
941 }
942 #undef __FUNCT__
943 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
944 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
945 {
946   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
947   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6;
948   MatScalar      *v;
949   PetscErrorCode ierr;
950   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
951   PetscInt       nonzerorow=0;
952 
953   PetscFunctionBegin;
954   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
955   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
956   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
957 
958   v     = a->a;
959   xb = x;
960 
961   for (i=0; i<mbs; i++) {
962     n  = ai[1] - ai[0]; /* length of i_th block row of A */
963     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
964     ib = aj + *ai;
965     jmin = 0;
966     nonzerorow += (n>0);
967     if (*ib == i){     /* (diag of A)*x */
968       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
969       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
970       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
971       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
972       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
973       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
974       v += 36; jmin++;
975     }
976     for (j=jmin; j<n; j++) {
977       /* (strict lower triangular part of A)*x  */
978       cval       = ib[j]*6;
979       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
980       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
981       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
982       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
983       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
984       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
985       /* (strict upper triangular part of A)*x  */
986       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
987       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
988       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
989       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
990       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
991       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
992       v  += 36;
993     }
994     xb +=6; ai++;
995   }
996 
997   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
998   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
999 
1000   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1001   PetscFunctionReturn(0);
1002 }
1003 
1004 #undef __FUNCT__
1005 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1006 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1007 {
1008   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1009   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
1010   MatScalar      *v;
1011   PetscErrorCode ierr;
1012   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
1013   PetscInt       nonzerorow=0;
1014 
1015   PetscFunctionBegin;
1016   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1017   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1018   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1019 
1020   v     = a->a;
1021   xb = x;
1022 
1023   for (i=0; i<mbs; i++) {
1024     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1025     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1026     ib = aj + *ai;
1027     jmin = 0;
1028     nonzerorow += (n>0);
1029     if (*ib == i){     /* (diag of A)*x */
1030       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1031       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1032       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1033       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1034       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1035       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1036       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1037       v += 49; jmin++;
1038     }
1039     for (j=jmin; j<n; j++) {
1040       /* (strict lower triangular part of A)*x  */
1041       cval       = ib[j]*7;
1042       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1043       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1044       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1045       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1046       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1047       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1048       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1049       /* (strict upper triangular part of A)*x  */
1050       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1051       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1052       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1053       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1054       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1055       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1056       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1057       v  += 49;
1058     }
1059     xb +=7; ai++;
1060   }
1061 
1062   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1063   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1064 
1065   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1066   PetscFunctionReturn(0);
1067 }
1068 
1069 #undef __FUNCT__
1070 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1071 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1072 {
1073   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1074   PetscScalar    *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1075   MatScalar      *v;
1076   PetscErrorCode ierr;
1077   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1078   PetscInt       nonzerorow=0;
1079 
1080   PetscFunctionBegin;
1081   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1082   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1083   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1084 
1085   aj   = a->j;
1086   v    = a->a;
1087   ii   = a->i;
1088 
1089   if (!a->mult_work) {
1090     ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1091   }
1092   work = a->mult_work;
1093 
1094 
1095   for (i=0; i<mbs; i++) {
1096     n     = ii[1] - ii[0]; ncols = n*bs;
1097     workt = work; idx=aj+ii[0];
1098     nonzerorow += (n>0);
1099 
1100     /* upper triangular part */
1101     for (j=0; j<n; j++) {
1102       xb = x_ptr + bs*(*idx++);
1103       for (k=0; k<bs; k++) workt[k] = xb[k];
1104       workt += bs;
1105     }
1106     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1107     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1108 
1109     /* strict lower triangular part */
1110     idx = aj+ii[0];
1111     if (*idx == i){
1112       ncols -= bs; v += bs2; idx++; n--;
1113     }
1114     if (ncols > 0){
1115       workt = work;
1116       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1117       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1118       for (j=0; j<n; j++) {
1119         zb = z_ptr + bs*(*idx++);
1120         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1121         workt += bs;
1122       }
1123     }
1124 
1125     x += bs; v += n*bs2; z += bs; ii++;
1126   }
1127 
1128   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1129   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1130 
1131   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1132   PetscFunctionReturn(0);
1133 }
1134 
1135 #undef __FUNCT__
1136 #define __FUNCT__ "MatScale_SeqSBAIJ"
1137 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1138 {
1139   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)inA->data;
1140   PetscScalar    oalpha = alpha;
1141   PetscErrorCode ierr;
1142   PetscBLASInt   one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz);
1143 
1144   PetscFunctionBegin;
1145   BLASscal_(&totalnz,&oalpha,a->a,&one);
1146   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1147   PetscFunctionReturn(0);
1148 }
1149 
1150 #undef __FUNCT__
1151 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1152 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1153 {
1154   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1155   MatScalar      *v = a->a;
1156   PetscReal      sum_diag = 0.0, sum_off = 0.0, *sum;
1157   PetscInt       i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1158   PetscErrorCode ierr;
1159   PetscInt       *jl,*il,jmin,jmax,nexti,ik,*col;
1160 
1161   PetscFunctionBegin;
1162   if (type == NORM_FROBENIUS) {
1163     for (k=0; k<mbs; k++){
1164       jmin = a->i[k]; jmax = a->i[k+1];
1165       col  = aj + jmin;
1166       if (*col == k){         /* diagonal block */
1167         for (i=0; i<bs2; i++){
1168 #if defined(PETSC_USE_COMPLEX)
1169           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1170 #else
1171           sum_diag += (*v)*(*v); v++;
1172 #endif
1173         }
1174         jmin++;
1175       }
1176       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1177         for (i=0; i<bs2; i++){
1178 #if defined(PETSC_USE_COMPLEX)
1179           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1180 #else
1181           sum_off += (*v)*(*v); v++;
1182 #endif
1183         }
1184       }
1185     }
1186     *norm = sqrt(sum_diag + 2*sum_off);
1187   }  else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1188     ierr = PetscMalloc((2*mbs+1)*sizeof(PetscInt)+bs*sizeof(PetscReal),&il);CHKERRQ(ierr);
1189     jl   = il + mbs;
1190     sum  = (PetscReal*)(jl + mbs);
1191     for (i=0; i<mbs; i++) jl[i] = mbs;
1192     il[0] = 0;
1193 
1194     *norm = 0.0;
1195     for (k=0; k<mbs; k++) { /* k_th block row */
1196       for (j=0; j<bs; j++) sum[j]=0.0;
1197       /*-- col sum --*/
1198       i = jl[k]; /* first |A(i,k)| to be added */
1199       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1200                   at step k */
1201       while (i<mbs){
1202         nexti = jl[i];  /* next block row to be added */
1203         ik    = il[i];  /* block index of A(i,k) in the array a */
1204         for (j=0; j<bs; j++){
1205           v = a->a + ik*bs2 + j*bs;
1206           for (k1=0; k1<bs; k1++) {
1207             sum[j] += PetscAbsScalar(*v); v++;
1208           }
1209         }
1210         /* update il, jl */
1211         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1212         jmax = a->i[i+1];
1213         if (jmin < jmax){
1214           il[i] = jmin;
1215           j   = a->j[jmin];
1216           jl[i] = jl[j]; jl[j]=i;
1217         }
1218         i = nexti;
1219       }
1220       /*-- row sum --*/
1221       jmin = a->i[k]; jmax = a->i[k+1];
1222       for (i=jmin; i<jmax; i++) {
1223         for (j=0; j<bs; j++){
1224           v = a->a + i*bs2 + j;
1225           for (k1=0; k1<bs; k1++){
1226             sum[j] += PetscAbsScalar(*v); v += bs;
1227           }
1228         }
1229       }
1230       /* add k_th block row to il, jl */
1231       col = aj+jmin;
1232       if (*col == k) jmin++;
1233       if (jmin < jmax){
1234         il[k] = jmin;
1235         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1236       }
1237       for (j=0; j<bs; j++){
1238         if (sum[j] > *norm) *norm = sum[j];
1239       }
1240     }
1241     ierr = PetscFree(il);CHKERRQ(ierr);
1242   } else {
1243     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1244   }
1245   PetscFunctionReturn(0);
1246 }
1247 
1248 #undef __FUNCT__
1249 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1250 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1251 {
1252   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1253   PetscErrorCode ierr;
1254 
1255   PetscFunctionBegin;
1256 
1257   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1258   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1259     *flg = PETSC_FALSE;
1260     PetscFunctionReturn(0);
1261   }
1262 
1263   /* if the a->i are the same */
1264   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1265   if (!*flg) {
1266     PetscFunctionReturn(0);
1267   }
1268 
1269   /* if a->j are the same */
1270   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1271   if (!*flg) {
1272     PetscFunctionReturn(0);
1273   }
1274   /* if a->a are the same */
1275   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1276   PetscFunctionReturn(0);
1277 }
1278 
1279 #undef __FUNCT__
1280 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1281 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1282 {
1283   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1284   PetscErrorCode ierr;
1285   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1286   PetscScalar    *x,zero = 0.0;
1287   MatScalar      *aa,*aa_j;
1288 
1289   PetscFunctionBegin;
1290   bs   = A->rmap->bs;
1291   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1292 
1293   aa   = a->a;
1294   ai   = a->i;
1295   aj   = a->j;
1296   ambs = a->mbs;
1297   bs2  = a->bs2;
1298 
1299   ierr = VecSet(v,zero);CHKERRQ(ierr);
1300   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1301   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1302   if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1303   for (i=0; i<ambs; i++) {
1304     j=ai[i];
1305     if (aj[j] == i) {             /* if this is a diagonal element */
1306       row  = i*bs;
1307       aa_j = aa + j*bs2;
1308       if (A->factor && bs==1){
1309         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1310       } else {
1311         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1312       }
1313     }
1314   }
1315 
1316   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1317   PetscFunctionReturn(0);
1318 }
1319 
1320 #undef __FUNCT__
1321 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1322 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1323 {
1324   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1325   PetscScalar    *l,x,*li,*ri;
1326   MatScalar      *aa,*v;
1327   PetscErrorCode ierr;
1328   PetscInt       i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1329   PetscTruth     flg;
1330 
1331   PetscFunctionBegin;
1332   if (ll != rr){
1333     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1334     if (!flg)
1335       SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1336   }
1337   if (!ll) PetscFunctionReturn(0);
1338   ai  = a->i;
1339   aj  = a->j;
1340   aa  = a->a;
1341   m   = A->rmap->N;
1342   bs  = A->rmap->bs;
1343   mbs = a->mbs;
1344   bs2 = a->bs2;
1345 
1346   ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1347   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1348   if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1349   for (i=0; i<mbs; i++) { /* for each block row */
1350     M  = ai[i+1] - ai[i];
1351     li = l + i*bs;
1352     v  = aa + bs2*ai[i];
1353     for (j=0; j<M; j++) { /* for each block */
1354       ri = l + bs*aj[ai[i]+j];
1355       for (k=0; k<bs; k++) {
1356         x = ri[k];
1357         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1358       }
1359     }
1360   }
1361   ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1362   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1363   PetscFunctionReturn(0);
1364 }
1365 
1366 #undef __FUNCT__
1367 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1368 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1369 {
1370   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1371 
1372   PetscFunctionBegin;
1373   info->block_size     = a->bs2;
1374   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1375   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1376   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1377   info->assemblies   = A->num_ass;
1378   info->mallocs      = a->reallocs;
1379   info->memory       = ((PetscObject)A)->mem;
1380   if (A->factor) {
1381     info->fill_ratio_given  = A->info.fill_ratio_given;
1382     info->fill_ratio_needed = A->info.fill_ratio_needed;
1383     info->factor_mallocs    = A->info.factor_mallocs;
1384   } else {
1385     info->fill_ratio_given  = 0;
1386     info->fill_ratio_needed = 0;
1387     info->factor_mallocs    = 0;
1388   }
1389   PetscFunctionReturn(0);
1390 }
1391 
1392 
1393 #undef __FUNCT__
1394 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1395 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1396 {
1397   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1398   PetscErrorCode ierr;
1399 
1400   PetscFunctionBegin;
1401   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1402   PetscFunctionReturn(0);
1403 }
1404 
1405 #undef __FUNCT__
1406 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1407 /*
1408    This code does not work since it only checks the upper triangular part of
1409   the matrix. Hence it is not listed in the function table.
1410 */
1411 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1412 {
1413   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1414   PetscErrorCode ierr;
1415   PetscInt       i,j,n,row,col,bs,*ai,*aj,mbs;
1416   PetscReal      atmp;
1417   MatScalar      *aa;
1418   PetscScalar    *x;
1419   PetscInt       ncols,brow,bcol,krow,kcol;
1420 
1421   PetscFunctionBegin;
1422   if (idx) SETERRQ(PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1423   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1424   bs   = A->rmap->bs;
1425   aa   = a->a;
1426   ai   = a->i;
1427   aj   = a->j;
1428   mbs = a->mbs;
1429 
1430   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1431   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1432   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1433   if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1434   for (i=0; i<mbs; i++) {
1435     ncols = ai[1] - ai[0]; ai++;
1436     brow  = bs*i;
1437     for (j=0; j<ncols; j++){
1438       bcol = bs*(*aj);
1439       for (kcol=0; kcol<bs; kcol++){
1440         col = bcol + kcol;      /* col index */
1441         for (krow=0; krow<bs; krow++){
1442           atmp = PetscAbsScalar(*aa); aa++;
1443           row = brow + krow;    /* row index */
1444           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1445           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1446         }
1447       }
1448       aj++;
1449     }
1450   }
1451   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1452   PetscFunctionReturn(0);
1453 }
1454