xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 6abefa4f446b2ea91915da4e25ea0defa83db48e)
1 #define PETSCMAT_DLL
2 
3 #include "../src/mat/impls/baij/seq/baij.h"
4 #include "../src/mat/blockinvert.h"
5 #include "petscbt.h"
6 #include "../src/mat/impls/sbaij/seq/sbaij.h"
7 #include "petscblaslapack.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
11 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
12 {
13   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
16   const PetscInt *idx;
17   PetscBT        table,table0;
18 
19   PetscFunctionBegin;
20   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
21   mbs = a->mbs;
22   ai  = a->i;
23   aj  = a->j;
24   bs  = A->rmap->bs;
25   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
26   ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
27   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
28   ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr);
29 
30   for (i=0; i<is_max; i++) { /* for each is */
31     isz  = 0;
32     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
33 
34     /* Extract the indices, assume there can be duplicate entries */
35     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
36     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
37 
38     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
39     bcol_max = 0;
40     for (j=0; j<n ; ++j){
41       brow = idx[j]/bs; /* convert the indices into block indices */
42       if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43       if(!PetscBTLookupSet(table,brow)) {
44         nidx[isz++] = brow;
45         if (bcol_max < brow) bcol_max = brow;
46       }
47     }
48     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
49     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
50 
51     k = 0;
52     for (j=0; j<ov; j++){ /* for each overlap */
53       /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
54       ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr);
55       for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); }
56 
57       n = isz;  /* length of the updated is[i] */
58       for (brow=0; brow<mbs; brow++){
59         start = ai[brow]; end   = ai[brow+1];
60         if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
61           for (l = start; l<end ; l++){
62             bcol = aj[l];
63             if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
64           }
65           k++;
66           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
68           for (l = start; l<end ; l++){
69             bcol = aj[l];
70             if (bcol > bcol_max) break;
71             if (PetscBTLookup(table0,bcol)){
72               if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
73               break; /* for l = start; l<end ; l++) */
74             }
75           }
76         }
77       }
78     } /* for each overlap */
79 
80     /* expand the Index Set */
81     for (j=0; j<isz; j++) {
82       for (k=0; k<bs; k++)
83         nidx2[j*bs+k] = nidx[j]*bs+k;
84     }
85     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
86   }
87   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
88   ierr = PetscFree(nidx);CHKERRQ(ierr);
89   ierr = PetscFree(nidx2);CHKERRQ(ierr);
90   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
91   PetscFunctionReturn(0);
92 }
93 
94 #undef __FUNCT__
95 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
96 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
97 {
98   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
99   PetscErrorCode ierr;
100   PetscInt       *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
101   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
102   PetscInt       nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
103   const PetscInt *irow,*aj = a->j,*ai = a->i;
104   MatScalar      *mat_a;
105   Mat            C;
106   PetscTruth     flag,sorted;
107 
108   PetscFunctionBegin;
109   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
110   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
111   if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
112 
113   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
114   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
115 
116   ierr  = PetscMalloc(oldcols*sizeof(PetscInt),&smap);CHKERRQ(ierr);
117   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
118   ssmap = smap;
119   ierr  = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
120   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
121   /* determine lens of each row */
122   for (i=0; i<nrows; i++) {
123     kstart  = ai[irow[i]];
124     kend    = kstart + a->ilen[irow[i]];
125     lens[i] = 0;
126       for (k=kstart; k<kend; k++) {
127         if (ssmap[aj[k]]) {
128           lens[i]++;
129         }
130       }
131     }
132   /* Create and fill new matrix */
133   if (scall == MAT_REUSE_MATRIX) {
134     c = (Mat_SeqSBAIJ *)((*B)->data);
135 
136     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
137     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
138     if (!flag) {
139       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
140     }
141     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
142     C = *B;
143   } else {
144     ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr);
145     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
146     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
147     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
148   }
149   c = (Mat_SeqSBAIJ *)(C->data);
150   for (i=0; i<nrows; i++) {
151     row    = irow[i];
152     kstart = ai[row];
153     kend   = kstart + a->ilen[row];
154     mat_i  = c->i[i];
155     mat_j  = c->j + mat_i;
156     mat_a  = c->a + mat_i*bs2;
157     mat_ilen = c->ilen + i;
158     for (k=kstart; k<kend; k++) {
159       if ((tcol=ssmap[a->j[k]])) {
160         *mat_j++ = tcol - 1;
161         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
162         mat_a   += bs2;
163         (*mat_ilen)++;
164       }
165     }
166   }
167 
168   /* Free work space */
169   ierr = PetscFree(smap);CHKERRQ(ierr);
170   ierr = PetscFree(lens);CHKERRQ(ierr);
171   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
172   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
173 
174   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
175   *B = C;
176   PetscFunctionReturn(0);
177 }
178 
179 #undef __FUNCT__
180 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
181 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
182 {
183   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
184   IS             is1;
185   PetscErrorCode ierr;
186   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
187   const PetscInt *irow;
188 
189   PetscFunctionBegin;
190   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
191 
192   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
193   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
194 
195   /* Verify if the indices corespond to each element in a block
196    and form the IS with compressed IS */
197   ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr);
198   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
199   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
200 
201   count = 0;
202   for (i=0; i<a->mbs; i++) {
203     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
204     if (vary[i]==bs) iary[count++] = i;
205   }
206   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
207   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
208   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
209 
210   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr);
211   ierr = ISDestroy(is1);CHKERRQ(ierr);
212   PetscFunctionReturn(0);
213 }
214 
215 #undef __FUNCT__
216 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
217 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
218 {
219   PetscErrorCode ierr;
220   PetscInt       i;
221 
222   PetscFunctionBegin;
223   if (scall == MAT_INITIAL_MATRIX) {
224     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
225   }
226 
227   for (i=0; i<n; i++) {
228     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
229   }
230   PetscFunctionReturn(0);
231 }
232 
233 /* -------------------------------------------------------*/
234 /* Should check that shapes of vectors and matrices match */
235 /* -------------------------------------------------------*/
236 
237 #undef __FUNCT__
238 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
239 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
240 {
241   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
242   PetscScalar    *x,*z,*xb,x1,x2,zero=0.0;
243   MatScalar      *v;
244   PetscErrorCode ierr;
245   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
246   PetscInt       nonzerorow=0;
247 
248   PetscFunctionBegin;
249   ierr = VecSet(zz,zero);CHKERRQ(ierr);
250   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
251   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
252 
253   v     = a->a;
254   xb = x;
255 
256   for (i=0; i<mbs; i++) {
257     n  = ai[1] - ai[0]; /* length of i_th block row of A */
258     x1 = xb[0]; x2 = xb[1];
259     ib = aj + *ai;
260     jmin = 0;
261     nonzerorow += (n>0);
262     if (*ib == i){     /* (diag of A)*x */
263       z[2*i]   += v[0]*x1 + v[2]*x2;
264       z[2*i+1] += v[2]*x1 + v[3]*x2;
265       v += 4; jmin++;
266     }
267     for (j=jmin; j<n; j++) {
268       /* (strict lower triangular part of A)*x  */
269       cval       = ib[j]*2;
270       z[cval]     += v[0]*x1 + v[1]*x2;
271       z[cval+1]   += v[2]*x1 + v[3]*x2;
272       /* (strict upper triangular part of A)*x  */
273       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
274       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
275       v  += 4;
276     }
277     xb +=2; ai++;
278   }
279 
280   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
281   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
282   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
283   PetscFunctionReturn(0);
284 }
285 
286 #undef __FUNCT__
287 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
288 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
289 {
290   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
291   PetscScalar    *x,*z,*xb,x1,x2,x3,zero=0.0;
292   MatScalar      *v;
293   PetscErrorCode ierr;
294   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
295   PetscInt       nonzerorow=0;
296 
297   PetscFunctionBegin;
298   ierr = VecSet(zz,zero);CHKERRQ(ierr);
299   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
300   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
301 
302   v    = a->a;
303   xb   = x;
304 
305   for (i=0; i<mbs; i++) {
306     n  = ai[1] - ai[0]; /* length of i_th block row of A */
307     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
308     ib = aj + *ai;
309     jmin = 0;
310     nonzerorow += (n>0);
311     if (*ib == i){     /* (diag of A)*x */
312       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
313       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
314       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
315       v += 9; jmin++;
316     }
317     for (j=jmin; j<n; j++) {
318       /* (strict lower triangular part of A)*x  */
319       cval       = ib[j]*3;
320       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
321       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
322       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
323       /* (strict upper triangular part of A)*x  */
324       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
325       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
326       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
327       v  += 9;
328     }
329     xb +=3; ai++;
330   }
331 
332   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
333   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
334   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
335   PetscFunctionReturn(0);
336 }
337 
338 #undef __FUNCT__
339 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
340 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
341 {
342   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
343   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
344   MatScalar      *v;
345   PetscErrorCode ierr;
346   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
347   PetscInt       nonzerorow=0;
348 
349   PetscFunctionBegin;
350   ierr = VecSet(zz,zero);CHKERRQ(ierr);
351   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
352   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
353 
354   v     = a->a;
355   xb = x;
356 
357   for (i=0; i<mbs; i++) {
358     n  = ai[1] - ai[0]; /* length of i_th block row of A */
359     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
360     ib = aj + *ai;
361     jmin = 0;
362     nonzerorow += (n>0);
363     if (*ib == i){     /* (diag of A)*x */
364       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
365       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
366       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
367       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
368       v += 16; jmin++;
369     }
370     for (j=jmin; j<n; j++) {
371       /* (strict lower triangular part of A)*x  */
372       cval       = ib[j]*4;
373       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
374       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
375       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
376       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
377       /* (strict upper triangular part of A)*x  */
378       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
379       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
380       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
381       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
382       v  += 16;
383     }
384     xb +=4; ai++;
385   }
386 
387   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
388   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
389   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
390   PetscFunctionReturn(0);
391 }
392 
393 #undef __FUNCT__
394 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
395 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
396 {
397   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
398   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
399   MatScalar      *v;
400   PetscErrorCode ierr;
401   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
402   PetscInt       nonzerorow=0;
403 
404   PetscFunctionBegin;
405   ierr = VecSet(zz,zero);CHKERRQ(ierr);
406   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
407   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
408 
409   v     = a->a;
410   xb = x;
411 
412   for (i=0; i<mbs; i++) {
413     n  = ai[1] - ai[0]; /* length of i_th block row of A */
414     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
415     ib = aj + *ai;
416     jmin = 0;
417     nonzerorow += (n>0);
418     if (*ib == i){      /* (diag of A)*x */
419       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
420       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
421       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
422       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
423       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
424       v += 25; jmin++;
425     }
426     for (j=jmin; j<n; j++) {
427       /* (strict lower triangular part of A)*x  */
428       cval       = ib[j]*5;
429       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
430       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
431       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
432       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
433       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
434       /* (strict upper triangular part of A)*x  */
435       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
436       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
437       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
438       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
439       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
440       v  += 25;
441     }
442     xb +=5; ai++;
443   }
444 
445   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
446   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
447   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
448   PetscFunctionReturn(0);
449 }
450 
451 
452 #undef __FUNCT__
453 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
454 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
455 {
456   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
457   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
458   MatScalar      *v;
459   PetscErrorCode ierr;
460   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
461   PetscInt       nonzerorow=0;
462 
463   PetscFunctionBegin;
464   ierr = VecSet(zz,zero);CHKERRQ(ierr);
465   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
466   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
467 
468   v     = a->a;
469   xb = x;
470 
471   for (i=0; i<mbs; i++) {
472     n  = ai[1] - ai[0]; /* length of i_th block row of A */
473     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
474     ib = aj + *ai;
475     jmin = 0;
476     nonzerorow += (n>0);
477     if (*ib == i){      /* (diag of A)*x */
478       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
479       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
480       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
481       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
482       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
483       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
484       v += 36; jmin++;
485     }
486     for (j=jmin; j<n; j++) {
487       /* (strict lower triangular part of A)*x  */
488       cval       = ib[j]*6;
489       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
490       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
491       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
492       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
493       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
494       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
495       /* (strict upper triangular part of A)*x  */
496       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
497       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
498       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
499       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
500       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
501       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
502       v  += 36;
503     }
504     xb +=6; ai++;
505   }
506 
507   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
508   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
509   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
510   PetscFunctionReturn(0);
511 }
512 #undef __FUNCT__
513 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
514 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
515 {
516   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
517   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
518   MatScalar      *v;
519   PetscErrorCode ierr;
520   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
521   PetscInt       nonzerorow=0;
522 
523   PetscFunctionBegin;
524   ierr = VecSet(zz,zero);CHKERRQ(ierr);
525   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
526   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
527 
528   v     = a->a;
529   xb = x;
530 
531   for (i=0; i<mbs; i++) {
532     n  = ai[1] - ai[0]; /* length of i_th block row of A */
533     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
534     ib = aj + *ai;
535     jmin = 0;
536     nonzerorow += (n>0);
537     if (*ib == i){      /* (diag of A)*x */
538       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
539       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
540       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
541       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
542       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
543       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
544       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
545       v += 49; jmin++;
546     }
547     for (j=jmin; j<n; j++) {
548       /* (strict lower triangular part of A)*x  */
549       cval       = ib[j]*7;
550       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
551       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
552       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
553       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
554       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
555       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
556       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
557       /* (strict upper triangular part of A)*x  */
558       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
559       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
560       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
561       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
562       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
563       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
564       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
565       v  += 49;
566     }
567     xb +=7; ai++;
568   }
569   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
570   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
571   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
572   PetscFunctionReturn(0);
573 }
574 
575 /*
576     This will not work with MatScalar == float because it calls the BLAS
577 */
578 #undef __FUNCT__
579 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
580 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
581 {
582   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
583   PetscScalar    *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
584   MatScalar      *v;
585   PetscErrorCode ierr;
586   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
587   PetscInt       nonzerorow=0;
588 
589   PetscFunctionBegin;
590   ierr = VecSet(zz,zero);CHKERRQ(ierr);
591   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
592   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
593 
594   aj   = a->j;
595   v    = a->a;
596   ii   = a->i;
597 
598   if (!a->mult_work) {
599     ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
600   }
601   work = a->mult_work;
602 
603   for (i=0; i<mbs; i++) {
604     n     = ii[1] - ii[0]; ncols = n*bs;
605     workt = work; idx=aj+ii[0];
606     nonzerorow += (n>0);
607 
608     /* upper triangular part */
609     for (j=0; j<n; j++) {
610       xb = x_ptr + bs*(*idx++);
611       for (k=0; k<bs; k++) workt[k] = xb[k];
612       workt += bs;
613     }
614     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
615     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
616 
617     /* strict lower triangular part */
618     idx = aj+ii[0];
619     if (*idx == i){
620       ncols -= bs; v += bs2; idx++; n--;
621     }
622 
623     if (ncols > 0){
624       workt = work;
625       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
626       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
627       for (j=0; j<n; j++) {
628         zb = z_ptr + bs*(*idx++);
629         for (k=0; k<bs; k++) zb[k] += workt[k] ;
630         workt += bs;
631       }
632     }
633     x += bs; v += n*bs2; z += bs; ii++;
634   }
635 
636   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
637   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
638   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
639   PetscFunctionReturn(0);
640 }
641 
642 extern PetscErrorCode VecCopy_Seq(Vec,Vec);
643 #undef __FUNCT__
644 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
645 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
646 {
647   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
648   PetscScalar    *x,*z,*xb,x1;
649   MatScalar      *v;
650   PetscErrorCode ierr;
651   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
652   PetscInt       nonzerorow=0;
653 
654   PetscFunctionBegin;
655   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
656   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
657   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
658   v  = a->a;
659   xb = x;
660 
661   for (i=0; i<mbs; i++) {
662     n  = ai[1] - ai[0];  /* length of i_th row of A */
663     x1 = xb[0];
664     ib = aj + *ai;
665     jmin = 0;
666     nonzerorow += (n>0);
667     if (*ib == i) {            /* (diag of A)*x */
668       z[i] += *v++ * x[*ib++]; jmin++;
669     }
670     for (j=jmin; j<n; j++) {
671       cval    = *ib;
672       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
673       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
674     }
675     xb++; ai++;
676   }
677 
678   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
679   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
680 
681   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
682   PetscFunctionReturn(0);
683 }
684 
685 #undef __FUNCT__
686 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
687 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
688 {
689   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
690   PetscScalar    *x,*z,*xb,x1,x2;
691   MatScalar      *v;
692   PetscErrorCode ierr;
693   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
694   PetscInt       nonzerorow=0;
695 
696   PetscFunctionBegin;
697   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
698   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
699   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
700 
701   v  = a->a;
702   xb = x;
703 
704   for (i=0; i<mbs; i++) {
705     n  = ai[1] - ai[0]; /* length of i_th block row of A */
706     x1 = xb[0]; x2 = xb[1];
707     ib = aj + *ai;
708     jmin = 0;
709     nonzerorow += (n>0);
710     if (*ib == i){      /* (diag of A)*x */
711       z[2*i]   += v[0]*x1 + v[2]*x2;
712       z[2*i+1] += v[2]*x1 + v[3]*x2;
713       v += 4; jmin++;
714     }
715     for (j=jmin; j<n; j++) {
716       /* (strict lower triangular part of A)*x  */
717       cval       = ib[j]*2;
718       z[cval]     += v[0]*x1 + v[1]*x2;
719       z[cval+1]   += v[2]*x1 + v[3]*x2;
720       /* (strict upper triangular part of A)*x  */
721       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
722       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
723       v  += 4;
724     }
725     xb +=2; ai++;
726   }
727   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
728   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
729 
730   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
731   PetscFunctionReturn(0);
732 }
733 
734 #undef __FUNCT__
735 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
736 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
737 {
738   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
739   PetscScalar    *x,*z,*xb,x1,x2,x3;
740   MatScalar      *v;
741   PetscErrorCode ierr;
742   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
743   PetscInt       nonzerorow=0;
744 
745   PetscFunctionBegin;
746   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
747   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
748   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
749 
750   v     = a->a;
751   xb = x;
752 
753   for (i=0; i<mbs; i++) {
754     n  = ai[1] - ai[0]; /* length of i_th block row of A */
755     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
756     ib = aj + *ai;
757     jmin = 0;
758     nonzerorow += (n>0);
759     if (*ib == i){     /* (diag of A)*x */
760      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
761      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
762      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
763      v += 9; jmin++;
764     }
765     for (j=jmin; j<n; j++) {
766       /* (strict lower triangular part of A)*x  */
767       cval       = ib[j]*3;
768       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
769       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
770       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
771       /* (strict upper triangular part of A)*x  */
772       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
773       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
774       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
775       v  += 9;
776     }
777     xb +=3; ai++;
778   }
779 
780   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
781   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
782 
783   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
784   PetscFunctionReturn(0);
785 }
786 
787 #undef __FUNCT__
788 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
789 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
790 {
791   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
792   PetscScalar    *x,*z,*xb,x1,x2,x3,x4;
793   MatScalar      *v;
794   PetscErrorCode ierr;
795   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
796   PetscInt       nonzerorow=0;
797 
798   PetscFunctionBegin;
799   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
800   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
801   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
802 
803   v     = a->a;
804   xb = x;
805 
806   for (i=0; i<mbs; i++) {
807     n  = ai[1] - ai[0]; /* length of i_th block row of A */
808     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
809     ib = aj + *ai;
810     jmin = 0;
811     nonzerorow += (n>0);
812     if (*ib == i){      /* (diag of A)*x */
813       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
814       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
815       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
816       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
817       v += 16; jmin++;
818     }
819     for (j=jmin; j<n; j++) {
820       /* (strict lower triangular part of A)*x  */
821       cval       = ib[j]*4;
822       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
823       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
824       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
825       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
826       /* (strict upper triangular part of A)*x  */
827       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
828       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
829       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
830       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
831       v  += 16;
832     }
833     xb +=4; ai++;
834   }
835 
836   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
837   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
838 
839   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
840   PetscFunctionReturn(0);
841 }
842 
843 #undef __FUNCT__
844 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
845 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
846 {
847   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
848   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5;
849   MatScalar      *v;
850   PetscErrorCode ierr;
851   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
852   PetscInt       nonzerorow=0;
853 
854   PetscFunctionBegin;
855   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
856   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
857   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
858 
859   v     = a->a;
860   xb = x;
861 
862   for (i=0; i<mbs; i++) {
863     n  = ai[1] - ai[0]; /* length of i_th block row of A */
864     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
865     ib = aj + *ai;
866     jmin = 0;
867     nonzerorow += (n>0);
868     if (*ib == i){      /* (diag of A)*x */
869       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
870       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
871       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
872       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
873       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
874       v += 25; jmin++;
875     }
876     for (j=jmin; j<n; j++) {
877       /* (strict lower triangular part of A)*x  */
878       cval       = ib[j]*5;
879       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
880       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
881       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
882       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
883       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
884       /* (strict upper triangular part of A)*x  */
885       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
886       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
887       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
888       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
889       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
890       v  += 25;
891     }
892     xb +=5; ai++;
893   }
894 
895   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
896   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
897 
898   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
899   PetscFunctionReturn(0);
900 }
901 #undef __FUNCT__
902 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
903 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
904 {
905   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
906   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6;
907   MatScalar      *v;
908   PetscErrorCode ierr;
909   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
910   PetscInt       nonzerorow=0;
911 
912   PetscFunctionBegin;
913   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
914   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
915   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
916 
917   v     = a->a;
918   xb = x;
919 
920   for (i=0; i<mbs; i++) {
921     n  = ai[1] - ai[0]; /* length of i_th block row of A */
922     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
923     ib = aj + *ai;
924     jmin = 0;
925     nonzerorow += (n>0);
926     if (*ib == i){     /* (diag of A)*x */
927       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
928       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
929       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
930       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
931       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
932       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
933       v += 36; jmin++;
934     }
935     for (j=jmin; j<n; j++) {
936       /* (strict lower triangular part of A)*x  */
937       cval       = ib[j]*6;
938       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
939       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
940       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
941       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
942       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
943       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
944       /* (strict upper triangular part of A)*x  */
945       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
946       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
947       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
948       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
949       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
950       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
951       v  += 36;
952     }
953     xb +=6; ai++;
954   }
955 
956   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
957   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
958 
959   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
960   PetscFunctionReturn(0);
961 }
962 
963 #undef __FUNCT__
964 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
965 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
966 {
967   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
968   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
969   MatScalar      *v;
970   PetscErrorCode ierr;
971   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
972   PetscInt       nonzerorow=0;
973 
974   PetscFunctionBegin;
975   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
976   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
977   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
978 
979   v     = a->a;
980   xb = x;
981 
982   for (i=0; i<mbs; i++) {
983     n  = ai[1] - ai[0]; /* length of i_th block row of A */
984     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
985     ib = aj + *ai;
986     jmin = 0;
987     nonzerorow += (n>0);
988     if (*ib == i){     /* (diag of A)*x */
989       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
990       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
991       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
992       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
993       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
994       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
995       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
996       v += 49; jmin++;
997     }
998     for (j=jmin; j<n; j++) {
999       /* (strict lower triangular part of A)*x  */
1000       cval       = ib[j]*7;
1001       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1002       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1003       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1004       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1005       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1006       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1007       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1008       /* (strict upper triangular part of A)*x  */
1009       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1010       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1011       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1012       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1013       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1014       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1015       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1016       v  += 49;
1017     }
1018     xb +=7; ai++;
1019   }
1020 
1021   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1022   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1023 
1024   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1025   PetscFunctionReturn(0);
1026 }
1027 
1028 #undef __FUNCT__
1029 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1030 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1031 {
1032   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1033   PetscScalar    *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1034   MatScalar      *v;
1035   PetscErrorCode ierr;
1036   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1037   PetscInt       nonzerorow=0;
1038 
1039   PetscFunctionBegin;
1040   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1041   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1042   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1043 
1044   aj   = a->j;
1045   v    = a->a;
1046   ii   = a->i;
1047 
1048   if (!a->mult_work) {
1049     ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1050   }
1051   work = a->mult_work;
1052 
1053 
1054   for (i=0; i<mbs; i++) {
1055     n     = ii[1] - ii[0]; ncols = n*bs;
1056     workt = work; idx=aj+ii[0];
1057     nonzerorow += (n>0);
1058 
1059     /* upper triangular part */
1060     for (j=0; j<n; j++) {
1061       xb = x_ptr + bs*(*idx++);
1062       for (k=0; k<bs; k++) workt[k] = xb[k];
1063       workt += bs;
1064     }
1065     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1066     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1067 
1068     /* strict lower triangular part */
1069     idx = aj+ii[0];
1070     if (*idx == i){
1071       ncols -= bs; v += bs2; idx++; n--;
1072     }
1073     if (ncols > 0){
1074       workt = work;
1075       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1076       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1077       for (j=0; j<n; j++) {
1078         zb = z_ptr + bs*(*idx++);
1079         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1080         workt += bs;
1081       }
1082     }
1083 
1084     x += bs; v += n*bs2; z += bs; ii++;
1085   }
1086 
1087   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1088   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1089 
1090   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1091   PetscFunctionReturn(0);
1092 }
1093 
1094 #undef __FUNCT__
1095 #define __FUNCT__ "MatScale_SeqSBAIJ"
1096 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1097 {
1098   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)inA->data;
1099   PetscScalar    oalpha = alpha;
1100   PetscErrorCode ierr;
1101   PetscBLASInt   one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz);
1102 
1103   PetscFunctionBegin;
1104   BLASscal_(&totalnz,&oalpha,a->a,&one);
1105   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1106   PetscFunctionReturn(0);
1107 }
1108 
1109 #undef __FUNCT__
1110 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1111 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1112 {
1113   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1114   MatScalar      *v = a->a;
1115   PetscReal      sum_diag = 0.0, sum_off = 0.0, *sum;
1116   PetscInt       i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1117   PetscErrorCode ierr;
1118   PetscInt       *jl,*il,jmin,jmax,nexti,ik,*col;
1119 
1120   PetscFunctionBegin;
1121   if (type == NORM_FROBENIUS) {
1122     for (k=0; k<mbs; k++){
1123       jmin = a->i[k]; jmax = a->i[k+1];
1124       col  = aj + jmin;
1125       if (*col == k){         /* diagonal block */
1126         for (i=0; i<bs2; i++){
1127 #if defined(PETSC_USE_COMPLEX)
1128           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1129 #else
1130           sum_diag += (*v)*(*v); v++;
1131 #endif
1132         }
1133         jmin++;
1134       }
1135       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1136         for (i=0; i<bs2; i++){
1137 #if defined(PETSC_USE_COMPLEX)
1138           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1139 #else
1140           sum_off += (*v)*(*v); v++;
1141 #endif
1142         }
1143       }
1144     }
1145     *norm = sqrt(sum_diag + 2*sum_off);
1146   }  else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1147     ierr = PetscMalloc3(bs,PetscReal,&sum,mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr);
1148     for (i=0; i<mbs; i++) jl[i] = mbs;
1149     il[0] = 0;
1150 
1151     *norm = 0.0;
1152     for (k=0; k<mbs; k++) { /* k_th block row */
1153       for (j=0; j<bs; j++) sum[j]=0.0;
1154       /*-- col sum --*/
1155       i = jl[k]; /* first |A(i,k)| to be added */
1156       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1157                   at step k */
1158       while (i<mbs){
1159         nexti = jl[i];  /* next block row to be added */
1160         ik    = il[i];  /* block index of A(i,k) in the array a */
1161         for (j=0; j<bs; j++){
1162           v = a->a + ik*bs2 + j*bs;
1163           for (k1=0; k1<bs; k1++) {
1164             sum[j] += PetscAbsScalar(*v); v++;
1165           }
1166         }
1167         /* update il, jl */
1168         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1169         jmax = a->i[i+1];
1170         if (jmin < jmax){
1171           il[i] = jmin;
1172           j   = a->j[jmin];
1173           jl[i] = jl[j]; jl[j]=i;
1174         }
1175         i = nexti;
1176       }
1177       /*-- row sum --*/
1178       jmin = a->i[k]; jmax = a->i[k+1];
1179       for (i=jmin; i<jmax; i++) {
1180         for (j=0; j<bs; j++){
1181           v = a->a + i*bs2 + j;
1182           for (k1=0; k1<bs; k1++){
1183             sum[j] += PetscAbsScalar(*v); v += bs;
1184           }
1185         }
1186       }
1187       /* add k_th block row to il, jl */
1188       col = aj+jmin;
1189       if (*col == k) jmin++;
1190       if (jmin < jmax){
1191         il[k] = jmin;
1192         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1193       }
1194       for (j=0; j<bs; j++){
1195         if (sum[j] > *norm) *norm = sum[j];
1196       }
1197     }
1198     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1199   } else {
1200     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1201   }
1202   PetscFunctionReturn(0);
1203 }
1204 
1205 #undef __FUNCT__
1206 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1207 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1208 {
1209   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1210   PetscErrorCode ierr;
1211 
1212   PetscFunctionBegin;
1213 
1214   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1215   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1216     *flg = PETSC_FALSE;
1217     PetscFunctionReturn(0);
1218   }
1219 
1220   /* if the a->i are the same */
1221   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1222   if (!*flg) {
1223     PetscFunctionReturn(0);
1224   }
1225 
1226   /* if a->j are the same */
1227   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1228   if (!*flg) {
1229     PetscFunctionReturn(0);
1230   }
1231   /* if a->a are the same */
1232   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1233   PetscFunctionReturn(0);
1234 }
1235 
1236 #undef __FUNCT__
1237 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1238 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1239 {
1240   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1241   PetscErrorCode ierr;
1242   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1243   PetscScalar    *x,zero = 0.0;
1244   MatScalar      *aa,*aa_j;
1245 
1246   PetscFunctionBegin;
1247   bs   = A->rmap->bs;
1248   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1249 
1250   aa   = a->a;
1251   ai   = a->i;
1252   aj   = a->j;
1253   ambs = a->mbs;
1254   bs2  = a->bs2;
1255 
1256   ierr = VecSet(v,zero);CHKERRQ(ierr);
1257   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1258   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1259   if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1260   for (i=0; i<ambs; i++) {
1261     j=ai[i];
1262     if (aj[j] == i) {             /* if this is a diagonal element */
1263       row  = i*bs;
1264       aa_j = aa + j*bs2;
1265       if (A->factor && bs==1){
1266         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1267       } else {
1268         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1269       }
1270     }
1271   }
1272 
1273   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1274   PetscFunctionReturn(0);
1275 }
1276 
1277 #undef __FUNCT__
1278 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1279 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1280 {
1281   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1282   PetscScalar    *l,x,*li,*ri;
1283   MatScalar      *aa,*v;
1284   PetscErrorCode ierr;
1285   PetscInt       i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1286   PetscTruth     flg;
1287 
1288   PetscFunctionBegin;
1289   if (ll != rr){
1290     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1291     if (!flg)
1292       SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1293   }
1294   if (!ll) PetscFunctionReturn(0);
1295   ai  = a->i;
1296   aj  = a->j;
1297   aa  = a->a;
1298   m   = A->rmap->N;
1299   bs  = A->rmap->bs;
1300   mbs = a->mbs;
1301   bs2 = a->bs2;
1302 
1303   ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1304   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1305   if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1306   for (i=0; i<mbs; i++) { /* for each block row */
1307     M  = ai[i+1] - ai[i];
1308     li = l + i*bs;
1309     v  = aa + bs2*ai[i];
1310     for (j=0; j<M; j++) { /* for each block */
1311       ri = l + bs*aj[ai[i]+j];
1312       for (k=0; k<bs; k++) {
1313         x = ri[k];
1314         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1315       }
1316     }
1317   }
1318   ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1319   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1320   PetscFunctionReturn(0);
1321 }
1322 
1323 #undef __FUNCT__
1324 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1325 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1326 {
1327   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1328 
1329   PetscFunctionBegin;
1330   info->block_size     = a->bs2;
1331   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1332   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1333   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1334   info->assemblies   = A->num_ass;
1335   info->mallocs      = a->reallocs;
1336   info->memory       = ((PetscObject)A)->mem;
1337   if (A->factor) {
1338     info->fill_ratio_given  = A->info.fill_ratio_given;
1339     info->fill_ratio_needed = A->info.fill_ratio_needed;
1340     info->factor_mallocs    = A->info.factor_mallocs;
1341   } else {
1342     info->fill_ratio_given  = 0;
1343     info->fill_ratio_needed = 0;
1344     info->factor_mallocs    = 0;
1345   }
1346   PetscFunctionReturn(0);
1347 }
1348 
1349 
1350 #undef __FUNCT__
1351 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1352 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1353 {
1354   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1355   PetscErrorCode ierr;
1356 
1357   PetscFunctionBegin;
1358   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1359   PetscFunctionReturn(0);
1360 }
1361 
1362 #undef __FUNCT__
1363 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1364 /*
1365    This code does not work since it only checks the upper triangular part of
1366   the matrix. Hence it is not listed in the function table.
1367 */
1368 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1369 {
1370   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1371   PetscErrorCode ierr;
1372   PetscInt       i,j,n,row,col,bs,*ai,*aj,mbs;
1373   PetscReal      atmp;
1374   MatScalar      *aa;
1375   PetscScalar    *x;
1376   PetscInt       ncols,brow,bcol,krow,kcol;
1377 
1378   PetscFunctionBegin;
1379   if (idx) SETERRQ(PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1380   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1381   bs   = A->rmap->bs;
1382   aa   = a->a;
1383   ai   = a->i;
1384   aj   = a->j;
1385   mbs = a->mbs;
1386 
1387   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1388   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1389   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1390   if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1391   for (i=0; i<mbs; i++) {
1392     ncols = ai[1] - ai[0]; ai++;
1393     brow  = bs*i;
1394     for (j=0; j<ncols; j++){
1395       bcol = bs*(*aj);
1396       for (kcol=0; kcol<bs; kcol++){
1397         col = bcol + kcol;      /* col index */
1398         for (krow=0; krow<bs; krow++){
1399           atmp = PetscAbsScalar(*aa); aa++;
1400           row = brow + krow;    /* row index */
1401           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1402           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1403         }
1404       }
1405       aj++;
1406     }
1407   }
1408   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1409   PetscFunctionReturn(0);
1410 }
1411