xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision c05d87d6c2fa1d0952fe676de20339d09daa79bc)
1 #define PETSCMAT_DLL
2 
3 #include "../src/mat/impls/baij/seq/baij.h"
4 #include "../src/mat/blockinvert.h"
5 #include "petscbt.h"
6 #include "../src/mat/impls/sbaij/seq/sbaij.h"
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11 {
12   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
15   const PetscInt *idx;
16   PetscBT        table,table0;
17 
18   PetscFunctionBegin;
19   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20   mbs = a->mbs;
21   ai  = a->i;
22   aj  = a->j;
23   bs  = A->rmap->bs;
24   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
25   ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
26   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
27   ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr);
28 
29   for (i=0; i<is_max; i++) { /* for each is */
30     isz  = 0;
31     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
32 
33     /* Extract the indices, assume there can be duplicate entries */
34     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36 
37     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
38     bcol_max = 0;
39     for (j=0; j<n ; ++j){
40       brow = idx[j]/bs; /* convert the indices into block indices */
41       if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42       if(!PetscBTLookupSet(table,brow)) {
43         nidx[isz++] = brow;
44         if (bcol_max < brow) bcol_max = brow;
45       }
46     }
47     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
49 
50     k = 0;
51     for (j=0; j<ov; j++){ /* for each overlap */
52       /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53       ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr);
54       for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); }
55 
56       n = isz;  /* length of the updated is[i] */
57       for (brow=0; brow<mbs; brow++){
58         start = ai[brow]; end   = ai[brow+1];
59         if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
60           for (l = start; l<end ; l++){
61             bcol = aj[l];
62             if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
63           }
64           k++;
65           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
66         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
67           for (l = start; l<end ; l++){
68             bcol = aj[l];
69             if (bcol > bcol_max) break;
70             if (PetscBTLookup(table0,bcol)){
71               if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
72               break; /* for l = start; l<end ; l++) */
73             }
74           }
75         }
76       }
77     } /* for each overlap */
78 
79     /* expand the Index Set */
80     for (j=0; j<isz; j++) {
81       for (k=0; k<bs; k++)
82         nidx2[j*bs+k] = nidx[j]*bs+k;
83     }
84     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
85   }
86   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
87   ierr = PetscFree(nidx);CHKERRQ(ierr);
88   ierr = PetscFree(nidx2);CHKERRQ(ierr);
89   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
90   PetscFunctionReturn(0);
91 }
92 
93 #undef __FUNCT__
94 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
95 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
96 {
97   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
98   PetscErrorCode ierr;
99   PetscInt       *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
100   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
101   PetscInt       nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
102   const PetscInt *irow,*aj = a->j,*ai = a->i;
103   MatScalar      *mat_a;
104   Mat            C;
105   PetscTruth     flag,sorted;
106 
107   PetscFunctionBegin;
108   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
109   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
110   if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
111 
112   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
113   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
114 
115   ierr  = PetscMalloc(oldcols*sizeof(PetscInt),&smap);CHKERRQ(ierr);
116   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
117   ssmap = smap;
118   ierr  = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
119   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
120   /* determine lens of each row */
121   for (i=0; i<nrows; i++) {
122     kstart  = ai[irow[i]];
123     kend    = kstart + a->ilen[irow[i]];
124     lens[i] = 0;
125       for (k=kstart; k<kend; k++) {
126         if (ssmap[aj[k]]) {
127           lens[i]++;
128         }
129       }
130     }
131   /* Create and fill new matrix */
132   if (scall == MAT_REUSE_MATRIX) {
133     c = (Mat_SeqSBAIJ *)((*B)->data);
134 
135     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
136     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
137     if (!flag) {
138       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
139     }
140     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
141     C = *B;
142   } else {
143     ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr);
144     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
145     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
146     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
147   }
148   c = (Mat_SeqSBAIJ *)(C->data);
149   for (i=0; i<nrows; i++) {
150     row    = irow[i];
151     kstart = ai[row];
152     kend   = kstart + a->ilen[row];
153     mat_i  = c->i[i];
154     mat_j  = c->j + mat_i;
155     mat_a  = c->a + mat_i*bs2;
156     mat_ilen = c->ilen + i;
157     for (k=kstart; k<kend; k++) {
158       if ((tcol=ssmap[a->j[k]])) {
159         *mat_j++ = tcol - 1;
160         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
161         mat_a   += bs2;
162         (*mat_ilen)++;
163       }
164     }
165   }
166 
167   /* Free work space */
168   ierr = PetscFree(smap);CHKERRQ(ierr);
169   ierr = PetscFree(lens);CHKERRQ(ierr);
170   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
171   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
172 
173   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
174   *B = C;
175   PetscFunctionReturn(0);
176 }
177 
178 #undef __FUNCT__
179 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
180 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
181 {
182   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
183   IS             is1;
184   PetscErrorCode ierr;
185   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
186   const PetscInt *irow;
187 
188   PetscFunctionBegin;
189   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
190 
191   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
192   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
193 
194   /* Verify if the indices corespond to each element in a block
195    and form the IS with compressed IS */
196   ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr);
197   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
198   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
199 
200   count = 0;
201   for (i=0; i<a->mbs; i++) {
202     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
203     if (vary[i]==bs) iary[count++] = i;
204   }
205   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
206   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
207   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
208 
209   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr);
210   ierr = ISDestroy(is1);CHKERRQ(ierr);
211   PetscFunctionReturn(0);
212 }
213 
214 #undef __FUNCT__
215 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
216 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
217 {
218   PetscErrorCode ierr;
219   PetscInt       i;
220 
221   PetscFunctionBegin;
222   if (scall == MAT_INITIAL_MATRIX) {
223     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
224   }
225 
226   for (i=0; i<n; i++) {
227     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
228   }
229   PetscFunctionReturn(0);
230 }
231 
232 /* -------------------------------------------------------*/
233 /* Should check that shapes of vectors and matrices match */
234 /* -------------------------------------------------------*/
235 #include "petscblaslapack.h"
236 
237 
238 #undef __FUNCT__
239 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
240 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
241 {
242   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
243   PetscScalar    *x,*z,*xb,x1,x2,zero=0.0;
244   MatScalar      *v;
245   PetscErrorCode ierr;
246   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
247   PetscInt       nonzerorow=0;
248 
249   PetscFunctionBegin;
250   ierr = VecSet(zz,zero);CHKERRQ(ierr);
251   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
252   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
253 
254   v     = a->a;
255   xb = x;
256 
257   for (i=0; i<mbs; i++) {
258     n  = ai[1] - ai[0]; /* length of i_th block row of A */
259     x1 = xb[0]; x2 = xb[1];
260     ib = aj + *ai;
261     jmin = 0;
262     nonzerorow += (n>0);
263     if (*ib == i){     /* (diag of A)*x */
264       z[2*i]   += v[0]*x1 + v[2]*x2;
265       z[2*i+1] += v[2]*x1 + v[3]*x2;
266       v += 4; jmin++;
267     }
268     for (j=jmin; j<n; j++) {
269       /* (strict lower triangular part of A)*x  */
270       cval       = ib[j]*2;
271       z[cval]     += v[0]*x1 + v[1]*x2;
272       z[cval+1]   += v[2]*x1 + v[3]*x2;
273       /* (strict upper triangular part of A)*x  */
274       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
275       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
276       v  += 4;
277     }
278     xb +=2; ai++;
279   }
280 
281   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
282   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
283   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
284   PetscFunctionReturn(0);
285 }
286 
287 #undef __FUNCT__
288 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
289 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
290 {
291   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
292   PetscScalar    *x,*z,*xb,x1,x2,x3,zero=0.0;
293   MatScalar      *v;
294   PetscErrorCode ierr;
295   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
296   PetscInt       nonzerorow=0;
297 
298   PetscFunctionBegin;
299   ierr = VecSet(zz,zero);CHKERRQ(ierr);
300   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
301   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
302 
303   v    = a->a;
304   xb   = x;
305 
306   for (i=0; i<mbs; i++) {
307     n  = ai[1] - ai[0]; /* length of i_th block row of A */
308     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
309     ib = aj + *ai;
310     jmin = 0;
311     nonzerorow += (n>0);
312     if (*ib == i){     /* (diag of A)*x */
313       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
314       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
315       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
316       v += 9; jmin++;
317     }
318     for (j=jmin; j<n; j++) {
319       /* (strict lower triangular part of A)*x  */
320       cval       = ib[j]*3;
321       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
322       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
323       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
324       /* (strict upper triangular part of A)*x  */
325       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
326       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
327       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
328       v  += 9;
329     }
330     xb +=3; ai++;
331   }
332 
333   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
334   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
335   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
336   PetscFunctionReturn(0);
337 }
338 
339 #undef __FUNCT__
340 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
341 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
342 {
343   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
344   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
345   MatScalar      *v;
346   PetscErrorCode ierr;
347   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
348   PetscInt       nonzerorow=0;
349 
350   PetscFunctionBegin;
351   ierr = VecSet(zz,zero);CHKERRQ(ierr);
352   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
353   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
354 
355   v     = a->a;
356   xb = x;
357 
358   for (i=0; i<mbs; i++) {
359     n  = ai[1] - ai[0]; /* length of i_th block row of A */
360     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
361     ib = aj + *ai;
362     jmin = 0;
363     nonzerorow += (n>0);
364     if (*ib == i){     /* (diag of A)*x */
365       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
366       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
367       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
368       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
369       v += 16; jmin++;
370     }
371     for (j=jmin; j<n; j++) {
372       /* (strict lower triangular part of A)*x  */
373       cval       = ib[j]*4;
374       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
375       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
376       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
377       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
378       /* (strict upper triangular part of A)*x  */
379       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
380       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
381       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
382       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
383       v  += 16;
384     }
385     xb +=4; ai++;
386   }
387 
388   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
389   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
390   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
391   PetscFunctionReturn(0);
392 }
393 
394 #undef __FUNCT__
395 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
396 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
397 {
398   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
399   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
400   MatScalar      *v;
401   PetscErrorCode ierr;
402   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
403   PetscInt       nonzerorow=0;
404 
405   PetscFunctionBegin;
406   ierr = VecSet(zz,zero);CHKERRQ(ierr);
407   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
408   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
409 
410   v     = a->a;
411   xb = x;
412 
413   for (i=0; i<mbs; i++) {
414     n  = ai[1] - ai[0]; /* length of i_th block row of A */
415     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
416     ib = aj + *ai;
417     jmin = 0;
418     nonzerorow += (n>0);
419     if (*ib == i){      /* (diag of A)*x */
420       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
421       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
422       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
423       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
424       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
425       v += 25; jmin++;
426     }
427     for (j=jmin; j<n; j++) {
428       /* (strict lower triangular part of A)*x  */
429       cval       = ib[j]*5;
430       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
431       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
432       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
433       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
434       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
435       /* (strict upper triangular part of A)*x  */
436       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
437       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
438       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
439       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
440       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
441       v  += 25;
442     }
443     xb +=5; ai++;
444   }
445 
446   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
447   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
448   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
449   PetscFunctionReturn(0);
450 }
451 
452 
453 #undef __FUNCT__
454 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
455 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
456 {
457   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
458   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
459   MatScalar      *v;
460   PetscErrorCode ierr;
461   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
462   PetscInt       nonzerorow=0;
463 
464   PetscFunctionBegin;
465   ierr = VecSet(zz,zero);CHKERRQ(ierr);
466   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
467   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
468 
469   v     = a->a;
470   xb = x;
471 
472   for (i=0; i<mbs; i++) {
473     n  = ai[1] - ai[0]; /* length of i_th block row of A */
474     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
475     ib = aj + *ai;
476     jmin = 0;
477     nonzerorow += (n>0);
478     if (*ib == i){      /* (diag of A)*x */
479       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
480       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
481       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
482       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
483       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
484       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
485       v += 36; jmin++;
486     }
487     for (j=jmin; j<n; j++) {
488       /* (strict lower triangular part of A)*x  */
489       cval       = ib[j]*6;
490       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
491       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
492       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
493       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
494       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
495       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
496       /* (strict upper triangular part of A)*x  */
497       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
498       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
499       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
500       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
501       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
502       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
503       v  += 36;
504     }
505     xb +=6; ai++;
506   }
507 
508   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
509   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
510   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
511   PetscFunctionReturn(0);
512 }
513 #undef __FUNCT__
514 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
515 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
516 {
517   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
518   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
519   MatScalar      *v;
520   PetscErrorCode ierr;
521   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
522   PetscInt       nonzerorow=0;
523 
524   PetscFunctionBegin;
525   ierr = VecSet(zz,zero);CHKERRQ(ierr);
526   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
527   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
528 
529   v     = a->a;
530   xb = x;
531 
532   for (i=0; i<mbs; i++) {
533     n  = ai[1] - ai[0]; /* length of i_th block row of A */
534     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
535     ib = aj + *ai;
536     jmin = 0;
537     nonzerorow += (n>0);
538     if (*ib == i){      /* (diag of A)*x */
539       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
540       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
541       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
542       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
543       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
544       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
545       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
546       v += 49; jmin++;
547     }
548     for (j=jmin; j<n; j++) {
549       /* (strict lower triangular part of A)*x  */
550       cval       = ib[j]*7;
551       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
552       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
553       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
554       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
555       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
556       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
557       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
558       /* (strict upper triangular part of A)*x  */
559       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
560       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
561       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
562       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
563       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
564       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
565       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
566       v  += 49;
567     }
568     xb +=7; ai++;
569   }
570   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
571   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
572   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
573   PetscFunctionReturn(0);
574 }
575 
576 /*
577     This will not work with MatScalar == float because it calls the BLAS
578 */
579 #undef __FUNCT__
580 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
581 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
582 {
583   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
584   PetscScalar    *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
585   MatScalar      *v;
586   PetscErrorCode ierr;
587   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
588   PetscInt       nonzerorow=0;
589 
590   PetscFunctionBegin;
591   ierr = VecSet(zz,zero);CHKERRQ(ierr);
592   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
593   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
594 
595   aj   = a->j;
596   v    = a->a;
597   ii   = a->i;
598 
599   if (!a->mult_work) {
600     ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
601   }
602   work = a->mult_work;
603 
604   for (i=0; i<mbs; i++) {
605     n     = ii[1] - ii[0]; ncols = n*bs;
606     workt = work; idx=aj+ii[0];
607     nonzerorow += (n>0);
608 
609     /* upper triangular part */
610     for (j=0; j<n; j++) {
611       xb = x_ptr + bs*(*idx++);
612       for (k=0; k<bs; k++) workt[k] = xb[k];
613       workt += bs;
614     }
615     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
616     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
617 
618     /* strict lower triangular part */
619     idx = aj+ii[0];
620     if (*idx == i){
621       ncols -= bs; v += bs2; idx++; n--;
622     }
623 
624     if (ncols > 0){
625       workt = work;
626       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
627       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
628       for (j=0; j<n; j++) {
629         zb = z_ptr + bs*(*idx++);
630         for (k=0; k<bs; k++) zb[k] += workt[k] ;
631         workt += bs;
632       }
633     }
634     x += bs; v += n*bs2; z += bs; ii++;
635   }
636 
637   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
638   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
639   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
640   PetscFunctionReturn(0);
641 }
642 
643 extern PetscErrorCode VecCopy_Seq(Vec,Vec);
644 #undef __FUNCT__
645 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
646 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
647 {
648   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
649   PetscScalar    *x,*z,*xb,x1;
650   MatScalar      *v;
651   PetscErrorCode ierr;
652   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
653   PetscInt       nonzerorow=0;
654 
655   PetscFunctionBegin;
656   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
657   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
658   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
659   v  = a->a;
660   xb = x;
661 
662   for (i=0; i<mbs; i++) {
663     n  = ai[1] - ai[0];  /* length of i_th row of A */
664     x1 = xb[0];
665     ib = aj + *ai;
666     jmin = 0;
667     nonzerorow += (n>0);
668     if (*ib == i) {            /* (diag of A)*x */
669       z[i] += *v++ * x[*ib++]; jmin++;
670     }
671     for (j=jmin; j<n; j++) {
672       cval    = *ib;
673       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
674       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
675     }
676     xb++; ai++;
677   }
678 
679   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
680   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
681 
682   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
683   PetscFunctionReturn(0);
684 }
685 
686 #undef __FUNCT__
687 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
688 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
689 {
690   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
691   PetscScalar    *x,*z,*xb,x1,x2;
692   MatScalar      *v;
693   PetscErrorCode ierr;
694   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
695   PetscInt       nonzerorow=0;
696 
697   PetscFunctionBegin;
698   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
699   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
700   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
701 
702   v  = a->a;
703   xb = x;
704 
705   for (i=0; i<mbs; i++) {
706     n  = ai[1] - ai[0]; /* length of i_th block row of A */
707     x1 = xb[0]; x2 = xb[1];
708     ib = aj + *ai;
709     jmin = 0;
710     nonzerorow += (n>0);
711     if (*ib == i){      /* (diag of A)*x */
712       z[2*i]   += v[0]*x1 + v[2]*x2;
713       z[2*i+1] += v[2]*x1 + v[3]*x2;
714       v += 4; jmin++;
715     }
716     for (j=jmin; j<n; j++) {
717       /* (strict lower triangular part of A)*x  */
718       cval       = ib[j]*2;
719       z[cval]     += v[0]*x1 + v[1]*x2;
720       z[cval+1]   += v[2]*x1 + v[3]*x2;
721       /* (strict upper triangular part of A)*x  */
722       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
723       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
724       v  += 4;
725     }
726     xb +=2; ai++;
727   }
728   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
729   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
730 
731   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
732   PetscFunctionReturn(0);
733 }
734 
735 #undef __FUNCT__
736 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
737 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
738 {
739   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
740   PetscScalar    *x,*z,*xb,x1,x2,x3;
741   MatScalar      *v;
742   PetscErrorCode ierr;
743   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
744   PetscInt       nonzerorow=0;
745 
746   PetscFunctionBegin;
747   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
748   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
749   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
750 
751   v     = a->a;
752   xb = x;
753 
754   for (i=0; i<mbs; i++) {
755     n  = ai[1] - ai[0]; /* length of i_th block row of A */
756     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
757     ib = aj + *ai;
758     jmin = 0;
759     nonzerorow += (n>0);
760     if (*ib == i){     /* (diag of A)*x */
761      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
762      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
763      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
764      v += 9; jmin++;
765     }
766     for (j=jmin; j<n; j++) {
767       /* (strict lower triangular part of A)*x  */
768       cval       = ib[j]*3;
769       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
770       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
771       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
772       /* (strict upper triangular part of A)*x  */
773       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
774       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
775       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
776       v  += 9;
777     }
778     xb +=3; ai++;
779   }
780 
781   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
782   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
783 
784   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
785   PetscFunctionReturn(0);
786 }
787 
788 #undef __FUNCT__
789 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
790 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
791 {
792   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
793   PetscScalar    *x,*z,*xb,x1,x2,x3,x4;
794   MatScalar      *v;
795   PetscErrorCode ierr;
796   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
797   PetscInt       nonzerorow=0;
798 
799   PetscFunctionBegin;
800   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
801   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
802   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
803 
804   v     = a->a;
805   xb = x;
806 
807   for (i=0; i<mbs; i++) {
808     n  = ai[1] - ai[0]; /* length of i_th block row of A */
809     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
810     ib = aj + *ai;
811     jmin = 0;
812     nonzerorow += (n>0);
813     if (*ib == i){      /* (diag of A)*x */
814       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
815       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
816       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
817       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
818       v += 16; jmin++;
819     }
820     for (j=jmin; j<n; j++) {
821       /* (strict lower triangular part of A)*x  */
822       cval       = ib[j]*4;
823       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
824       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
825       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
826       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
827       /* (strict upper triangular part of A)*x  */
828       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
829       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
830       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
831       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
832       v  += 16;
833     }
834     xb +=4; ai++;
835   }
836 
837   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
838   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
839 
840   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
841   PetscFunctionReturn(0);
842 }
843 
844 #undef __FUNCT__
845 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
846 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
847 {
848   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
849   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5;
850   MatScalar      *v;
851   PetscErrorCode ierr;
852   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
853   PetscInt       nonzerorow=0;
854 
855   PetscFunctionBegin;
856   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
857   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
858   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
859 
860   v     = a->a;
861   xb = x;
862 
863   for (i=0; i<mbs; i++) {
864     n  = ai[1] - ai[0]; /* length of i_th block row of A */
865     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
866     ib = aj + *ai;
867     jmin = 0;
868     nonzerorow += (n>0);
869     if (*ib == i){      /* (diag of A)*x */
870       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
871       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
872       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
873       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
874       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
875       v += 25; jmin++;
876     }
877     for (j=jmin; j<n; j++) {
878       /* (strict lower triangular part of A)*x  */
879       cval       = ib[j]*5;
880       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
881       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
882       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
883       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
884       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
885       /* (strict upper triangular part of A)*x  */
886       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
887       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
888       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
889       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
890       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
891       v  += 25;
892     }
893     xb +=5; ai++;
894   }
895 
896   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
897   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
898 
899   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
900   PetscFunctionReturn(0);
901 }
902 #undef __FUNCT__
903 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
904 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
905 {
906   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
907   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6;
908   MatScalar      *v;
909   PetscErrorCode ierr;
910   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
911   PetscInt       nonzerorow=0;
912 
913   PetscFunctionBegin;
914   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
915   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
916   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
917 
918   v     = a->a;
919   xb = x;
920 
921   for (i=0; i<mbs; i++) {
922     n  = ai[1] - ai[0]; /* length of i_th block row of A */
923     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
924     ib = aj + *ai;
925     jmin = 0;
926     nonzerorow += (n>0);
927     if (*ib == i){     /* (diag of A)*x */
928       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
929       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
930       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
931       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
932       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
933       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
934       v += 36; jmin++;
935     }
936     for (j=jmin; j<n; j++) {
937       /* (strict lower triangular part of A)*x  */
938       cval       = ib[j]*6;
939       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
940       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
941       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
942       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
943       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
944       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
945       /* (strict upper triangular part of A)*x  */
946       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
947       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
948       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
949       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
950       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
951       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
952       v  += 36;
953     }
954     xb +=6; ai++;
955   }
956 
957   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
958   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
959 
960   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
961   PetscFunctionReturn(0);
962 }
963 
964 #undef __FUNCT__
965 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
966 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
967 {
968   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
969   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
970   MatScalar      *v;
971   PetscErrorCode ierr;
972   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
973   PetscInt       nonzerorow=0;
974 
975   PetscFunctionBegin;
976   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
977   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
978   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
979 
980   v     = a->a;
981   xb = x;
982 
983   for (i=0; i<mbs; i++) {
984     n  = ai[1] - ai[0]; /* length of i_th block row of A */
985     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
986     ib = aj + *ai;
987     jmin = 0;
988     nonzerorow += (n>0);
989     if (*ib == i){     /* (diag of A)*x */
990       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
991       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
992       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
993       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
994       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
995       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
996       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
997       v += 49; jmin++;
998     }
999     for (j=jmin; j<n; j++) {
1000       /* (strict lower triangular part of A)*x  */
1001       cval       = ib[j]*7;
1002       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1003       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1004       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1005       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1006       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1007       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1008       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1009       /* (strict upper triangular part of A)*x  */
1010       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1011       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1012       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1013       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1014       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1015       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1016       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1017       v  += 49;
1018     }
1019     xb +=7; ai++;
1020   }
1021 
1022   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1023   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1024 
1025   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1026   PetscFunctionReturn(0);
1027 }
1028 
1029 #undef __FUNCT__
1030 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1031 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1032 {
1033   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1034   PetscScalar    *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1035   MatScalar      *v;
1036   PetscErrorCode ierr;
1037   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1038   PetscInt       nonzerorow=0;
1039 
1040   PetscFunctionBegin;
1041   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1042   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1043   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1044 
1045   aj   = a->j;
1046   v    = a->a;
1047   ii   = a->i;
1048 
1049   if (!a->mult_work) {
1050     ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1051   }
1052   work = a->mult_work;
1053 
1054 
1055   for (i=0; i<mbs; i++) {
1056     n     = ii[1] - ii[0]; ncols = n*bs;
1057     workt = work; idx=aj+ii[0];
1058     nonzerorow += (n>0);
1059 
1060     /* upper triangular part */
1061     for (j=0; j<n; j++) {
1062       xb = x_ptr + bs*(*idx++);
1063       for (k=0; k<bs; k++) workt[k] = xb[k];
1064       workt += bs;
1065     }
1066     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1067     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1068 
1069     /* strict lower triangular part */
1070     idx = aj+ii[0];
1071     if (*idx == i){
1072       ncols -= bs; v += bs2; idx++; n--;
1073     }
1074     if (ncols > 0){
1075       workt = work;
1076       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1077       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1078       for (j=0; j<n; j++) {
1079         zb = z_ptr + bs*(*idx++);
1080         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1081         workt += bs;
1082       }
1083     }
1084 
1085     x += bs; v += n*bs2; z += bs; ii++;
1086   }
1087 
1088   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1089   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1090 
1091   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1092   PetscFunctionReturn(0);
1093 }
1094 
1095 #undef __FUNCT__
1096 #define __FUNCT__ "MatScale_SeqSBAIJ"
1097 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1098 {
1099   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)inA->data;
1100   PetscScalar    oalpha = alpha;
1101   PetscErrorCode ierr;
1102   PetscBLASInt   one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz);
1103 
1104   PetscFunctionBegin;
1105   BLASscal_(&totalnz,&oalpha,a->a,&one);
1106   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1107   PetscFunctionReturn(0);
1108 }
1109 
1110 #undef __FUNCT__
1111 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1112 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1113 {
1114   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1115   MatScalar      *v = a->a;
1116   PetscReal      sum_diag = 0.0, sum_off = 0.0, *sum;
1117   PetscInt       i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1118   PetscErrorCode ierr;
1119   PetscInt       *jl,*il,jmin,jmax,nexti,ik,*col;
1120 
1121   PetscFunctionBegin;
1122   if (type == NORM_FROBENIUS) {
1123     for (k=0; k<mbs; k++){
1124       jmin = a->i[k]; jmax = a->i[k+1];
1125       col  = aj + jmin;
1126       if (*col == k){         /* diagonal block */
1127         for (i=0; i<bs2; i++){
1128 #if defined(PETSC_USE_COMPLEX)
1129           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1130 #else
1131           sum_diag += (*v)*(*v); v++;
1132 #endif
1133         }
1134         jmin++;
1135       }
1136       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1137         for (i=0; i<bs2; i++){
1138 #if defined(PETSC_USE_COMPLEX)
1139           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1140 #else
1141           sum_off += (*v)*(*v); v++;
1142 #endif
1143         }
1144       }
1145     }
1146     *norm = sqrt(sum_diag + 2*sum_off);
1147   }  else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1148     ierr = PetscMalloc3(bs,PetscReal,&sum,mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr);
1149     for (i=0; i<mbs; i++) jl[i] = mbs;
1150     il[0] = 0;
1151 
1152     *norm = 0.0;
1153     for (k=0; k<mbs; k++) { /* k_th block row */
1154       for (j=0; j<bs; j++) sum[j]=0.0;
1155       /*-- col sum --*/
1156       i = jl[k]; /* first |A(i,k)| to be added */
1157       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1158                   at step k */
1159       while (i<mbs){
1160         nexti = jl[i];  /* next block row to be added */
1161         ik    = il[i];  /* block index of A(i,k) in the array a */
1162         for (j=0; j<bs; j++){
1163           v = a->a + ik*bs2 + j*bs;
1164           for (k1=0; k1<bs; k1++) {
1165             sum[j] += PetscAbsScalar(*v); v++;
1166           }
1167         }
1168         /* update il, jl */
1169         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1170         jmax = a->i[i+1];
1171         if (jmin < jmax){
1172           il[i] = jmin;
1173           j   = a->j[jmin];
1174           jl[i] = jl[j]; jl[j]=i;
1175         }
1176         i = nexti;
1177       }
1178       /*-- row sum --*/
1179       jmin = a->i[k]; jmax = a->i[k+1];
1180       for (i=jmin; i<jmax; i++) {
1181         for (j=0; j<bs; j++){
1182           v = a->a + i*bs2 + j;
1183           for (k1=0; k1<bs; k1++){
1184             sum[j] += PetscAbsScalar(*v); v += bs;
1185           }
1186         }
1187       }
1188       /* add k_th block row to il, jl */
1189       col = aj+jmin;
1190       if (*col == k) jmin++;
1191       if (jmin < jmax){
1192         il[k] = jmin;
1193         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1194       }
1195       for (j=0; j<bs; j++){
1196         if (sum[j] > *norm) *norm = sum[j];
1197       }
1198     }
1199     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1200   } else {
1201     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1202   }
1203   PetscFunctionReturn(0);
1204 }
1205 
1206 #undef __FUNCT__
1207 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1208 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1209 {
1210   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1211   PetscErrorCode ierr;
1212 
1213   PetscFunctionBegin;
1214 
1215   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1216   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1217     *flg = PETSC_FALSE;
1218     PetscFunctionReturn(0);
1219   }
1220 
1221   /* if the a->i are the same */
1222   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1223   if (!*flg) {
1224     PetscFunctionReturn(0);
1225   }
1226 
1227   /* if a->j are the same */
1228   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1229   if (!*flg) {
1230     PetscFunctionReturn(0);
1231   }
1232   /* if a->a are the same */
1233   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1234   PetscFunctionReturn(0);
1235 }
1236 
1237 #undef __FUNCT__
1238 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1239 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1240 {
1241   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1242   PetscErrorCode ierr;
1243   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1244   PetscScalar    *x,zero = 0.0;
1245   MatScalar      *aa,*aa_j;
1246 
1247   PetscFunctionBegin;
1248   bs   = A->rmap->bs;
1249   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1250 
1251   aa   = a->a;
1252   ai   = a->i;
1253   aj   = a->j;
1254   ambs = a->mbs;
1255   bs2  = a->bs2;
1256 
1257   ierr = VecSet(v,zero);CHKERRQ(ierr);
1258   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1259   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1260   if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1261   for (i=0; i<ambs; i++) {
1262     j=ai[i];
1263     if (aj[j] == i) {             /* if this is a diagonal element */
1264       row  = i*bs;
1265       aa_j = aa + j*bs2;
1266       if (A->factor && bs==1){
1267         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1268       } else {
1269         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1270       }
1271     }
1272   }
1273 
1274   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1275   PetscFunctionReturn(0);
1276 }
1277 
1278 #undef __FUNCT__
1279 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1280 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1281 {
1282   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1283   PetscScalar    *l,x,*li,*ri;
1284   MatScalar      *aa,*v;
1285   PetscErrorCode ierr;
1286   PetscInt       i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1287   PetscTruth     flg;
1288 
1289   PetscFunctionBegin;
1290   if (ll != rr){
1291     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1292     if (!flg)
1293       SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1294   }
1295   if (!ll) PetscFunctionReturn(0);
1296   ai  = a->i;
1297   aj  = a->j;
1298   aa  = a->a;
1299   m   = A->rmap->N;
1300   bs  = A->rmap->bs;
1301   mbs = a->mbs;
1302   bs2 = a->bs2;
1303 
1304   ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1305   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1306   if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1307   for (i=0; i<mbs; i++) { /* for each block row */
1308     M  = ai[i+1] - ai[i];
1309     li = l + i*bs;
1310     v  = aa + bs2*ai[i];
1311     for (j=0; j<M; j++) { /* for each block */
1312       ri = l + bs*aj[ai[i]+j];
1313       for (k=0; k<bs; k++) {
1314         x = ri[k];
1315         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1316       }
1317     }
1318   }
1319   ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1320   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1321   PetscFunctionReturn(0);
1322 }
1323 
1324 #undef __FUNCT__
1325 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1326 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1327 {
1328   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1329 
1330   PetscFunctionBegin;
1331   info->block_size     = a->bs2;
1332   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1333   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1334   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1335   info->assemblies   = A->num_ass;
1336   info->mallocs      = a->reallocs;
1337   info->memory       = ((PetscObject)A)->mem;
1338   if (A->factor) {
1339     info->fill_ratio_given  = A->info.fill_ratio_given;
1340     info->fill_ratio_needed = A->info.fill_ratio_needed;
1341     info->factor_mallocs    = A->info.factor_mallocs;
1342   } else {
1343     info->fill_ratio_given  = 0;
1344     info->fill_ratio_needed = 0;
1345     info->factor_mallocs    = 0;
1346   }
1347   PetscFunctionReturn(0);
1348 }
1349 
1350 
1351 #undef __FUNCT__
1352 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1353 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1354 {
1355   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1356   PetscErrorCode ierr;
1357 
1358   PetscFunctionBegin;
1359   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1360   PetscFunctionReturn(0);
1361 }
1362 
1363 #undef __FUNCT__
1364 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1365 /*
1366    This code does not work since it only checks the upper triangular part of
1367   the matrix. Hence it is not listed in the function table.
1368 */
1369 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1370 {
1371   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1372   PetscErrorCode ierr;
1373   PetscInt       i,j,n,row,col,bs,*ai,*aj,mbs;
1374   PetscReal      atmp;
1375   MatScalar      *aa;
1376   PetscScalar    *x;
1377   PetscInt       ncols,brow,bcol,krow,kcol;
1378 
1379   PetscFunctionBegin;
1380   if (idx) SETERRQ(PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1381   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1382   bs   = A->rmap->bs;
1383   aa   = a->a;
1384   ai   = a->i;
1385   aj   = a->j;
1386   mbs = a->mbs;
1387 
1388   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1389   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1390   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1391   if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1392   for (i=0; i<mbs; i++) {
1393     ncols = ai[1] - ai[0]; ai++;
1394     brow  = bs*i;
1395     for (j=0; j<ncols; j++){
1396       bcol = bs*(*aj);
1397       for (kcol=0; kcol<bs; kcol++){
1398         col = bcol + kcol;      /* col index */
1399         for (krow=0; krow<bs; krow++){
1400           atmp = PetscAbsScalar(*aa); aa++;
1401           row = brow + krow;    /* row index */
1402           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1403           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1404         }
1405       }
1406       aj++;
1407     }
1408   }
1409   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1410   PetscFunctionReturn(0);
1411 }
1412