xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 753ebd1d069d7a4e86ed47ccb83df941bd1fb5db)
1 #define PETSCMAT_DLL
2 
3 #include "../src/mat/impls/baij/seq/baij.h"
4 #include "../src/mat/blockinvert.h"
5 #include "petscbt.h"
6 #include "../src/mat/impls/sbaij/seq/sbaij.h"
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11 {
12   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
15   const PetscInt *idx;
16   PetscBT        table,table0;
17 
18   PetscFunctionBegin;
19   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20   mbs = a->mbs;
21   ai  = a->i;
22   aj  = a->j;
23   bs  = A->rmap->bs;
24   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
25   ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
26   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
27   ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr);
28 
29   for (i=0; i<is_max; i++) { /* for each is */
30     isz  = 0;
31     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
32 
33     /* Extract the indices, assume there can be duplicate entries */
34     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36 
37     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
38     bcol_max = 0;
39     for (j=0; j<n ; ++j){
40       brow = idx[j]/bs; /* convert the indices into block indices */
41       if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42       if(!PetscBTLookupSet(table,brow)) {
43         nidx[isz++] = brow;
44         if (bcol_max < brow) bcol_max = brow;
45       }
46     }
47     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
49 
50     k = 0;
51     for (j=0; j<ov; j++){ /* for each overlap */
52       /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53       ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr);
54       for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); }
55 
56       n = isz;  /* length of the updated is[i] */
57       for (brow=0; brow<mbs; brow++){
58         start = ai[brow]; end   = ai[brow+1];
59         if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
60           for (l = start; l<end ; l++){
61             bcol = aj[l];
62             if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
63           }
64           k++;
65           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
66         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
67           for (l = start; l<end ; l++){
68             bcol = aj[l];
69             if (bcol > bcol_max) break;
70             if (PetscBTLookup(table0,bcol)){
71               if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
72               break; /* for l = start; l<end ; l++) */
73             }
74           }
75         }
76       }
77     } /* for each overlap */
78 
79     /* expand the Index Set */
80     for (j=0; j<isz; j++) {
81       for (k=0; k<bs; k++)
82         nidx2[j*bs+k] = nidx[j]*bs+k;
83     }
84     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
85   }
86   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
87   ierr = PetscFree(nidx);CHKERRQ(ierr);
88   ierr = PetscFree(nidx2);CHKERRQ(ierr);
89   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
90   PetscFunctionReturn(0);
91 }
92 
93 #undef __FUNCT__
94 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
95 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
96 {
97   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
98   PetscErrorCode ierr;
99   PetscInt       *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
100   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
101   PetscInt       nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
102   const PetscInt *irow,*aj = a->j,*ai = a->i;
103   MatScalar      *mat_a;
104   Mat            C;
105   PetscTruth     flag,sorted;
106 
107   PetscFunctionBegin;
108   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
109   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
110   if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
111 
112   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
113   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
114 
115   ierr  = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
116   ssmap = smap;
117   ierr  = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
118   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
119   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
120   /* determine lens of each row */
121   for (i=0; i<nrows; i++) {
122     kstart  = ai[irow[i]];
123     kend    = kstart + a->ilen[irow[i]];
124     lens[i] = 0;
125       for (k=kstart; k<kend; k++) {
126         if (ssmap[aj[k]]) {
127           lens[i]++;
128         }
129       }
130     }
131   /* Create and fill new matrix */
132   if (scall == MAT_REUSE_MATRIX) {
133     c = (Mat_SeqSBAIJ *)((*B)->data);
134 
135     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
136     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
137     if (!flag) {
138       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
139     }
140     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
141     C = *B;
142   } else {
143     ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr);
144     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
145     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
146     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
147   }
148   c = (Mat_SeqSBAIJ *)(C->data);
149   for (i=0; i<nrows; i++) {
150     row    = irow[i];
151     kstart = ai[row];
152     kend   = kstart + a->ilen[row];
153     mat_i  = c->i[i];
154     mat_j  = c->j + mat_i;
155     mat_a  = c->a + mat_i*bs2;
156     mat_ilen = c->ilen + i;
157     for (k=kstart; k<kend; k++) {
158       if ((tcol=ssmap[a->j[k]])) {
159         *mat_j++ = tcol - 1;
160         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
161         mat_a   += bs2;
162         (*mat_ilen)++;
163       }
164     }
165   }
166 
167   /* Free work space */
168   ierr = PetscFree(smap);CHKERRQ(ierr);
169   ierr = PetscFree(lens);CHKERRQ(ierr);
170   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
171   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
172 
173   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
174   *B = C;
175   PetscFunctionReturn(0);
176 }
177 
178 #undef __FUNCT__
179 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
180 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
181 {
182   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
183   IS             is1;
184   PetscErrorCode ierr;
185   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
186   const PetscInt *irow;
187 
188   PetscFunctionBegin;
189   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
190 
191   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
192   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
193 
194   /* Verify if the indices corespond to each element in a block
195    and form the IS with compressed IS */
196   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);CHKERRQ(ierr);
197   iary = vary + a->mbs;
198   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
199   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
200 
201   count = 0;
202   for (i=0; i<a->mbs; i++) {
203     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
204     if (vary[i]==bs) iary[count++] = i;
205   }
206   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
207 
208   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
209   ierr = PetscFree(vary);CHKERRQ(ierr);
210 
211   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr);
212   ISDestroy(is1);
213   PetscFunctionReturn(0);
214 }
215 
216 #undef __FUNCT__
217 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
218 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
219 {
220   PetscErrorCode ierr;
221   PetscInt       i;
222 
223   PetscFunctionBegin;
224   if (scall == MAT_INITIAL_MATRIX) {
225     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
226   }
227 
228   for (i=0; i<n; i++) {
229     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
230   }
231   PetscFunctionReturn(0);
232 }
233 
234 /* -------------------------------------------------------*/
235 /* Should check that shapes of vectors and matrices match */
236 /* -------------------------------------------------------*/
237 #include "petscblaslapack.h"
238 
239 
240 #undef __FUNCT__
241 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
242 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
243 {
244   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
245   PetscScalar    *x,*z,*xb,x1,x2,zero=0.0;
246   MatScalar      *v;
247   PetscErrorCode ierr;
248   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
249   PetscInt       nonzerorow=0;
250 
251   PetscFunctionBegin;
252   ierr = VecSet(zz,zero);CHKERRQ(ierr);
253   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
254   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
255 
256   v     = a->a;
257   xb = x;
258 
259   for (i=0; i<mbs; i++) {
260     n  = ai[1] - ai[0]; /* length of i_th block row of A */
261     x1 = xb[0]; x2 = xb[1];
262     ib = aj + *ai;
263     jmin = 0;
264     nonzerorow += (n>0);
265     if (*ib == i){     /* (diag of A)*x */
266       z[2*i]   += v[0]*x1 + v[2]*x2;
267       z[2*i+1] += v[2]*x1 + v[3]*x2;
268       v += 4; jmin++;
269     }
270     for (j=jmin; j<n; j++) {
271       /* (strict lower triangular part of A)*x  */
272       cval       = ib[j]*2;
273       z[cval]     += v[0]*x1 + v[1]*x2;
274       z[cval+1]   += v[2]*x1 + v[3]*x2;
275       /* (strict upper triangular part of A)*x  */
276       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
277       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
278       v  += 4;
279     }
280     xb +=2; ai++;
281   }
282 
283   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
284   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
285   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
286   PetscFunctionReturn(0);
287 }
288 
289 #undef __FUNCT__
290 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
291 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
292 {
293   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
294   PetscScalar    *x,*z,*xb,x1,x2,x3,zero=0.0;
295   MatScalar      *v;
296   PetscErrorCode ierr;
297   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
298   PetscInt       nonzerorow=0;
299 
300   PetscFunctionBegin;
301   ierr = VecSet(zz,zero);CHKERRQ(ierr);
302   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
303   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
304 
305   v    = a->a;
306   xb   = x;
307 
308   for (i=0; i<mbs; i++) {
309     n  = ai[1] - ai[0]; /* length of i_th block row of A */
310     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
311     ib = aj + *ai;
312     jmin = 0;
313     nonzerorow += (n>0);
314     if (*ib == i){     /* (diag of A)*x */
315       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
316       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
317       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
318       v += 9; jmin++;
319     }
320     for (j=jmin; j<n; j++) {
321       /* (strict lower triangular part of A)*x  */
322       cval       = ib[j]*3;
323       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
324       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
325       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
326       /* (strict upper triangular part of A)*x  */
327       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
328       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
329       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
330       v  += 9;
331     }
332     xb +=3; ai++;
333   }
334 
335   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
336   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
337   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
338   PetscFunctionReturn(0);
339 }
340 
341 #undef __FUNCT__
342 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
343 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
344 {
345   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
346   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
347   MatScalar      *v;
348   PetscErrorCode ierr;
349   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
350   PetscInt       nonzerorow=0;
351 
352   PetscFunctionBegin;
353   ierr = VecSet(zz,zero);CHKERRQ(ierr);
354   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
355   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
356 
357   v     = a->a;
358   xb = x;
359 
360   for (i=0; i<mbs; i++) {
361     n  = ai[1] - ai[0]; /* length of i_th block row of A */
362     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
363     ib = aj + *ai;
364     jmin = 0;
365     nonzerorow += (n>0);
366     if (*ib == i){     /* (diag of A)*x */
367       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
368       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
369       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
370       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
371       v += 16; jmin++;
372     }
373     for (j=jmin; j<n; j++) {
374       /* (strict lower triangular part of A)*x  */
375       cval       = ib[j]*4;
376       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
377       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
378       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
379       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
380       /* (strict upper triangular part of A)*x  */
381       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
382       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
383       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
384       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
385       v  += 16;
386     }
387     xb +=4; ai++;
388   }
389 
390   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
391   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
392   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
393   PetscFunctionReturn(0);
394 }
395 
396 #undef __FUNCT__
397 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
398 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
399 {
400   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
401   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
402   MatScalar      *v;
403   PetscErrorCode ierr;
404   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
405   PetscInt       nonzerorow=0;
406 
407   PetscFunctionBegin;
408   ierr = VecSet(zz,zero);CHKERRQ(ierr);
409   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
410   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
411 
412   v     = a->a;
413   xb = x;
414 
415   for (i=0; i<mbs; i++) {
416     n  = ai[1] - ai[0]; /* length of i_th block row of A */
417     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
418     ib = aj + *ai;
419     jmin = 0;
420     nonzerorow += (n>0);
421     if (*ib == i){      /* (diag of A)*x */
422       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
423       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
424       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
425       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
426       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
427       v += 25; jmin++;
428     }
429     for (j=jmin; j<n; j++) {
430       /* (strict lower triangular part of A)*x  */
431       cval       = ib[j]*5;
432       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
433       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
434       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
435       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
436       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
437       /* (strict upper triangular part of A)*x  */
438       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
439       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
440       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
441       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
442       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
443       v  += 25;
444     }
445     xb +=5; ai++;
446   }
447 
448   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
449   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
450   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
451   PetscFunctionReturn(0);
452 }
453 
454 
455 #undef __FUNCT__
456 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
457 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
458 {
459   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
460   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
461   MatScalar      *v;
462   PetscErrorCode ierr;
463   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
464   PetscInt       nonzerorow=0;
465 
466   PetscFunctionBegin;
467   ierr = VecSet(zz,zero);CHKERRQ(ierr);
468   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
469   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
470 
471   v     = a->a;
472   xb = x;
473 
474   for (i=0; i<mbs; i++) {
475     n  = ai[1] - ai[0]; /* length of i_th block row of A */
476     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
477     ib = aj + *ai;
478     jmin = 0;
479     nonzerorow += (n>0);
480     if (*ib == i){      /* (diag of A)*x */
481       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
482       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
483       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
484       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
485       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
486       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
487       v += 36; jmin++;
488     }
489     for (j=jmin; j<n; j++) {
490       /* (strict lower triangular part of A)*x  */
491       cval       = ib[j]*6;
492       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
493       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
494       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
495       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
496       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
497       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
498       /* (strict upper triangular part of A)*x  */
499       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
500       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
501       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
502       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
503       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
504       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
505       v  += 36;
506     }
507     xb +=6; ai++;
508   }
509 
510   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
511   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
512   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
513   PetscFunctionReturn(0);
514 }
515 #undef __FUNCT__
516 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
517 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
518 {
519   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
520   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
521   MatScalar      *v;
522   PetscErrorCode ierr;
523   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
524   PetscInt       nonzerorow=0;
525 
526   PetscFunctionBegin;
527   ierr = VecSet(zz,zero);CHKERRQ(ierr);
528   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
529   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
530 
531   v     = a->a;
532   xb = x;
533 
534   for (i=0; i<mbs; i++) {
535     n  = ai[1] - ai[0]; /* length of i_th block row of A */
536     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
537     ib = aj + *ai;
538     jmin = 0;
539     nonzerorow += (n>0);
540     if (*ib == i){      /* (diag of A)*x */
541       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
542       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
543       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
544       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
545       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
546       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
547       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
548       v += 49; jmin++;
549     }
550     for (j=jmin; j<n; j++) {
551       /* (strict lower triangular part of A)*x  */
552       cval       = ib[j]*7;
553       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
554       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
555       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
556       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
557       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
558       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
559       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
560       /* (strict upper triangular part of A)*x  */
561       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
562       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
563       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
564       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
565       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
566       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
567       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
568       v  += 49;
569     }
570     xb +=7; ai++;
571   }
572   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
573   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
574   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
575   PetscFunctionReturn(0);
576 }
577 
578 /*
579     This will not work with MatScalar == float because it calls the BLAS
580 */
581 #undef __FUNCT__
582 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
583 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
584 {
585   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
586   PetscScalar    *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
587   MatScalar      *v;
588   PetscErrorCode ierr;
589   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
590   PetscInt       nonzerorow=0;
591 
592   PetscFunctionBegin;
593   ierr = VecSet(zz,zero);CHKERRQ(ierr);
594   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
595   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
596 
597   aj   = a->j;
598   v    = a->a;
599   ii   = a->i;
600 
601   if (!a->mult_work) {
602     ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
603   }
604   work = a->mult_work;
605 
606   for (i=0; i<mbs; i++) {
607     n     = ii[1] - ii[0]; ncols = n*bs;
608     workt = work; idx=aj+ii[0];
609     nonzerorow += (n>0);
610 
611     /* upper triangular part */
612     for (j=0; j<n; j++) {
613       xb = x_ptr + bs*(*idx++);
614       for (k=0; k<bs; k++) workt[k] = xb[k];
615       workt += bs;
616     }
617     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
618     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
619 
620     /* strict lower triangular part */
621     idx = aj+ii[0];
622     if (*idx == i){
623       ncols -= bs; v += bs2; idx++; n--;
624     }
625 
626     if (ncols > 0){
627       workt = work;
628       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
629       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
630       for (j=0; j<n; j++) {
631         zb = z_ptr + bs*(*idx++);
632         for (k=0; k<bs; k++) zb[k] += workt[k] ;
633         workt += bs;
634       }
635     }
636     x += bs; v += n*bs2; z += bs; ii++;
637   }
638 
639   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
640   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
641   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
642   PetscFunctionReturn(0);
643 }
644 
645 extern PetscErrorCode VecCopy_Seq(Vec,Vec);
646 #undef __FUNCT__
647 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
648 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
649 {
650   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
651   PetscScalar    *x,*z,*xb,x1;
652   MatScalar      *v;
653   PetscErrorCode ierr;
654   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
655   PetscInt       nonzerorow=0;
656 
657   PetscFunctionBegin;
658   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
659   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
660   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
661   v  = a->a;
662   xb = x;
663 
664   for (i=0; i<mbs; i++) {
665     n  = ai[1] - ai[0];  /* length of i_th row of A */
666     x1 = xb[0];
667     ib = aj + *ai;
668     jmin = 0;
669     nonzerorow += (n>0);
670     if (*ib == i) {            /* (diag of A)*x */
671       z[i] += *v++ * x[*ib++]; jmin++;
672     }
673     for (j=jmin; j<n; j++) {
674       cval    = *ib;
675       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
676       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
677     }
678     xb++; ai++;
679   }
680 
681   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
682   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
683 
684   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
685   PetscFunctionReturn(0);
686 }
687 
688 #undef __FUNCT__
689 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
690 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
691 {
692   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
693   PetscScalar    *x,*z,*xb,x1,x2;
694   MatScalar      *v;
695   PetscErrorCode ierr;
696   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
697   PetscInt       nonzerorow=0;
698 
699   PetscFunctionBegin;
700   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
701   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
702   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
703 
704   v  = a->a;
705   xb = x;
706 
707   for (i=0; i<mbs; i++) {
708     n  = ai[1] - ai[0]; /* length of i_th block row of A */
709     x1 = xb[0]; x2 = xb[1];
710     ib = aj + *ai;
711     jmin = 0;
712     nonzerorow += (n>0);
713     if (*ib == i){      /* (diag of A)*x */
714       z[2*i]   += v[0]*x1 + v[2]*x2;
715       z[2*i+1] += v[2]*x1 + v[3]*x2;
716       v += 4; jmin++;
717     }
718     for (j=jmin; j<n; j++) {
719       /* (strict lower triangular part of A)*x  */
720       cval       = ib[j]*2;
721       z[cval]     += v[0]*x1 + v[1]*x2;
722       z[cval+1]   += v[2]*x1 + v[3]*x2;
723       /* (strict upper triangular part of A)*x  */
724       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
725       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
726       v  += 4;
727     }
728     xb +=2; ai++;
729   }
730   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
731   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
732 
733   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
734   PetscFunctionReturn(0);
735 }
736 
737 #undef __FUNCT__
738 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
739 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
740 {
741   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
742   PetscScalar    *x,*z,*xb,x1,x2,x3;
743   MatScalar      *v;
744   PetscErrorCode ierr;
745   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
746   PetscInt       nonzerorow=0;
747 
748   PetscFunctionBegin;
749   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
750   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
751   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
752 
753   v     = a->a;
754   xb = x;
755 
756   for (i=0; i<mbs; i++) {
757     n  = ai[1] - ai[0]; /* length of i_th block row of A */
758     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
759     ib = aj + *ai;
760     jmin = 0;
761     nonzerorow += (n>0);
762     if (*ib == i){     /* (diag of A)*x */
763      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
764      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
765      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
766      v += 9; jmin++;
767     }
768     for (j=jmin; j<n; j++) {
769       /* (strict lower triangular part of A)*x  */
770       cval       = ib[j]*3;
771       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
772       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
773       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
774       /* (strict upper triangular part of A)*x  */
775       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
776       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
777       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
778       v  += 9;
779     }
780     xb +=3; ai++;
781   }
782 
783   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
784   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
785 
786   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
787   PetscFunctionReturn(0);
788 }
789 
790 #undef __FUNCT__
791 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
792 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
793 {
794   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
795   PetscScalar    *x,*z,*xb,x1,x2,x3,x4;
796   MatScalar      *v;
797   PetscErrorCode ierr;
798   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
799   PetscInt       nonzerorow=0;
800 
801   PetscFunctionBegin;
802   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
803   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
804   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
805 
806   v     = a->a;
807   xb = x;
808 
809   for (i=0; i<mbs; i++) {
810     n  = ai[1] - ai[0]; /* length of i_th block row of A */
811     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
812     ib = aj + *ai;
813     jmin = 0;
814     nonzerorow += (n>0);
815     if (*ib == i){      /* (diag of A)*x */
816       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
817       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
818       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
819       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
820       v += 16; jmin++;
821     }
822     for (j=jmin; j<n; j++) {
823       /* (strict lower triangular part of A)*x  */
824       cval       = ib[j]*4;
825       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
826       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
827       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
828       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
829       /* (strict upper triangular part of A)*x  */
830       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
831       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
832       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
833       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
834       v  += 16;
835     }
836     xb +=4; ai++;
837   }
838 
839   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
840   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
841 
842   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
843   PetscFunctionReturn(0);
844 }
845 
846 #undef __FUNCT__
847 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
848 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
849 {
850   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
851   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5;
852   MatScalar      *v;
853   PetscErrorCode ierr;
854   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
855   PetscInt       nonzerorow=0;
856 
857   PetscFunctionBegin;
858   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
859   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
860   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
861 
862   v     = a->a;
863   xb = x;
864 
865   for (i=0; i<mbs; i++) {
866     n  = ai[1] - ai[0]; /* length of i_th block row of A */
867     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
868     ib = aj + *ai;
869     jmin = 0;
870     nonzerorow += (n>0);
871     if (*ib == i){      /* (diag of A)*x */
872       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
873       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
874       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
875       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
876       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
877       v += 25; jmin++;
878     }
879     for (j=jmin; j<n; j++) {
880       /* (strict lower triangular part of A)*x  */
881       cval       = ib[j]*5;
882       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
883       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
884       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
885       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
886       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
887       /* (strict upper triangular part of A)*x  */
888       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
889       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
890       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
891       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
892       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
893       v  += 25;
894     }
895     xb +=5; ai++;
896   }
897 
898   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
899   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
900 
901   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
902   PetscFunctionReturn(0);
903 }
904 #undef __FUNCT__
905 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
906 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
907 {
908   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
909   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6;
910   MatScalar      *v;
911   PetscErrorCode ierr;
912   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
913   PetscInt       nonzerorow=0;
914 
915   PetscFunctionBegin;
916   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
917   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
918   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
919 
920   v     = a->a;
921   xb = x;
922 
923   for (i=0; i<mbs; i++) {
924     n  = ai[1] - ai[0]; /* length of i_th block row of A */
925     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
926     ib = aj + *ai;
927     jmin = 0;
928     nonzerorow += (n>0);
929     if (*ib == i){     /* (diag of A)*x */
930       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
931       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
932       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
933       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
934       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
935       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
936       v += 36; jmin++;
937     }
938     for (j=jmin; j<n; j++) {
939       /* (strict lower triangular part of A)*x  */
940       cval       = ib[j]*6;
941       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
942       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
943       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
944       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
945       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
946       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
947       /* (strict upper triangular part of A)*x  */
948       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
949       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
950       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
951       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
952       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
953       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
954       v  += 36;
955     }
956     xb +=6; ai++;
957   }
958 
959   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
960   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
961 
962   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
963   PetscFunctionReturn(0);
964 }
965 
966 #undef __FUNCT__
967 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
968 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
969 {
970   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
971   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
972   MatScalar      *v;
973   PetscErrorCode ierr;
974   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
975   PetscInt       nonzerorow=0;
976 
977   PetscFunctionBegin;
978   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
979   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
980   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
981 
982   v     = a->a;
983   xb = x;
984 
985   for (i=0; i<mbs; i++) {
986     n  = ai[1] - ai[0]; /* length of i_th block row of A */
987     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
988     ib = aj + *ai;
989     jmin = 0;
990     nonzerorow += (n>0);
991     if (*ib == i){     /* (diag of A)*x */
992       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
993       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
994       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
995       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
996       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
997       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
998       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
999       v += 49; jmin++;
1000     }
1001     for (j=jmin; j<n; j++) {
1002       /* (strict lower triangular part of A)*x  */
1003       cval       = ib[j]*7;
1004       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1005       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1006       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1007       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1008       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1009       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1010       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1011       /* (strict upper triangular part of A)*x  */
1012       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1013       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1014       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1015       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1016       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1017       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1018       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1019       v  += 49;
1020     }
1021     xb +=7; ai++;
1022   }
1023 
1024   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1025   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1026 
1027   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1028   PetscFunctionReturn(0);
1029 }
1030 
1031 #undef __FUNCT__
1032 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1033 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1034 {
1035   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1036   PetscScalar    *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1037   MatScalar      *v;
1038   PetscErrorCode ierr;
1039   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1040   PetscInt       nonzerorow=0;
1041 
1042   PetscFunctionBegin;
1043   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1044   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1045   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1046 
1047   aj   = a->j;
1048   v    = a->a;
1049   ii   = a->i;
1050 
1051   if (!a->mult_work) {
1052     ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1053   }
1054   work = a->mult_work;
1055 
1056 
1057   for (i=0; i<mbs; i++) {
1058     n     = ii[1] - ii[0]; ncols = n*bs;
1059     workt = work; idx=aj+ii[0];
1060     nonzerorow += (n>0);
1061 
1062     /* upper triangular part */
1063     for (j=0; j<n; j++) {
1064       xb = x_ptr + bs*(*idx++);
1065       for (k=0; k<bs; k++) workt[k] = xb[k];
1066       workt += bs;
1067     }
1068     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1069     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1070 
1071     /* strict lower triangular part */
1072     idx = aj+ii[0];
1073     if (*idx == i){
1074       ncols -= bs; v += bs2; idx++; n--;
1075     }
1076     if (ncols > 0){
1077       workt = work;
1078       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1079       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1080       for (j=0; j<n; j++) {
1081         zb = z_ptr + bs*(*idx++);
1082         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1083         workt += bs;
1084       }
1085     }
1086 
1087     x += bs; v += n*bs2; z += bs; ii++;
1088   }
1089 
1090   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1091   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1092 
1093   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1094   PetscFunctionReturn(0);
1095 }
1096 
1097 #undef __FUNCT__
1098 #define __FUNCT__ "MatScale_SeqSBAIJ"
1099 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1100 {
1101   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)inA->data;
1102   PetscScalar    oalpha = alpha;
1103   PetscErrorCode ierr;
1104   PetscBLASInt   one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz);
1105 
1106   PetscFunctionBegin;
1107   BLASscal_(&totalnz,&oalpha,a->a,&one);
1108   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1109   PetscFunctionReturn(0);
1110 }
1111 
1112 #undef __FUNCT__
1113 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1114 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1115 {
1116   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1117   MatScalar      *v = a->a;
1118   PetscReal      sum_diag = 0.0, sum_off = 0.0, *sum;
1119   PetscInt       i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1120   PetscErrorCode ierr;
1121   PetscInt       *jl,*il,jmin,jmax,nexti,ik,*col;
1122 
1123   PetscFunctionBegin;
1124   if (type == NORM_FROBENIUS) {
1125     for (k=0; k<mbs; k++){
1126       jmin = a->i[k]; jmax = a->i[k+1];
1127       col  = aj + jmin;
1128       if (*col == k){         /* diagonal block */
1129         for (i=0; i<bs2; i++){
1130 #if defined(PETSC_USE_COMPLEX)
1131           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1132 #else
1133           sum_diag += (*v)*(*v); v++;
1134 #endif
1135         }
1136         jmin++;
1137       }
1138       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1139         for (i=0; i<bs2; i++){
1140 #if defined(PETSC_USE_COMPLEX)
1141           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1142 #else
1143           sum_off += (*v)*(*v); v++;
1144 #endif
1145         }
1146       }
1147     }
1148     *norm = sqrt(sum_diag + 2*sum_off);
1149   }  else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1150     ierr = PetscMalloc((2*mbs+1)*sizeof(PetscInt)+bs*sizeof(PetscReal),&il);CHKERRQ(ierr);
1151     jl   = il + mbs;
1152     sum  = (PetscReal*)(jl + mbs);
1153     for (i=0; i<mbs; i++) jl[i] = mbs;
1154     il[0] = 0;
1155 
1156     *norm = 0.0;
1157     for (k=0; k<mbs; k++) { /* k_th block row */
1158       for (j=0; j<bs; j++) sum[j]=0.0;
1159       /*-- col sum --*/
1160       i = jl[k]; /* first |A(i,k)| to be added */
1161       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1162                   at step k */
1163       while (i<mbs){
1164         nexti = jl[i];  /* next block row to be added */
1165         ik    = il[i];  /* block index of A(i,k) in the array a */
1166         for (j=0; j<bs; j++){
1167           v = a->a + ik*bs2 + j*bs;
1168           for (k1=0; k1<bs; k1++) {
1169             sum[j] += PetscAbsScalar(*v); v++;
1170           }
1171         }
1172         /* update il, jl */
1173         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1174         jmax = a->i[i+1];
1175         if (jmin < jmax){
1176           il[i] = jmin;
1177           j   = a->j[jmin];
1178           jl[i] = jl[j]; jl[j]=i;
1179         }
1180         i = nexti;
1181       }
1182       /*-- row sum --*/
1183       jmin = a->i[k]; jmax = a->i[k+1];
1184       for (i=jmin; i<jmax; i++) {
1185         for (j=0; j<bs; j++){
1186           v = a->a + i*bs2 + j;
1187           for (k1=0; k1<bs; k1++){
1188             sum[j] += PetscAbsScalar(*v); v += bs;
1189           }
1190         }
1191       }
1192       /* add k_th block row to il, jl */
1193       col = aj+jmin;
1194       if (*col == k) jmin++;
1195       if (jmin < jmax){
1196         il[k] = jmin;
1197         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1198       }
1199       for (j=0; j<bs; j++){
1200         if (sum[j] > *norm) *norm = sum[j];
1201       }
1202     }
1203     ierr = PetscFree(il);CHKERRQ(ierr);
1204   } else {
1205     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1206   }
1207   PetscFunctionReturn(0);
1208 }
1209 
1210 #undef __FUNCT__
1211 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1212 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1213 {
1214   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1215   PetscErrorCode ierr;
1216 
1217   PetscFunctionBegin;
1218 
1219   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1220   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1221     *flg = PETSC_FALSE;
1222     PetscFunctionReturn(0);
1223   }
1224 
1225   /* if the a->i are the same */
1226   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1227   if (!*flg) {
1228     PetscFunctionReturn(0);
1229   }
1230 
1231   /* if a->j are the same */
1232   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1233   if (!*flg) {
1234     PetscFunctionReturn(0);
1235   }
1236   /* if a->a are the same */
1237   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1238   PetscFunctionReturn(0);
1239 }
1240 
1241 #undef __FUNCT__
1242 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1243 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1244 {
1245   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1246   PetscErrorCode ierr;
1247   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1248   PetscScalar    *x,zero = 0.0;
1249   MatScalar      *aa,*aa_j;
1250 
1251   PetscFunctionBegin;
1252   bs   = A->rmap->bs;
1253   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1254 
1255   aa   = a->a;
1256   ai   = a->i;
1257   aj   = a->j;
1258   ambs = a->mbs;
1259   bs2  = a->bs2;
1260 
1261   ierr = VecSet(v,zero);CHKERRQ(ierr);
1262   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1263   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1264   if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1265   for (i=0; i<ambs; i++) {
1266     j=ai[i];
1267     if (aj[j] == i) {             /* if this is a diagonal element */
1268       row  = i*bs;
1269       aa_j = aa + j*bs2;
1270       if (A->factor && bs==1){
1271         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1272       } else {
1273         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1274       }
1275     }
1276   }
1277 
1278   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1279   PetscFunctionReturn(0);
1280 }
1281 
1282 #undef __FUNCT__
1283 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1284 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1285 {
1286   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1287   PetscScalar    *l,x,*li,*ri;
1288   MatScalar      *aa,*v;
1289   PetscErrorCode ierr;
1290   PetscInt       i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1291   PetscTruth     flg;
1292 
1293   PetscFunctionBegin;
1294   if (ll != rr){
1295     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1296     if (!flg)
1297       SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1298   }
1299   if (!ll) PetscFunctionReturn(0);
1300   ai  = a->i;
1301   aj  = a->j;
1302   aa  = a->a;
1303   m   = A->rmap->N;
1304   bs  = A->rmap->bs;
1305   mbs = a->mbs;
1306   bs2 = a->bs2;
1307 
1308   ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1309   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1310   if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1311   for (i=0; i<mbs; i++) { /* for each block row */
1312     M  = ai[i+1] - ai[i];
1313     li = l + i*bs;
1314     v  = aa + bs2*ai[i];
1315     for (j=0; j<M; j++) { /* for each block */
1316       ri = l + bs*aj[ai[i]+j];
1317       for (k=0; k<bs; k++) {
1318         x = ri[k];
1319         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1320       }
1321     }
1322   }
1323   ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1324   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1325   PetscFunctionReturn(0);
1326 }
1327 
1328 #undef __FUNCT__
1329 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1330 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1331 {
1332   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1333 
1334   PetscFunctionBegin;
1335   info->block_size     = a->bs2;
1336   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1337   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1338   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1339   info->assemblies   = A->num_ass;
1340   info->mallocs      = a->reallocs;
1341   info->memory       = ((PetscObject)A)->mem;
1342   if (A->factor) {
1343     info->fill_ratio_given  = A->info.fill_ratio_given;
1344     info->fill_ratio_needed = A->info.fill_ratio_needed;
1345     info->factor_mallocs    = A->info.factor_mallocs;
1346   } else {
1347     info->fill_ratio_given  = 0;
1348     info->fill_ratio_needed = 0;
1349     info->factor_mallocs    = 0;
1350   }
1351   PetscFunctionReturn(0);
1352 }
1353 
1354 
1355 #undef __FUNCT__
1356 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1357 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1358 {
1359   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1360   PetscErrorCode ierr;
1361 
1362   PetscFunctionBegin;
1363   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1364   PetscFunctionReturn(0);
1365 }
1366 
1367 #undef __FUNCT__
1368 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1369 /*
1370    This code does not work since it only checks the upper triangular part of
1371   the matrix. Hence it is not listed in the function table.
1372 */
1373 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1374 {
1375   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1376   PetscErrorCode ierr;
1377   PetscInt       i,j,n,row,col,bs,*ai,*aj,mbs;
1378   PetscReal      atmp;
1379   MatScalar      *aa;
1380   PetscScalar    *x;
1381   PetscInt       ncols,brow,bcol,krow,kcol;
1382 
1383   PetscFunctionBegin;
1384   if (idx) SETERRQ(PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1385   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1386   bs   = A->rmap->bs;
1387   aa   = a->a;
1388   ai   = a->i;
1389   aj   = a->j;
1390   mbs = a->mbs;
1391 
1392   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1393   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1394   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1395   if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1396   for (i=0; i<mbs; i++) {
1397     ncols = ai[1] - ai[0]; ai++;
1398     brow  = bs*i;
1399     for (j=0; j<ncols; j++){
1400       bcol = bs*(*aj);
1401       for (kcol=0; kcol<bs; kcol++){
1402         col = bcol + kcol;      /* col index */
1403         for (krow=0; krow<bs; krow++){
1404           atmp = PetscAbsScalar(*aa); aa++;
1405           row = brow + krow;    /* row index */
1406           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1407           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1408         }
1409       }
1410       aj++;
1411     }
1412   }
1413   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1414   PetscFunctionReturn(0);
1415 }
1416