xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision d543d7c6650afe38e02d4e42c97e77acf4b0e00a)
1 #define PETSCMAT_DLL
2 
3 #include "../src/mat/impls/baij/seq/baij.h"
4 #include "../src/mat/blockinvert.h"
5 #include "petscbt.h"
6 #include "../src/mat/impls/sbaij/seq/sbaij.h"
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11 {
12   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
15   const PetscInt *idx;
16   PetscBT        table,table0;
17 
18   PetscFunctionBegin;
19   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20   mbs = a->mbs;
21   ai  = a->i;
22   aj  = a->j;
23   bs  = A->rmap->bs;
24   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
25   ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
26   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
27   ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr);
28 
29   for (i=0; i<is_max; i++) { /* for each is */
30     isz  = 0;
31     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
32 
33     /* Extract the indices, assume there can be duplicate entries */
34     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36 
37     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
38     bcol_max = 0;
39     for (j=0; j<n ; ++j){
40       brow = idx[j]/bs; /* convert the indices into block indices */
41       if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42       if(!PetscBTLookupSet(table,brow)) {
43         nidx[isz++] = brow;
44         if (bcol_max < brow) bcol_max = brow;
45       }
46     }
47     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
49 
50     k = 0;
51     for (j=0; j<ov; j++){ /* for each overlap */
52       /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53       ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr);
54       for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); }
55 
56       n = isz;  /* length of the updated is[i] */
57       for (brow=0; brow<mbs; brow++){
58         start = ai[brow]; end   = ai[brow+1];
59         if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
60           for (l = start; l<end ; l++){
61             bcol = aj[l];
62             if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
63           }
64           k++;
65           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
66         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
67           for (l = start; l<end ; l++){
68             bcol = aj[l];
69             if (bcol > bcol_max) break;
70             if (PetscBTLookup(table0,bcol)){
71               if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
72               break; /* for l = start; l<end ; l++) */
73             }
74           }
75         }
76       }
77     } /* for each overlap */
78 
79     /* expand the Index Set */
80     for (j=0; j<isz; j++) {
81       for (k=0; k<bs; k++)
82         nidx2[j*bs+k] = nidx[j]*bs+k;
83     }
84     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
85   }
86   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
87   ierr = PetscFree(nidx);CHKERRQ(ierr);
88   ierr = PetscFree(nidx2);CHKERRQ(ierr);
89   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
90   PetscFunctionReturn(0);
91 }
92 
93 #undef __FUNCT__
94 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
95 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
96 {
97   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
98   PetscErrorCode ierr;
99   PetscInt       *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
100   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
101   PetscInt       nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
102   const PetscInt *irow,*aj = a->j,*ai = a->i;
103   MatScalar      *mat_a;
104   Mat            C;
105   PetscTruth     flag,sorted;
106 
107   PetscFunctionBegin;
108   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
109   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
110   if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
111 
112   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
113   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
114 
115   ierr  = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
116   ssmap = smap;
117   ierr  = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
118   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
119   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
120   /* determine lens of each row */
121   for (i=0; i<nrows; i++) {
122     kstart  = ai[irow[i]];
123     kend    = kstart + a->ilen[irow[i]];
124     lens[i] = 0;
125       for (k=kstart; k<kend; k++) {
126         if (ssmap[aj[k]]) {
127           lens[i]++;
128         }
129       }
130     }
131   /* Create and fill new matrix */
132   if (scall == MAT_REUSE_MATRIX) {
133     c = (Mat_SeqSBAIJ *)((*B)->data);
134 
135     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
136     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
137     if (!flag) {
138       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
139     }
140     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
141     C = *B;
142   } else {
143     ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr);
144     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
145     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
146     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
147   }
148   c = (Mat_SeqSBAIJ *)(C->data);
149   for (i=0; i<nrows; i++) {
150     row    = irow[i];
151     kstart = ai[row];
152     kend   = kstart + a->ilen[row];
153     mat_i  = c->i[i];
154     mat_j  = c->j + mat_i;
155     mat_a  = c->a + mat_i*bs2;
156     mat_ilen = c->ilen + i;
157     for (k=kstart; k<kend; k++) {
158       if ((tcol=ssmap[a->j[k]])) {
159         *mat_j++ = tcol - 1;
160         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
161         mat_a   += bs2;
162         (*mat_ilen)++;
163       }
164     }
165   }
166 
167   /* Free work space */
168   ierr = PetscFree(smap);CHKERRQ(ierr);
169   ierr = PetscFree(lens);CHKERRQ(ierr);
170   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
171   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
172 
173   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
174   *B = C;
175   PetscFunctionReturn(0);
176 }
177 
178 #undef __FUNCT__
179 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
180 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
181 {
182   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
183   IS             is1;
184   PetscErrorCode ierr;
185   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
186   const PetscInt *irow;
187 
188   PetscFunctionBegin;
189   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
190 
191   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
192   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
193 
194   /* Verify if the indices corespond to each element in a block
195    and form the IS with compressed IS */
196   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);CHKERRQ(ierr);
197   iary = vary + a->mbs;
198   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
199   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
200 
201   count = 0;
202   for (i=0; i<a->mbs; i++) {
203     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
204     if (vary[i]==bs) iary[count++] = i;
205   }
206   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
207 
208   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
209   ierr = PetscFree(vary);CHKERRQ(ierr);
210 
211   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr);
212   ISDestroy(is1);
213   PetscFunctionReturn(0);
214 }
215 
216 #undef __FUNCT__
217 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
218 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
219 {
220   PetscErrorCode ierr;
221   PetscInt       i;
222 
223   PetscFunctionBegin;
224   if (scall == MAT_INITIAL_MATRIX) {
225     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
226   }
227 
228   for (i=0; i<n; i++) {
229     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
230   }
231   PetscFunctionReturn(0);
232 }
233 
234 /* -------------------------------------------------------*/
235 /* Should check that shapes of vectors and matrices match */
236 /* -------------------------------------------------------*/
237 #include "petscblaslapack.h"
238 
239 #undef __FUNCT__
240 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
241 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
242 {
243   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
244   const PetscScalar *x;
245   PetscScalar       *z,x1,sum;
246   const MatScalar   *v;
247   PetscErrorCode    ierr;
248   PetscInt          mbs=a->mbs,i,j,nz;
249   const PetscInt    *ib=a->j,*ai=a->i;
250 
251   PetscFunctionBegin;
252   ierr = VecSet(zz,0.0);CHKERRQ(ierr);
253   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
254   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
255 
256   v  = a->a;
257   for (i=0; i<mbs; i++) {
258     nz   = ai[i+1] - ai[i];        /* length of i_th row of A */
259     x1   = x[i];
260     sum  = v[0]*x1;                /* diagonal term */
261     for (j=1; j<nz; j++) {
262       z[ib[j]] += v[j] * x1;       /* (strict lower triangular part of A)*x  */
263       sum      += v[j] * x[ib[j]]; /* (strict upper triangular part of A)*x  */
264     }
265     z[i] += sum;
266     v    += nz;
267     ib   += nz;
268   }
269 
270   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
271   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
272   ierr = PetscLogFlops(2.0*(2.0*a->nz - mbs) - mbs);CHKERRQ(ierr);
273   PetscFunctionReturn(0);
274 }
275 
276 #undef __FUNCT__
277 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
278 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
279 {
280   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
281   PetscScalar    *x,*z,*xb,x1,x2,zero=0.0;
282   MatScalar      *v;
283   PetscErrorCode ierr;
284   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
285   PetscInt       nonzerorow=0;
286 
287   PetscFunctionBegin;
288   ierr = VecSet(zz,zero);CHKERRQ(ierr);
289   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
290   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
291 
292   v     = a->a;
293   xb = x;
294 
295   for (i=0; i<mbs; i++) {
296     n  = ai[1] - ai[0]; /* length of i_th block row of A */
297     x1 = xb[0]; x2 = xb[1];
298     ib = aj + *ai;
299     jmin = 0;
300     nonzerorow += (n>0);
301     if (*ib == i){     /* (diag of A)*x */
302       z[2*i]   += v[0]*x1 + v[2]*x2;
303       z[2*i+1] += v[2]*x1 + v[3]*x2;
304       v += 4; jmin++;
305     }
306     for (j=jmin; j<n; j++) {
307       /* (strict lower triangular part of A)*x  */
308       cval       = ib[j]*2;
309       z[cval]     += v[0]*x1 + v[1]*x2;
310       z[cval+1]   += v[2]*x1 + v[3]*x2;
311       /* (strict upper triangular part of A)*x  */
312       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
313       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
314       v  += 4;
315     }
316     xb +=2; ai++;
317   }
318 
319   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
320   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
321   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
322   PetscFunctionReturn(0);
323 }
324 
325 #undef __FUNCT__
326 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
327 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
328 {
329   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
330   PetscScalar    *x,*z,*xb,x1,x2,x3,zero=0.0;
331   MatScalar      *v;
332   PetscErrorCode ierr;
333   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
334   PetscInt       nonzerorow=0;
335 
336   PetscFunctionBegin;
337   ierr = VecSet(zz,zero);CHKERRQ(ierr);
338   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
339   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
340 
341   v    = a->a;
342   xb   = x;
343 
344   for (i=0; i<mbs; i++) {
345     n  = ai[1] - ai[0]; /* length of i_th block row of A */
346     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
347     ib = aj + *ai;
348     jmin = 0;
349     nonzerorow += (n>0);
350     if (*ib == i){     /* (diag of A)*x */
351       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
352       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
353       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
354       v += 9; jmin++;
355     }
356     for (j=jmin; j<n; j++) {
357       /* (strict lower triangular part of A)*x  */
358       cval       = ib[j]*3;
359       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
360       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
361       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
362       /* (strict upper triangular part of A)*x  */
363       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
364       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
365       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
366       v  += 9;
367     }
368     xb +=3; ai++;
369   }
370 
371   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
372   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
373   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
374   PetscFunctionReturn(0);
375 }
376 
377 #undef __FUNCT__
378 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
379 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
380 {
381   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
382   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
383   MatScalar      *v;
384   PetscErrorCode ierr;
385   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
386   PetscInt       nonzerorow=0;
387 
388   PetscFunctionBegin;
389   ierr = VecSet(zz,zero);CHKERRQ(ierr);
390   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
391   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
392 
393   v     = a->a;
394   xb = x;
395 
396   for (i=0; i<mbs; i++) {
397     n  = ai[1] - ai[0]; /* length of i_th block row of A */
398     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
399     ib = aj + *ai;
400     jmin = 0;
401     nonzerorow += (n>0);
402     if (*ib == i){     /* (diag of A)*x */
403       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
404       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
405       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
406       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
407       v += 16; jmin++;
408     }
409     for (j=jmin; j<n; j++) {
410       /* (strict lower triangular part of A)*x  */
411       cval       = ib[j]*4;
412       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
413       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
414       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
415       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
416       /* (strict upper triangular part of A)*x  */
417       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
418       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
419       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
420       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
421       v  += 16;
422     }
423     xb +=4; ai++;
424   }
425 
426   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
427   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
428   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
429   PetscFunctionReturn(0);
430 }
431 
432 #undef __FUNCT__
433 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
434 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
435 {
436   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
437   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
438   MatScalar      *v;
439   PetscErrorCode ierr;
440   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
441   PetscInt       nonzerorow=0;
442 
443   PetscFunctionBegin;
444   ierr = VecSet(zz,zero);CHKERRQ(ierr);
445   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
446   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
447 
448   v     = a->a;
449   xb = x;
450 
451   for (i=0; i<mbs; i++) {
452     n  = ai[1] - ai[0]; /* length of i_th block row of A */
453     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
454     ib = aj + *ai;
455     jmin = 0;
456     nonzerorow += (n>0);
457     if (*ib == i){      /* (diag of A)*x */
458       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
459       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
460       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
461       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
462       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
463       v += 25; jmin++;
464     }
465     for (j=jmin; j<n; j++) {
466       /* (strict lower triangular part of A)*x  */
467       cval       = ib[j]*5;
468       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
469       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
470       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
471       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
472       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
473       /* (strict upper triangular part of A)*x  */
474       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
475       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
476       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
477       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
478       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
479       v  += 25;
480     }
481     xb +=5; ai++;
482   }
483 
484   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
485   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
486   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
487   PetscFunctionReturn(0);
488 }
489 
490 
491 #undef __FUNCT__
492 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
493 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
494 {
495   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
496   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
497   MatScalar      *v;
498   PetscErrorCode ierr;
499   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
500   PetscInt       nonzerorow=0;
501 
502   PetscFunctionBegin;
503   ierr = VecSet(zz,zero);CHKERRQ(ierr);
504   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
505   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
506 
507   v     = a->a;
508   xb = x;
509 
510   for (i=0; i<mbs; i++) {
511     n  = ai[1] - ai[0]; /* length of i_th block row of A */
512     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
513     ib = aj + *ai;
514     jmin = 0;
515     nonzerorow += (n>0);
516     if (*ib == i){      /* (diag of A)*x */
517       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
518       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
519       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
520       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
521       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
522       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
523       v += 36; jmin++;
524     }
525     for (j=jmin; j<n; j++) {
526       /* (strict lower triangular part of A)*x  */
527       cval       = ib[j]*6;
528       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
529       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
530       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
531       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
532       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
533       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
534       /* (strict upper triangular part of A)*x  */
535       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
536       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
537       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
538       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
539       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
540       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
541       v  += 36;
542     }
543     xb +=6; ai++;
544   }
545 
546   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
547   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
548   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
549   PetscFunctionReturn(0);
550 }
551 #undef __FUNCT__
552 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
553 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
554 {
555   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
556   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
557   MatScalar      *v;
558   PetscErrorCode ierr;
559   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
560   PetscInt       nonzerorow=0;
561 
562   PetscFunctionBegin;
563   ierr = VecSet(zz,zero);CHKERRQ(ierr);
564   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
565   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
566 
567   v     = a->a;
568   xb = x;
569 
570   for (i=0; i<mbs; i++) {
571     n  = ai[1] - ai[0]; /* length of i_th block row of A */
572     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
573     ib = aj + *ai;
574     jmin = 0;
575     nonzerorow += (n>0);
576     if (*ib == i){      /* (diag of A)*x */
577       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
578       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
579       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
580       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
581       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
582       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
583       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
584       v += 49; jmin++;
585     }
586     for (j=jmin; j<n; j++) {
587       /* (strict lower triangular part of A)*x  */
588       cval       = ib[j]*7;
589       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
590       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
591       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
592       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
593       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
594       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
595       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
596       /* (strict upper triangular part of A)*x  */
597       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
598       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
599       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
600       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
601       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
602       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
603       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
604       v  += 49;
605     }
606     xb +=7; ai++;
607   }
608   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
609   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
610   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
611   PetscFunctionReturn(0);
612 }
613 
614 /*
615     This will not work with MatScalar == float because it calls the BLAS
616 */
617 #undef __FUNCT__
618 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
619 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
620 {
621   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
622   PetscScalar    *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
623   MatScalar      *v;
624   PetscErrorCode ierr;
625   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
626   PetscInt       nonzerorow=0;
627 
628   PetscFunctionBegin;
629   ierr = VecSet(zz,zero);CHKERRQ(ierr);
630   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
631   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
632 
633   aj   = a->j;
634   v    = a->a;
635   ii   = a->i;
636 
637   if (!a->mult_work) {
638     ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
639   }
640   work = a->mult_work;
641 
642   for (i=0; i<mbs; i++) {
643     n     = ii[1] - ii[0]; ncols = n*bs;
644     workt = work; idx=aj+ii[0];
645     nonzerorow += (n>0);
646 
647     /* upper triangular part */
648     for (j=0; j<n; j++) {
649       xb = x_ptr + bs*(*idx++);
650       for (k=0; k<bs; k++) workt[k] = xb[k];
651       workt += bs;
652     }
653     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
654     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
655 
656     /* strict lower triangular part */
657     idx = aj+ii[0];
658     if (*idx == i){
659       ncols -= bs; v += bs2; idx++; n--;
660     }
661 
662     if (ncols > 0){
663       workt = work;
664       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
665       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
666       for (j=0; j<n; j++) {
667         zb = z_ptr + bs*(*idx++);
668         for (k=0; k<bs; k++) zb[k] += workt[k] ;
669         workt += bs;
670       }
671     }
672     x += bs; v += n*bs2; z += bs; ii++;
673   }
674 
675   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
676   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
677   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
678   PetscFunctionReturn(0);
679 }
680 
681 extern PetscErrorCode VecCopy_Seq(Vec,Vec);
682 #undef __FUNCT__
683 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
684 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
685 {
686   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
687   PetscScalar    *x,*z,*xb,x1;
688   MatScalar      *v;
689   PetscErrorCode ierr;
690   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
691   PetscInt       nonzerorow=0;
692 
693   PetscFunctionBegin;
694   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
695   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
696   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
697   v  = a->a;
698   xb = x;
699 
700   for (i=0; i<mbs; i++) {
701     n  = ai[1] - ai[0];  /* length of i_th row of A */
702     x1 = xb[0];
703     ib = aj + *ai;
704     jmin = 0;
705     nonzerorow += (n>0);
706     if (*ib == i) {            /* (diag of A)*x */
707       z[i] += *v++ * x[*ib++]; jmin++;
708     }
709     for (j=jmin; j<n; j++) {
710       cval    = *ib;
711       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
712       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
713     }
714     xb++; ai++;
715   }
716 
717   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
718   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
719 
720   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
721   PetscFunctionReturn(0);
722 }
723 
724 #undef __FUNCT__
725 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
726 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
727 {
728   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
729   PetscScalar    *x,*z,*xb,x1,x2;
730   MatScalar      *v;
731   PetscErrorCode ierr;
732   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
733   PetscInt       nonzerorow=0;
734 
735   PetscFunctionBegin;
736   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
737   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
738   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
739 
740   v  = a->a;
741   xb = x;
742 
743   for (i=0; i<mbs; i++) {
744     n  = ai[1] - ai[0]; /* length of i_th block row of A */
745     x1 = xb[0]; x2 = xb[1];
746     ib = aj + *ai;
747     jmin = 0;
748     nonzerorow += (n>0);
749     if (*ib == i){      /* (diag of A)*x */
750       z[2*i]   += v[0]*x1 + v[2]*x2;
751       z[2*i+1] += v[2]*x1 + v[3]*x2;
752       v += 4; jmin++;
753     }
754     for (j=jmin; j<n; j++) {
755       /* (strict lower triangular part of A)*x  */
756       cval       = ib[j]*2;
757       z[cval]     += v[0]*x1 + v[1]*x2;
758       z[cval+1]   += v[2]*x1 + v[3]*x2;
759       /* (strict upper triangular part of A)*x  */
760       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
761       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
762       v  += 4;
763     }
764     xb +=2; ai++;
765   }
766   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
767   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
768 
769   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
770   PetscFunctionReturn(0);
771 }
772 
773 #undef __FUNCT__
774 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
775 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
776 {
777   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
778   PetscScalar    *x,*z,*xb,x1,x2,x3;
779   MatScalar      *v;
780   PetscErrorCode ierr;
781   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
782   PetscInt       nonzerorow=0;
783 
784   PetscFunctionBegin;
785   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
786   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
787   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
788 
789   v     = a->a;
790   xb = x;
791 
792   for (i=0; i<mbs; i++) {
793     n  = ai[1] - ai[0]; /* length of i_th block row of A */
794     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
795     ib = aj + *ai;
796     jmin = 0;
797     nonzerorow += (n>0);
798     if (*ib == i){     /* (diag of A)*x */
799      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
800      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
801      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
802      v += 9; jmin++;
803     }
804     for (j=jmin; j<n; j++) {
805       /* (strict lower triangular part of A)*x  */
806       cval       = ib[j]*3;
807       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
808       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
809       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
810       /* (strict upper triangular part of A)*x  */
811       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
812       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
813       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
814       v  += 9;
815     }
816     xb +=3; ai++;
817   }
818 
819   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
820   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
821 
822   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
823   PetscFunctionReturn(0);
824 }
825 
826 #undef __FUNCT__
827 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
828 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
829 {
830   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
831   PetscScalar    *x,*z,*xb,x1,x2,x3,x4;
832   MatScalar      *v;
833   PetscErrorCode ierr;
834   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
835   PetscInt       nonzerorow=0;
836 
837   PetscFunctionBegin;
838   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
839   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
840   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
841 
842   v     = a->a;
843   xb = x;
844 
845   for (i=0; i<mbs; i++) {
846     n  = ai[1] - ai[0]; /* length of i_th block row of A */
847     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
848     ib = aj + *ai;
849     jmin = 0;
850     nonzerorow += (n>0);
851     if (*ib == i){      /* (diag of A)*x */
852       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
853       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
854       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
855       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
856       v += 16; jmin++;
857     }
858     for (j=jmin; j<n; j++) {
859       /* (strict lower triangular part of A)*x  */
860       cval       = ib[j]*4;
861       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
862       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
863       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
864       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
865       /* (strict upper triangular part of A)*x  */
866       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
867       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
868       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
869       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
870       v  += 16;
871     }
872     xb +=4; ai++;
873   }
874 
875   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
876   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
877 
878   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
879   PetscFunctionReturn(0);
880 }
881 
882 #undef __FUNCT__
883 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
884 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
885 {
886   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
887   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5;
888   MatScalar      *v;
889   PetscErrorCode ierr;
890   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
891   PetscInt       nonzerorow=0;
892 
893   PetscFunctionBegin;
894   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
895   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
896   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
897 
898   v     = a->a;
899   xb = x;
900 
901   for (i=0; i<mbs; i++) {
902     n  = ai[1] - ai[0]; /* length of i_th block row of A */
903     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
904     ib = aj + *ai;
905     jmin = 0;
906     nonzerorow += (n>0);
907     if (*ib == i){      /* (diag of A)*x */
908       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
909       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
910       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
911       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
912       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
913       v += 25; jmin++;
914     }
915     for (j=jmin; j<n; j++) {
916       /* (strict lower triangular part of A)*x  */
917       cval       = ib[j]*5;
918       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
919       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
920       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
921       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
922       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
923       /* (strict upper triangular part of A)*x  */
924       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
925       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
926       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
927       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
928       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
929       v  += 25;
930     }
931     xb +=5; ai++;
932   }
933 
934   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
935   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
936 
937   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
938   PetscFunctionReturn(0);
939 }
940 #undef __FUNCT__
941 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
942 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
943 {
944   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
945   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6;
946   MatScalar      *v;
947   PetscErrorCode ierr;
948   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
949   PetscInt       nonzerorow=0;
950 
951   PetscFunctionBegin;
952   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
953   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
954   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
955 
956   v     = a->a;
957   xb = x;
958 
959   for (i=0; i<mbs; i++) {
960     n  = ai[1] - ai[0]; /* length of i_th block row of A */
961     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
962     ib = aj + *ai;
963     jmin = 0;
964     nonzerorow += (n>0);
965     if (*ib == i){     /* (diag of A)*x */
966       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
967       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
968       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
969       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
970       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
971       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
972       v += 36; jmin++;
973     }
974     for (j=jmin; j<n; j++) {
975       /* (strict lower triangular part of A)*x  */
976       cval       = ib[j]*6;
977       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
978       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
979       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
980       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
981       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
982       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
983       /* (strict upper triangular part of A)*x  */
984       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
985       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
986       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
987       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
988       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
989       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
990       v  += 36;
991     }
992     xb +=6; ai++;
993   }
994 
995   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
996   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
997 
998   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
999   PetscFunctionReturn(0);
1000 }
1001 
1002 #undef __FUNCT__
1003 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1004 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1005 {
1006   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1007   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
1008   MatScalar      *v;
1009   PetscErrorCode ierr;
1010   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
1011   PetscInt       nonzerorow=0;
1012 
1013   PetscFunctionBegin;
1014   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1015   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1016   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1017 
1018   v     = a->a;
1019   xb = x;
1020 
1021   for (i=0; i<mbs; i++) {
1022     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1023     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1024     ib = aj + *ai;
1025     jmin = 0;
1026     nonzerorow += (n>0);
1027     if (*ib == i){     /* (diag of A)*x */
1028       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1029       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1030       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1031       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1032       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1033       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1034       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1035       v += 49; jmin++;
1036     }
1037     for (j=jmin; j<n; j++) {
1038       /* (strict lower triangular part of A)*x  */
1039       cval       = ib[j]*7;
1040       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1041       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1042       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1043       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1044       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1045       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1046       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1047       /* (strict upper triangular part of A)*x  */
1048       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1049       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1050       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1051       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1052       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1053       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1054       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1055       v  += 49;
1056     }
1057     xb +=7; ai++;
1058   }
1059 
1060   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1061   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1062 
1063   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1064   PetscFunctionReturn(0);
1065 }
1066 
1067 #undef __FUNCT__
1068 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1069 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1070 {
1071   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1072   PetscScalar    *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1073   MatScalar      *v;
1074   PetscErrorCode ierr;
1075   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1076   PetscInt       nonzerorow=0;
1077 
1078   PetscFunctionBegin;
1079   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1080   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1081   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1082 
1083   aj   = a->j;
1084   v    = a->a;
1085   ii   = a->i;
1086 
1087   if (!a->mult_work) {
1088     ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1089   }
1090   work = a->mult_work;
1091 
1092 
1093   for (i=0; i<mbs; i++) {
1094     n     = ii[1] - ii[0]; ncols = n*bs;
1095     workt = work; idx=aj+ii[0];
1096     nonzerorow += (n>0);
1097 
1098     /* upper triangular part */
1099     for (j=0; j<n; j++) {
1100       xb = x_ptr + bs*(*idx++);
1101       for (k=0; k<bs; k++) workt[k] = xb[k];
1102       workt += bs;
1103     }
1104     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1105     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1106 
1107     /* strict lower triangular part */
1108     idx = aj+ii[0];
1109     if (*idx == i){
1110       ncols -= bs; v += bs2; idx++; n--;
1111     }
1112     if (ncols > 0){
1113       workt = work;
1114       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1115       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1116       for (j=0; j<n; j++) {
1117         zb = z_ptr + bs*(*idx++);
1118         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1119         workt += bs;
1120       }
1121     }
1122 
1123     x += bs; v += n*bs2; z += bs; ii++;
1124   }
1125 
1126   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1127   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1128 
1129   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1130   PetscFunctionReturn(0);
1131 }
1132 
1133 #undef __FUNCT__
1134 #define __FUNCT__ "MatScale_SeqSBAIJ"
1135 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1136 {
1137   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)inA->data;
1138   PetscScalar    oalpha = alpha;
1139   PetscErrorCode ierr;
1140   PetscBLASInt   one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz);
1141 
1142   PetscFunctionBegin;
1143   BLASscal_(&totalnz,&oalpha,a->a,&one);
1144   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1145   PetscFunctionReturn(0);
1146 }
1147 
1148 #undef __FUNCT__
1149 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1150 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1151 {
1152   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1153   MatScalar      *v = a->a;
1154   PetscReal      sum_diag = 0.0, sum_off = 0.0, *sum;
1155   PetscInt       i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1156   PetscErrorCode ierr;
1157   PetscInt       *jl,*il,jmin,jmax,nexti,ik,*col;
1158 
1159   PetscFunctionBegin;
1160   if (type == NORM_FROBENIUS) {
1161     for (k=0; k<mbs; k++){
1162       jmin = a->i[k]; jmax = a->i[k+1];
1163       col  = aj + jmin;
1164       if (*col == k){         /* diagonal block */
1165         for (i=0; i<bs2; i++){
1166 #if defined(PETSC_USE_COMPLEX)
1167           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1168 #else
1169           sum_diag += (*v)*(*v); v++;
1170 #endif
1171         }
1172         jmin++;
1173       }
1174       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1175         for (i=0; i<bs2; i++){
1176 #if defined(PETSC_USE_COMPLEX)
1177           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1178 #else
1179           sum_off += (*v)*(*v); v++;
1180 #endif
1181         }
1182       }
1183     }
1184     *norm = sqrt(sum_diag + 2*sum_off);
1185   }  else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1186     ierr = PetscMalloc((2*mbs+1)*sizeof(PetscInt)+bs*sizeof(PetscReal),&il);CHKERRQ(ierr);
1187     jl   = il + mbs;
1188     sum  = (PetscReal*)(jl + mbs);
1189     for (i=0; i<mbs; i++) jl[i] = mbs;
1190     il[0] = 0;
1191 
1192     *norm = 0.0;
1193     for (k=0; k<mbs; k++) { /* k_th block row */
1194       for (j=0; j<bs; j++) sum[j]=0.0;
1195       /*-- col sum --*/
1196       i = jl[k]; /* first |A(i,k)| to be added */
1197       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1198                   at step k */
1199       while (i<mbs){
1200         nexti = jl[i];  /* next block row to be added */
1201         ik    = il[i];  /* block index of A(i,k) in the array a */
1202         for (j=0; j<bs; j++){
1203           v = a->a + ik*bs2 + j*bs;
1204           for (k1=0; k1<bs; k1++) {
1205             sum[j] += PetscAbsScalar(*v); v++;
1206           }
1207         }
1208         /* update il, jl */
1209         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1210         jmax = a->i[i+1];
1211         if (jmin < jmax){
1212           il[i] = jmin;
1213           j   = a->j[jmin];
1214           jl[i] = jl[j]; jl[j]=i;
1215         }
1216         i = nexti;
1217       }
1218       /*-- row sum --*/
1219       jmin = a->i[k]; jmax = a->i[k+1];
1220       for (i=jmin; i<jmax; i++) {
1221         for (j=0; j<bs; j++){
1222           v = a->a + i*bs2 + j;
1223           for (k1=0; k1<bs; k1++){
1224             sum[j] += PetscAbsScalar(*v); v += bs;
1225           }
1226         }
1227       }
1228       /* add k_th block row to il, jl */
1229       col = aj+jmin;
1230       if (*col == k) jmin++;
1231       if (jmin < jmax){
1232         il[k] = jmin;
1233         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1234       }
1235       for (j=0; j<bs; j++){
1236         if (sum[j] > *norm) *norm = sum[j];
1237       }
1238     }
1239     ierr = PetscFree(il);CHKERRQ(ierr);
1240   } else {
1241     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1242   }
1243   PetscFunctionReturn(0);
1244 }
1245 
1246 #undef __FUNCT__
1247 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1248 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1249 {
1250   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1251   PetscErrorCode ierr;
1252 
1253   PetscFunctionBegin;
1254 
1255   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1256   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1257     *flg = PETSC_FALSE;
1258     PetscFunctionReturn(0);
1259   }
1260 
1261   /* if the a->i are the same */
1262   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1263   if (!*flg) {
1264     PetscFunctionReturn(0);
1265   }
1266 
1267   /* if a->j are the same */
1268   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1269   if (!*flg) {
1270     PetscFunctionReturn(0);
1271   }
1272   /* if a->a are the same */
1273   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1274   PetscFunctionReturn(0);
1275 }
1276 
1277 #undef __FUNCT__
1278 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1279 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1280 {
1281   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1282   PetscErrorCode ierr;
1283   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1284   PetscScalar    *x,zero = 0.0;
1285   MatScalar      *aa,*aa_j;
1286 
1287   PetscFunctionBegin;
1288   bs   = A->rmap->bs;
1289   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1290 
1291   aa   = a->a;
1292   ai   = a->i;
1293   aj   = a->j;
1294   ambs = a->mbs;
1295   bs2  = a->bs2;
1296 
1297   ierr = VecSet(v,zero);CHKERRQ(ierr);
1298   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1299   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1300   if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1301   for (i=0; i<ambs; i++) {
1302     j=ai[i];
1303     if (aj[j] == i) {             /* if this is a diagonal element */
1304       row  = i*bs;
1305       aa_j = aa + j*bs2;
1306       if (A->factor && bs==1){
1307         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1308       } else {
1309         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1310       }
1311     }
1312   }
1313 
1314   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1315   PetscFunctionReturn(0);
1316 }
1317 
1318 #undef __FUNCT__
1319 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1320 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1321 {
1322   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1323   PetscScalar    *l,x,*li,*ri;
1324   MatScalar      *aa,*v;
1325   PetscErrorCode ierr;
1326   PetscInt       i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1327   PetscTruth     flg;
1328 
1329   PetscFunctionBegin;
1330   if (ll != rr){
1331     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1332     if (!flg)
1333       SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1334   }
1335   if (!ll) PetscFunctionReturn(0);
1336   ai  = a->i;
1337   aj  = a->j;
1338   aa  = a->a;
1339   m   = A->rmap->N;
1340   bs  = A->rmap->bs;
1341   mbs = a->mbs;
1342   bs2 = a->bs2;
1343 
1344   ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1345   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1346   if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1347   for (i=0; i<mbs; i++) { /* for each block row */
1348     M  = ai[i+1] - ai[i];
1349     li = l + i*bs;
1350     v  = aa + bs2*ai[i];
1351     for (j=0; j<M; j++) { /* for each block */
1352       ri = l + bs*aj[ai[i]+j];
1353       for (k=0; k<bs; k++) {
1354         x = ri[k];
1355         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1356       }
1357     }
1358   }
1359   ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1360   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1361   PetscFunctionReturn(0);
1362 }
1363 
1364 #undef __FUNCT__
1365 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1366 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1367 {
1368   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1369 
1370   PetscFunctionBegin;
1371   info->block_size     = a->bs2;
1372   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1373   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1374   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1375   info->assemblies   = A->num_ass;
1376   info->mallocs      = a->reallocs;
1377   info->memory       = ((PetscObject)A)->mem;
1378   if (A->factor) {
1379     info->fill_ratio_given  = A->info.fill_ratio_given;
1380     info->fill_ratio_needed = A->info.fill_ratio_needed;
1381     info->factor_mallocs    = A->info.factor_mallocs;
1382   } else {
1383     info->fill_ratio_given  = 0;
1384     info->fill_ratio_needed = 0;
1385     info->factor_mallocs    = 0;
1386   }
1387   PetscFunctionReturn(0);
1388 }
1389 
1390 
1391 #undef __FUNCT__
1392 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1393 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1394 {
1395   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1396   PetscErrorCode ierr;
1397 
1398   PetscFunctionBegin;
1399   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1400   PetscFunctionReturn(0);
1401 }
1402 
1403 #undef __FUNCT__
1404 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1405 /*
1406    This code does not work since it only checks the upper triangular part of
1407   the matrix. Hence it is not listed in the function table.
1408 */
1409 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1410 {
1411   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1412   PetscErrorCode ierr;
1413   PetscInt       i,j,n,row,col,bs,*ai,*aj,mbs;
1414   PetscReal      atmp;
1415   MatScalar      *aa;
1416   PetscScalar    *x;
1417   PetscInt       ncols,brow,bcol,krow,kcol;
1418 
1419   PetscFunctionBegin;
1420   if (idx) SETERRQ(PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1421   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1422   bs   = A->rmap->bs;
1423   aa   = a->a;
1424   ai   = a->i;
1425   aj   = a->j;
1426   mbs = a->mbs;
1427 
1428   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1429   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1430   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1431   if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1432   for (i=0; i<mbs; i++) {
1433     ncols = ai[1] - ai[0]; ai++;
1434     brow  = bs*i;
1435     for (j=0; j<ncols; j++){
1436       bcol = bs*(*aj);
1437       for (kcol=0; kcol<bs; kcol++){
1438         col = bcol + kcol;      /* col index */
1439         for (krow=0; krow<bs; krow++){
1440           atmp = PetscAbsScalar(*aa); aa++;
1441           row = brow + krow;    /* row index */
1442           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1443           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1444         }
1445       }
1446       aj++;
1447     }
1448   }
1449   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1450   PetscFunctionReturn(0);
1451 }
1452