xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision ee1cef2c4f92b4b45b8aa3ceac59cb944062fab9)
1 #define PETSCMAT_DLL
2 
3 #include "../src/mat/impls/baij/seq/baij.h"
4 #include "../src/inline/spops.h"
5 #include "../src/inline/ilu.h"
6 #include "petscbt.h"
7 #include "../src/mat/impls/sbaij/seq/sbaij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
11 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
12 {
13   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
16   const PetscInt *idx;
17   PetscBT        table,table0;
18 
19   PetscFunctionBegin;
20   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
21   mbs = a->mbs;
22   ai  = a->i;
23   aj  = a->j;
24   bs  = A->rmap->bs;
25   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
26   ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
27   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
28   ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr);
29 
30   for (i=0; i<is_max; i++) { /* for each is */
31     isz  = 0;
32     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
33 
34     /* Extract the indices, assume there can be duplicate entries */
35     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
36     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
37 
38     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
39     bcol_max = 0;
40     for (j=0; j<n ; ++j){
41       brow = idx[j]/bs; /* convert the indices into block indices */
42       if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43       if(!PetscBTLookupSet(table,brow)) {
44         nidx[isz++] = brow;
45         if (bcol_max < brow) bcol_max = brow;
46       }
47     }
48     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
49     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
50 
51     k = 0;
52     for (j=0; j<ov; j++){ /* for each overlap */
53       /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
54       ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr);
55       for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); }
56 
57       n = isz;  /* length of the updated is[i] */
58       for (brow=0; brow<mbs; brow++){
59         start = ai[brow]; end   = ai[brow+1];
60         if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
61           for (l = start; l<end ; l++){
62             bcol = aj[l];
63             if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
64           }
65           k++;
66           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
68           for (l = start; l<end ; l++){
69             bcol = aj[l];
70             if (bcol > bcol_max) break;
71             if (PetscBTLookup(table0,bcol)){
72               if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
73               break; /* for l = start; l<end ; l++) */
74             }
75           }
76         }
77       }
78     } /* for each overlap */
79 
80     /* expand the Index Set */
81     for (j=0; j<isz; j++) {
82       for (k=0; k<bs; k++)
83         nidx2[j*bs+k] = nidx[j]*bs+k;
84     }
85     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
86   }
87   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
88   ierr = PetscFree(nidx);CHKERRQ(ierr);
89   ierr = PetscFree(nidx2);CHKERRQ(ierr);
90   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
91   PetscFunctionReturn(0);
92 }
93 
94 #undef __FUNCT__
95 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
96 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
97 {
98   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
99   PetscErrorCode ierr;
100   PetscInt       *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
101   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
102   PetscInt       nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
103   const PetscInt *irow,*aj = a->j,*ai = a->i;
104   MatScalar      *mat_a;
105   Mat            C;
106   PetscTruth     flag,sorted;
107 
108   PetscFunctionBegin;
109   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
110   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
111   if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
112 
113   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
114   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
115 
116   ierr  = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
117   ssmap = smap;
118   ierr  = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
119   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
120   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
121   /* determine lens of each row */
122   for (i=0; i<nrows; i++) {
123     kstart  = ai[irow[i]];
124     kend    = kstart + a->ilen[irow[i]];
125     lens[i] = 0;
126       for (k=kstart; k<kend; k++) {
127         if (ssmap[aj[k]]) {
128           lens[i]++;
129         }
130       }
131     }
132   /* Create and fill new matrix */
133   if (scall == MAT_REUSE_MATRIX) {
134     c = (Mat_SeqSBAIJ *)((*B)->data);
135 
136     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
137     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
138     if (!flag) {
139       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
140     }
141     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
142     C = *B;
143   } else {
144     ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr);
145     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
146     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
147     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
148   }
149   c = (Mat_SeqSBAIJ *)(C->data);
150   for (i=0; i<nrows; i++) {
151     row    = irow[i];
152     kstart = ai[row];
153     kend   = kstart + a->ilen[row];
154     mat_i  = c->i[i];
155     mat_j  = c->j + mat_i;
156     mat_a  = c->a + mat_i*bs2;
157     mat_ilen = c->ilen + i;
158     for (k=kstart; k<kend; k++) {
159       if ((tcol=ssmap[a->j[k]])) {
160         *mat_j++ = tcol - 1;
161         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
162         mat_a   += bs2;
163         (*mat_ilen)++;
164       }
165     }
166   }
167 
168   /* Free work space */
169   ierr = PetscFree(smap);CHKERRQ(ierr);
170   ierr = PetscFree(lens);CHKERRQ(ierr);
171   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
172   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
173 
174   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
175   *B = C;
176   PetscFunctionReturn(0);
177 }
178 
179 #undef __FUNCT__
180 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
181 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
182 {
183   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
184   IS             is1;
185   PetscErrorCode ierr;
186   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
187   const PetscInt *irow;
188 
189   PetscFunctionBegin;
190   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
191 
192   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
193   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
194 
195   /* Verify if the indices corespond to each element in a block
196    and form the IS with compressed IS */
197   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);CHKERRQ(ierr);
198   iary = vary + a->mbs;
199   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
200   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
201 
202   count = 0;
203   for (i=0; i<a->mbs; i++) {
204     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
205     if (vary[i]==bs) iary[count++] = i;
206   }
207   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
208 
209   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
210   ierr = PetscFree(vary);CHKERRQ(ierr);
211 
212   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr);
213   ISDestroy(is1);
214   PetscFunctionReturn(0);
215 }
216 
217 #undef __FUNCT__
218 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
219 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
220 {
221   PetscErrorCode ierr;
222   PetscInt       i;
223 
224   PetscFunctionBegin;
225   if (scall == MAT_INITIAL_MATRIX) {
226     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
227   }
228 
229   for (i=0; i<n; i++) {
230     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
231   }
232   PetscFunctionReturn(0);
233 }
234 
235 /* -------------------------------------------------------*/
236 /* Should check that shapes of vectors and matrices match */
237 /* -------------------------------------------------------*/
238 #include "petscblaslapack.h"
239 
240 #undef __FUNCT__
241 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
242 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
243 {
244   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
245   const PetscScalar *x;
246   PetscScalar       *z,x1,sum;
247   const MatScalar   *v;
248   PetscErrorCode    ierr;
249   PetscInt          mbs=a->mbs,i,j,nz;
250   const PetscInt    *aj=a->j,*ai=a->i,*ib;
251 
252   PetscFunctionBegin;
253   ierr = VecSet(zz,0.0);CHKERRQ(ierr);
254   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
255   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
256 
257   v  = a->a;
258   ib = aj;
259 
260   for (i=0; i<mbs; i++) {
261     nz   = ai[i+1] - ai[i];        /* length of i_th row of A */
262     x1   = x[i];
263     sum  = v[0]*x1;                /* diagonal term */
264     for (j=1; j<nz; j++) {
265       z[ib[j]] += v[j] * x1;       /* (strict lower triangular part of A)*x  */
266       sum      += v[j] * x[ib[j]]; /* (strict upper triangular part of A)*x  */
267     }
268     z[i] += sum;
269     v    += nz;
270     ib   += nz;
271   }
272 
273   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
274   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
275   ierr = PetscLogFlops(2.0*(2.0*a->nz - mbs) - mbs);CHKERRQ(ierr);
276   PetscFunctionReturn(0);
277 }
278 
279 #undef __FUNCT__
280 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
281 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
282 {
283   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
284   PetscScalar    *x,*z,*xb,x1,x2,zero=0.0;
285   MatScalar      *v;
286   PetscErrorCode ierr;
287   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
288   PetscInt       nonzerorow=0;
289 
290   PetscFunctionBegin;
291   ierr = VecSet(zz,zero);CHKERRQ(ierr);
292   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
293   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
294 
295   v     = a->a;
296   xb = x;
297 
298   for (i=0; i<mbs; i++) {
299     n  = ai[1] - ai[0]; /* length of i_th block row of A */
300     x1 = xb[0]; x2 = xb[1];
301     ib = aj + *ai;
302     jmin = 0;
303     nonzerorow += (n>0);
304     if (*ib == i){     /* (diag of A)*x */
305       z[2*i]   += v[0]*x1 + v[2]*x2;
306       z[2*i+1] += v[2]*x1 + v[3]*x2;
307       v += 4; jmin++;
308     }
309     for (j=jmin; j<n; j++) {
310       /* (strict lower triangular part of A)*x  */
311       cval       = ib[j]*2;
312       z[cval]     += v[0]*x1 + v[1]*x2;
313       z[cval+1]   += v[2]*x1 + v[3]*x2;
314       /* (strict upper triangular part of A)*x  */
315       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
316       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
317       v  += 4;
318     }
319     xb +=2; ai++;
320   }
321 
322   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
323   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
324   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
325   PetscFunctionReturn(0);
326 }
327 
328 #undef __FUNCT__
329 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
330 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
331 {
332   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
333   PetscScalar    *x,*z,*xb,x1,x2,x3,zero=0.0;
334   MatScalar      *v;
335   PetscErrorCode ierr;
336   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
337   PetscInt       nonzerorow=0;
338 
339   PetscFunctionBegin;
340   ierr = VecSet(zz,zero);CHKERRQ(ierr);
341   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
342   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
343 
344   v    = a->a;
345   xb   = x;
346 
347   for (i=0; i<mbs; i++) {
348     n  = ai[1] - ai[0]; /* length of i_th block row of A */
349     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
350     ib = aj + *ai;
351     jmin = 0;
352     nonzerorow += (n>0);
353     if (*ib == i){     /* (diag of A)*x */
354       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
355       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
356       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
357       v += 9; jmin++;
358     }
359     for (j=jmin; j<n; j++) {
360       /* (strict lower triangular part of A)*x  */
361       cval       = ib[j]*3;
362       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
363       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
364       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
365       /* (strict upper triangular part of A)*x  */
366       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
367       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
368       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
369       v  += 9;
370     }
371     xb +=3; ai++;
372   }
373 
374   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
375   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
376   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
377   PetscFunctionReturn(0);
378 }
379 
380 #undef __FUNCT__
381 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
382 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
383 {
384   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
385   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
386   MatScalar      *v;
387   PetscErrorCode ierr;
388   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
389   PetscInt       nonzerorow=0;
390 
391   PetscFunctionBegin;
392   ierr = VecSet(zz,zero);CHKERRQ(ierr);
393   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
394   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
395 
396   v     = a->a;
397   xb = x;
398 
399   for (i=0; i<mbs; i++) {
400     n  = ai[1] - ai[0]; /* length of i_th block row of A */
401     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
402     ib = aj + *ai;
403     jmin = 0;
404     nonzerorow += (n>0);
405     if (*ib == i){     /* (diag of A)*x */
406       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
407       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
408       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
409       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
410       v += 16; jmin++;
411     }
412     for (j=jmin; j<n; j++) {
413       /* (strict lower triangular part of A)*x  */
414       cval       = ib[j]*4;
415       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
416       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
417       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
418       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
419       /* (strict upper triangular part of A)*x  */
420       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
421       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
422       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
423       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
424       v  += 16;
425     }
426     xb +=4; ai++;
427   }
428 
429   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
430   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
431   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
432   PetscFunctionReturn(0);
433 }
434 
435 #undef __FUNCT__
436 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
437 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
438 {
439   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
440   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
441   MatScalar      *v;
442   PetscErrorCode ierr;
443   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
444   PetscInt       nonzerorow=0;
445 
446   PetscFunctionBegin;
447   ierr = VecSet(zz,zero);CHKERRQ(ierr);
448   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
449   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
450 
451   v     = a->a;
452   xb = x;
453 
454   for (i=0; i<mbs; i++) {
455     n  = ai[1] - ai[0]; /* length of i_th block row of A */
456     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
457     ib = aj + *ai;
458     jmin = 0;
459     nonzerorow += (n>0);
460     if (*ib == i){      /* (diag of A)*x */
461       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
462       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
463       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
464       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
465       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
466       v += 25; jmin++;
467     }
468     for (j=jmin; j<n; j++) {
469       /* (strict lower triangular part of A)*x  */
470       cval       = ib[j]*5;
471       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
472       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
473       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
474       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
475       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
476       /* (strict upper triangular part of A)*x  */
477       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
478       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
479       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
480       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
481       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
482       v  += 25;
483     }
484     xb +=5; ai++;
485   }
486 
487   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
488   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
489   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
490   PetscFunctionReturn(0);
491 }
492 
493 
494 #undef __FUNCT__
495 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
496 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
497 {
498   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
499   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
500   MatScalar      *v;
501   PetscErrorCode ierr;
502   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
503   PetscInt       nonzerorow=0;
504 
505   PetscFunctionBegin;
506   ierr = VecSet(zz,zero);CHKERRQ(ierr);
507   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
508   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
509 
510   v     = a->a;
511   xb = x;
512 
513   for (i=0; i<mbs; i++) {
514     n  = ai[1] - ai[0]; /* length of i_th block row of A */
515     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
516     ib = aj + *ai;
517     jmin = 0;
518     nonzerorow += (n>0);
519     if (*ib == i){      /* (diag of A)*x */
520       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
521       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
522       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
523       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
524       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
525       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
526       v += 36; jmin++;
527     }
528     for (j=jmin; j<n; j++) {
529       /* (strict lower triangular part of A)*x  */
530       cval       = ib[j]*6;
531       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
532       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
533       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
534       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
535       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
536       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
537       /* (strict upper triangular part of A)*x  */
538       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
539       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
540       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
541       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
542       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
543       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
544       v  += 36;
545     }
546     xb +=6; ai++;
547   }
548 
549   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
550   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
551   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
552   PetscFunctionReturn(0);
553 }
554 #undef __FUNCT__
555 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
556 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
557 {
558   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
559   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
560   MatScalar      *v;
561   PetscErrorCode ierr;
562   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
563   PetscInt       nonzerorow=0;
564 
565   PetscFunctionBegin;
566   ierr = VecSet(zz,zero);CHKERRQ(ierr);
567   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
568   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
569 
570   v     = a->a;
571   xb = x;
572 
573   for (i=0; i<mbs; i++) {
574     n  = ai[1] - ai[0]; /* length of i_th block row of A */
575     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
576     ib = aj + *ai;
577     jmin = 0;
578     nonzerorow += (n>0);
579     if (*ib == i){      /* (diag of A)*x */
580       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
581       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
582       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
583       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
584       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
585       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
586       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
587       v += 49; jmin++;
588     }
589     for (j=jmin; j<n; j++) {
590       /* (strict lower triangular part of A)*x  */
591       cval       = ib[j]*7;
592       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
593       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
594       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
595       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
596       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
597       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
598       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
599       /* (strict upper triangular part of A)*x  */
600       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
601       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
602       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
603       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
604       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
605       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
606       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
607       v  += 49;
608     }
609     xb +=7; ai++;
610   }
611   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
612   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
613   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
614   PetscFunctionReturn(0);
615 }
616 
617 /*
618     This will not work with MatScalar == float because it calls the BLAS
619 */
620 #undef __FUNCT__
621 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
622 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
623 {
624   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
625   PetscScalar    *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
626   MatScalar      *v;
627   PetscErrorCode ierr;
628   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
629   PetscInt       nonzerorow=0;
630 
631   PetscFunctionBegin;
632   ierr = VecSet(zz,zero);CHKERRQ(ierr);
633   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
634   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
635 
636   aj   = a->j;
637   v    = a->a;
638   ii   = a->i;
639 
640   if (!a->mult_work) {
641     ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
642   }
643   work = a->mult_work;
644 
645   for (i=0; i<mbs; i++) {
646     n     = ii[1] - ii[0]; ncols = n*bs;
647     workt = work; idx=aj+ii[0];
648     nonzerorow += (n>0);
649 
650     /* upper triangular part */
651     for (j=0; j<n; j++) {
652       xb = x_ptr + bs*(*idx++);
653       for (k=0; k<bs; k++) workt[k] = xb[k];
654       workt += bs;
655     }
656     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
657     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
658 
659     /* strict lower triangular part */
660     idx = aj+ii[0];
661     if (*idx == i){
662       ncols -= bs; v += bs2; idx++; n--;
663     }
664 
665     if (ncols > 0){
666       workt = work;
667       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
668       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
669       for (j=0; j<n; j++) {
670         zb = z_ptr + bs*(*idx++);
671         for (k=0; k<bs; k++) zb[k] += workt[k] ;
672         workt += bs;
673       }
674     }
675     x += bs; v += n*bs2; z += bs; ii++;
676   }
677 
678   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
679   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
680   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
681   PetscFunctionReturn(0);
682 }
683 
684 extern PetscErrorCode VecCopy_Seq(Vec,Vec);
685 #undef __FUNCT__
686 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
687 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
688 {
689   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
690   PetscScalar    *x,*z,*xb,x1;
691   MatScalar      *v;
692   PetscErrorCode ierr;
693   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
694   PetscInt       nonzerorow=0;
695 
696   PetscFunctionBegin;
697   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
698   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
699   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
700   v  = a->a;
701   xb = x;
702 
703   for (i=0; i<mbs; i++) {
704     n  = ai[1] - ai[0];  /* length of i_th row of A */
705     x1 = xb[0];
706     ib = aj + *ai;
707     jmin = 0;
708     nonzerorow += (n>0);
709     if (*ib == i) {            /* (diag of A)*x */
710       z[i] += *v++ * x[*ib++]; jmin++;
711     }
712     for (j=jmin; j<n; j++) {
713       cval    = *ib;
714       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
715       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
716     }
717     xb++; ai++;
718   }
719 
720   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
721   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
722 
723   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
724   PetscFunctionReturn(0);
725 }
726 
727 #undef __FUNCT__
728 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
729 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
730 {
731   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
732   PetscScalar    *x,*z,*xb,x1,x2;
733   MatScalar      *v;
734   PetscErrorCode ierr;
735   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
736   PetscInt       nonzerorow=0;
737 
738   PetscFunctionBegin;
739   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
740   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
741   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
742 
743   v  = a->a;
744   xb = x;
745 
746   for (i=0; i<mbs; i++) {
747     n  = ai[1] - ai[0]; /* length of i_th block row of A */
748     x1 = xb[0]; x2 = xb[1];
749     ib = aj + *ai;
750     jmin = 0;
751     nonzerorow += (n>0);
752     if (*ib == i){      /* (diag of A)*x */
753       z[2*i]   += v[0]*x1 + v[2]*x2;
754       z[2*i+1] += v[2]*x1 + v[3]*x2;
755       v += 4; jmin++;
756     }
757     for (j=jmin; j<n; j++) {
758       /* (strict lower triangular part of A)*x  */
759       cval       = ib[j]*2;
760       z[cval]     += v[0]*x1 + v[1]*x2;
761       z[cval+1]   += v[2]*x1 + v[3]*x2;
762       /* (strict upper triangular part of A)*x  */
763       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
764       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
765       v  += 4;
766     }
767     xb +=2; ai++;
768   }
769   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
770   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
771 
772   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
773   PetscFunctionReturn(0);
774 }
775 
776 #undef __FUNCT__
777 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
778 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
779 {
780   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
781   PetscScalar    *x,*z,*xb,x1,x2,x3;
782   MatScalar      *v;
783   PetscErrorCode ierr;
784   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
785   PetscInt       nonzerorow=0;
786 
787   PetscFunctionBegin;
788   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
789   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
790   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
791 
792   v     = a->a;
793   xb = x;
794 
795   for (i=0; i<mbs; i++) {
796     n  = ai[1] - ai[0]; /* length of i_th block row of A */
797     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
798     ib = aj + *ai;
799     jmin = 0;
800     nonzerorow += (n>0);
801     if (*ib == i){     /* (diag of A)*x */
802      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
803      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
804      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
805      v += 9; jmin++;
806     }
807     for (j=jmin; j<n; j++) {
808       /* (strict lower triangular part of A)*x  */
809       cval       = ib[j]*3;
810       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
811       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
812       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
813       /* (strict upper triangular part of A)*x  */
814       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
815       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
816       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
817       v  += 9;
818     }
819     xb +=3; ai++;
820   }
821 
822   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
823   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
824 
825   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
826   PetscFunctionReturn(0);
827 }
828 
829 #undef __FUNCT__
830 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
831 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
832 {
833   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
834   PetscScalar    *x,*z,*xb,x1,x2,x3,x4;
835   MatScalar      *v;
836   PetscErrorCode ierr;
837   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
838   PetscInt       nonzerorow=0;
839 
840   PetscFunctionBegin;
841   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
842   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
843   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
844 
845   v     = a->a;
846   xb = x;
847 
848   for (i=0; i<mbs; i++) {
849     n  = ai[1] - ai[0]; /* length of i_th block row of A */
850     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
851     ib = aj + *ai;
852     jmin = 0;
853     nonzerorow += (n>0);
854     if (*ib == i){      /* (diag of A)*x */
855       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
856       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
857       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
858       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
859       v += 16; jmin++;
860     }
861     for (j=jmin; j<n; j++) {
862       /* (strict lower triangular part of A)*x  */
863       cval       = ib[j]*4;
864       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
865       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
866       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
867       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
868       /* (strict upper triangular part of A)*x  */
869       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
870       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
871       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
872       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
873       v  += 16;
874     }
875     xb +=4; ai++;
876   }
877 
878   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
879   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
880 
881   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
882   PetscFunctionReturn(0);
883 }
884 
885 #undef __FUNCT__
886 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
887 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
888 {
889   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
890   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5;
891   MatScalar      *v;
892   PetscErrorCode ierr;
893   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
894   PetscInt       nonzerorow=0;
895 
896   PetscFunctionBegin;
897   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
898   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
899   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
900 
901   v     = a->a;
902   xb = x;
903 
904   for (i=0; i<mbs; i++) {
905     n  = ai[1] - ai[0]; /* length of i_th block row of A */
906     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
907     ib = aj + *ai;
908     jmin = 0;
909     nonzerorow += (n>0);
910     if (*ib == i){      /* (diag of A)*x */
911       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
912       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
913       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
914       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
915       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
916       v += 25; jmin++;
917     }
918     for (j=jmin; j<n; j++) {
919       /* (strict lower triangular part of A)*x  */
920       cval       = ib[j]*5;
921       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
922       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
923       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
924       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
925       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
926       /* (strict upper triangular part of A)*x  */
927       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
928       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
929       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
930       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
931       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
932       v  += 25;
933     }
934     xb +=5; ai++;
935   }
936 
937   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
938   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
939 
940   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
941   PetscFunctionReturn(0);
942 }
943 #undef __FUNCT__
944 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
945 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
946 {
947   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
948   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6;
949   MatScalar      *v;
950   PetscErrorCode ierr;
951   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
952   PetscInt       nonzerorow=0;
953 
954   PetscFunctionBegin;
955   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
956   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
957   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
958 
959   v     = a->a;
960   xb = x;
961 
962   for (i=0; i<mbs; i++) {
963     n  = ai[1] - ai[0]; /* length of i_th block row of A */
964     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
965     ib = aj + *ai;
966     jmin = 0;
967     nonzerorow += (n>0);
968     if (*ib == i){     /* (diag of A)*x */
969       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
970       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
971       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
972       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
973       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
974       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
975       v += 36; jmin++;
976     }
977     for (j=jmin; j<n; j++) {
978       /* (strict lower triangular part of A)*x  */
979       cval       = ib[j]*6;
980       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
981       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
982       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
983       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
984       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
985       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
986       /* (strict upper triangular part of A)*x  */
987       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
988       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
989       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
990       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
991       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
992       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
993       v  += 36;
994     }
995     xb +=6; ai++;
996   }
997 
998   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
999   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1000 
1001   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1002   PetscFunctionReturn(0);
1003 }
1004 
1005 #undef __FUNCT__
1006 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1007 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1008 {
1009   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1010   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
1011   MatScalar      *v;
1012   PetscErrorCode ierr;
1013   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
1014   PetscInt       nonzerorow=0;
1015 
1016   PetscFunctionBegin;
1017   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1018   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1019   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1020 
1021   v     = a->a;
1022   xb = x;
1023 
1024   for (i=0; i<mbs; i++) {
1025     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1026     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1027     ib = aj + *ai;
1028     jmin = 0;
1029     nonzerorow += (n>0);
1030     if (*ib == i){     /* (diag of A)*x */
1031       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1032       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1033       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1034       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1035       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1036       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1037       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1038       v += 49; jmin++;
1039     }
1040     for (j=jmin; j<n; j++) {
1041       /* (strict lower triangular part of A)*x  */
1042       cval       = ib[j]*7;
1043       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1044       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1045       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1046       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1047       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1048       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1049       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1050       /* (strict upper triangular part of A)*x  */
1051       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1052       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1053       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1054       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1055       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1056       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1057       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1058       v  += 49;
1059     }
1060     xb +=7; ai++;
1061   }
1062 
1063   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1064   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1065 
1066   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1067   PetscFunctionReturn(0);
1068 }
1069 
1070 #undef __FUNCT__
1071 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1072 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1073 {
1074   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1075   PetscScalar    *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1076   MatScalar      *v;
1077   PetscErrorCode ierr;
1078   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1079   PetscInt       nonzerorow=0;
1080 
1081   PetscFunctionBegin;
1082   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1083   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1084   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1085 
1086   aj   = a->j;
1087   v    = a->a;
1088   ii   = a->i;
1089 
1090   if (!a->mult_work) {
1091     ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1092   }
1093   work = a->mult_work;
1094 
1095 
1096   for (i=0; i<mbs; i++) {
1097     n     = ii[1] - ii[0]; ncols = n*bs;
1098     workt = work; idx=aj+ii[0];
1099     nonzerorow += (n>0);
1100 
1101     /* upper triangular part */
1102     for (j=0; j<n; j++) {
1103       xb = x_ptr + bs*(*idx++);
1104       for (k=0; k<bs; k++) workt[k] = xb[k];
1105       workt += bs;
1106     }
1107     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1108     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1109 
1110     /* strict lower triangular part */
1111     idx = aj+ii[0];
1112     if (*idx == i){
1113       ncols -= bs; v += bs2; idx++; n--;
1114     }
1115     if (ncols > 0){
1116       workt = work;
1117       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1118       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1119       for (j=0; j<n; j++) {
1120         zb = z_ptr + bs*(*idx++);
1121         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1122         workt += bs;
1123       }
1124     }
1125 
1126     x += bs; v += n*bs2; z += bs; ii++;
1127   }
1128 
1129   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1130   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1131 
1132   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1133   PetscFunctionReturn(0);
1134 }
1135 
1136 #undef __FUNCT__
1137 #define __FUNCT__ "MatScale_SeqSBAIJ"
1138 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1139 {
1140   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)inA->data;
1141   PetscScalar    oalpha = alpha;
1142   PetscErrorCode ierr;
1143   PetscBLASInt   one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz);
1144 
1145   PetscFunctionBegin;
1146   BLASscal_(&totalnz,&oalpha,a->a,&one);
1147   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1148   PetscFunctionReturn(0);
1149 }
1150 
1151 #undef __FUNCT__
1152 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1153 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1154 {
1155   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1156   MatScalar      *v = a->a;
1157   PetscReal      sum_diag = 0.0, sum_off = 0.0, *sum;
1158   PetscInt       i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1159   PetscErrorCode ierr;
1160   PetscInt       *jl,*il,jmin,jmax,nexti,ik,*col;
1161 
1162   PetscFunctionBegin;
1163   if (type == NORM_FROBENIUS) {
1164     for (k=0; k<mbs; k++){
1165       jmin = a->i[k]; jmax = a->i[k+1];
1166       col  = aj + jmin;
1167       if (*col == k){         /* diagonal block */
1168         for (i=0; i<bs2; i++){
1169 #if defined(PETSC_USE_COMPLEX)
1170           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1171 #else
1172           sum_diag += (*v)*(*v); v++;
1173 #endif
1174         }
1175         jmin++;
1176       }
1177       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1178         for (i=0; i<bs2; i++){
1179 #if defined(PETSC_USE_COMPLEX)
1180           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1181 #else
1182           sum_off += (*v)*(*v); v++;
1183 #endif
1184         }
1185       }
1186     }
1187     *norm = sqrt(sum_diag + 2*sum_off);
1188   }  else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1189     ierr = PetscMalloc((2*mbs+1)*sizeof(PetscInt)+bs*sizeof(PetscReal),&il);CHKERRQ(ierr);
1190     jl   = il + mbs;
1191     sum  = (PetscReal*)(jl + mbs);
1192     for (i=0; i<mbs; i++) jl[i] = mbs;
1193     il[0] = 0;
1194 
1195     *norm = 0.0;
1196     for (k=0; k<mbs; k++) { /* k_th block row */
1197       for (j=0; j<bs; j++) sum[j]=0.0;
1198       /*-- col sum --*/
1199       i = jl[k]; /* first |A(i,k)| to be added */
1200       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1201                   at step k */
1202       while (i<mbs){
1203         nexti = jl[i];  /* next block row to be added */
1204         ik    = il[i];  /* block index of A(i,k) in the array a */
1205         for (j=0; j<bs; j++){
1206           v = a->a + ik*bs2 + j*bs;
1207           for (k1=0; k1<bs; k1++) {
1208             sum[j] += PetscAbsScalar(*v); v++;
1209           }
1210         }
1211         /* update il, jl */
1212         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1213         jmax = a->i[i+1];
1214         if (jmin < jmax){
1215           il[i] = jmin;
1216           j   = a->j[jmin];
1217           jl[i] = jl[j]; jl[j]=i;
1218         }
1219         i = nexti;
1220       }
1221       /*-- row sum --*/
1222       jmin = a->i[k]; jmax = a->i[k+1];
1223       for (i=jmin; i<jmax; i++) {
1224         for (j=0; j<bs; j++){
1225           v = a->a + i*bs2 + j;
1226           for (k1=0; k1<bs; k1++){
1227             sum[j] += PetscAbsScalar(*v); v += bs;
1228           }
1229         }
1230       }
1231       /* add k_th block row to il, jl */
1232       col = aj+jmin;
1233       if (*col == k) jmin++;
1234       if (jmin < jmax){
1235         il[k] = jmin;
1236         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1237       }
1238       for (j=0; j<bs; j++){
1239         if (sum[j] > *norm) *norm = sum[j];
1240       }
1241     }
1242     ierr = PetscFree(il);CHKERRQ(ierr);
1243   } else {
1244     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1245   }
1246   PetscFunctionReturn(0);
1247 }
1248 
1249 #undef __FUNCT__
1250 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1251 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1252 {
1253   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1254   PetscErrorCode ierr;
1255 
1256   PetscFunctionBegin;
1257 
1258   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1259   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1260     *flg = PETSC_FALSE;
1261     PetscFunctionReturn(0);
1262   }
1263 
1264   /* if the a->i are the same */
1265   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1266   if (!*flg) {
1267     PetscFunctionReturn(0);
1268   }
1269 
1270   /* if a->j are the same */
1271   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1272   if (!*flg) {
1273     PetscFunctionReturn(0);
1274   }
1275   /* if a->a are the same */
1276   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1277   PetscFunctionReturn(0);
1278 }
1279 
1280 #undef __FUNCT__
1281 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1282 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1283 {
1284   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1285   PetscErrorCode ierr;
1286   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1287   PetscScalar    *x,zero = 0.0;
1288   MatScalar      *aa,*aa_j;
1289 
1290   PetscFunctionBegin;
1291   bs   = A->rmap->bs;
1292   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1293 
1294   aa   = a->a;
1295   ai   = a->i;
1296   aj   = a->j;
1297   ambs = a->mbs;
1298   bs2  = a->bs2;
1299 
1300   ierr = VecSet(v,zero);CHKERRQ(ierr);
1301   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1302   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1303   if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1304   for (i=0; i<ambs; i++) {
1305     j=ai[i];
1306     if (aj[j] == i) {             /* if this is a diagonal element */
1307       row  = i*bs;
1308       aa_j = aa + j*bs2;
1309       if (A->factor && bs==1){
1310         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1311       } else {
1312         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1313       }
1314     }
1315   }
1316 
1317   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1318   PetscFunctionReturn(0);
1319 }
1320 
1321 #undef __FUNCT__
1322 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1323 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1324 {
1325   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1326   PetscScalar    *l,x,*li,*ri;
1327   MatScalar      *aa,*v;
1328   PetscErrorCode ierr;
1329   PetscInt       i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1330   PetscTruth     flg;
1331 
1332   PetscFunctionBegin;
1333   if (ll != rr){
1334     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1335     if (!flg)
1336       SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1337   }
1338   if (!ll) PetscFunctionReturn(0);
1339   ai  = a->i;
1340   aj  = a->j;
1341   aa  = a->a;
1342   m   = A->rmap->N;
1343   bs  = A->rmap->bs;
1344   mbs = a->mbs;
1345   bs2 = a->bs2;
1346 
1347   ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1348   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1349   if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1350   for (i=0; i<mbs; i++) { /* for each block row */
1351     M  = ai[i+1] - ai[i];
1352     li = l + i*bs;
1353     v  = aa + bs2*ai[i];
1354     for (j=0; j<M; j++) { /* for each block */
1355       ri = l + bs*aj[ai[i]+j];
1356       for (k=0; k<bs; k++) {
1357         x = ri[k];
1358         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1359       }
1360     }
1361   }
1362   ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1363   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1364   PetscFunctionReturn(0);
1365 }
1366 
1367 #undef __FUNCT__
1368 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1369 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1370 {
1371   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1372 
1373   PetscFunctionBegin;
1374   info->block_size     = a->bs2;
1375   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1376   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1377   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1378   info->assemblies   = A->num_ass;
1379   info->mallocs      = a->reallocs;
1380   info->memory       = ((PetscObject)A)->mem;
1381   if (A->factor) {
1382     info->fill_ratio_given  = A->info.fill_ratio_given;
1383     info->fill_ratio_needed = A->info.fill_ratio_needed;
1384     info->factor_mallocs    = A->info.factor_mallocs;
1385   } else {
1386     info->fill_ratio_given  = 0;
1387     info->fill_ratio_needed = 0;
1388     info->factor_mallocs    = 0;
1389   }
1390   PetscFunctionReturn(0);
1391 }
1392 
1393 
1394 #undef __FUNCT__
1395 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1396 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1397 {
1398   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1399   PetscErrorCode ierr;
1400 
1401   PetscFunctionBegin;
1402   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1403   PetscFunctionReturn(0);
1404 }
1405 
1406 #undef __FUNCT__
1407 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1408 /*
1409    This code does not work since it only checks the upper triangular part of
1410   the matrix. Hence it is not listed in the function table.
1411 */
1412 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1413 {
1414   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1415   PetscErrorCode ierr;
1416   PetscInt       i,j,n,row,col,bs,*ai,*aj,mbs;
1417   PetscReal      atmp;
1418   MatScalar      *aa;
1419   PetscScalar    *x;
1420   PetscInt       ncols,brow,bcol,krow,kcol;
1421 
1422   PetscFunctionBegin;
1423   if (idx) SETERRQ(PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1424   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1425   bs   = A->rmap->bs;
1426   aa   = a->a;
1427   ai   = a->i;
1428   aj   = a->j;
1429   mbs = a->mbs;
1430 
1431   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1432   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1433   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1434   if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1435   for (i=0; i<mbs; i++) {
1436     ncols = ai[1] - ai[0]; ai++;
1437     brow  = bs*i;
1438     for (j=0; j<ncols; j++){
1439       bcol = bs*(*aj);
1440       for (kcol=0; kcol<bs; kcol++){
1441         col = bcol + kcol;      /* col index */
1442         for (krow=0; krow<bs; krow++){
1443           atmp = PetscAbsScalar(*aa); aa++;
1444           row = brow + krow;    /* row index */
1445           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1446           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1447         }
1448       }
1449       aj++;
1450     }
1451   }
1452   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1453   PetscFunctionReturn(0);
1454 }
1455