xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision f3fe499b4cc4d64bf04aa4f5e4963dcc4eb56541)
1 #define PETSCMAT_DLL
2 
3 #include "../src/mat/impls/baij/seq/baij.h"
4 #include "../src/mat/blockinvert.h"
5 #include "petscbt.h"
6 #include "../src/mat/impls/sbaij/seq/sbaij.h"
7 #include "petscblaslapack.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
11 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
12 {
13   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
16   const PetscInt *idx;
17   PetscBT        table,table0;
18 
19   PetscFunctionBegin;
20   if (ov < 0)  SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
21   mbs = a->mbs;
22   ai  = a->i;
23   aj  = a->j;
24   bs  = A->rmap->bs;
25   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
26   ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
27   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
28   ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr);
29 
30   for (i=0; i<is_max; i++) { /* for each is */
31     isz  = 0;
32     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
33 
34     /* Extract the indices, assume there can be duplicate entries */
35     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
36     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
37 
38     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
39     bcol_max = 0;
40     for (j=0; j<n ; ++j){
41       brow = idx[j]/bs; /* convert the indices into block indices */
42       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43       if(!PetscBTLookupSet(table,brow)) {
44         nidx[isz++] = brow;
45         if (bcol_max < brow) bcol_max = brow;
46       }
47     }
48     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
49     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
50 
51     k = 0;
52     for (j=0; j<ov; j++){ /* for each overlap */
53       /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
54       ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr);
55       for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); }
56 
57       n = isz;  /* length of the updated is[i] */
58       for (brow=0; brow<mbs; brow++){
59         start = ai[brow]; end   = ai[brow+1];
60         if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
61           for (l = start; l<end ; l++){
62             bcol = aj[l];
63             if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
64           }
65           k++;
66           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
68           for (l = start; l<end ; l++){
69             bcol = aj[l];
70             if (bcol > bcol_max) break;
71             if (PetscBTLookup(table0,bcol)){
72               if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
73               break; /* for l = start; l<end ; l++) */
74             }
75           }
76         }
77       }
78     } /* for each overlap */
79 
80     /* expand the Index Set */
81     for (j=0; j<isz; j++) {
82       for (k=0; k<bs; k++)
83         nidx2[j*bs+k] = nidx[j]*bs+k;
84     }
85     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
86   }
87   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
88   ierr = PetscFree(nidx);CHKERRQ(ierr);
89   ierr = PetscFree(nidx2);CHKERRQ(ierr);
90   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
91   PetscFunctionReturn(0);
92 }
93 
94 #undef __FUNCT__
95 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
96 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
97 {
98   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
99   PetscErrorCode ierr;
100   PetscInt       *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
101   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
102   PetscInt       nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
103   const PetscInt *irow,*aj = a->j,*ai = a->i;
104   MatScalar      *mat_a;
105   Mat            C;
106   PetscTruth     flag,sorted;
107 
108   PetscFunctionBegin;
109   if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
110   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
111   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
112 
113   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
114   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
115 
116   ierr  = PetscMalloc(oldcols*sizeof(PetscInt),&smap);CHKERRQ(ierr);
117   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
118   ssmap = smap;
119   ierr  = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
120   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
121   /* determine lens of each row */
122   for (i=0; i<nrows; i++) {
123     kstart  = ai[irow[i]];
124     kend    = kstart + a->ilen[irow[i]];
125     lens[i] = 0;
126       for (k=kstart; k<kend; k++) {
127         if (ssmap[aj[k]]) {
128           lens[i]++;
129         }
130       }
131     }
132   /* Create and fill new matrix */
133   if (scall == MAT_REUSE_MATRIX) {
134     c = (Mat_SeqSBAIJ *)((*B)->data);
135 
136     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
137     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
138     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
139     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
140     C = *B;
141   } else {
142     ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr);
143     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
144     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
145     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
146   }
147   c = (Mat_SeqSBAIJ *)(C->data);
148   for (i=0; i<nrows; i++) {
149     row    = irow[i];
150     kstart = ai[row];
151     kend   = kstart + a->ilen[row];
152     mat_i  = c->i[i];
153     mat_j  = c->j + mat_i;
154     mat_a  = c->a + mat_i*bs2;
155     mat_ilen = c->ilen + i;
156     for (k=kstart; k<kend; k++) {
157       if ((tcol=ssmap[a->j[k]])) {
158         *mat_j++ = tcol - 1;
159         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
160         mat_a   += bs2;
161         (*mat_ilen)++;
162       }
163     }
164   }
165 
166   /* Free work space */
167   ierr = PetscFree(smap);CHKERRQ(ierr);
168   ierr = PetscFree(lens);CHKERRQ(ierr);
169   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
170   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
171 
172   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
173   *B = C;
174   PetscFunctionReturn(0);
175 }
176 
177 #undef __FUNCT__
178 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
179 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
180 {
181   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
182   IS             is1;
183   PetscErrorCode ierr;
184   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
185   const PetscInt *irow;
186 
187   PetscFunctionBegin;
188   if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
189 
190   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
191   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
192 
193   /* Verify if the indices corespond to each element in a block
194    and form the IS with compressed IS */
195   ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr);
196   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
197   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
198 
199   count = 0;
200   for (i=0; i<a->mbs; i++) {
201     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
202     if (vary[i]==bs) iary[count++] = i;
203   }
204   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
205   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
206   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
207 
208   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr);
209   ierr = ISDestroy(is1);CHKERRQ(ierr);
210   PetscFunctionReturn(0);
211 }
212 
213 #undef __FUNCT__
214 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
215 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
216 {
217   PetscErrorCode ierr;
218   PetscInt       i;
219 
220   PetscFunctionBegin;
221   if (scall == MAT_INITIAL_MATRIX) {
222     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
223   }
224 
225   for (i=0; i<n; i++) {
226     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
227   }
228   PetscFunctionReturn(0);
229 }
230 
231 /* -------------------------------------------------------*/
232 /* Should check that shapes of vectors and matrices match */
233 /* -------------------------------------------------------*/
234 
235 #undef __FUNCT__
236 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
237 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
238 {
239   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
240   PetscScalar    *x,*z,*xb,x1,x2,zero=0.0;
241   MatScalar      *v;
242   PetscErrorCode ierr;
243   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
244   PetscInt       nonzerorow=0;
245 
246   PetscFunctionBegin;
247   ierr = VecSet(zz,zero);CHKERRQ(ierr);
248   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
249   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
250 
251   v     = a->a;
252   xb = x;
253 
254   for (i=0; i<mbs; i++) {
255     n  = ai[1] - ai[0]; /* length of i_th block row of A */
256     x1 = xb[0]; x2 = xb[1];
257     ib = aj + *ai;
258     jmin = 0;
259     nonzerorow += (n>0);
260     if (*ib == i){     /* (diag of A)*x */
261       z[2*i]   += v[0]*x1 + v[2]*x2;
262       z[2*i+1] += v[2]*x1 + v[3]*x2;
263       v += 4; jmin++;
264     }
265     for (j=jmin; j<n; j++) {
266       /* (strict lower triangular part of A)*x  */
267       cval       = ib[j]*2;
268       z[cval]     += v[0]*x1 + v[1]*x2;
269       z[cval+1]   += v[2]*x1 + v[3]*x2;
270       /* (strict upper triangular part of A)*x  */
271       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
272       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
273       v  += 4;
274     }
275     xb +=2; ai++;
276   }
277 
278   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
279   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
280   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
281   PetscFunctionReturn(0);
282 }
283 
284 #undef __FUNCT__
285 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
286 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
287 {
288   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
289   PetscScalar    *x,*z,*xb,x1,x2,x3,zero=0.0;
290   MatScalar      *v;
291   PetscErrorCode ierr;
292   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
293   PetscInt       nonzerorow=0;
294 
295   PetscFunctionBegin;
296   ierr = VecSet(zz,zero);CHKERRQ(ierr);
297   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
298   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
299 
300   v    = a->a;
301   xb   = x;
302 
303   for (i=0; i<mbs; i++) {
304     n  = ai[1] - ai[0]; /* length of i_th block row of A */
305     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
306     ib = aj + *ai;
307     jmin = 0;
308     nonzerorow += (n>0);
309     if (*ib == i){     /* (diag of A)*x */
310       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
311       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
312       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
313       v += 9; jmin++;
314     }
315     for (j=jmin; j<n; j++) {
316       /* (strict lower triangular part of A)*x  */
317       cval       = ib[j]*3;
318       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
319       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
320       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
321       /* (strict upper triangular part of A)*x  */
322       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
323       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
324       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
325       v  += 9;
326     }
327     xb +=3; ai++;
328   }
329 
330   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
331   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
332   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
333   PetscFunctionReturn(0);
334 }
335 
336 #undef __FUNCT__
337 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
338 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
339 {
340   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
341   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
342   MatScalar      *v;
343   PetscErrorCode ierr;
344   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
345   PetscInt       nonzerorow=0;
346 
347   PetscFunctionBegin;
348   ierr = VecSet(zz,zero);CHKERRQ(ierr);
349   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
350   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
351 
352   v     = a->a;
353   xb = x;
354 
355   for (i=0; i<mbs; i++) {
356     n  = ai[1] - ai[0]; /* length of i_th block row of A */
357     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
358     ib = aj + *ai;
359     jmin = 0;
360     nonzerorow += (n>0);
361     if (*ib == i){     /* (diag of A)*x */
362       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
363       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
364       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
365       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
366       v += 16; jmin++;
367     }
368     for (j=jmin; j<n; j++) {
369       /* (strict lower triangular part of A)*x  */
370       cval       = ib[j]*4;
371       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
372       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
373       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
374       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
375       /* (strict upper triangular part of A)*x  */
376       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
377       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
378       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
379       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
380       v  += 16;
381     }
382     xb +=4; ai++;
383   }
384 
385   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
386   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
387   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
388   PetscFunctionReturn(0);
389 }
390 
391 #undef __FUNCT__
392 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
393 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
394 {
395   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
396   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
397   MatScalar      *v;
398   PetscErrorCode ierr;
399   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
400   PetscInt       nonzerorow=0;
401 
402   PetscFunctionBegin;
403   ierr = VecSet(zz,zero);CHKERRQ(ierr);
404   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
405   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
406 
407   v     = a->a;
408   xb = x;
409 
410   for (i=0; i<mbs; i++) {
411     n  = ai[1] - ai[0]; /* length of i_th block row of A */
412     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
413     ib = aj + *ai;
414     jmin = 0;
415     nonzerorow += (n>0);
416     if (*ib == i){      /* (diag of A)*x */
417       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
418       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
419       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
420       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
421       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
422       v += 25; jmin++;
423     }
424     for (j=jmin; j<n; j++) {
425       /* (strict lower triangular part of A)*x  */
426       cval       = ib[j]*5;
427       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
428       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
429       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
430       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
431       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
432       /* (strict upper triangular part of A)*x  */
433       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
434       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
435       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
436       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
437       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
438       v  += 25;
439     }
440     xb +=5; ai++;
441   }
442 
443   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
444   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
445   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
446   PetscFunctionReturn(0);
447 }
448 
449 
450 #undef __FUNCT__
451 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
452 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
453 {
454   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
455   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
456   MatScalar      *v;
457   PetscErrorCode ierr;
458   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
459   PetscInt       nonzerorow=0;
460 
461   PetscFunctionBegin;
462   ierr = VecSet(zz,zero);CHKERRQ(ierr);
463   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
464   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
465 
466   v     = a->a;
467   xb = x;
468 
469   for (i=0; i<mbs; i++) {
470     n  = ai[1] - ai[0]; /* length of i_th block row of A */
471     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
472     ib = aj + *ai;
473     jmin = 0;
474     nonzerorow += (n>0);
475     if (*ib == i){      /* (diag of A)*x */
476       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
477       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
478       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
479       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
480       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
481       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
482       v += 36; jmin++;
483     }
484     for (j=jmin; j<n; j++) {
485       /* (strict lower triangular part of A)*x  */
486       cval       = ib[j]*6;
487       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
488       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
489       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
490       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
491       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
492       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
493       /* (strict upper triangular part of A)*x  */
494       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
495       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
496       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
497       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
498       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
499       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
500       v  += 36;
501     }
502     xb +=6; ai++;
503   }
504 
505   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
506   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
507   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
508   PetscFunctionReturn(0);
509 }
510 #undef __FUNCT__
511 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
512 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
513 {
514   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
515   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
516   MatScalar      *v;
517   PetscErrorCode ierr;
518   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
519   PetscInt       nonzerorow=0;
520 
521   PetscFunctionBegin;
522   ierr = VecSet(zz,zero);CHKERRQ(ierr);
523   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
524   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
525 
526   v     = a->a;
527   xb = x;
528 
529   for (i=0; i<mbs; i++) {
530     n  = ai[1] - ai[0]; /* length of i_th block row of A */
531     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
532     ib = aj + *ai;
533     jmin = 0;
534     nonzerorow += (n>0);
535     if (*ib == i){      /* (diag of A)*x */
536       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
537       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
538       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
539       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
540       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
541       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
542       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
543       v += 49; jmin++;
544     }
545     for (j=jmin; j<n; j++) {
546       /* (strict lower triangular part of A)*x  */
547       cval       = ib[j]*7;
548       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
549       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
550       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
551       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
552       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
553       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
554       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
555       /* (strict upper triangular part of A)*x  */
556       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
557       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
558       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
559       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
560       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
561       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
562       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
563       v  += 49;
564     }
565     xb +=7; ai++;
566   }
567   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
568   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
569   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
570   PetscFunctionReturn(0);
571 }
572 
573 /*
574     This will not work with MatScalar == float because it calls the BLAS
575 */
576 #undef __FUNCT__
577 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
578 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
579 {
580   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
581   PetscScalar    *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
582   MatScalar      *v;
583   PetscErrorCode ierr;
584   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
585   PetscInt       nonzerorow=0;
586 
587   PetscFunctionBegin;
588   ierr = VecSet(zz,zero);CHKERRQ(ierr);
589   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
590   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
591 
592   aj   = a->j;
593   v    = a->a;
594   ii   = a->i;
595 
596   if (!a->mult_work) {
597     ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
598   }
599   work = a->mult_work;
600 
601   for (i=0; i<mbs; i++) {
602     n     = ii[1] - ii[0]; ncols = n*bs;
603     workt = work; idx=aj+ii[0];
604     nonzerorow += (n>0);
605 
606     /* upper triangular part */
607     for (j=0; j<n; j++) {
608       xb = x_ptr + bs*(*idx++);
609       for (k=0; k<bs; k++) workt[k] = xb[k];
610       workt += bs;
611     }
612     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
613     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
614 
615     /* strict lower triangular part */
616     idx = aj+ii[0];
617     if (*idx == i){
618       ncols -= bs; v += bs2; idx++; n--;
619     }
620 
621     if (ncols > 0){
622       workt = work;
623       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
624       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
625       for (j=0; j<n; j++) {
626         zb = z_ptr + bs*(*idx++);
627         for (k=0; k<bs; k++) zb[k] += workt[k] ;
628         workt += bs;
629       }
630     }
631     x += bs; v += n*bs2; z += bs; ii++;
632   }
633 
634   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
635   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
636   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
637   PetscFunctionReturn(0);
638 }
639 
640 extern PetscErrorCode VecCopy_Seq(Vec,Vec);
641 #undef __FUNCT__
642 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
643 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
644 {
645   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
646   PetscScalar    *x,*z,*xb,x1;
647   MatScalar      *v;
648   PetscErrorCode ierr;
649   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
650   PetscInt       nonzerorow=0;
651 
652   PetscFunctionBegin;
653   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
654   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
655   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
656   v  = a->a;
657   xb = x;
658 
659   for (i=0; i<mbs; i++) {
660     n  = ai[1] - ai[0];  /* length of i_th row of A */
661     x1 = xb[0];
662     ib = aj + *ai;
663     jmin = 0;
664     nonzerorow += (n>0);
665     if (*ib == i) {            /* (diag of A)*x */
666       z[i] += *v++ * x[*ib++]; jmin++;
667     }
668     for (j=jmin; j<n; j++) {
669       cval    = *ib;
670       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
671       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
672     }
673     xb++; ai++;
674   }
675 
676   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
677   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
678 
679   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
680   PetscFunctionReturn(0);
681 }
682 
683 #undef __FUNCT__
684 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
685 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
686 {
687   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
688   PetscScalar    *x,*z,*xb,x1,x2;
689   MatScalar      *v;
690   PetscErrorCode ierr;
691   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
692   PetscInt       nonzerorow=0;
693 
694   PetscFunctionBegin;
695   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
696   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
697   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
698 
699   v  = a->a;
700   xb = x;
701 
702   for (i=0; i<mbs; i++) {
703     n  = ai[1] - ai[0]; /* length of i_th block row of A */
704     x1 = xb[0]; x2 = xb[1];
705     ib = aj + *ai;
706     jmin = 0;
707     nonzerorow += (n>0);
708     if (*ib == i){      /* (diag of A)*x */
709       z[2*i]   += v[0]*x1 + v[2]*x2;
710       z[2*i+1] += v[2]*x1 + v[3]*x2;
711       v += 4; jmin++;
712     }
713     for (j=jmin; j<n; j++) {
714       /* (strict lower triangular part of A)*x  */
715       cval       = ib[j]*2;
716       z[cval]     += v[0]*x1 + v[1]*x2;
717       z[cval+1]   += v[2]*x1 + v[3]*x2;
718       /* (strict upper triangular part of A)*x  */
719       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
720       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
721       v  += 4;
722     }
723     xb +=2; ai++;
724   }
725   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
726   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
727 
728   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
729   PetscFunctionReturn(0);
730 }
731 
732 #undef __FUNCT__
733 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
734 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
735 {
736   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
737   PetscScalar    *x,*z,*xb,x1,x2,x3;
738   MatScalar      *v;
739   PetscErrorCode ierr;
740   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
741   PetscInt       nonzerorow=0;
742 
743   PetscFunctionBegin;
744   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
745   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
746   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
747 
748   v     = a->a;
749   xb = x;
750 
751   for (i=0; i<mbs; i++) {
752     n  = ai[1] - ai[0]; /* length of i_th block row of A */
753     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
754     ib = aj + *ai;
755     jmin = 0;
756     nonzerorow += (n>0);
757     if (*ib == i){     /* (diag of A)*x */
758      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
759      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
760      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
761      v += 9; jmin++;
762     }
763     for (j=jmin; j<n; j++) {
764       /* (strict lower triangular part of A)*x  */
765       cval       = ib[j]*3;
766       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
767       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
768       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
769       /* (strict upper triangular part of A)*x  */
770       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
771       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
772       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
773       v  += 9;
774     }
775     xb +=3; ai++;
776   }
777 
778   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
779   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
780 
781   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
782   PetscFunctionReturn(0);
783 }
784 
785 #undef __FUNCT__
786 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
787 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
788 {
789   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
790   PetscScalar    *x,*z,*xb,x1,x2,x3,x4;
791   MatScalar      *v;
792   PetscErrorCode ierr;
793   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
794   PetscInt       nonzerorow=0;
795 
796   PetscFunctionBegin;
797   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
798   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
799   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
800 
801   v     = a->a;
802   xb = x;
803 
804   for (i=0; i<mbs; i++) {
805     n  = ai[1] - ai[0]; /* length of i_th block row of A */
806     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
807     ib = aj + *ai;
808     jmin = 0;
809     nonzerorow += (n>0);
810     if (*ib == i){      /* (diag of A)*x */
811       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
812       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
813       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
814       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
815       v += 16; jmin++;
816     }
817     for (j=jmin; j<n; j++) {
818       /* (strict lower triangular part of A)*x  */
819       cval       = ib[j]*4;
820       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
821       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
822       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
823       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
824       /* (strict upper triangular part of A)*x  */
825       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
826       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
827       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
828       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
829       v  += 16;
830     }
831     xb +=4; ai++;
832   }
833 
834   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
835   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
836 
837   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
838   PetscFunctionReturn(0);
839 }
840 
841 #undef __FUNCT__
842 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
843 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
844 {
845   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
846   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5;
847   MatScalar      *v;
848   PetscErrorCode ierr;
849   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
850   PetscInt       nonzerorow=0;
851 
852   PetscFunctionBegin;
853   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
854   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
855   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
856 
857   v     = a->a;
858   xb = x;
859 
860   for (i=0; i<mbs; i++) {
861     n  = ai[1] - ai[0]; /* length of i_th block row of A */
862     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
863     ib = aj + *ai;
864     jmin = 0;
865     nonzerorow += (n>0);
866     if (*ib == i){      /* (diag of A)*x */
867       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
868       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
869       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
870       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
871       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
872       v += 25; jmin++;
873     }
874     for (j=jmin; j<n; j++) {
875       /* (strict lower triangular part of A)*x  */
876       cval       = ib[j]*5;
877       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
878       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
879       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
880       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
881       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
882       /* (strict upper triangular part of A)*x  */
883       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
884       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
885       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
886       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
887       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
888       v  += 25;
889     }
890     xb +=5; ai++;
891   }
892 
893   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
894   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
895 
896   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
897   PetscFunctionReturn(0);
898 }
899 #undef __FUNCT__
900 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
901 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
902 {
903   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
904   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6;
905   MatScalar      *v;
906   PetscErrorCode ierr;
907   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
908   PetscInt       nonzerorow=0;
909 
910   PetscFunctionBegin;
911   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
912   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
913   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
914 
915   v     = a->a;
916   xb = x;
917 
918   for (i=0; i<mbs; i++) {
919     n  = ai[1] - ai[0]; /* length of i_th block row of A */
920     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
921     ib = aj + *ai;
922     jmin = 0;
923     nonzerorow += (n>0);
924     if (*ib == i){     /* (diag of A)*x */
925       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
926       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
927       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
928       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
929       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
930       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
931       v += 36; jmin++;
932     }
933     for (j=jmin; j<n; j++) {
934       /* (strict lower triangular part of A)*x  */
935       cval       = ib[j]*6;
936       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
937       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
938       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
939       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
940       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
941       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
942       /* (strict upper triangular part of A)*x  */
943       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
944       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
945       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
946       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
947       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
948       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
949       v  += 36;
950     }
951     xb +=6; ai++;
952   }
953 
954   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
955   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
956 
957   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
958   PetscFunctionReturn(0);
959 }
960 
961 #undef __FUNCT__
962 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
963 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
964 {
965   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
966   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
967   MatScalar      *v;
968   PetscErrorCode ierr;
969   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
970   PetscInt       nonzerorow=0;
971 
972   PetscFunctionBegin;
973   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
974   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
975   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
976 
977   v     = a->a;
978   xb = x;
979 
980   for (i=0; i<mbs; i++) {
981     n  = ai[1] - ai[0]; /* length of i_th block row of A */
982     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
983     ib = aj + *ai;
984     jmin = 0;
985     nonzerorow += (n>0);
986     if (*ib == i){     /* (diag of A)*x */
987       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
988       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
989       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
990       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
991       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
992       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
993       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
994       v += 49; jmin++;
995     }
996     for (j=jmin; j<n; j++) {
997       /* (strict lower triangular part of A)*x  */
998       cval       = ib[j]*7;
999       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1000       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1001       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1002       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1003       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1004       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1005       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1006       /* (strict upper triangular part of A)*x  */
1007       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1008       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1009       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1010       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1011       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1012       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1013       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1014       v  += 49;
1015     }
1016     xb +=7; ai++;
1017   }
1018 
1019   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1020   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1021 
1022   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1023   PetscFunctionReturn(0);
1024 }
1025 
1026 #undef __FUNCT__
1027 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1028 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1029 {
1030   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1031   PetscScalar    *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1032   MatScalar      *v;
1033   PetscErrorCode ierr;
1034   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1035   PetscInt       nonzerorow=0;
1036 
1037   PetscFunctionBegin;
1038   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1039   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1040   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1041 
1042   aj   = a->j;
1043   v    = a->a;
1044   ii   = a->i;
1045 
1046   if (!a->mult_work) {
1047     ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1048   }
1049   work = a->mult_work;
1050 
1051 
1052   for (i=0; i<mbs; i++) {
1053     n     = ii[1] - ii[0]; ncols = n*bs;
1054     workt = work; idx=aj+ii[0];
1055     nonzerorow += (n>0);
1056 
1057     /* upper triangular part */
1058     for (j=0; j<n; j++) {
1059       xb = x_ptr + bs*(*idx++);
1060       for (k=0; k<bs; k++) workt[k] = xb[k];
1061       workt += bs;
1062     }
1063     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1064     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1065 
1066     /* strict lower triangular part */
1067     idx = aj+ii[0];
1068     if (*idx == i){
1069       ncols -= bs; v += bs2; idx++; n--;
1070     }
1071     if (ncols > 0){
1072       workt = work;
1073       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1074       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1075       for (j=0; j<n; j++) {
1076         zb = z_ptr + bs*(*idx++);
1077         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1078         workt += bs;
1079       }
1080     }
1081 
1082     x += bs; v += n*bs2; z += bs; ii++;
1083   }
1084 
1085   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1086   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1087 
1088   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1089   PetscFunctionReturn(0);
1090 }
1091 
1092 #undef __FUNCT__
1093 #define __FUNCT__ "MatScale_SeqSBAIJ"
1094 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1095 {
1096   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)inA->data;
1097   PetscScalar    oalpha = alpha;
1098   PetscErrorCode ierr;
1099   PetscBLASInt   one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz);
1100 
1101   PetscFunctionBegin;
1102   BLASscal_(&totalnz,&oalpha,a->a,&one);
1103   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1104   PetscFunctionReturn(0);
1105 }
1106 
1107 #undef __FUNCT__
1108 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1109 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1110 {
1111   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1112   MatScalar      *v = a->a;
1113   PetscReal      sum_diag = 0.0, sum_off = 0.0, *sum;
1114   PetscInt       i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1115   PetscErrorCode ierr;
1116   PetscInt       *jl,*il,jmin,jmax,nexti,ik,*col;
1117 
1118   PetscFunctionBegin;
1119   if (type == NORM_FROBENIUS) {
1120     for (k=0; k<mbs; k++){
1121       jmin = a->i[k]; jmax = a->i[k+1];
1122       col  = aj + jmin;
1123       if (*col == k){         /* diagonal block */
1124         for (i=0; i<bs2; i++){
1125 #if defined(PETSC_USE_COMPLEX)
1126           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1127 #else
1128           sum_diag += (*v)*(*v); v++;
1129 #endif
1130         }
1131         jmin++;
1132       }
1133       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1134         for (i=0; i<bs2; i++){
1135 #if defined(PETSC_USE_COMPLEX)
1136           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1137 #else
1138           sum_off += (*v)*(*v); v++;
1139 #endif
1140         }
1141       }
1142     }
1143     *norm = sqrt(sum_diag + 2*sum_off);
1144   }  else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1145     ierr = PetscMalloc3(bs,PetscReal,&sum,mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr);
1146     for (i=0; i<mbs; i++) jl[i] = mbs;
1147     il[0] = 0;
1148 
1149     *norm = 0.0;
1150     for (k=0; k<mbs; k++) { /* k_th block row */
1151       for (j=0; j<bs; j++) sum[j]=0.0;
1152       /*-- col sum --*/
1153       i = jl[k]; /* first |A(i,k)| to be added */
1154       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1155                   at step k */
1156       while (i<mbs){
1157         nexti = jl[i];  /* next block row to be added */
1158         ik    = il[i];  /* block index of A(i,k) in the array a */
1159         for (j=0; j<bs; j++){
1160           v = a->a + ik*bs2 + j*bs;
1161           for (k1=0; k1<bs; k1++) {
1162             sum[j] += PetscAbsScalar(*v); v++;
1163           }
1164         }
1165         /* update il, jl */
1166         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1167         jmax = a->i[i+1];
1168         if (jmin < jmax){
1169           il[i] = jmin;
1170           j   = a->j[jmin];
1171           jl[i] = jl[j]; jl[j]=i;
1172         }
1173         i = nexti;
1174       }
1175       /*-- row sum --*/
1176       jmin = a->i[k]; jmax = a->i[k+1];
1177       for (i=jmin; i<jmax; i++) {
1178         for (j=0; j<bs; j++){
1179           v = a->a + i*bs2 + j;
1180           for (k1=0; k1<bs; k1++){
1181             sum[j] += PetscAbsScalar(*v); v += bs;
1182           }
1183         }
1184       }
1185       /* add k_th block row to il, jl */
1186       col = aj+jmin;
1187       if (*col == k) jmin++;
1188       if (jmin < jmax){
1189         il[k] = jmin;
1190         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1191       }
1192       for (j=0; j<bs; j++){
1193         if (sum[j] > *norm) *norm = sum[j];
1194       }
1195     }
1196     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1197   } else {
1198     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1199   }
1200   PetscFunctionReturn(0);
1201 }
1202 
1203 #undef __FUNCT__
1204 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1205 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1206 {
1207   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1208   PetscErrorCode ierr;
1209 
1210   PetscFunctionBegin;
1211 
1212   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1213   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1214     *flg = PETSC_FALSE;
1215     PetscFunctionReturn(0);
1216   }
1217 
1218   /* if the a->i are the same */
1219   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1220   if (!*flg) {
1221     PetscFunctionReturn(0);
1222   }
1223 
1224   /* if a->j are the same */
1225   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1226   if (!*flg) {
1227     PetscFunctionReturn(0);
1228   }
1229   /* if a->a are the same */
1230   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1231   PetscFunctionReturn(0);
1232 }
1233 
1234 #undef __FUNCT__
1235 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1236 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1237 {
1238   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1239   PetscErrorCode ierr;
1240   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1241   PetscScalar    *x,zero = 0.0;
1242   MatScalar      *aa,*aa_j;
1243 
1244   PetscFunctionBegin;
1245   bs   = A->rmap->bs;
1246   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1247 
1248   aa   = a->a;
1249   ai   = a->i;
1250   aj   = a->j;
1251   ambs = a->mbs;
1252   bs2  = a->bs2;
1253 
1254   ierr = VecSet(v,zero);CHKERRQ(ierr);
1255   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1256   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1257   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1258   for (i=0; i<ambs; i++) {
1259     j=ai[i];
1260     if (aj[j] == i) {             /* if this is a diagonal element */
1261       row  = i*bs;
1262       aa_j = aa + j*bs2;
1263       if (A->factortype && bs==1){
1264         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1265       } else {
1266         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1267       }
1268     }
1269   }
1270 
1271   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1272   PetscFunctionReturn(0);
1273 }
1274 
1275 #undef __FUNCT__
1276 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1277 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1278 {
1279   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1280   PetscScalar    *l,x,*li,*ri;
1281   MatScalar      *aa,*v;
1282   PetscErrorCode ierr;
1283   PetscInt       i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1284   PetscTruth     flg;
1285 
1286   PetscFunctionBegin;
1287   if (ll != rr){
1288     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1289     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1290   }
1291   if (!ll) PetscFunctionReturn(0);
1292   ai  = a->i;
1293   aj  = a->j;
1294   aa  = a->a;
1295   m   = A->rmap->N;
1296   bs  = A->rmap->bs;
1297   mbs = a->mbs;
1298   bs2 = a->bs2;
1299 
1300   ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1301   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1302   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1303   for (i=0; i<mbs; i++) { /* for each block row */
1304     M  = ai[i+1] - ai[i];
1305     li = l + i*bs;
1306     v  = aa + bs2*ai[i];
1307     for (j=0; j<M; j++) { /* for each block */
1308       ri = l + bs*aj[ai[i]+j];
1309       for (k=0; k<bs; k++) {
1310         x = ri[k];
1311         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1312       }
1313     }
1314   }
1315   ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1316   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1317   PetscFunctionReturn(0);
1318 }
1319 
1320 #undef __FUNCT__
1321 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1322 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1323 {
1324   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1325 
1326   PetscFunctionBegin;
1327   info->block_size     = a->bs2;
1328   info->nz_allocated   = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */
1329   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1330   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1331   info->assemblies   = A->num_ass;
1332   info->mallocs      = A->info.mallocs;
1333   info->memory       = ((PetscObject)A)->mem;
1334   if (A->factortype) {
1335     info->fill_ratio_given  = A->info.fill_ratio_given;
1336     info->fill_ratio_needed = A->info.fill_ratio_needed;
1337     info->factor_mallocs    = A->info.factor_mallocs;
1338   } else {
1339     info->fill_ratio_given  = 0;
1340     info->fill_ratio_needed = 0;
1341     info->factor_mallocs    = 0;
1342   }
1343   PetscFunctionReturn(0);
1344 }
1345 
1346 
1347 #undef __FUNCT__
1348 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1349 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1350 {
1351   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1352   PetscErrorCode ierr;
1353 
1354   PetscFunctionBegin;
1355   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1356   PetscFunctionReturn(0);
1357 }
1358 
1359 #undef __FUNCT__
1360 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1361 /*
1362    This code does not work since it only checks the upper triangular part of
1363   the matrix. Hence it is not listed in the function table.
1364 */
1365 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1366 {
1367   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1368   PetscErrorCode ierr;
1369   PetscInt       i,j,n,row,col,bs,*ai,*aj,mbs;
1370   PetscReal      atmp;
1371   MatScalar      *aa;
1372   PetscScalar    *x;
1373   PetscInt       ncols,brow,bcol,krow,kcol;
1374 
1375   PetscFunctionBegin;
1376   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1377   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1378   bs   = A->rmap->bs;
1379   aa   = a->a;
1380   ai   = a->i;
1381   aj   = a->j;
1382   mbs = a->mbs;
1383 
1384   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1385   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1386   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1387   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1388   for (i=0; i<mbs; i++) {
1389     ncols = ai[1] - ai[0]; ai++;
1390     brow  = bs*i;
1391     for (j=0; j<ncols; j++){
1392       bcol = bs*(*aj);
1393       for (kcol=0; kcol<bs; kcol++){
1394         col = bcol + kcol;      /* col index */
1395         for (krow=0; krow<bs; krow++){
1396           atmp = PetscAbsScalar(*aa); aa++;
1397           row = brow + krow;    /* row index */
1398           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1399           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1400         }
1401       }
1402       aj++;
1403     }
1404   }
1405   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1406   PetscFunctionReturn(0);
1407 }
1408