xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision dedccee8f225f91da5577b2f35bba0c3cd8d7504)
1 #define PETSCMAT_DLL
2 
3 #include "../src/mat/impls/baij/seq/baij.h"
4 #include "../src/mat/blockinvert.h"
5 #include "petscbt.h"
6 #include "../src/mat/impls/sbaij/seq/sbaij.h"
7 #include "petscblaslapack.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
11 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
12 {
13   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
16   const PetscInt *idx;
17   PetscBT        table,table0;
18 
19   PetscFunctionBegin;
20   if (ov < 0)  SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
21   mbs = a->mbs;
22   ai  = a->i;
23   aj  = a->j;
24   bs  = A->rmap->bs;
25   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
26   ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
27   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
28   ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr);
29 
30   for (i=0; i<is_max; i++) { /* for each is */
31     isz  = 0;
32     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
33 
34     /* Extract the indices, assume there can be duplicate entries */
35     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
36     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
37 
38     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
39     bcol_max = 0;
40     for (j=0; j<n ; ++j){
41       brow = idx[j]/bs; /* convert the indices into block indices */
42       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43       if(!PetscBTLookupSet(table,brow)) {
44         nidx[isz++] = brow;
45         if (bcol_max < brow) bcol_max = brow;
46       }
47     }
48     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
49     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
50 
51     k = 0;
52     for (j=0; j<ov; j++){ /* for each overlap */
53       /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
54       ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr);
55       for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); }
56 
57       n = isz;  /* length of the updated is[i] */
58       for (brow=0; brow<mbs; brow++){
59         start = ai[brow]; end   = ai[brow+1];
60         if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
61           for (l = start; l<end ; l++){
62             bcol = aj[l];
63             if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
64           }
65           k++;
66           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
68           for (l = start; l<end ; l++){
69             bcol = aj[l];
70             if (bcol > bcol_max) break;
71             if (PetscBTLookup(table0,bcol)){
72               if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
73               break; /* for l = start; l<end ; l++) */
74             }
75           }
76         }
77       }
78     } /* for each overlap */
79 
80     /* expand the Index Set */
81     for (j=0; j<isz; j++) {
82       for (k=0; k<bs; k++)
83         nidx2[j*bs+k] = nidx[j]*bs+k;
84     }
85     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
86   }
87   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
88   ierr = PetscFree(nidx);CHKERRQ(ierr);
89   ierr = PetscFree(nidx2);CHKERRQ(ierr);
90   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
91   PetscFunctionReturn(0);
92 }
93 
94 #undef __FUNCT__
95 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
96 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
97 {
98   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
99   PetscErrorCode ierr;
100   PetscInt       *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
101   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
102   PetscInt       nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
103   const PetscInt *irow,*aj = a->j,*ai = a->i;
104   MatScalar      *mat_a;
105   Mat            C;
106   PetscBool      flag,sorted;
107 
108   PetscFunctionBegin;
109   if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
110   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
111   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
112 
113   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
114   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
115 
116   ierr  = PetscMalloc(oldcols*sizeof(PetscInt),&smap);CHKERRQ(ierr);
117   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
118   ssmap = smap;
119   ierr  = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
120   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
121   /* determine lens of each row */
122   for (i=0; i<nrows; i++) {
123     kstart  = ai[irow[i]];
124     kend    = kstart + a->ilen[irow[i]];
125     lens[i] = 0;
126       for (k=kstart; k<kend; k++) {
127         if (ssmap[aj[k]]) {
128           lens[i]++;
129         }
130       }
131     }
132   /* Create and fill new matrix */
133   if (scall == MAT_REUSE_MATRIX) {
134     c = (Mat_SeqSBAIJ *)((*B)->data);
135 
136     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
137     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
138     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
139     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
140     C = *B;
141   } else {
142     ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr);
143     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
144     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
145     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
146   }
147   c = (Mat_SeqSBAIJ *)(C->data);
148   for (i=0; i<nrows; i++) {
149     row    = irow[i];
150     kstart = ai[row];
151     kend   = kstart + a->ilen[row];
152     mat_i  = c->i[i];
153     mat_j  = c->j + mat_i;
154     mat_a  = c->a + mat_i*bs2;
155     mat_ilen = c->ilen + i;
156     for (k=kstart; k<kend; k++) {
157       if ((tcol=ssmap[a->j[k]])) {
158         *mat_j++ = tcol - 1;
159         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
160         mat_a   += bs2;
161         (*mat_ilen)++;
162       }
163     }
164   }
165 
166   /* Free work space */
167   ierr = PetscFree(smap);CHKERRQ(ierr);
168   ierr = PetscFree(lens);CHKERRQ(ierr);
169   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
170   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
171 
172   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
173   *B = C;
174   PetscFunctionReturn(0);
175 }
176 
177 #undef __FUNCT__
178 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
179 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
180 {
181   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
182   IS             is1;
183   PetscErrorCode ierr;
184   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
185   const PetscInt *irow;
186 
187   PetscFunctionBegin;
188   if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
189 
190   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
191   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
192 
193   /* Verify if the indices corespond to each element in a block
194    and form the IS with compressed IS */
195   ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr);
196   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
197   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
198 
199   count = 0;
200   for (i=0; i<a->mbs; i++) {
201     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
202     if (vary[i]==bs) iary[count++] = i;
203   }
204   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
205   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
206   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
207 
208   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr);
209   ierr = ISDestroy(is1);CHKERRQ(ierr);
210   PetscFunctionReturn(0);
211 }
212 
213 #undef __FUNCT__
214 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
215 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
216 {
217   PetscErrorCode ierr;
218   PetscInt       i;
219 
220   PetscFunctionBegin;
221   if (scall == MAT_INITIAL_MATRIX) {
222     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
223   }
224 
225   for (i=0; i<n; i++) {
226     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
227   }
228   PetscFunctionReturn(0);
229 }
230 
231 /* -------------------------------------------------------*/
232 /* Should check that shapes of vectors and matrices match */
233 /* -------------------------------------------------------*/
234 
235 #undef __FUNCT__
236 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
237 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
238 {
239   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
240   PetscScalar    *x,*z,*xb,x1,x2,zero=0.0;
241   MatScalar      *v;
242   PetscErrorCode ierr;
243   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
244   PetscInt       nonzerorow=0;
245 
246   PetscFunctionBegin;
247   ierr = VecSet(zz,zero);CHKERRQ(ierr);
248   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
249   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
250 
251   v     = a->a;
252   xb = x;
253 
254   for (i=0; i<mbs; i++) {
255     n  = ai[1] - ai[0]; /* length of i_th block row of A */
256     x1 = xb[0]; x2 = xb[1];
257     ib = aj + *ai;
258     jmin = 0;
259     nonzerorow += (n>0);
260     if (*ib == i){     /* (diag of A)*x */
261       z[2*i]   += v[0]*x1 + v[2]*x2;
262       z[2*i+1] += v[2]*x1 + v[3]*x2;
263       v += 4; jmin++;
264     }
265     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
266     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
267     for (j=jmin; j<n; j++) {
268       /* (strict lower triangular part of A)*x  */
269       cval       = ib[j]*2;
270       z[cval]     += v[0]*x1 + v[1]*x2;
271       z[cval+1]   += v[2]*x1 + v[3]*x2;
272       /* (strict upper triangular part of A)*x  */
273       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
274       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
275       v  += 4;
276     }
277     xb +=2; ai++;
278   }
279 
280   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
281   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
282   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
283   PetscFunctionReturn(0);
284 }
285 
286 #undef __FUNCT__
287 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
288 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
289 {
290   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
291   PetscScalar    *x,*z,*xb,x1,x2,x3,zero=0.0;
292   MatScalar      *v;
293   PetscErrorCode ierr;
294   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
295   PetscInt       nonzerorow=0;
296 
297   PetscFunctionBegin;
298   ierr = VecSet(zz,zero);CHKERRQ(ierr);
299   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
300   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
301 
302   v    = a->a;
303   xb   = x;
304 
305   for (i=0; i<mbs; i++) {
306     n  = ai[1] - ai[0]; /* length of i_th block row of A */
307     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
308     ib = aj + *ai;
309     jmin = 0;
310     nonzerorow += (n>0);
311     if (*ib == i){     /* (diag of A)*x */
312       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
313       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
314       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
315       v += 9; jmin++;
316     }
317     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
318     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
319     for (j=jmin; j<n; j++) {
320       /* (strict lower triangular part of A)*x  */
321       cval       = ib[j]*3;
322       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
323       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
324       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
325       /* (strict upper triangular part of A)*x  */
326       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
327       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
328       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
329       v  += 9;
330     }
331     xb +=3; ai++;
332   }
333 
334   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
335   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
336   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
337   PetscFunctionReturn(0);
338 }
339 
340 #undef __FUNCT__
341 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
342 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
343 {
344   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
345   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
346   MatScalar      *v;
347   PetscErrorCode ierr;
348   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
349   PetscInt       nonzerorow=0;
350 
351   PetscFunctionBegin;
352   ierr = VecSet(zz,zero);CHKERRQ(ierr);
353   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
354   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
355 
356   v     = a->a;
357   xb = x;
358 
359   for (i=0; i<mbs; i++) {
360     n  = ai[1] - ai[0]; /* length of i_th block row of A */
361     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
362     ib = aj + *ai;
363     jmin = 0;
364     nonzerorow += (n>0);
365     if (*ib == i){     /* (diag of A)*x */
366       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
367       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
368       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
369       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
370       v += 16; jmin++;
371     }
372     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
373     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
374     for (j=jmin; j<n; j++) {
375       /* (strict lower triangular part of A)*x  */
376       cval       = ib[j]*4;
377       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
378       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
379       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
380       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
381       /* (strict upper triangular part of A)*x  */
382       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
383       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
384       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
385       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
386       v  += 16;
387     }
388     xb +=4; ai++;
389   }
390 
391   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
392   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
393   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
394   PetscFunctionReturn(0);
395 }
396 
397 #undef __FUNCT__
398 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
399 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
400 {
401   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
402   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
403   MatScalar      *v;
404   PetscErrorCode ierr;
405   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
406   PetscInt       nonzerorow=0;
407 
408   PetscFunctionBegin;
409   ierr = VecSet(zz,zero);CHKERRQ(ierr);
410   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
411   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
412 
413   v     = a->a;
414   xb = x;
415 
416   for (i=0; i<mbs; i++) {
417     n  = ai[1] - ai[0]; /* length of i_th block row of A */
418     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
419     ib = aj + *ai;
420     jmin = 0;
421     nonzerorow += (n>0);
422     if (*ib == i){      /* (diag of A)*x */
423       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
424       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
425       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
426       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
427       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
428       v += 25; jmin++;
429     }
430     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
431     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
432     for (j=jmin; j<n; j++) {
433       /* (strict lower triangular part of A)*x  */
434       cval       = ib[j]*5;
435       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
436       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
437       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
438       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
439       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
440       /* (strict upper triangular part of A)*x  */
441       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
442       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
443       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
444       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
445       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
446       v  += 25;
447     }
448     xb +=5; ai++;
449   }
450 
451   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
452   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
453   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
454   PetscFunctionReturn(0);
455 }
456 
457 
458 #undef __FUNCT__
459 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
460 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
461 {
462   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
463   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
464   MatScalar      *v;
465   PetscErrorCode ierr;
466   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
467   PetscInt       nonzerorow=0;
468 
469   PetscFunctionBegin;
470   ierr = VecSet(zz,zero);CHKERRQ(ierr);
471   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
472   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
473 
474   v     = a->a;
475   xb = x;
476 
477   for (i=0; i<mbs; i++) {
478     n  = ai[1] - ai[0]; /* length of i_th block row of A */
479     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
480     ib = aj + *ai;
481     jmin = 0;
482     nonzerorow += (n>0);
483     if (*ib == i){      /* (diag of A)*x */
484       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
485       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
486       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
487       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
488       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
489       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
490       v += 36; jmin++;
491     }
492     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
493     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
494     for (j=jmin; j<n; j++) {
495       /* (strict lower triangular part of A)*x  */
496       cval       = ib[j]*6;
497       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
498       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
499       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
500       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
501       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
502       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
503       /* (strict upper triangular part of A)*x  */
504       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
505       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
506       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
507       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
508       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
509       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
510       v  += 36;
511     }
512     xb +=6; ai++;
513   }
514 
515   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
516   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
517   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
518   PetscFunctionReturn(0);
519 }
520 #undef __FUNCT__
521 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
522 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
523 {
524   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
525   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
526   MatScalar      *v;
527   PetscErrorCode ierr;
528   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
529   PetscInt       nonzerorow=0;
530 
531   PetscFunctionBegin;
532   ierr = VecSet(zz,zero);CHKERRQ(ierr);
533   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
534   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
535 
536   v     = a->a;
537   xb = x;
538 
539   for (i=0; i<mbs; i++) {
540     n  = ai[1] - ai[0]; /* length of i_th block row of A */
541     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
542     ib = aj + *ai;
543     jmin = 0;
544     nonzerorow += (n>0);
545     if (*ib == i){      /* (diag of A)*x */
546       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
547       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
548       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
549       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
550       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
551       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
552       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
553       v += 49; jmin++;
554     }
555     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
556     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
557     for (j=jmin; j<n; j++) {
558       /* (strict lower triangular part of A)*x  */
559       cval       = ib[j]*7;
560       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
561       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
562       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
563       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
564       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
565       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
566       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
567       /* (strict upper triangular part of A)*x  */
568       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
569       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
570       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
571       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
572       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
573       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
574       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
575       v  += 49;
576     }
577     xb +=7; ai++;
578   }
579   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
580   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
581   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
582   PetscFunctionReturn(0);
583 }
584 
585 /*
586     This will not work with MatScalar == float because it calls the BLAS
587 */
588 #undef __FUNCT__
589 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
590 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
591 {
592   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
593   PetscScalar    *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
594   MatScalar      *v;
595   PetscErrorCode ierr;
596   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
597   PetscInt       nonzerorow=0;
598 
599   PetscFunctionBegin;
600   ierr = VecSet(zz,zero);CHKERRQ(ierr);
601   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
602   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
603 
604   aj   = a->j;
605   v    = a->a;
606   ii   = a->i;
607 
608   if (!a->mult_work) {
609     ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
610   }
611   work = a->mult_work;
612 
613   for (i=0; i<mbs; i++) {
614     n     = ii[1] - ii[0]; ncols = n*bs;
615     workt = work; idx=aj+ii[0];
616     nonzerorow += (n>0);
617 
618     /* upper triangular part */
619     for (j=0; j<n; j++) {
620       xb = x_ptr + bs*(*idx++);
621       for (k=0; k<bs; k++) workt[k] = xb[k];
622       workt += bs;
623     }
624     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
625     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
626 
627     /* strict lower triangular part */
628     idx = aj+ii[0];
629     if (*idx == i){
630       ncols -= bs; v += bs2; idx++; n--;
631     }
632 
633     if (ncols > 0){
634       workt = work;
635       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
636       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
637       for (j=0; j<n; j++) {
638         zb = z_ptr + bs*(*idx++);
639         for (k=0; k<bs; k++) zb[k] += workt[k] ;
640         workt += bs;
641       }
642     }
643     x += bs; v += n*bs2; z += bs; ii++;
644   }
645 
646   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
647   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
648   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
649   PetscFunctionReturn(0);
650 }
651 
652 #undef __FUNCT__
653 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
654 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
655 {
656   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
657   PetscScalar    *x,*z,*xb,x1;
658   MatScalar      *v;
659   PetscErrorCode ierr;
660   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
661   PetscInt       nonzerorow=0;
662 
663   PetscFunctionBegin;
664   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
665   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
666   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
667   v  = a->a;
668   xb = x;
669 
670   for (i=0; i<mbs; i++) {
671     n  = ai[1] - ai[0];  /* length of i_th row of A */
672     x1 = xb[0];
673     ib = aj + *ai;
674     jmin = 0;
675     nonzerorow += (n>0);
676     if (*ib == i) {            /* (diag of A)*x */
677       z[i] += *v++ * x[*ib++]; jmin++;
678     }
679     for (j=jmin; j<n; j++) {
680       cval    = *ib;
681       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
682       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
683     }
684     xb++; ai++;
685   }
686 
687   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
688   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
689 
690   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
691   PetscFunctionReturn(0);
692 }
693 
694 #undef __FUNCT__
695 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
696 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
697 {
698   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
699   PetscScalar    *x,*z,*xb,x1,x2;
700   MatScalar      *v;
701   PetscErrorCode ierr;
702   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
703   PetscInt       nonzerorow=0;
704 
705   PetscFunctionBegin;
706   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
707   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
708   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
709 
710   v  = a->a;
711   xb = x;
712 
713   for (i=0; i<mbs; i++) {
714     n  = ai[1] - ai[0]; /* length of i_th block row of A */
715     x1 = xb[0]; x2 = xb[1];
716     ib = aj + *ai;
717     jmin = 0;
718     nonzerorow += (n>0);
719     if (*ib == i){      /* (diag of A)*x */
720       z[2*i]   += v[0]*x1 + v[2]*x2;
721       z[2*i+1] += v[2]*x1 + v[3]*x2;
722       v += 4; jmin++;
723     }
724     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
725     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
726     for (j=jmin; j<n; j++) {
727       /* (strict lower triangular part of A)*x  */
728       cval       = ib[j]*2;
729       z[cval]     += v[0]*x1 + v[1]*x2;
730       z[cval+1]   += v[2]*x1 + v[3]*x2;
731       /* (strict upper triangular part of A)*x  */
732       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
733       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
734       v  += 4;
735     }
736     xb +=2; ai++;
737   }
738   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
739   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
740 
741   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
742   PetscFunctionReturn(0);
743 }
744 
745 #undef __FUNCT__
746 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
747 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
748 {
749   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
750   PetscScalar    *x,*z,*xb,x1,x2,x3;
751   MatScalar      *v;
752   PetscErrorCode ierr;
753   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
754   PetscInt       nonzerorow=0;
755 
756   PetscFunctionBegin;
757   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
758   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
759   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
760 
761   v     = a->a;
762   xb = x;
763 
764   for (i=0; i<mbs; i++) {
765     n  = ai[1] - ai[0]; /* length of i_th block row of A */
766     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
767     ib = aj + *ai;
768     jmin = 0;
769     nonzerorow += (n>0);
770     if (*ib == i){     /* (diag of A)*x */
771      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
772      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
773      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
774      v += 9; jmin++;
775     }
776     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
777     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
778     for (j=jmin; j<n; j++) {
779       /* (strict lower triangular part of A)*x  */
780       cval       = ib[j]*3;
781       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
782       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
783       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
784       /* (strict upper triangular part of A)*x  */
785       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
786       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
787       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
788       v  += 9;
789     }
790     xb +=3; ai++;
791   }
792 
793   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
794   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
795 
796   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
797   PetscFunctionReturn(0);
798 }
799 
800 #undef __FUNCT__
801 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
802 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
803 {
804   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
805   PetscScalar    *x,*z,*xb,x1,x2,x3,x4;
806   MatScalar      *v;
807   PetscErrorCode ierr;
808   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
809   PetscInt       nonzerorow=0;
810 
811   PetscFunctionBegin;
812   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
813   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
814   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
815 
816   v     = a->a;
817   xb = x;
818 
819   for (i=0; i<mbs; i++) {
820     n  = ai[1] - ai[0]; /* length of i_th block row of A */
821     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
822     ib = aj + *ai;
823     jmin = 0;
824     nonzerorow += (n>0);
825     if (*ib == i){      /* (diag of A)*x */
826       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
827       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
828       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
829       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
830       v += 16; jmin++;
831     }
832     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
833     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
834     for (j=jmin; j<n; j++) {
835       /* (strict lower triangular part of A)*x  */
836       cval       = ib[j]*4;
837       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
838       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
839       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
840       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
841       /* (strict upper triangular part of A)*x  */
842       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
843       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
844       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
845       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
846       v  += 16;
847     }
848     xb +=4; ai++;
849   }
850 
851   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
852   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
853 
854   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
855   PetscFunctionReturn(0);
856 }
857 
858 #undef __FUNCT__
859 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
860 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
861 {
862   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
863   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5;
864   MatScalar      *v;
865   PetscErrorCode ierr;
866   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
867   PetscInt       nonzerorow=0;
868 
869   PetscFunctionBegin;
870   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
871   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
872   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
873 
874   v     = a->a;
875   xb = x;
876 
877   for (i=0; i<mbs; i++) {
878     n  = ai[1] - ai[0]; /* length of i_th block row of A */
879     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
880     ib = aj + *ai;
881     jmin = 0;
882     nonzerorow += (n>0);
883     if (*ib == i){      /* (diag of A)*x */
884       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
885       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
886       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
887       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
888       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
889       v += 25; jmin++;
890     }
891     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
892     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
893     for (j=jmin; j<n; j++) {
894       /* (strict lower triangular part of A)*x  */
895       cval       = ib[j]*5;
896       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
897       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
898       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
899       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
900       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
901       /* (strict upper triangular part of A)*x  */
902       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
903       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
904       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
905       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
906       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
907       v  += 25;
908     }
909     xb +=5; ai++;
910   }
911 
912   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
913   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
914 
915   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
916   PetscFunctionReturn(0);
917 }
918 #undef __FUNCT__
919 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
920 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
921 {
922   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
923   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6;
924   MatScalar      *v;
925   PetscErrorCode ierr;
926   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
927   PetscInt       nonzerorow=0;
928 
929   PetscFunctionBegin;
930   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
931   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
932   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
933 
934   v     = a->a;
935   xb = x;
936 
937   for (i=0; i<mbs; i++) {
938     n  = ai[1] - ai[0]; /* length of i_th block row of A */
939     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
940     ib = aj + *ai;
941     jmin = 0;
942     nonzerorow += (n>0);
943     if (*ib == i){     /* (diag of A)*x */
944       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
945       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
946       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
947       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
948       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
949       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
950       v += 36; jmin++;
951     }
952     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
953     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
954     for (j=jmin; j<n; j++) {
955       /* (strict lower triangular part of A)*x  */
956       cval       = ib[j]*6;
957       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
958       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
959       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
960       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
961       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
962       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
963       /* (strict upper triangular part of A)*x  */
964       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
965       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
966       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
967       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
968       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
969       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
970       v  += 36;
971     }
972     xb +=6; ai++;
973   }
974 
975   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
976   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
977 
978   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
979   PetscFunctionReturn(0);
980 }
981 
982 #undef __FUNCT__
983 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
984 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
985 {
986   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
987   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
988   MatScalar      *v;
989   PetscErrorCode ierr;
990   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
991   PetscInt       nonzerorow=0;
992 
993   PetscFunctionBegin;
994   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
995   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
996   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
997 
998   v     = a->a;
999   xb = x;
1000 
1001   for (i=0; i<mbs; i++) {
1002     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1003     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1004     ib = aj + *ai;
1005     jmin = 0;
1006     nonzerorow += (n>0);
1007     if (*ib == i){     /* (diag of A)*x */
1008       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1009       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1010       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1011       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1012       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1013       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1014       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1015       v += 49; jmin++;
1016     }
1017     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1018     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1019     for (j=jmin; j<n; j++) {
1020       /* (strict lower triangular part of A)*x  */
1021       cval       = ib[j]*7;
1022       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1023       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1024       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1025       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1026       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1027       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1028       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1029       /* (strict upper triangular part of A)*x  */
1030       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1031       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1032       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1033       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1034       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1035       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1036       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1037       v  += 49;
1038     }
1039     xb +=7; ai++;
1040   }
1041 
1042   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1043   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1044 
1045   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1046   PetscFunctionReturn(0);
1047 }
1048 
1049 #undef __FUNCT__
1050 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1051 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1052 {
1053   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1054   PetscScalar    *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1055   MatScalar      *v;
1056   PetscErrorCode ierr;
1057   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1058   PetscInt       nonzerorow=0;
1059 
1060   PetscFunctionBegin;
1061   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1062   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1063   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1064 
1065   aj   = a->j;
1066   v    = a->a;
1067   ii   = a->i;
1068 
1069   if (!a->mult_work) {
1070     ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1071   }
1072   work = a->mult_work;
1073 
1074 
1075   for (i=0; i<mbs; i++) {
1076     n     = ii[1] - ii[0]; ncols = n*bs;
1077     workt = work; idx=aj+ii[0];
1078     nonzerorow += (n>0);
1079 
1080     /* upper triangular part */
1081     for (j=0; j<n; j++) {
1082       xb = x_ptr + bs*(*idx++);
1083       for (k=0; k<bs; k++) workt[k] = xb[k];
1084       workt += bs;
1085     }
1086     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1087     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1088 
1089     /* strict lower triangular part */
1090     idx = aj+ii[0];
1091     if (*idx == i){
1092       ncols -= bs; v += bs2; idx++; n--;
1093     }
1094     if (ncols > 0){
1095       workt = work;
1096       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1097       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1098       for (j=0; j<n; j++) {
1099         zb = z_ptr + bs*(*idx++);
1100         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1101         workt += bs;
1102       }
1103     }
1104 
1105     x += bs; v += n*bs2; z += bs; ii++;
1106   }
1107 
1108   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1109   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1110 
1111   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1112   PetscFunctionReturn(0);
1113 }
1114 
1115 #undef __FUNCT__
1116 #define __FUNCT__ "MatScale_SeqSBAIJ"
1117 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1118 {
1119   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)inA->data;
1120   PetscScalar    oalpha = alpha;
1121   PetscErrorCode ierr;
1122   PetscBLASInt   one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz);
1123 
1124   PetscFunctionBegin;
1125   BLASscal_(&totalnz,&oalpha,a->a,&one);
1126   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1127   PetscFunctionReturn(0);
1128 }
1129 
1130 #undef __FUNCT__
1131 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1132 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1133 {
1134   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1135   MatScalar      *v = a->a;
1136   PetscReal      sum_diag = 0.0, sum_off = 0.0, *sum;
1137   PetscInt       i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1138   PetscErrorCode ierr;
1139   PetscInt       *jl,*il,jmin,jmax,nexti,ik,*col;
1140 
1141   PetscFunctionBegin;
1142   if (type == NORM_FROBENIUS) {
1143     for (k=0; k<mbs; k++){
1144       jmin = a->i[k]; jmax = a->i[k+1];
1145       col  = aj + jmin;
1146       if (*col == k){         /* diagonal block */
1147         for (i=0; i<bs2; i++){
1148 #if defined(PETSC_USE_COMPLEX)
1149           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1150 #else
1151           sum_diag += (*v)*(*v); v++;
1152 #endif
1153         }
1154         jmin++;
1155       }
1156       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1157         for (i=0; i<bs2; i++){
1158 #if defined(PETSC_USE_COMPLEX)
1159           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1160 #else
1161           sum_off += (*v)*(*v); v++;
1162 #endif
1163         }
1164       }
1165     }
1166     *norm = sqrt(sum_diag + 2*sum_off);
1167   }  else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1168     ierr = PetscMalloc3(bs,PetscReal,&sum,mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr);
1169     for (i=0; i<mbs; i++) jl[i] = mbs;
1170     il[0] = 0;
1171 
1172     *norm = 0.0;
1173     for (k=0; k<mbs; k++) { /* k_th block row */
1174       for (j=0; j<bs; j++) sum[j]=0.0;
1175       /*-- col sum --*/
1176       i = jl[k]; /* first |A(i,k)| to be added */
1177       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1178                   at step k */
1179       while (i<mbs){
1180         nexti = jl[i];  /* next block row to be added */
1181         ik    = il[i];  /* block index of A(i,k) in the array a */
1182         for (j=0; j<bs; j++){
1183           v = a->a + ik*bs2 + j*bs;
1184           for (k1=0; k1<bs; k1++) {
1185             sum[j] += PetscAbsScalar(*v); v++;
1186           }
1187         }
1188         /* update il, jl */
1189         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1190         jmax = a->i[i+1];
1191         if (jmin < jmax){
1192           il[i] = jmin;
1193           j   = a->j[jmin];
1194           jl[i] = jl[j]; jl[j]=i;
1195         }
1196         i = nexti;
1197       }
1198       /*-- row sum --*/
1199       jmin = a->i[k]; jmax = a->i[k+1];
1200       for (i=jmin; i<jmax; i++) {
1201         for (j=0; j<bs; j++){
1202           v = a->a + i*bs2 + j;
1203           for (k1=0; k1<bs; k1++){
1204             sum[j] += PetscAbsScalar(*v); v += bs;
1205           }
1206         }
1207       }
1208       /* add k_th block row to il, jl */
1209       col = aj+jmin;
1210       if (*col == k) jmin++;
1211       if (jmin < jmax){
1212         il[k] = jmin;
1213         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1214       }
1215       for (j=0; j<bs; j++){
1216         if (sum[j] > *norm) *norm = sum[j];
1217       }
1218     }
1219     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1220   } else {
1221     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1222   }
1223   PetscFunctionReturn(0);
1224 }
1225 
1226 #undef __FUNCT__
1227 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1228 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1229 {
1230   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1231   PetscErrorCode ierr;
1232 
1233   PetscFunctionBegin;
1234 
1235   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1236   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1237     *flg = PETSC_FALSE;
1238     PetscFunctionReturn(0);
1239   }
1240 
1241   /* if the a->i are the same */
1242   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1243   if (!*flg) {
1244     PetscFunctionReturn(0);
1245   }
1246 
1247   /* if a->j are the same */
1248   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1249   if (!*flg) {
1250     PetscFunctionReturn(0);
1251   }
1252   /* if a->a are the same */
1253   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1254   PetscFunctionReturn(0);
1255 }
1256 
1257 #undef __FUNCT__
1258 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1259 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1260 {
1261   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1262   PetscErrorCode ierr;
1263   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1264   PetscScalar    *x,zero = 0.0;
1265   MatScalar      *aa,*aa_j;
1266 
1267   PetscFunctionBegin;
1268   bs   = A->rmap->bs;
1269   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1270 
1271   aa   = a->a;
1272   ai   = a->i;
1273   aj   = a->j;
1274   ambs = a->mbs;
1275   bs2  = a->bs2;
1276 
1277   ierr = VecSet(v,zero);CHKERRQ(ierr);
1278   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1279   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1280   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1281   for (i=0; i<ambs; i++) {
1282     j=ai[i];
1283     if (aj[j] == i) {             /* if this is a diagonal element */
1284       row  = i*bs;
1285       aa_j = aa + j*bs2;
1286       if (A->factortype && bs==1){
1287         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1288       } else {
1289         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1290       }
1291     }
1292   }
1293 
1294   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1295   PetscFunctionReturn(0);
1296 }
1297 
1298 #undef __FUNCT__
1299 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1300 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1301 {
1302   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1303   PetscScalar    *l,x,*li,*ri;
1304   MatScalar      *aa,*v;
1305   PetscErrorCode ierr;
1306   PetscInt       i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1307   PetscBool      flg;
1308 
1309   PetscFunctionBegin;
1310   if (ll != rr){
1311     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1312     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1313   }
1314   if (!ll) PetscFunctionReturn(0);
1315   ai  = a->i;
1316   aj  = a->j;
1317   aa  = a->a;
1318   m   = A->rmap->N;
1319   bs  = A->rmap->bs;
1320   mbs = a->mbs;
1321   bs2 = a->bs2;
1322 
1323   ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1324   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1325   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1326   for (i=0; i<mbs; i++) { /* for each block row */
1327     M  = ai[i+1] - ai[i];
1328     li = l + i*bs;
1329     v  = aa + bs2*ai[i];
1330     for (j=0; j<M; j++) { /* for each block */
1331       ri = l + bs*aj[ai[i]+j];
1332       for (k=0; k<bs; k++) {
1333         x = ri[k];
1334         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1335       }
1336     }
1337   }
1338   ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1339   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1340   PetscFunctionReturn(0);
1341 }
1342 
1343 #undef __FUNCT__
1344 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1345 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1346 {
1347   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1348 
1349   PetscFunctionBegin;
1350   info->block_size     = a->bs2;
1351   info->nz_allocated   = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */
1352   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1353   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1354   info->assemblies   = A->num_ass;
1355   info->mallocs      = A->info.mallocs;
1356   info->memory       = ((PetscObject)A)->mem;
1357   if (A->factortype) {
1358     info->fill_ratio_given  = A->info.fill_ratio_given;
1359     info->fill_ratio_needed = A->info.fill_ratio_needed;
1360     info->factor_mallocs    = A->info.factor_mallocs;
1361   } else {
1362     info->fill_ratio_given  = 0;
1363     info->fill_ratio_needed = 0;
1364     info->factor_mallocs    = 0;
1365   }
1366   PetscFunctionReturn(0);
1367 }
1368 
1369 
1370 #undef __FUNCT__
1371 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1372 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1373 {
1374   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1375   PetscErrorCode ierr;
1376 
1377   PetscFunctionBegin;
1378   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1379   PetscFunctionReturn(0);
1380 }
1381 
1382 #undef __FUNCT__
1383 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1384 /*
1385    This code does not work since it only checks the upper triangular part of
1386   the matrix. Hence it is not listed in the function table.
1387 */
1388 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1389 {
1390   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1391   PetscErrorCode ierr;
1392   PetscInt       i,j,n,row,col,bs,*ai,*aj,mbs;
1393   PetscReal      atmp;
1394   MatScalar      *aa;
1395   PetscScalar    *x;
1396   PetscInt       ncols,brow,bcol,krow,kcol;
1397 
1398   PetscFunctionBegin;
1399   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1400   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1401   bs   = A->rmap->bs;
1402   aa   = a->a;
1403   ai   = a->i;
1404   aj   = a->j;
1405   mbs = a->mbs;
1406 
1407   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1408   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1409   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1410   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1411   for (i=0; i<mbs; i++) {
1412     ncols = ai[1] - ai[0]; ai++;
1413     brow  = bs*i;
1414     for (j=0; j<ncols; j++){
1415       bcol = bs*(*aj);
1416       for (kcol=0; kcol<bs; kcol++){
1417         col = bcol + kcol;      /* col index */
1418         for (krow=0; krow<bs; krow++){
1419           atmp = PetscAbsScalar(*aa); aa++;
1420           row = brow + krow;    /* row index */
1421           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1422           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1423         }
1424       }
1425       aj++;
1426     }
1427   }
1428   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1429   PetscFunctionReturn(0);
1430 }
1431