xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 8f46ffca725d709c802a056bffe944fc7c220922)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc-private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <../src/mat/impls/sbaij/seq/sbaij.h>
6 #include <petscblaslapack.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11 {
12   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
15   const PetscInt *idx;
16   PetscBT        table_out,table_in;
17 
18   PetscFunctionBegin;
19   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20   mbs  = a->mbs;
21   ai   = a->i;
22   aj   = a->j;
23   bs   = A->rmap->bs;
24   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
25   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
26   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
27   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
28 
29   for (i=0; i<is_max; i++) { /* for each is */
30     isz  = 0;
31     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
32 
33     /* Extract the indices, assume there can be duplicate entries */
34     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36 
37     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
38     bcol_max = 0;
39     for (j=0; j<n; ++j) {
40       brow = idx[j]/bs; /* convert the indices into block indices */
41       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42       if (!PetscBTLookupSet(table_out,brow)) {
43         nidx[isz++] = brow;
44         if (bcol_max < brow) bcol_max = brow;
45       }
46     }
47     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
49 
50     k = 0;
51     for (j=0; j<ov; j++) { /* for each overlap */
52       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
54       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
55 
56       n = isz;  /* length of the updated is[i] */
57       for (brow=0; brow<mbs; brow++) {
58         start = ai[brow]; end   = ai[brow+1];
59         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
60           for (l = start; l<end; l++) {
61             bcol = aj[l];
62             if (!PetscBTLookupSet(table_out,bcol)) {
63               nidx[isz++] = bcol;
64               if (bcol_max < bcol) bcol_max = bcol;
65             }
66           }
67           k++;
68           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
69         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
70           for (l = start; l<end; l++) {
71             bcol = aj[l];
72             if (bcol > bcol_max) break;
73             if (PetscBTLookup(table_in,bcol)) {
74               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
75               break; /* for l = start; l<end ; l++) */
76             }
77           }
78         }
79       }
80     } /* for each overlap */
81 
82     /* expand the Index Set */
83     for (j=0; j<isz; j++) {
84       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
85     }
86     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
87   }
88   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
89   ierr = PetscFree(nidx);CHKERRQ(ierr);
90   ierr = PetscFree(nidx2);CHKERRQ(ierr);
91   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
92   PetscFunctionReturn(0);
93 }
94 
95 #undef __FUNCT__
96 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
97 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) //rm
98 {
99   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data,*c;
100   PetscErrorCode  ierr;
101   PetscInt        *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
102   PetscInt        row,mat_i,*mat_j,tcol,*mat_ilen;
103   PetscInt        nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
104   const PetscInt  *irow,*aj = a->j,*ai = a->i;
105   MatScalar       *mat_a;
106   Mat             C;
107   PetscBool       flag,sorted;
108 
109   PetscFunctionBegin;
110   if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isrow"); //rm!!!
111   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); //rm!!!
112   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); //rm!!!
113 
114   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
115   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
116 
117   ierr  = PetscMalloc1(oldcols,&smap);CHKERRQ(ierr);
118   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
119   ssmap = smap;
120   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
121   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
122   /* determine lens of each row */
123   for (i=0; i<nrows; i++) {
124     kstart  = ai[irow[i]];
125     kend    = kstart + a->ilen[irow[i]];
126     lens[i] = 0;
127     for (k=kstart; k<kend; k++) {
128       if (ssmap[aj[k]]) lens[i]++;
129     }
130   }
131   /* Create and fill new matrix */
132   if (scall == MAT_REUSE_MATRIX) {
133     c = (Mat_SeqSBAIJ*)((*B)->data);
134 
135     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
136     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
137     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
138     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
139     C    = *B;
140   } else {
141     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
142     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
143     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
144     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
145   }
146   c = (Mat_SeqSBAIJ*)(C->data);
147   for (i=0; i<nrows; i++) {
148     row      = irow[i];
149     kstart   = ai[row];
150     kend     = kstart + a->ilen[row];
151     mat_i    = c->i[i];
152     mat_j    = c->j + mat_i;
153     mat_a    = c->a + mat_i*bs2;
154     mat_ilen = c->ilen + i;
155     for (k=kstart; k<kend; k++) {
156       if ((tcol=ssmap[a->j[k]])) {
157         *mat_j++ = tcol - 1;
158         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
159         mat_a   += bs2;
160         (*mat_ilen)++;
161       }
162     }
163   }
164 
165   /* Free work space */
166   ierr = PetscFree(smap);CHKERRQ(ierr);
167   ierr = PetscFree(lens);CHKERRQ(ierr);
168   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
169   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
170 
171   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
172   *B   = C;
173   PetscFunctionReturn(0);
174 }
175 
176 #undef __FUNCT__
177 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
178 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
179 {
180   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
181   IS             is1;
182   PetscErrorCode ierr;
183   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
184   const PetscInt *irow;
185 
186   PetscFunctionBegin;
187   if (isrow != iscol) {
188     PetscBool isequal;
189     ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr);
190     if (!isequal)
191       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isrow");
192   }
193 
194   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); //isrow == iscol, so delete below!!!
195   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
196 
197   /* Verify if the indices corespond to each element in a block
198    and form the IS with compressed IS -- also sort it!!! */
199   ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr);
200   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
201   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
202 
203   count = 0;
204   for (i=0; i<a->mbs; i++) {
205     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
206     if (vary[i]==bs) iary[count++] = i;
207   }
208   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
209   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
210   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
211 
212   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr); //remove a is1!!!
213   ierr = ISDestroy(&is1);CHKERRQ(ierr);
214   PetscFunctionReturn(0);
215 }
216 
217 #undef __FUNCT__
218 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
219 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
220 {
221   PetscErrorCode ierr;
222   PetscInt       i;
223 
224   PetscFunctionBegin;
225   if (scall == MAT_INITIAL_MATRIX) {
226     ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr);
227   }
228 
229   for (i=0; i<n; i++) {
230     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
231   }
232   PetscFunctionReturn(0);
233 }
234 
235 /* -------------------------------------------------------*/
236 /* Should check that shapes of vectors and matrices match */
237 /* -------------------------------------------------------*/
238 
239 #undef __FUNCT__
240 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
241 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
242 {
243   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
244   PetscScalar       *z,x1,x2,zero=0.0;
245   const PetscScalar *x,*xb;
246   const MatScalar   *v;
247   PetscErrorCode    ierr;
248   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
249   const PetscInt    *aj=a->j,*ai=a->i,*ib;
250   PetscInt          nonzerorow=0;
251 
252   PetscFunctionBegin;
253   ierr = VecSet(zz,zero);CHKERRQ(ierr);
254   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
255   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
256 
257   v  = a->a;
258   xb = x;
259 
260   for (i=0; i<mbs; i++) {
261     n           = ai[1] - ai[0]; /* length of i_th block row of A */
262     x1          = xb[0]; x2 = xb[1];
263     ib          = aj + *ai;
264     jmin        = 0;
265     nonzerorow += (n>0);
266     if (*ib == i) {     /* (diag of A)*x */
267       z[2*i]   += v[0]*x1 + v[2]*x2;
268       z[2*i+1] += v[2]*x1 + v[3]*x2;
269       v        += 4; jmin++;
270     }
271     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
272     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
273     for (j=jmin; j<n; j++) {
274       /* (strict lower triangular part of A)*x  */
275       cval       = ib[j]*2;
276       z[cval]   += v[0]*x1 + v[1]*x2;
277       z[cval+1] += v[2]*x1 + v[3]*x2;
278       /* (strict upper triangular part of A)*x  */
279       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
280       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
281       v        += 4;
282     }
283     xb +=2; ai++;
284   }
285 
286   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
287   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
288   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
289   PetscFunctionReturn(0);
290 }
291 
292 #undef __FUNCT__
293 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
294 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
295 {
296   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
297   PetscScalar       *z,x1,x2,x3,zero=0.0;
298   const PetscScalar *x,*xb;
299   const MatScalar   *v;
300   PetscErrorCode    ierr;
301   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
302   const PetscInt    *aj = a->j,*ai = a->i,*ib;
303   PetscInt          nonzerorow=0;
304 
305   PetscFunctionBegin;
306   ierr = VecSet(zz,zero);CHKERRQ(ierr);
307   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
308   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
309 
310   v  = a->a;
311   xb = x;
312 
313   for (i=0; i<mbs; i++) {
314     n           = ai[1] - ai[0]; /* length of i_th block row of A */
315     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
316     ib          = aj + *ai;
317     jmin        = 0;
318     nonzerorow += (n>0);
319     if (*ib == i) {     /* (diag of A)*x */
320       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
321       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
322       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
323       v        += 9; jmin++;
324     }
325     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
326     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
327     for (j=jmin; j<n; j++) {
328       /* (strict lower triangular part of A)*x  */
329       cval       = ib[j]*3;
330       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
331       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
332       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
333       /* (strict upper triangular part of A)*x  */
334       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
335       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
336       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
337       v        += 9;
338     }
339     xb +=3; ai++;
340   }
341 
342   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
343   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
344   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
345   PetscFunctionReturn(0);
346 }
347 
348 #undef __FUNCT__
349 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
350 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
351 {
352   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
353   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
354   const PetscScalar *x,*xb;
355   const MatScalar   *v;
356   PetscErrorCode    ierr;
357   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
358   const PetscInt    *aj = a->j,*ai = a->i,*ib;
359   PetscInt          nonzerorow = 0;
360 
361   PetscFunctionBegin;
362   ierr = VecSet(zz,zero);CHKERRQ(ierr);
363   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
364   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
365 
366   v  = a->a;
367   xb = x;
368 
369   for (i=0; i<mbs; i++) {
370     n           = ai[1] - ai[0]; /* length of i_th block row of A */
371     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
372     ib          = aj + *ai;
373     jmin        = 0;
374     nonzerorow += (n>0);
375     if (*ib == i) {     /* (diag of A)*x */
376       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
377       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
378       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
379       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
380       v        += 16; jmin++;
381     }
382     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
383     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
384     for (j=jmin; j<n; j++) {
385       /* (strict lower triangular part of A)*x  */
386       cval       = ib[j]*4;
387       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
388       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
389       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
390       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
391       /* (strict upper triangular part of A)*x  */
392       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
393       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
394       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
395       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
396       v        += 16;
397     }
398     xb +=4; ai++;
399   }
400 
401   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
402   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
403   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
404   PetscFunctionReturn(0);
405 }
406 
407 #undef __FUNCT__
408 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
409 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
410 {
411   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
412   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
413   const PetscScalar *x,*xb;
414   const MatScalar   *v;
415   PetscErrorCode    ierr;
416   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
417   const PetscInt    *aj = a->j,*ai = a->i,*ib;
418   PetscInt          nonzerorow=0;
419 
420   PetscFunctionBegin;
421   ierr = VecSet(zz,zero);CHKERRQ(ierr);
422   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
423   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
424 
425   v  = a->a;
426   xb = x;
427 
428   for (i=0; i<mbs; i++) {
429     n           = ai[1] - ai[0]; /* length of i_th block row of A */
430     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
431     ib          = aj + *ai;
432     jmin        = 0;
433     nonzerorow += (n>0);
434     if (*ib == i) {      /* (diag of A)*x */
435       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
436       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
437       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
438       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
439       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
440       v        += 25; jmin++;
441     }
442     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
443     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
444     for (j=jmin; j<n; j++) {
445       /* (strict lower triangular part of A)*x  */
446       cval       = ib[j]*5;
447       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
448       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
449       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
450       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
451       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
452       /* (strict upper triangular part of A)*x  */
453       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
454       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
455       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
456       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
457       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
458       v        += 25;
459     }
460     xb +=5; ai++;
461   }
462 
463   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
464   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
465   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
466   PetscFunctionReturn(0);
467 }
468 
469 
470 #undef __FUNCT__
471 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
472 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
473 {
474   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
475   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
476   const PetscScalar *x,*xb;
477   const MatScalar   *v;
478   PetscErrorCode    ierr;
479   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
480   const PetscInt    *aj=a->j,*ai=a->i,*ib;
481   PetscInt          nonzerorow=0;
482 
483   PetscFunctionBegin;
484   ierr = VecSet(zz,zero);CHKERRQ(ierr);
485   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
486   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
487 
488   v  = a->a;
489   xb = x;
490 
491   for (i=0; i<mbs; i++) {
492     n           = ai[1] - ai[0]; /* length of i_th block row of A */
493     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
494     ib          = aj + *ai;
495     jmin        = 0;
496     nonzerorow += (n>0);
497     if (*ib == i) {      /* (diag of A)*x */
498       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
499       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
500       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
501       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
502       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
503       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
504       v        += 36; jmin++;
505     }
506     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
507     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
508     for (j=jmin; j<n; j++) {
509       /* (strict lower triangular part of A)*x  */
510       cval       = ib[j]*6;
511       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
512       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
513       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
514       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
515       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
516       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
517       /* (strict upper triangular part of A)*x  */
518       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
519       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
520       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
521       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
522       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
523       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
524       v        += 36;
525     }
526     xb +=6; ai++;
527   }
528 
529   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
530   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
531   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
532   PetscFunctionReturn(0);
533 }
534 #undef __FUNCT__
535 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
536 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
537 {
538   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
539   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
540   const PetscScalar *x,*xb;
541   const MatScalar   *v;
542   PetscErrorCode    ierr;
543   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
544   const PetscInt    *aj=a->j,*ai=a->i,*ib;
545   PetscInt          nonzerorow=0;
546 
547   PetscFunctionBegin;
548   ierr = VecSet(zz,zero);CHKERRQ(ierr);
549   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
550   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
551 
552   v  = a->a;
553   xb = x;
554 
555   for (i=0; i<mbs; i++) {
556     n           = ai[1] - ai[0]; /* length of i_th block row of A */
557     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
558     ib          = aj + *ai;
559     jmin        = 0;
560     nonzerorow += (n>0);
561     if (*ib == i) {      /* (diag of A)*x */
562       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
563       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
564       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
565       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
566       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
567       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
568       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
569       v        += 49; jmin++;
570     }
571     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
572     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
573     for (j=jmin; j<n; j++) {
574       /* (strict lower triangular part of A)*x  */
575       cval       = ib[j]*7;
576       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
577       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
578       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
579       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
580       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
581       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
582       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
583       /* (strict upper triangular part of A)*x  */
584       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
585       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
586       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
587       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
588       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
589       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
590       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
591       v       += 49;
592     }
593     xb +=7; ai++;
594   }
595   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
596   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
597   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
598   PetscFunctionReturn(0);
599 }
600 
601 /*
602     This will not work with MatScalar == float because it calls the BLAS
603 */
604 #undef __FUNCT__
605 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
606 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
607 {
608   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
609   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
610   const PetscScalar *x,*x_ptr,*xb;
611   const MatScalar   *v;
612   PetscErrorCode    ierr;
613   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
614   const PetscInt    *idx,*aj,*ii;
615   PetscInt          nonzerorow=0;
616 
617   PetscFunctionBegin;
618   ierr = VecSet(zz,zero);CHKERRQ(ierr);
619   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x;
620   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
621 
622   aj = a->j;
623   v  = a->a;
624   ii = a->i;
625 
626   if (!a->mult_work) {
627     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
628   }
629   work = a->mult_work;
630 
631   for (i=0; i<mbs; i++) {
632     n           = ii[1] - ii[0]; ncols = n*bs;
633     workt       = work; idx=aj+ii[0];
634     nonzerorow += (n>0);
635 
636     /* upper triangular part */
637     for (j=0; j<n; j++) {
638       xb = x_ptr + bs*(*idx++);
639       for (k=0; k<bs; k++) workt[k] = xb[k];
640       workt += bs;
641     }
642     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
643     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
644 
645     /* strict lower triangular part */
646     idx = aj+ii[0];
647     if (*idx == i) {
648       ncols -= bs; v += bs2; idx++; n--;
649     }
650 
651     if (ncols > 0) {
652       workt = work;
653       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
654       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
655       for (j=0; j<n; j++) {
656         zb = z_ptr + bs*(*idx++);
657         for (k=0; k<bs; k++) zb[k] += workt[k];
658         workt += bs;
659       }
660     }
661     x += bs; v += n*bs2; z += bs; ii++;
662   }
663 
664   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
665   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
666   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
667   PetscFunctionReturn(0);
668 }
669 
670 #undef __FUNCT__
671 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
672 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
673 {
674   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
675   PetscScalar       *z,x1;
676   const PetscScalar *x,*xb;
677   const MatScalar   *v;
678   PetscErrorCode    ierr;
679   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
680   const PetscInt    *aj=a->j,*ai=a->i,*ib;
681   PetscInt          nonzerorow=0;
682 
683   PetscFunctionBegin;
684   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
685   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
686   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
687   v    = a->a;
688   xb   = x;
689 
690   for (i=0; i<mbs; i++) {
691     n           = ai[1] - ai[0]; /* length of i_th row of A */
692     x1          = xb[0];
693     ib          = aj + *ai;
694     jmin        = 0;
695     nonzerorow += (n>0);
696     if (*ib == i) {            /* (diag of A)*x */
697       z[i] += *v++ * x[*ib++]; jmin++;
698     }
699     for (j=jmin; j<n; j++) {
700       cval    = *ib;
701       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
702       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
703     }
704     xb++; ai++;
705   }
706 
707   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
708   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
709 
710   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
711   PetscFunctionReturn(0);
712 }
713 
714 #undef __FUNCT__
715 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
716 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
717 {
718   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
719   PetscScalar       *z,x1,x2;
720   const PetscScalar *x,*xb;
721   const MatScalar   *v;
722   PetscErrorCode    ierr;
723   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
724   const PetscInt    *aj=a->j,*ai=a->i,*ib;
725   PetscInt          nonzerorow=0;
726 
727   PetscFunctionBegin;
728   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
729   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
730   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
731 
732   v  = a->a;
733   xb = x;
734 
735   for (i=0; i<mbs; i++) {
736     n           = ai[1] - ai[0]; /* length of i_th block row of A */
737     x1          = xb[0]; x2 = xb[1];
738     ib          = aj + *ai;
739     jmin        = 0;
740     nonzerorow += (n>0);
741     if (*ib == i) {      /* (diag of A)*x */
742       z[2*i]   += v[0]*x1 + v[2]*x2;
743       z[2*i+1] += v[2]*x1 + v[3]*x2;
744       v        += 4; jmin++;
745     }
746     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
747     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
748     for (j=jmin; j<n; j++) {
749       /* (strict lower triangular part of A)*x  */
750       cval       = ib[j]*2;
751       z[cval]   += v[0]*x1 + v[1]*x2;
752       z[cval+1] += v[2]*x1 + v[3]*x2;
753       /* (strict upper triangular part of A)*x  */
754       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
755       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
756       v        += 4;
757     }
758     xb +=2; ai++;
759   }
760   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
761   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
762 
763   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
764   PetscFunctionReturn(0);
765 }
766 
767 #undef __FUNCT__
768 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
769 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
770 {
771   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
772   PetscScalar       *z,x1,x2,x3;
773   const PetscScalar *x,*xb;
774   const MatScalar   *v;
775   PetscErrorCode    ierr;
776   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
777   const PetscInt    *aj=a->j,*ai=a->i,*ib;
778   PetscInt          nonzerorow=0;
779 
780   PetscFunctionBegin;
781   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
782   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
783   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
784 
785   v  = a->a;
786   xb = x;
787 
788   for (i=0; i<mbs; i++) {
789     n           = ai[1] - ai[0]; /* length of i_th block row of A */
790     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
791     ib          = aj + *ai;
792     jmin        = 0;
793     nonzerorow += (n>0);
794     if (*ib == i) {     /* (diag of A)*x */
795       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
796       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
797       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
798       v        += 9; jmin++;
799     }
800     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
801     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
802     for (j=jmin; j<n; j++) {
803       /* (strict lower triangular part of A)*x  */
804       cval       = ib[j]*3;
805       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
806       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
807       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
808       /* (strict upper triangular part of A)*x  */
809       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
810       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
811       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
812       v        += 9;
813     }
814     xb +=3; ai++;
815   }
816 
817   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
818   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
819 
820   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
821   PetscFunctionReturn(0);
822 }
823 
824 #undef __FUNCT__
825 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
826 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
827 {
828   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
829   PetscScalar       *z,x1,x2,x3,x4;
830   const PetscScalar *x,*xb;
831   const MatScalar   *v;
832   PetscErrorCode    ierr;
833   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
834   const PetscInt    *aj=a->j,*ai=a->i,*ib;
835   PetscInt          nonzerorow=0;
836 
837   PetscFunctionBegin;
838   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
839   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
840   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
841 
842   v  = a->a;
843   xb = x;
844 
845   for (i=0; i<mbs; i++) {
846     n           = ai[1] - ai[0]; /* length of i_th block row of A */
847     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
848     ib          = aj + *ai;
849     jmin        = 0;
850     nonzerorow += (n>0);
851     if (*ib == i) {      /* (diag of A)*x */
852       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
853       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
854       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
855       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
856       v        += 16; jmin++;
857     }
858     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
859     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
860     for (j=jmin; j<n; j++) {
861       /* (strict lower triangular part of A)*x  */
862       cval       = ib[j]*4;
863       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
864       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
865       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
866       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
867       /* (strict upper triangular part of A)*x  */
868       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
869       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
870       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
871       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
872       v        += 16;
873     }
874     xb +=4; ai++;
875   }
876 
877   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
878   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
879 
880   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
881   PetscFunctionReturn(0);
882 }
883 
884 #undef __FUNCT__
885 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
886 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
887 {
888   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
889   PetscScalar       *z,x1,x2,x3,x4,x5;
890   const PetscScalar *x,*xb;
891   const MatScalar   *v;
892   PetscErrorCode    ierr;
893   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
894   const PetscInt    *aj=a->j,*ai=a->i,*ib;
895   PetscInt          nonzerorow=0;
896 
897   PetscFunctionBegin;
898   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
899   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
900   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
901 
902   v  = a->a;
903   xb = x;
904 
905   for (i=0; i<mbs; i++) {
906     n           = ai[1] - ai[0]; /* length of i_th block row of A */
907     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
908     ib          = aj + *ai;
909     jmin        = 0;
910     nonzerorow += (n>0);
911     if (*ib == i) {      /* (diag of A)*x */
912       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
913       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
914       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
915       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
916       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
917       v        += 25; jmin++;
918     }
919     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
920     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
921     for (j=jmin; j<n; j++) {
922       /* (strict lower triangular part of A)*x  */
923       cval       = ib[j]*5;
924       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
925       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
926       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
927       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
928       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
929       /* (strict upper triangular part of A)*x  */
930       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
931       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
932       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
933       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
934       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
935       v        += 25;
936     }
937     xb +=5; ai++;
938   }
939 
940   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
941   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
942 
943   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
944   PetscFunctionReturn(0);
945 }
946 #undef __FUNCT__
947 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
948 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
949 {
950   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
951   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
952   const PetscScalar *x,*xb;
953   const MatScalar   *v;
954   PetscErrorCode    ierr;
955   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
956   const PetscInt    *aj=a->j,*ai=a->i,*ib;
957   PetscInt          nonzerorow=0;
958 
959   PetscFunctionBegin;
960   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
961   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
962   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
963 
964   v  = a->a;
965   xb = x;
966 
967   for (i=0; i<mbs; i++) {
968     n           = ai[1] - ai[0]; /* length of i_th block row of A */
969     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
970     ib          = aj + *ai;
971     jmin        = 0;
972     nonzerorow += (n>0);
973     if (*ib == i) {     /* (diag of A)*x */
974       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
975       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
976       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
977       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
978       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
979       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
980       v        += 36; jmin++;
981     }
982     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
983     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
984     for (j=jmin; j<n; j++) {
985       /* (strict lower triangular part of A)*x  */
986       cval       = ib[j]*6;
987       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
988       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
989       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
990       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
991       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
992       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
993       /* (strict upper triangular part of A)*x  */
994       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
995       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
996       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
997       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
998       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
999       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1000       v        += 36;
1001     }
1002     xb +=6; ai++;
1003   }
1004 
1005   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1006   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1007 
1008   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1009   PetscFunctionReturn(0);
1010 }
1011 
1012 #undef __FUNCT__
1013 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1014 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1015 {
1016   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1017   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1018   const PetscScalar *x,*xb;
1019   const MatScalar   *v;
1020   PetscErrorCode    ierr;
1021   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1022   const PetscInt    *aj=a->j,*ai=a->i,*ib;
1023   PetscInt          nonzerorow=0;
1024 
1025   PetscFunctionBegin;
1026   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1027   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1028   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1029 
1030   v  = a->a;
1031   xb = x;
1032 
1033   for (i=0; i<mbs; i++) {
1034     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1035     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1036     ib          = aj + *ai;
1037     jmin        = 0;
1038     nonzerorow += (n>0);
1039     if (*ib == i) {     /* (diag of A)*x */
1040       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1041       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1042       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1043       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1044       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1045       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1046       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1047       v        += 49; jmin++;
1048     }
1049     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1050     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1051     for (j=jmin; j<n; j++) {
1052       /* (strict lower triangular part of A)*x  */
1053       cval       = ib[j]*7;
1054       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1055       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1056       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1057       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1058       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1059       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1060       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1061       /* (strict upper triangular part of A)*x  */
1062       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1063       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1064       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1065       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1066       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1067       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1068       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1069       v       += 49;
1070     }
1071     xb +=7; ai++;
1072   }
1073 
1074   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1075   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1076 
1077   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1078   PetscFunctionReturn(0);
1079 }
1080 
1081 #undef __FUNCT__
1082 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1083 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1084 {
1085   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1086   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1087   const PetscScalar *x,*x_ptr,*xb;
1088   const MatScalar   *v;
1089   PetscErrorCode    ierr;
1090   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1091   const PetscInt    *idx,*aj,*ii;
1092   PetscInt          nonzerorow=0;
1093 
1094   PetscFunctionBegin;
1095   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1096   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
1097   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1098 
1099   aj = a->j;
1100   v  = a->a;
1101   ii = a->i;
1102 
1103   if (!a->mult_work) {
1104     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
1105   }
1106   work = a->mult_work;
1107 
1108 
1109   for (i=0; i<mbs; i++) {
1110     n           = ii[1] - ii[0]; ncols = n*bs;
1111     workt       = work; idx=aj+ii[0];
1112     nonzerorow += (n>0);
1113 
1114     /* upper triangular part */
1115     for (j=0; j<n; j++) {
1116       xb = x_ptr + bs*(*idx++);
1117       for (k=0; k<bs; k++) workt[k] = xb[k];
1118       workt += bs;
1119     }
1120     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1121     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1122 
1123     /* strict lower triangular part */
1124     idx = aj+ii[0];
1125     if (*idx == i) {
1126       ncols -= bs; v += bs2; idx++; n--;
1127     }
1128     if (ncols > 0) {
1129       workt = work;
1130       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1131       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1132       for (j=0; j<n; j++) {
1133         zb = z_ptr + bs*(*idx++);
1134         for (k=0; k<bs; k++) zb[k] += workt[k];
1135         workt += bs;
1136       }
1137     }
1138 
1139     x += bs; v += n*bs2; z += bs; ii++;
1140   }
1141 
1142   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1143   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1144 
1145   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1146   PetscFunctionReturn(0);
1147 }
1148 
1149 #undef __FUNCT__
1150 #define __FUNCT__ "MatScale_SeqSBAIJ"
1151 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1152 {
1153   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1154   PetscScalar    oalpha = alpha;
1155   PetscErrorCode ierr;
1156   PetscBLASInt   one = 1,totalnz;
1157 
1158   PetscFunctionBegin;
1159   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
1160   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1161   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1162   PetscFunctionReturn(0);
1163 }
1164 
1165 #undef __FUNCT__
1166 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1167 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1168 {
1169   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1170   const MatScalar *v       = a->a;
1171   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1172   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1173   PetscErrorCode  ierr;
1174   const PetscInt  *aj=a->j,*col;
1175 
1176   PetscFunctionBegin;
1177   if (type == NORM_FROBENIUS) {
1178     for (k=0; k<mbs; k++) {
1179       jmin = a->i[k]; jmax = a->i[k+1];
1180       col  = aj + jmin;
1181       if (*col == k) {         /* diagonal block */
1182         for (i=0; i<bs2; i++) {
1183           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1184         }
1185         jmin++;
1186       }
1187       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
1188         for (i=0; i<bs2; i++) {
1189           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1190         }
1191       }
1192     }
1193     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1194   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1195     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
1196     for (i=0; i<mbs; i++) jl[i] = mbs;
1197     il[0] = 0;
1198 
1199     *norm = 0.0;
1200     for (k=0; k<mbs; k++) { /* k_th block row */
1201       for (j=0; j<bs; j++) sum[j]=0.0;
1202       /*-- col sum --*/
1203       i = jl[k]; /* first |A(i,k)| to be added */
1204       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1205                   at step k */
1206       while (i<mbs) {
1207         nexti = jl[i];  /* next block row to be added */
1208         ik    = il[i];  /* block index of A(i,k) in the array a */
1209         for (j=0; j<bs; j++) {
1210           v = a->a + ik*bs2 + j*bs;
1211           for (k1=0; k1<bs; k1++) {
1212             sum[j] += PetscAbsScalar(*v); v++;
1213           }
1214         }
1215         /* update il, jl */
1216         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1217         jmax = a->i[i+1];
1218         if (jmin < jmax) {
1219           il[i] = jmin;
1220           j     = a->j[jmin];
1221           jl[i] = jl[j]; jl[j]=i;
1222         }
1223         i = nexti;
1224       }
1225       /*-- row sum --*/
1226       jmin = a->i[k]; jmax = a->i[k+1];
1227       for (i=jmin; i<jmax; i++) {
1228         for (j=0; j<bs; j++) {
1229           v = a->a + i*bs2 + j;
1230           for (k1=0; k1<bs; k1++) {
1231             sum[j] += PetscAbsScalar(*v); v += bs;
1232           }
1233         }
1234       }
1235       /* add k_th block row to il, jl */
1236       col = aj+jmin;
1237       if (*col == k) jmin++;
1238       if (jmin < jmax) {
1239         il[k] = jmin;
1240         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1241       }
1242       for (j=0; j<bs; j++) {
1243         if (sum[j] > *norm) *norm = sum[j];
1244       }
1245     }
1246     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1247   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1248   PetscFunctionReturn(0);
1249 }
1250 
1251 #undef __FUNCT__
1252 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1253 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1254 {
1255   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1256   PetscErrorCode ierr;
1257 
1258   PetscFunctionBegin;
1259   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1260   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1261     *flg = PETSC_FALSE;
1262     PetscFunctionReturn(0);
1263   }
1264 
1265   /* if the a->i are the same */
1266   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1267   if (!*flg) PetscFunctionReturn(0);
1268 
1269   /* if a->j are the same */
1270   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1271   if (!*flg) PetscFunctionReturn(0);
1272 
1273   /* if a->a are the same */
1274   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1275   PetscFunctionReturn(0);
1276 }
1277 
1278 #undef __FUNCT__
1279 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1280 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1281 {
1282   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1283   PetscErrorCode  ierr;
1284   PetscInt        i,j,k,row,bs,ambs,bs2;
1285   const PetscInt  *ai,*aj;
1286   PetscScalar     *x,zero = 0.0;
1287   const MatScalar *aa,*aa_j;
1288 
1289   PetscFunctionBegin;
1290   bs = A->rmap->bs;
1291   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1292 
1293   aa   = a->a;
1294   ambs = a->mbs;
1295 
1296   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1297     PetscInt *diag=a->diag;
1298     aa   = a->a;
1299     ambs = a->mbs;
1300     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1301     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1302     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1303     PetscFunctionReturn(0);
1304   }
1305 
1306   ai   = a->i;
1307   aj   = a->j;
1308   bs2  = a->bs2;
1309   ierr = VecSet(v,zero);CHKERRQ(ierr);
1310   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1311   for (i=0; i<ambs; i++) {
1312     j=ai[i];
1313     if (aj[j] == i) {    /* if this is a diagonal element */
1314       row  = i*bs;
1315       aa_j = aa + j*bs2;
1316       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1317     }
1318   }
1319   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1320   PetscFunctionReturn(0);
1321 }
1322 
1323 #undef __FUNCT__
1324 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1325 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1326 {
1327   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1328   PetscScalar       x;
1329   const PetscScalar *l,*li,*ri;
1330   MatScalar         *aa,*v;
1331   PetscErrorCode    ierr;
1332   PetscInt          i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1333   PetscBool         flg;
1334 
1335   PetscFunctionBegin;
1336   if (ll != rr) {
1337     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1338     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1339   }
1340   if (!ll) PetscFunctionReturn(0);
1341   ai  = a->i;
1342   aj  = a->j;
1343   aa  = a->a;
1344   m   = A->rmap->N;
1345   bs  = A->rmap->bs;
1346   mbs = a->mbs;
1347   bs2 = a->bs2;
1348 
1349   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
1350   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1351   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1352   for (i=0; i<mbs; i++) { /* for each block row */
1353     M  = ai[i+1] - ai[i];
1354     li = l + i*bs;
1355     v  = aa + bs2*ai[i];
1356     for (j=0; j<M; j++) { /* for each block */
1357       ri = l + bs*aj[ai[i]+j];
1358       for (k=0; k<bs; k++) {
1359         x = ri[k];
1360         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1361       }
1362     }
1363   }
1364   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1365   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1366   PetscFunctionReturn(0);
1367 }
1368 
1369 #undef __FUNCT__
1370 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1371 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1372 {
1373   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1374 
1375   PetscFunctionBegin;
1376   info->block_size   = a->bs2;
1377   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
1378   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
1379   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
1380   info->assemblies   = A->num_ass;
1381   info->mallocs      = A->info.mallocs;
1382   info->memory       = ((PetscObject)A)->mem;
1383   if (A->factortype) {
1384     info->fill_ratio_given  = A->info.fill_ratio_given;
1385     info->fill_ratio_needed = A->info.fill_ratio_needed;
1386     info->factor_mallocs    = A->info.factor_mallocs;
1387   } else {
1388     info->fill_ratio_given  = 0;
1389     info->fill_ratio_needed = 0;
1390     info->factor_mallocs    = 0;
1391   }
1392   PetscFunctionReturn(0);
1393 }
1394 
1395 
1396 #undef __FUNCT__
1397 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1398 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1399 {
1400   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1401   PetscErrorCode ierr;
1402 
1403   PetscFunctionBegin;
1404   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1405   PetscFunctionReturn(0);
1406 }
1407 
1408 #undef __FUNCT__
1409 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1410 /*
1411    This code does not work since it only checks the upper triangular part of
1412   the matrix. Hence it is not listed in the function table.
1413 */
1414 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1415 {
1416   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1417   PetscErrorCode  ierr;
1418   PetscInt        i,j,n,row,col,bs,mbs;
1419   const PetscInt  *ai,*aj;
1420   PetscReal       atmp;
1421   const MatScalar *aa;
1422   PetscScalar     *x;
1423   PetscInt        ncols,brow,bcol,krow,kcol;
1424 
1425   PetscFunctionBegin;
1426   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1427   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1428   bs  = A->rmap->bs;
1429   aa  = a->a;
1430   ai  = a->i;
1431   aj  = a->j;
1432   mbs = a->mbs;
1433 
1434   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1435   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1436   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1437   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1438   for (i=0; i<mbs; i++) {
1439     ncols = ai[1] - ai[0]; ai++;
1440     brow  = bs*i;
1441     for (j=0; j<ncols; j++) {
1442       bcol = bs*(*aj);
1443       for (kcol=0; kcol<bs; kcol++) {
1444         col = bcol + kcol;      /* col index */
1445         for (krow=0; krow<bs; krow++) {
1446           atmp = PetscAbsScalar(*aa); aa++;
1447           row  = brow + krow;   /* row index */
1448           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1449           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1450         }
1451       }
1452       aj++;
1453     }
1454   }
1455   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1456   PetscFunctionReturn(0);
1457 }
1458