xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision ae5cfb4a3679d07b6904bd2c794eee8dc00ed47b)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc-private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <../src/mat/impls/sbaij/seq/sbaij.h>
6 #include <petscblaslapack.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11 {
12   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
15   const PetscInt *idx;
16   PetscBT        table_out,table_in;
17 
18   PetscFunctionBegin;
19   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20   mbs  = a->mbs;
21   ai   = a->i;
22   aj   = a->j;
23   bs   = A->rmap->bs;
24   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
25   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
26   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
27   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
28 
29   for (i=0; i<is_max; i++) { /* for each is */
30     isz  = 0;
31     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
32 
33     /* Extract the indices, assume there can be duplicate entries */
34     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36 
37     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
38     bcol_max = 0;
39     for (j=0; j<n; ++j) {
40       brow = idx[j]/bs; /* convert the indices into block indices */
41       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42       if (!PetscBTLookupSet(table_out,brow)) {
43         nidx[isz++] = brow;
44         if (bcol_max < brow) bcol_max = brow;
45       }
46     }
47     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
49 
50     k = 0;
51     for (j=0; j<ov; j++) { /* for each overlap */
52       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
54       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
55 
56       n = isz;  /* length of the updated is[i] */
57       for (brow=0; brow<mbs; brow++) {
58         start = ai[brow]; end   = ai[brow+1];
59         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
60           for (l = start; l<end; l++) {
61             bcol = aj[l];
62             if (!PetscBTLookupSet(table_out,bcol)) {
63               nidx[isz++] = bcol;
64               if (bcol_max < bcol) bcol_max = bcol;
65             }
66           }
67           k++;
68           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
69         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
70           for (l = start; l<end; l++) {
71             bcol = aj[l];
72             if (bcol > bcol_max) break;
73             if (PetscBTLookup(table_in,bcol)) {
74               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
75               break; /* for l = start; l<end ; l++) */
76             }
77           }
78         }
79       }
80     } /* for each overlap */
81 
82     /* expand the Index Set */
83     for (j=0; j<isz; j++) {
84       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
85     }
86     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
87   }
88   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
89   ierr = PetscFree(nidx);CHKERRQ(ierr);
90   ierr = PetscFree(nidx2);CHKERRQ(ierr);
91   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
92   PetscFunctionReturn(0);
93 }
94 
95 #undef __FUNCT__
96 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
97 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
98 {
99   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data,*c;
100   PetscErrorCode  ierr;
101   PetscInt        *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
102   PetscInt        row,mat_i,*mat_j,tcol,*mat_ilen;
103   PetscInt        nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
104   const PetscInt  *irow,*aj = a->j,*ai = a->i;
105   MatScalar       *mat_a;
106   Mat             C;
107   PetscBool       flag,sorted;
108 
109   PetscFunctionBegin;
110   if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
111   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
112   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
113 
114   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
115   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
116 
117   ierr  = PetscMalloc1(oldcols,&smap);CHKERRQ(ierr);
118   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
119   ssmap = smap;
120   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
121   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
122   /* determine lens of each row */
123   for (i=0; i<nrows; i++) {
124     kstart  = ai[irow[i]];
125     kend    = kstart + a->ilen[irow[i]];
126     lens[i] = 0;
127     for (k=kstart; k<kend; k++) {
128       if (ssmap[aj[k]]) lens[i]++;
129     }
130   }
131   /* Create and fill new matrix */
132   if (scall == MAT_REUSE_MATRIX) {
133     c = (Mat_SeqSBAIJ*)((*B)->data);
134 
135     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
136     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
137     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
138     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
139     C    = *B;
140   } else {
141     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
142     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
143     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
144     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
145   }
146   c = (Mat_SeqSBAIJ*)(C->data);
147   for (i=0; i<nrows; i++) {
148     row      = irow[i];
149     kstart   = ai[row];
150     kend     = kstart + a->ilen[row];
151     mat_i    = c->i[i];
152     mat_j    = c->j + mat_i;
153     mat_a    = c->a + mat_i*bs2;
154     mat_ilen = c->ilen + i;
155     for (k=kstart; k<kend; k++) {
156       if ((tcol=ssmap[a->j[k]])) {
157         *mat_j++ = tcol - 1;
158         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
159         mat_a   += bs2;
160         (*mat_ilen)++;
161       }
162     }
163   }
164 
165   /* Free work space */
166   ierr = PetscFree(smap);CHKERRQ(ierr);
167   ierr = PetscFree(lens);CHKERRQ(ierr);
168   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
169   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
170 
171   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
172   *B   = C;
173   PetscFunctionReturn(0);
174 }
175 
176 #undef __FUNCT__
177 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
178 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
179 {
180   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
181   IS             is1;
182   PetscErrorCode ierr;
183   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
184   const PetscInt *irow;
185 
186   PetscFunctionBegin;
187   if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
188 
189   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
190   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
191 
192   /* Verify if the indices corespond to each element in a block
193    and form the IS with compressed IS */
194   ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr);
195   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
196   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
197 
198   count = 0;
199   for (i=0; i<a->mbs; i++) {
200     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
201     if (vary[i]==bs) iary[count++] = i;
202   }
203   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
204   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
205   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
206 
207   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr);
208   ierr = ISDestroy(&is1);CHKERRQ(ierr);
209   PetscFunctionReturn(0);
210 }
211 
212 #undef __FUNCT__
213 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
214 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
215 {
216   PetscErrorCode ierr;
217   PetscInt       i;
218 
219   PetscFunctionBegin;
220   if (scall == MAT_INITIAL_MATRIX) {
221     ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr);
222   }
223 
224   for (i=0; i<n; i++) {
225     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
226   }
227   PetscFunctionReturn(0);
228 }
229 
230 /* -------------------------------------------------------*/
231 /* Should check that shapes of vectors and matrices match */
232 /* -------------------------------------------------------*/
233 
234 #undef __FUNCT__
235 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
236 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
237 {
238   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
239   PetscScalar       *z,x1,x2,zero=0.0;
240   const PetscScalar *x,*xb;
241   const MatScalar   *v;
242   PetscErrorCode    ierr;
243   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
244   const PetscInt    *aj=a->j,*ai=a->i,*ib;
245   PetscInt          nonzerorow=0;
246 
247   PetscFunctionBegin;
248   ierr = VecSet(zz,zero);CHKERRQ(ierr);
249   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
250   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
251 
252   v  = a->a;
253   xb = x;
254 
255   for (i=0; i<mbs; i++) {
256     n           = ai[1] - ai[0]; /* length of i_th block row of A */
257     x1          = xb[0]; x2 = xb[1];
258     ib          = aj + *ai;
259     jmin        = 0;
260     nonzerorow += (n>0);
261     if (*ib == i) {     /* (diag of A)*x */
262       z[2*i]   += v[0]*x1 + v[2]*x2;
263       z[2*i+1] += v[2]*x1 + v[3]*x2;
264       v        += 4; jmin++;
265     }
266     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
267     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
268     for (j=jmin; j<n; j++) {
269       /* (strict lower triangular part of A)*x  */
270       cval       = ib[j]*2;
271       z[cval]   += v[0]*x1 + v[1]*x2;
272       z[cval+1] += v[2]*x1 + v[3]*x2;
273       /* (strict upper triangular part of A)*x  */
274       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
275       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
276       v        += 4;
277     }
278     xb +=2; ai++;
279   }
280 
281   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
282   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
283   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
284   PetscFunctionReturn(0);
285 }
286 
287 #undef __FUNCT__
288 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
289 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
290 {
291   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
292   PetscScalar       *z,x1,x2,x3,zero=0.0;
293   const PetscScalar *x,*xb;
294   const MatScalar   *v;
295   PetscErrorCode    ierr;
296   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
297   const PetscInt    *aj = a->j,*ai = a->i,*ib;
298   PetscInt          nonzerorow=0;
299 
300   PetscFunctionBegin;
301   ierr = VecSet(zz,zero);CHKERRQ(ierr);
302   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
303   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
304 
305   v  = a->a;
306   xb = x;
307 
308   for (i=0; i<mbs; i++) {
309     n           = ai[1] - ai[0]; /* length of i_th block row of A */
310     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
311     ib          = aj + *ai;
312     jmin        = 0;
313     nonzerorow += (n>0);
314     if (*ib == i) {     /* (diag of A)*x */
315       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
316       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
317       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
318       v        += 9; jmin++;
319     }
320     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
321     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
322     for (j=jmin; j<n; j++) {
323       /* (strict lower triangular part of A)*x  */
324       cval       = ib[j]*3;
325       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
326       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
327       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
328       /* (strict upper triangular part of A)*x  */
329       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
330       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
331       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
332       v        += 9;
333     }
334     xb +=3; ai++;
335   }
336 
337   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
338   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
339   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
340   PetscFunctionReturn(0);
341 }
342 
343 #undef __FUNCT__
344 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
345 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
346 {
347   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
348   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
349   const PetscScalar *x,*xb;
350   const MatScalar   *v;
351   PetscErrorCode    ierr;
352   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
353   const PetscInt    *aj = a->j,*ai = a->i,*ib;
354   PetscInt          nonzerorow = 0;
355 
356   PetscFunctionBegin;
357   ierr = VecSet(zz,zero);CHKERRQ(ierr);
358   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
359   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
360 
361   v  = a->a;
362   xb = x;
363 
364   for (i=0; i<mbs; i++) {
365     n           = ai[1] - ai[0]; /* length of i_th block row of A */
366     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
367     ib          = aj + *ai;
368     jmin        = 0;
369     nonzerorow += (n>0);
370     if (*ib == i) {     /* (diag of A)*x */
371       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
372       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
373       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
374       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
375       v        += 16; jmin++;
376     }
377     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
378     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
379     for (j=jmin; j<n; j++) {
380       /* (strict lower triangular part of A)*x  */
381       cval       = ib[j]*4;
382       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
383       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
384       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
385       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
386       /* (strict upper triangular part of A)*x  */
387       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
388       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
389       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
390       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
391       v        += 16;
392     }
393     xb +=4; ai++;
394   }
395 
396   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
397   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
398   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
399   PetscFunctionReturn(0);
400 }
401 
402 #undef __FUNCT__
403 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
404 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
405 {
406   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
407   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
408   const PetscScalar *x,*xb;
409   const MatScalar   *v;
410   PetscErrorCode    ierr;
411   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
412   const PetscInt    *aj = a->j,*ai = a->i,*ib;
413   PetscInt          nonzerorow=0;
414 
415   PetscFunctionBegin;
416   ierr = VecSet(zz,zero);CHKERRQ(ierr);
417   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
418   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
419 
420   v  = a->a;
421   xb = x;
422 
423   for (i=0; i<mbs; i++) {
424     n           = ai[1] - ai[0]; /* length of i_th block row of A */
425     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
426     ib          = aj + *ai;
427     jmin        = 0;
428     nonzerorow += (n>0);
429     if (*ib == i) {      /* (diag of A)*x */
430       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
431       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
432       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
433       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
434       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
435       v        += 25; jmin++;
436     }
437     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
438     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
439     for (j=jmin; j<n; j++) {
440       /* (strict lower triangular part of A)*x  */
441       cval       = ib[j]*5;
442       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
443       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
444       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
445       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
446       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
447       /* (strict upper triangular part of A)*x  */
448       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
449       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
450       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
451       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
452       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
453       v        += 25;
454     }
455     xb +=5; ai++;
456   }
457 
458   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
459   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
460   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
461   PetscFunctionReturn(0);
462 }
463 
464 
465 #undef __FUNCT__
466 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
467 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
468 {
469   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
470   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
471   const PetscScalar *x,*xb;
472   const MatScalar   *v;
473   PetscErrorCode    ierr;
474   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
475   const PetscInt    *aj=a->j,*ai=a->i,*ib;
476   PetscInt          nonzerorow=0;
477 
478   PetscFunctionBegin;
479   ierr = VecSet(zz,zero);CHKERRQ(ierr);
480   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
481   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
482 
483   v  = a->a;
484   xb = x;
485 
486   for (i=0; i<mbs; i++) {
487     n           = ai[1] - ai[0]; /* length of i_th block row of A */
488     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
489     ib          = aj + *ai;
490     jmin        = 0;
491     nonzerorow += (n>0);
492     if (*ib == i) {      /* (diag of A)*x */
493       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
494       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
495       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
496       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
497       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
498       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
499       v        += 36; jmin++;
500     }
501     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
502     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
503     for (j=jmin; j<n; j++) {
504       /* (strict lower triangular part of A)*x  */
505       cval       = ib[j]*6;
506       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
507       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
508       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
509       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
510       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
511       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
512       /* (strict upper triangular part of A)*x  */
513       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
514       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
515       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
516       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
517       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
518       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
519       v        += 36;
520     }
521     xb +=6; ai++;
522   }
523 
524   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
525   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
526   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
527   PetscFunctionReturn(0);
528 }
529 #undef __FUNCT__
530 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
531 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
532 {
533   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
534   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
535   const PetscScalar *x,*xb;
536   const MatScalar   *v;
537   PetscErrorCode    ierr;
538   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
539   const PetscInt    *aj=a->j,*ai=a->i,*ib;
540   PetscInt          nonzerorow=0;
541 
542   PetscFunctionBegin;
543   ierr = VecSet(zz,zero);CHKERRQ(ierr);
544   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
545   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
546 
547   v  = a->a;
548   xb = x;
549 
550   for (i=0; i<mbs; i++) {
551     n           = ai[1] - ai[0]; /* length of i_th block row of A */
552     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
553     ib          = aj + *ai;
554     jmin        = 0;
555     nonzerorow += (n>0);
556     if (*ib == i) {      /* (diag of A)*x */
557       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
558       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
559       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
560       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
561       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
562       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
563       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
564       v        += 49; jmin++;
565     }
566     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
567     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
568     for (j=jmin; j<n; j++) {
569       /* (strict lower triangular part of A)*x  */
570       cval       = ib[j]*7;
571       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
572       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
573       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
574       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
575       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
576       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
577       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
578       /* (strict upper triangular part of A)*x  */
579       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
580       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
581       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
582       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
583       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
584       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
585       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
586       v       += 49;
587     }
588     xb +=7; ai++;
589   }
590   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
591   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
592   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
593   PetscFunctionReturn(0);
594 }
595 
596 /*
597     This will not work with MatScalar == float because it calls the BLAS
598 */
599 #undef __FUNCT__
600 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
601 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
602 {
603   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
604   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
605   const PetscScalar *x,*x_ptr,*xb;
606   const MatScalar   *v;
607   PetscErrorCode    ierr;
608   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
609   const PetscInt    *idx,*aj,*ii;
610   PetscInt          nonzerorow=0;
611 
612   PetscFunctionBegin;
613   ierr = VecSet(zz,zero);CHKERRQ(ierr);
614   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x;
615   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
616 
617   aj = a->j;
618   v  = a->a;
619   ii = a->i;
620 
621   if (!a->mult_work) {
622     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
623   }
624   work = a->mult_work;
625 
626   for (i=0; i<mbs; i++) {
627     n           = ii[1] - ii[0]; ncols = n*bs;
628     workt       = work; idx=aj+ii[0];
629     nonzerorow += (n>0);
630 
631     /* upper triangular part */
632     for (j=0; j<n; j++) {
633       xb = x_ptr + bs*(*idx++);
634       for (k=0; k<bs; k++) workt[k] = xb[k];
635       workt += bs;
636     }
637     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
638     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
639 
640     /* strict lower triangular part */
641     idx = aj+ii[0];
642     if (*idx == i) {
643       ncols -= bs; v += bs2; idx++; n--;
644     }
645 
646     if (ncols > 0) {
647       workt = work;
648       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
649       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
650       for (j=0; j<n; j++) {
651         zb = z_ptr + bs*(*idx++);
652         for (k=0; k<bs; k++) zb[k] += workt[k];
653         workt += bs;
654       }
655     }
656     x += bs; v += n*bs2; z += bs; ii++;
657   }
658 
659   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
660   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
661   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
662   PetscFunctionReturn(0);
663 }
664 
665 #undef __FUNCT__
666 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
667 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
668 {
669   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
670   PetscScalar       *z,x1;
671   const PetscScalar *x,*xb;
672   const MatScalar   *v;
673   PetscErrorCode    ierr;
674   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
675   const PetscInt    *aj=a->j,*ai=a->i,*ib;
676   PetscInt          nonzerorow=0;
677 
678   PetscFunctionBegin;
679   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
680   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
681   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
682   v    = a->a;
683   xb   = x;
684 
685   for (i=0; i<mbs; i++) {
686     n           = ai[1] - ai[0]; /* length of i_th row of A */
687     x1          = xb[0];
688     ib          = aj + *ai;
689     jmin        = 0;
690     nonzerorow += (n>0);
691     if (*ib == i) {            /* (diag of A)*x */
692       z[i] += *v++ * x[*ib++]; jmin++;
693     }
694     for (j=jmin; j<n; j++) {
695       cval    = *ib;
696       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
697       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
698     }
699     xb++; ai++;
700   }
701 
702   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
703   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
704 
705   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
706   PetscFunctionReturn(0);
707 }
708 
709 #undef __FUNCT__
710 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
711 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
712 {
713   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
714   PetscScalar       *z,x1,x2;
715   const PetscScalar *x,*xb;
716   const MatScalar   *v;
717   PetscErrorCode    ierr;
718   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
719   const PetscInt    *aj=a->j,*ai=a->i,*ib;
720   PetscInt          nonzerorow=0;
721 
722   PetscFunctionBegin;
723   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
724   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
725   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
726 
727   v  = a->a;
728   xb = x;
729 
730   for (i=0; i<mbs; i++) {
731     n           = ai[1] - ai[0]; /* length of i_th block row of A */
732     x1          = xb[0]; x2 = xb[1];
733     ib          = aj + *ai;
734     jmin        = 0;
735     nonzerorow += (n>0);
736     if (*ib == i) {      /* (diag of A)*x */
737       z[2*i]   += v[0]*x1 + v[2]*x2;
738       z[2*i+1] += v[2]*x1 + v[3]*x2;
739       v        += 4; jmin++;
740     }
741     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
742     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
743     for (j=jmin; j<n; j++) {
744       /* (strict lower triangular part of A)*x  */
745       cval       = ib[j]*2;
746       z[cval]   += v[0]*x1 + v[1]*x2;
747       z[cval+1] += v[2]*x1 + v[3]*x2;
748       /* (strict upper triangular part of A)*x  */
749       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
750       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
751       v        += 4;
752     }
753     xb +=2; ai++;
754   }
755   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
756   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
757 
758   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
759   PetscFunctionReturn(0);
760 }
761 
762 #undef __FUNCT__
763 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
764 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
765 {
766   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
767   PetscScalar       *z,x1,x2,x3;
768   const PetscScalar *x,*xb;
769   const MatScalar   *v;
770   PetscErrorCode    ierr;
771   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
772   const PetscInt    *aj=a->j,*ai=a->i,*ib;
773   PetscInt          nonzerorow=0;
774 
775   PetscFunctionBegin;
776   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
777   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
778   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
779 
780   v  = a->a;
781   xb = x;
782 
783   for (i=0; i<mbs; i++) {
784     n           = ai[1] - ai[0]; /* length of i_th block row of A */
785     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
786     ib          = aj + *ai;
787     jmin        = 0;
788     nonzerorow += (n>0);
789     if (*ib == i) {     /* (diag of A)*x */
790       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
791       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
792       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
793       v        += 9; jmin++;
794     }
795     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
796     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
797     for (j=jmin; j<n; j++) {
798       /* (strict lower triangular part of A)*x  */
799       cval       = ib[j]*3;
800       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
801       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
802       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
803       /* (strict upper triangular part of A)*x  */
804       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
805       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
806       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
807       v        += 9;
808     }
809     xb +=3; ai++;
810   }
811 
812   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
813   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
814 
815   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
816   PetscFunctionReturn(0);
817 }
818 
819 #undef __FUNCT__
820 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
821 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
822 {
823   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
824   PetscScalar       *z,x1,x2,x3,x4;
825   const PetscScalar *x,*xb;
826   const MatScalar   *v;
827   PetscErrorCode    ierr;
828   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
829   const PetscInt    *aj=a->j,*ai=a->i,*ib;
830   PetscInt          nonzerorow=0;
831 
832   PetscFunctionBegin;
833   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
834   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
835   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
836 
837   v  = a->a;
838   xb = x;
839 
840   for (i=0; i<mbs; i++) {
841     n           = ai[1] - ai[0]; /* length of i_th block row of A */
842     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
843     ib          = aj + *ai;
844     jmin        = 0;
845     nonzerorow += (n>0);
846     if (*ib == i) {      /* (diag of A)*x */
847       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
848       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
849       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
850       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
851       v        += 16; jmin++;
852     }
853     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
854     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
855     for (j=jmin; j<n; j++) {
856       /* (strict lower triangular part of A)*x  */
857       cval       = ib[j]*4;
858       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
859       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
860       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
861       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
862       /* (strict upper triangular part of A)*x  */
863       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
864       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
865       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
866       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
867       v        += 16;
868     }
869     xb +=4; ai++;
870   }
871 
872   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
873   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
874 
875   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
876   PetscFunctionReturn(0);
877 }
878 
879 #undef __FUNCT__
880 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
881 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
882 {
883   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
884   PetscScalar       *z,x1,x2,x3,x4,x5;
885   const PetscScalar *x,*xb;
886   const MatScalar   *v;
887   PetscErrorCode    ierr;
888   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
889   const PetscInt    *aj=a->j,*ai=a->i,*ib;
890   PetscInt          nonzerorow=0;
891 
892   PetscFunctionBegin;
893   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
894   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
895   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
896 
897   v  = a->a;
898   xb = x;
899 
900   for (i=0; i<mbs; i++) {
901     n           = ai[1] - ai[0]; /* length of i_th block row of A */
902     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
903     ib          = aj + *ai;
904     jmin        = 0;
905     nonzerorow += (n>0);
906     if (*ib == i) {      /* (diag of A)*x */
907       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
908       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
909       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
910       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
911       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
912       v        += 25; jmin++;
913     }
914     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
915     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
916     for (j=jmin; j<n; j++) {
917       /* (strict lower triangular part of A)*x  */
918       cval       = ib[j]*5;
919       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
920       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
921       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
922       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
923       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
924       /* (strict upper triangular part of A)*x  */
925       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
926       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
927       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
928       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
929       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
930       v        += 25;
931     }
932     xb +=5; ai++;
933   }
934 
935   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
936   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
937 
938   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
939   PetscFunctionReturn(0);
940 }
941 #undef __FUNCT__
942 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
943 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
944 {
945   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
946   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
947   const PetscScalar *x,*xb;
948   const MatScalar   *v;
949   PetscErrorCode    ierr;
950   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
951   const PetscInt    *aj=a->j,*ai=a->i,*ib;
952   PetscInt          nonzerorow=0;
953 
954   PetscFunctionBegin;
955   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
956   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
957   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
958 
959   v  = a->a;
960   xb = x;
961 
962   for (i=0; i<mbs; i++) {
963     n           = ai[1] - ai[0]; /* length of i_th block row of A */
964     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
965     ib          = aj + *ai;
966     jmin        = 0;
967     nonzerorow += (n>0);
968     if (*ib == i) {     /* (diag of A)*x */
969       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
970       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
971       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
972       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
973       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
974       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
975       v        += 36; jmin++;
976     }
977     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
978     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
979     for (j=jmin; j<n; j++) {
980       /* (strict lower triangular part of A)*x  */
981       cval       = ib[j]*6;
982       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
983       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
984       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
985       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
986       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
987       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
988       /* (strict upper triangular part of A)*x  */
989       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
990       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
991       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
992       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
993       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
994       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
995       v        += 36;
996     }
997     xb +=6; ai++;
998   }
999 
1000   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1001   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1002 
1003   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1004   PetscFunctionReturn(0);
1005 }
1006 
1007 #undef __FUNCT__
1008 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1009 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1010 {
1011   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1012   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1013   const PetscScalar *x,*xb;
1014   const MatScalar   *v;
1015   PetscErrorCode    ierr;
1016   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1017   const PetscInt    *aj=a->j,*ai=a->i,*ib;
1018   PetscInt          nonzerorow=0;
1019 
1020   PetscFunctionBegin;
1021   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1022   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1023   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1024 
1025   v  = a->a;
1026   xb = x;
1027 
1028   for (i=0; i<mbs; i++) {
1029     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1030     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1031     ib          = aj + *ai;
1032     jmin        = 0;
1033     nonzerorow += (n>0);
1034     if (*ib == i) {     /* (diag of A)*x */
1035       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1036       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1037       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1038       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1039       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1040       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1041       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1042       v        += 49; jmin++;
1043     }
1044     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1045     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1046     for (j=jmin; j<n; j++) {
1047       /* (strict lower triangular part of A)*x  */
1048       cval       = ib[j]*7;
1049       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1050       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1051       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1052       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1053       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1054       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1055       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1056       /* (strict upper triangular part of A)*x  */
1057       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1058       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1059       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1060       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1061       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1062       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1063       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1064       v       += 49;
1065     }
1066     xb +=7; ai++;
1067   }
1068 
1069   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1070   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1071 
1072   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1073   PetscFunctionReturn(0);
1074 }
1075 
1076 #undef __FUNCT__
1077 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1078 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1079 {
1080   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1081   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1082   const PetscScalar *x,*x_ptr,*xb;
1083   const MatScalar   *v;
1084   PetscErrorCode    ierr;
1085   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1086   const PetscInt    *idx,*aj,*ii;
1087   PetscInt          nonzerorow=0;
1088 
1089   PetscFunctionBegin;
1090   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1091   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
1092   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1093 
1094   aj = a->j;
1095   v  = a->a;
1096   ii = a->i;
1097 
1098   if (!a->mult_work) {
1099     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
1100   }
1101   work = a->mult_work;
1102 
1103 
1104   for (i=0; i<mbs; i++) {
1105     n           = ii[1] - ii[0]; ncols = n*bs;
1106     workt       = work; idx=aj+ii[0];
1107     nonzerorow += (n>0);
1108 
1109     /* upper triangular part */
1110     for (j=0; j<n; j++) {
1111       xb = x_ptr + bs*(*idx++);
1112       for (k=0; k<bs; k++) workt[k] = xb[k];
1113       workt += bs;
1114     }
1115     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1116     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1117 
1118     /* strict lower triangular part */
1119     idx = aj+ii[0];
1120     if (*idx == i) {
1121       ncols -= bs; v += bs2; idx++; n--;
1122     }
1123     if (ncols > 0) {
1124       workt = work;
1125       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1126       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1127       for (j=0; j<n; j++) {
1128         zb = z_ptr + bs*(*idx++);
1129         for (k=0; k<bs; k++) zb[k] += workt[k];
1130         workt += bs;
1131       }
1132     }
1133 
1134     x += bs; v += n*bs2; z += bs; ii++;
1135   }
1136 
1137   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1138   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1139 
1140   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1141   PetscFunctionReturn(0);
1142 }
1143 
1144 #undef __FUNCT__
1145 #define __FUNCT__ "MatScale_SeqSBAIJ"
1146 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1147 {
1148   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1149   PetscScalar    oalpha = alpha;
1150   PetscErrorCode ierr;
1151   PetscBLASInt   one = 1,totalnz;
1152 
1153   PetscFunctionBegin;
1154   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
1155   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1156   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1157   PetscFunctionReturn(0);
1158 }
1159 
1160 #undef __FUNCT__
1161 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1162 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1163 {
1164   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1165   const MatScalar *v       = a->a;
1166   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1167   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1168   PetscErrorCode  ierr;
1169   const PetscInt  *aj=a->j,*col;
1170 
1171   PetscFunctionBegin;
1172   if (type == NORM_FROBENIUS) {
1173     for (k=0; k<mbs; k++) {
1174       jmin = a->i[k]; jmax = a->i[k+1];
1175       col  = aj + jmin;
1176       if (*col == k) {         /* diagonal block */
1177         for (i=0; i<bs2; i++) {
1178           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1179         }
1180         jmin++;
1181       }
1182       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
1183         for (i=0; i<bs2; i++) {
1184           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1185         }
1186       }
1187     }
1188     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1189   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1190     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
1191     for (i=0; i<mbs; i++) jl[i] = mbs;
1192     il[0] = 0;
1193 
1194     *norm = 0.0;
1195     for (k=0; k<mbs; k++) { /* k_th block row */
1196       for (j=0; j<bs; j++) sum[j]=0.0;
1197       /*-- col sum --*/
1198       i = jl[k]; /* first |A(i,k)| to be added */
1199       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1200                   at step k */
1201       while (i<mbs) {
1202         nexti = jl[i];  /* next block row to be added */
1203         ik    = il[i];  /* block index of A(i,k) in the array a */
1204         for (j=0; j<bs; j++) {
1205           v = a->a + ik*bs2 + j*bs;
1206           for (k1=0; k1<bs; k1++) {
1207             sum[j] += PetscAbsScalar(*v); v++;
1208           }
1209         }
1210         /* update il, jl */
1211         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1212         jmax = a->i[i+1];
1213         if (jmin < jmax) {
1214           il[i] = jmin;
1215           j     = a->j[jmin];
1216           jl[i] = jl[j]; jl[j]=i;
1217         }
1218         i = nexti;
1219       }
1220       /*-- row sum --*/
1221       jmin = a->i[k]; jmax = a->i[k+1];
1222       for (i=jmin; i<jmax; i++) {
1223         for (j=0; j<bs; j++) {
1224           v = a->a + i*bs2 + j;
1225           for (k1=0; k1<bs; k1++) {
1226             sum[j] += PetscAbsScalar(*v); v += bs;
1227           }
1228         }
1229       }
1230       /* add k_th block row to il, jl */
1231       col = aj+jmin;
1232       if (*col == k) jmin++;
1233       if (jmin < jmax) {
1234         il[k] = jmin;
1235         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1236       }
1237       for (j=0; j<bs; j++) {
1238         if (sum[j] > *norm) *norm = sum[j];
1239       }
1240     }
1241     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1242   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1243   PetscFunctionReturn(0);
1244 }
1245 
1246 #undef __FUNCT__
1247 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1248 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1249 {
1250   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1251   PetscErrorCode ierr;
1252 
1253   PetscFunctionBegin;
1254   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1255   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1256     *flg = PETSC_FALSE;
1257     PetscFunctionReturn(0);
1258   }
1259 
1260   /* if the a->i are the same */
1261   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1262   if (!*flg) PetscFunctionReturn(0);
1263 
1264   /* if a->j are the same */
1265   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1266   if (!*flg) PetscFunctionReturn(0);
1267 
1268   /* if a->a are the same */
1269   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1270   PetscFunctionReturn(0);
1271 }
1272 
1273 #undef __FUNCT__
1274 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1275 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1276 {
1277   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1278   PetscErrorCode  ierr;
1279   PetscInt        i,j,k,row,bs,ambs,bs2;
1280   const PetscInt  *ai,*aj;
1281   PetscScalar     *x,zero = 0.0;
1282   const MatScalar *aa,*aa_j;
1283 
1284   PetscFunctionBegin;
1285   bs = A->rmap->bs;
1286   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1287 
1288   aa   = a->a;
1289   ambs = a->mbs;
1290 
1291   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1292     PetscInt *diag=a->diag;
1293     aa   = a->a;
1294     ambs = a->mbs;
1295     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1296     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1297     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1298     PetscFunctionReturn(0);
1299   }
1300 
1301   ai   = a->i;
1302   aj   = a->j;
1303   bs2  = a->bs2;
1304   ierr = VecSet(v,zero);CHKERRQ(ierr);
1305   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1306   for (i=0; i<ambs; i++) {
1307     j=ai[i];
1308     if (aj[j] == i) {    /* if this is a diagonal element */
1309       row  = i*bs;
1310       aa_j = aa + j*bs2;
1311       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1312     }
1313   }
1314   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1315   PetscFunctionReturn(0);
1316 }
1317 
1318 #undef __FUNCT__
1319 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1320 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1321 {
1322   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1323   PetscScalar       x;
1324   const PetscScalar *l,*li,*ri;
1325   MatScalar         *aa,*v;
1326   PetscErrorCode    ierr;
1327   PetscInt          i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1328   PetscBool         flg;
1329 
1330   PetscFunctionBegin;
1331   if (ll != rr) {
1332     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1333     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1334   }
1335   if (!ll) PetscFunctionReturn(0);
1336   ai  = a->i;
1337   aj  = a->j;
1338   aa  = a->a;
1339   m   = A->rmap->N;
1340   bs  = A->rmap->bs;
1341   mbs = a->mbs;
1342   bs2 = a->bs2;
1343 
1344   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
1345   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1346   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1347   for (i=0; i<mbs; i++) { /* for each block row */
1348     M  = ai[i+1] - ai[i];
1349     li = l + i*bs;
1350     v  = aa + bs2*ai[i];
1351     for (j=0; j<M; j++) { /* for each block */
1352       ri = l + bs*aj[ai[i]+j];
1353       for (k=0; k<bs; k++) {
1354         x = ri[k];
1355         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1356       }
1357     }
1358   }
1359   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1360   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1361   PetscFunctionReturn(0);
1362 }
1363 
1364 #undef __FUNCT__
1365 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1366 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1367 {
1368   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1369 
1370   PetscFunctionBegin;
1371   info->block_size   = a->bs2;
1372   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
1373   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
1374   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
1375   info->assemblies   = A->num_ass;
1376   info->mallocs      = A->info.mallocs;
1377   info->memory       = ((PetscObject)A)->mem;
1378   if (A->factortype) {
1379     info->fill_ratio_given  = A->info.fill_ratio_given;
1380     info->fill_ratio_needed = A->info.fill_ratio_needed;
1381     info->factor_mallocs    = A->info.factor_mallocs;
1382   } else {
1383     info->fill_ratio_given  = 0;
1384     info->fill_ratio_needed = 0;
1385     info->factor_mallocs    = 0;
1386   }
1387   PetscFunctionReturn(0);
1388 }
1389 
1390 
1391 #undef __FUNCT__
1392 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1393 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1394 {
1395   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1396   PetscErrorCode ierr;
1397 
1398   PetscFunctionBegin;
1399   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1400   PetscFunctionReturn(0);
1401 }
1402 
1403 #undef __FUNCT__
1404 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1405 /*
1406    This code does not work since it only checks the upper triangular part of
1407   the matrix. Hence it is not listed in the function table.
1408 */
1409 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1410 {
1411   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1412   PetscErrorCode  ierr;
1413   PetscInt        i,j,n,row,col,bs,mbs;
1414   const PetscInt  *ai,*aj;
1415   PetscReal       atmp;
1416   const MatScalar *aa;
1417   PetscScalar     *x;
1418   PetscInt        ncols,brow,bcol,krow,kcol;
1419 
1420   PetscFunctionBegin;
1421   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1422   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1423   bs  = A->rmap->bs;
1424   aa  = a->a;
1425   ai  = a->i;
1426   aj  = a->j;
1427   mbs = a->mbs;
1428 
1429   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1430   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1431   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1432   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1433   for (i=0; i<mbs; i++) {
1434     ncols = ai[1] - ai[0]; ai++;
1435     brow  = bs*i;
1436     for (j=0; j<ncols; j++) {
1437       bcol = bs*(*aj);
1438       for (kcol=0; kcol<bs; kcol++) {
1439         col = bcol + kcol;      /* col index */
1440         for (krow=0; krow<bs; krow++) {
1441           atmp = PetscAbsScalar(*aa); aa++;
1442           row  = brow + krow;   /* row index */
1443           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1444           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1445         }
1446       }
1447       aj++;
1448     }
1449   }
1450   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1451   PetscFunctionReturn(0);
1452 }
1453