xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 7cf71cfdedeb9639d8ef2da3c41fb734efb627ee)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <../src/mat/impls/sbaij/seq/sbaij.h>
6 #include <petscblaslapack.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11 {
12   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
15   const PetscInt *idx;
16   PetscBT        table_out,table_in;
17 
18   PetscFunctionBegin;
19   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20   mbs  = a->mbs;
21   ai   = a->i;
22   aj   = a->j;
23   bs   = A->rmap->bs;
24   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
25   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
26   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
27   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
28 
29   for (i=0; i<is_max; i++) { /* for each is */
30     isz  = 0;
31     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
32 
33     /* Extract the indices, assume there can be duplicate entries */
34     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36 
37     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
38     bcol_max = 0;
39     for (j=0; j<n; ++j) {
40       brow = idx[j]/bs; /* convert the indices into block indices */
41       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42       if (!PetscBTLookupSet(table_out,brow)) {
43         nidx[isz++] = brow;
44         if (bcol_max < brow) bcol_max = brow;
45       }
46     }
47     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
49 
50     k = 0;
51     for (j=0; j<ov; j++) { /* for each overlap */
52       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
54       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
55 
56       n = isz;  /* length of the updated is[i] */
57       for (brow=0; brow<mbs; brow++) {
58         start = ai[brow]; end   = ai[brow+1];
59         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
60           for (l = start; l<end; l++) {
61             bcol = aj[l];
62             if (!PetscBTLookupSet(table_out,bcol)) {
63               nidx[isz++] = bcol;
64               if (bcol_max < bcol) bcol_max = bcol;
65             }
66           }
67           k++;
68           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
69         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
70           for (l = start; l<end; l++) {
71             bcol = aj[l];
72             if (bcol > bcol_max) break;
73             if (PetscBTLookup(table_in,bcol)) {
74               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
75               break; /* for l = start; l<end ; l++) */
76             }
77           }
78         }
79       }
80     } /* for each overlap */
81 
82     /* expand the Index Set */
83     for (j=0; j<isz; j++) {
84       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
85     }
86     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
87   }
88   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
89   ierr = PetscFree(nidx);CHKERRQ(ierr);
90   ierr = PetscFree(nidx2);CHKERRQ(ierr);
91   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
92   PetscFunctionReturn(0);
93 }
94 
95 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
96         Zero some ops' to avoid invalid usse */
97 #undef __FUNCT__
98 #define __FUNCT__ "MatSeqSBAIJZeroOps_Private"
99 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
100 {
101   PetscErrorCode ierr;
102 
103   PetscFunctionBegin;
104   ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr);
105   Bseq->ops->mult                   = 0;
106   Bseq->ops->multadd                = 0;
107   Bseq->ops->multtranspose          = 0;
108   Bseq->ops->multtransposeadd       = 0;
109   Bseq->ops->lufactor               = 0;
110   Bseq->ops->choleskyfactor         = 0;
111   Bseq->ops->lufactorsymbolic       = 0;
112   Bseq->ops->choleskyfactorsymbolic = 0;
113   Bseq->ops->getinertia             = 0;
114   PetscFunctionReturn(0);
115 }
116 
117 /* same as MatGetSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
118 #undef __FUNCT__
119 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
120 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
121 {
122   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*c;
123   PetscErrorCode ierr;
124   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
125   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
126   const PetscInt *irow,*icol;
127   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
128   PetscInt       *aj = a->j,*ai = a->i;
129   MatScalar      *mat_a;
130   Mat            C;
131   PetscBool      flag;
132 
133   PetscFunctionBegin;
134 
135   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
136   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
137   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
138   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
139 
140   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
141   ssmap = smap;
142   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
143   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
144   /* determine lens of each row */
145   for (i=0; i<nrows; i++) {
146     kstart  = ai[irow[i]];
147     kend    = kstart + a->ilen[irow[i]];
148     lens[i] = 0;
149     for (k=kstart; k<kend; k++) {
150       if (ssmap[aj[k]]) lens[i]++;
151     }
152   }
153   /* Create and fill new matrix */
154   if (scall == MAT_REUSE_MATRIX) {
155     c = (Mat_SeqSBAIJ*)((*B)->data);
156 
157     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
158     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
159     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
160     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
161     C    = *B;
162   } else {
163     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
164     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
165     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
166     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
167   }
168   c = (Mat_SeqSBAIJ*)(C->data);
169   for (i=0; i<nrows; i++) {
170     row      = irow[i];
171     kstart   = ai[row];
172     kend     = kstart + a->ilen[row];
173     mat_i    = c->i[i];
174     mat_j    = c->j + mat_i;
175     mat_a    = c->a + mat_i*bs2;
176     mat_ilen = c->ilen + i;
177     for (k=kstart; k<kend; k++) {
178       if ((tcol=ssmap[a->j[k]])) {
179         *mat_j++ = tcol - 1;
180         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
181         mat_a   += bs2;
182         (*mat_ilen)++;
183       }
184     }
185   }
186   /* sort */
187   {
188     MatScalar *work;
189 
190     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
191     for (i=0; i<nrows; i++) {
192       PetscInt ilen;
193       mat_i = c->i[i];
194       mat_j = c->j + mat_i;
195       mat_a = c->a + mat_i*bs2;
196       ilen  = c->ilen[i];
197       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
198     }
199     ierr = PetscFree(work);CHKERRQ(ierr);
200   }
201 
202   /* Free work space */
203   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
204   ierr = PetscFree(smap);CHKERRQ(ierr);
205   ierr = PetscFree(lens);CHKERRQ(ierr);
206   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
207   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
208 
209   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
210   *B   = C;
211   PetscFunctionReturn(0);
212 }
213 
214 #undef __FUNCT__
215 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
216 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
217 {
218   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
219   IS             is1,is2;
220   PetscErrorCode ierr;
221   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
222   const PetscInt *irow,*icol;
223 
224   PetscFunctionBegin;
225   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
226   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
227   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
228   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
229 
230   /* Verify if the indices corespond to each element in a block
231    and form the IS with compressed IS */
232   maxmnbs = PetscMax(a->mbs,a->nbs);
233   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
234   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
235   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
236   for (i=0; i<a->mbs; i++) {
237     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
238   }
239   count = 0;
240   for (i=0; i<nrows; i++) {
241     PetscInt j = irow[i] / bs;
242     if ((vary[j]--)==bs) iary[count++] = j;
243   }
244   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
245 
246   ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr);
247   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
248   for (i=0; i<a->nbs; i++) {
249     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
250   }
251   count = 0;
252   for (i=0; i<ncols; i++) {
253     PetscInt j = icol[i] / bs;
254     if ((vary[j]--)==bs) iary[count++] = j;
255   }
256   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
257   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
258   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
259   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
260 
261   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
262   ierr = ISDestroy(&is1);CHKERRQ(ierr);
263   ierr = ISDestroy(&is2);CHKERRQ(ierr);
264 
265   if (isrow != iscol) {
266     PetscBool isequal;
267     ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr);
268     if (!isequal) {
269       ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr);
270     }
271   }
272   PetscFunctionReturn(0);
273 }
274 
275 #undef __FUNCT__
276 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
277 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
278 {
279   PetscErrorCode ierr;
280   PetscInt       i;
281 
282   PetscFunctionBegin;
283   if (scall == MAT_INITIAL_MATRIX) {
284     ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr);
285   }
286 
287   for (i=0; i<n; i++) {
288     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
289   }
290   PetscFunctionReturn(0);
291 }
292 
293 /* -------------------------------------------------------*/
294 /* Should check that shapes of vectors and matrices match */
295 /* -------------------------------------------------------*/
296 
297 #undef __FUNCT__
298 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
299 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
300 {
301   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
302   PetscScalar       *z,x1,x2,zero=0.0;
303   const PetscScalar *x,*xb;
304   const MatScalar   *v;
305   PetscErrorCode    ierr;
306   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
307   const PetscInt    *aj=a->j,*ai=a->i,*ib;
308   PetscInt          nonzerorow=0;
309 
310   PetscFunctionBegin;
311   ierr = VecSet(zz,zero);CHKERRQ(ierr);
312   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
313   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
314 
315   v  = a->a;
316   xb = x;
317 
318   for (i=0; i<mbs; i++) {
319     n           = ai[1] - ai[0]; /* length of i_th block row of A */
320     x1          = xb[0]; x2 = xb[1];
321     ib          = aj + *ai;
322     jmin        = 0;
323     nonzerorow += (n>0);
324     if (*ib == i) {     /* (diag of A)*x */
325       z[2*i]   += v[0]*x1 + v[2]*x2;
326       z[2*i+1] += v[2]*x1 + v[3]*x2;
327       v        += 4; jmin++;
328     }
329     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
330     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
331     for (j=jmin; j<n; j++) {
332       /* (strict lower triangular part of A)*x  */
333       cval       = ib[j]*2;
334       z[cval]   += v[0]*x1 + v[1]*x2;
335       z[cval+1] += v[2]*x1 + v[3]*x2;
336       /* (strict upper triangular part of A)*x  */
337       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
338       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
339       v        += 4;
340     }
341     xb +=2; ai++;
342   }
343 
344   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
345   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
346   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
347   PetscFunctionReturn(0);
348 }
349 
350 #undef __FUNCT__
351 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
352 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
353 {
354   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
355   PetscScalar       *z,x1,x2,x3,zero=0.0;
356   const PetscScalar *x,*xb;
357   const MatScalar   *v;
358   PetscErrorCode    ierr;
359   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
360   const PetscInt    *aj = a->j,*ai = a->i,*ib;
361   PetscInt          nonzerorow=0;
362 
363   PetscFunctionBegin;
364   ierr = VecSet(zz,zero);CHKERRQ(ierr);
365   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
366   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
367 
368   v  = a->a;
369   xb = x;
370 
371   for (i=0; i<mbs; i++) {
372     n           = ai[1] - ai[0]; /* length of i_th block row of A */
373     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
374     ib          = aj + *ai;
375     jmin        = 0;
376     nonzerorow += (n>0);
377     if (*ib == i) {     /* (diag of A)*x */
378       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
379       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
380       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
381       v        += 9; jmin++;
382     }
383     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
384     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
385     for (j=jmin; j<n; j++) {
386       /* (strict lower triangular part of A)*x  */
387       cval       = ib[j]*3;
388       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
389       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
390       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
391       /* (strict upper triangular part of A)*x  */
392       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
393       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
394       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
395       v        += 9;
396     }
397     xb +=3; ai++;
398   }
399 
400   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
401   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
402   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
403   PetscFunctionReturn(0);
404 }
405 
406 #undef __FUNCT__
407 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
408 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
409 {
410   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
411   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
412   const PetscScalar *x,*xb;
413   const MatScalar   *v;
414   PetscErrorCode    ierr;
415   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
416   const PetscInt    *aj = a->j,*ai = a->i,*ib;
417   PetscInt          nonzerorow = 0;
418 
419   PetscFunctionBegin;
420   ierr = VecSet(zz,zero);CHKERRQ(ierr);
421   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
422   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
423 
424   v  = a->a;
425   xb = x;
426 
427   for (i=0; i<mbs; i++) {
428     n           = ai[1] - ai[0]; /* length of i_th block row of A */
429     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
430     ib          = aj + *ai;
431     jmin        = 0;
432     nonzerorow += (n>0);
433     if (*ib == i) {     /* (diag of A)*x */
434       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
435       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
436       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
437       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
438       v        += 16; jmin++;
439     }
440     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
441     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
442     for (j=jmin; j<n; j++) {
443       /* (strict lower triangular part of A)*x  */
444       cval       = ib[j]*4;
445       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
446       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
447       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
448       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
449       /* (strict upper triangular part of A)*x  */
450       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
451       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
452       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
453       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
454       v        += 16;
455     }
456     xb +=4; ai++;
457   }
458 
459   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
460   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
461   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
462   PetscFunctionReturn(0);
463 }
464 
465 #undef __FUNCT__
466 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
467 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
468 {
469   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
470   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
471   const PetscScalar *x,*xb;
472   const MatScalar   *v;
473   PetscErrorCode    ierr;
474   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
475   const PetscInt    *aj = a->j,*ai = a->i,*ib;
476   PetscInt          nonzerorow=0;
477 
478   PetscFunctionBegin;
479   ierr = VecSet(zz,zero);CHKERRQ(ierr);
480   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
481   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
482 
483   v  = a->a;
484   xb = x;
485 
486   for (i=0; i<mbs; i++) {
487     n           = ai[1] - ai[0]; /* length of i_th block row of A */
488     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
489     ib          = aj + *ai;
490     jmin        = 0;
491     nonzerorow += (n>0);
492     if (*ib == i) {      /* (diag of A)*x */
493       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
494       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
495       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
496       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
497       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
498       v        += 25; jmin++;
499     }
500     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
501     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
502     for (j=jmin; j<n; j++) {
503       /* (strict lower triangular part of A)*x  */
504       cval       = ib[j]*5;
505       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
506       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
507       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
508       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
509       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
510       /* (strict upper triangular part of A)*x  */
511       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
512       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
513       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
514       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
515       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
516       v        += 25;
517     }
518     xb +=5; ai++;
519   }
520 
521   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
522   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
523   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
524   PetscFunctionReturn(0);
525 }
526 
527 
528 #undef __FUNCT__
529 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
530 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
531 {
532   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
533   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
534   const PetscScalar *x,*xb;
535   const MatScalar   *v;
536   PetscErrorCode    ierr;
537   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
538   const PetscInt    *aj=a->j,*ai=a->i,*ib;
539   PetscInt          nonzerorow=0;
540 
541   PetscFunctionBegin;
542   ierr = VecSet(zz,zero);CHKERRQ(ierr);
543   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
544   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
545 
546   v  = a->a;
547   xb = x;
548 
549   for (i=0; i<mbs; i++) {
550     n           = ai[1] - ai[0]; /* length of i_th block row of A */
551     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
552     ib          = aj + *ai;
553     jmin        = 0;
554     nonzerorow += (n>0);
555     if (*ib == i) {      /* (diag of A)*x */
556       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
557       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
558       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
559       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
560       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
561       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
562       v        += 36; jmin++;
563     }
564     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
565     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
566     for (j=jmin; j<n; j++) {
567       /* (strict lower triangular part of A)*x  */
568       cval       = ib[j]*6;
569       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
570       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
571       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
572       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
573       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
574       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
575       /* (strict upper triangular part of A)*x  */
576       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
577       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
578       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
579       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
580       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
581       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
582       v        += 36;
583     }
584     xb +=6; ai++;
585   }
586 
587   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
588   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
589   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
590   PetscFunctionReturn(0);
591 }
592 #undef __FUNCT__
593 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
594 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
595 {
596   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
597   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
598   const PetscScalar *x,*xb;
599   const MatScalar   *v;
600   PetscErrorCode    ierr;
601   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
602   const PetscInt    *aj=a->j,*ai=a->i,*ib;
603   PetscInt          nonzerorow=0;
604 
605   PetscFunctionBegin;
606   ierr = VecSet(zz,zero);CHKERRQ(ierr);
607   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
608   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
609 
610   v  = a->a;
611   xb = x;
612 
613   for (i=0; i<mbs; i++) {
614     n           = ai[1] - ai[0]; /* length of i_th block row of A */
615     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
616     ib          = aj + *ai;
617     jmin        = 0;
618     nonzerorow += (n>0);
619     if (*ib == i) {      /* (diag of A)*x */
620       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
621       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
622       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
623       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
624       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
625       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
626       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
627       v        += 49; jmin++;
628     }
629     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
630     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
631     for (j=jmin; j<n; j++) {
632       /* (strict lower triangular part of A)*x  */
633       cval       = ib[j]*7;
634       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
635       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
636       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
637       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
638       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
639       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
640       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
641       /* (strict upper triangular part of A)*x  */
642       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
643       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
644       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
645       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
646       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
647       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
648       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
649       v       += 49;
650     }
651     xb +=7; ai++;
652   }
653   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
654   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
655   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
656   PetscFunctionReturn(0);
657 }
658 
659 /*
660     This will not work with MatScalar == float because it calls the BLAS
661 */
662 #undef __FUNCT__
663 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
664 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
665 {
666   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
667   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
668   const PetscScalar *x,*x_ptr,*xb;
669   const MatScalar   *v;
670   PetscErrorCode    ierr;
671   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
672   const PetscInt    *idx,*aj,*ii;
673   PetscInt          nonzerorow=0;
674 
675   PetscFunctionBegin;
676   ierr = VecSet(zz,zero);CHKERRQ(ierr);
677   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x;
678   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
679 
680   aj = a->j;
681   v  = a->a;
682   ii = a->i;
683 
684   if (!a->mult_work) {
685     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
686   }
687   work = a->mult_work;
688 
689   for (i=0; i<mbs; i++) {
690     n           = ii[1] - ii[0]; ncols = n*bs;
691     workt       = work; idx=aj+ii[0];
692     nonzerorow += (n>0);
693 
694     /* upper triangular part */
695     for (j=0; j<n; j++) {
696       xb = x_ptr + bs*(*idx++);
697       for (k=0; k<bs; k++) workt[k] = xb[k];
698       workt += bs;
699     }
700     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
701     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
702 
703     /* strict lower triangular part */
704     idx = aj+ii[0];
705     if (*idx == i) {
706       ncols -= bs; v += bs2; idx++; n--;
707     }
708 
709     if (ncols > 0) {
710       workt = work;
711       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
712       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
713       for (j=0; j<n; j++) {
714         zb = z_ptr + bs*(*idx++);
715         for (k=0; k<bs; k++) zb[k] += workt[k];
716         workt += bs;
717       }
718     }
719     x += bs; v += n*bs2; z += bs; ii++;
720   }
721 
722   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
723   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
724   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
725   PetscFunctionReturn(0);
726 }
727 
728 #undef __FUNCT__
729 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
730 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
731 {
732   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
733   PetscScalar       *z,x1;
734   const PetscScalar *x,*xb;
735   const MatScalar   *v;
736   PetscErrorCode    ierr;
737   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
738   const PetscInt    *aj=a->j,*ai=a->i,*ib;
739   PetscInt          nonzerorow=0;
740 
741   PetscFunctionBegin;
742   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
743   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
744   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
745   v    = a->a;
746   xb   = x;
747 
748   for (i=0; i<mbs; i++) {
749     n           = ai[1] - ai[0]; /* length of i_th row of A */
750     x1          = xb[0];
751     ib          = aj + *ai;
752     jmin        = 0;
753     nonzerorow += (n>0);
754     if (*ib == i) {            /* (diag of A)*x */
755       z[i] += *v++ * x[*ib++]; jmin++;
756     }
757     for (j=jmin; j<n; j++) {
758       cval    = *ib;
759       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
760       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
761     }
762     xb++; ai++;
763   }
764 
765   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
766   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
767 
768   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
769   PetscFunctionReturn(0);
770 }
771 
772 #undef __FUNCT__
773 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
774 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
775 {
776   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
777   PetscScalar       *z,x1,x2;
778   const PetscScalar *x,*xb;
779   const MatScalar   *v;
780   PetscErrorCode    ierr;
781   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
782   const PetscInt    *aj=a->j,*ai=a->i,*ib;
783   PetscInt          nonzerorow=0;
784 
785   PetscFunctionBegin;
786   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
787   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
788   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
789 
790   v  = a->a;
791   xb = x;
792 
793   for (i=0; i<mbs; i++) {
794     n           = ai[1] - ai[0]; /* length of i_th block row of A */
795     x1          = xb[0]; x2 = xb[1];
796     ib          = aj + *ai;
797     jmin        = 0;
798     nonzerorow += (n>0);
799     if (*ib == i) {      /* (diag of A)*x */
800       z[2*i]   += v[0]*x1 + v[2]*x2;
801       z[2*i+1] += v[2]*x1 + v[3]*x2;
802       v        += 4; jmin++;
803     }
804     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
805     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
806     for (j=jmin; j<n; j++) {
807       /* (strict lower triangular part of A)*x  */
808       cval       = ib[j]*2;
809       z[cval]   += v[0]*x1 + v[1]*x2;
810       z[cval+1] += v[2]*x1 + v[3]*x2;
811       /* (strict upper triangular part of A)*x  */
812       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
813       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
814       v        += 4;
815     }
816     xb +=2; ai++;
817   }
818   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
819   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
820 
821   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
822   PetscFunctionReturn(0);
823 }
824 
825 #undef __FUNCT__
826 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
827 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
828 {
829   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
830   PetscScalar       *z,x1,x2,x3;
831   const PetscScalar *x,*xb;
832   const MatScalar   *v;
833   PetscErrorCode    ierr;
834   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
835   const PetscInt    *aj=a->j,*ai=a->i,*ib;
836   PetscInt          nonzerorow=0;
837 
838   PetscFunctionBegin;
839   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
840   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
841   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
842 
843   v  = a->a;
844   xb = x;
845 
846   for (i=0; i<mbs; i++) {
847     n           = ai[1] - ai[0]; /* length of i_th block row of A */
848     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
849     ib          = aj + *ai;
850     jmin        = 0;
851     nonzerorow += (n>0);
852     if (*ib == i) {     /* (diag of A)*x */
853       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
854       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
855       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
856       v        += 9; jmin++;
857     }
858     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
859     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
860     for (j=jmin; j<n; j++) {
861       /* (strict lower triangular part of A)*x  */
862       cval       = ib[j]*3;
863       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
864       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
865       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
866       /* (strict upper triangular part of A)*x  */
867       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
868       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
869       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
870       v        += 9;
871     }
872     xb +=3; ai++;
873   }
874 
875   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
876   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
877 
878   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
879   PetscFunctionReturn(0);
880 }
881 
882 #undef __FUNCT__
883 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
884 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
885 {
886   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
887   PetscScalar       *z,x1,x2,x3,x4;
888   const PetscScalar *x,*xb;
889   const MatScalar   *v;
890   PetscErrorCode    ierr;
891   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
892   const PetscInt    *aj=a->j,*ai=a->i,*ib;
893   PetscInt          nonzerorow=0;
894 
895   PetscFunctionBegin;
896   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
897   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
898   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
899 
900   v  = a->a;
901   xb = x;
902 
903   for (i=0; i<mbs; i++) {
904     n           = ai[1] - ai[0]; /* length of i_th block row of A */
905     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
906     ib          = aj + *ai;
907     jmin        = 0;
908     nonzerorow += (n>0);
909     if (*ib == i) {      /* (diag of A)*x */
910       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
911       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
912       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
913       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
914       v        += 16; jmin++;
915     }
916     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
917     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
918     for (j=jmin; j<n; j++) {
919       /* (strict lower triangular part of A)*x  */
920       cval       = ib[j]*4;
921       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
922       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
923       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
924       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
925       /* (strict upper triangular part of A)*x  */
926       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
927       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
928       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
929       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
930       v        += 16;
931     }
932     xb +=4; ai++;
933   }
934 
935   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
936   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
937 
938   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
939   PetscFunctionReturn(0);
940 }
941 
942 #undef __FUNCT__
943 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
944 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
945 {
946   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
947   PetscScalar       *z,x1,x2,x3,x4,x5;
948   const PetscScalar *x,*xb;
949   const MatScalar   *v;
950   PetscErrorCode    ierr;
951   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
952   const PetscInt    *aj=a->j,*ai=a->i,*ib;
953   PetscInt          nonzerorow=0;
954 
955   PetscFunctionBegin;
956   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
957   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
958   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
959 
960   v  = a->a;
961   xb = x;
962 
963   for (i=0; i<mbs; i++) {
964     n           = ai[1] - ai[0]; /* length of i_th block row of A */
965     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
966     ib          = aj + *ai;
967     jmin        = 0;
968     nonzerorow += (n>0);
969     if (*ib == i) {      /* (diag of A)*x */
970       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
971       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
972       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
973       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
974       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
975       v        += 25; jmin++;
976     }
977     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
978     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
979     for (j=jmin; j<n; j++) {
980       /* (strict lower triangular part of A)*x  */
981       cval       = ib[j]*5;
982       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
983       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
984       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
985       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
986       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
987       /* (strict upper triangular part of A)*x  */
988       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
989       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
990       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
991       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
992       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
993       v        += 25;
994     }
995     xb +=5; ai++;
996   }
997 
998   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
999   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1000 
1001   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1002   PetscFunctionReturn(0);
1003 }
1004 #undef __FUNCT__
1005 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
1006 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1007 {
1008   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1009   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
1010   const PetscScalar *x,*xb;
1011   const MatScalar   *v;
1012   PetscErrorCode    ierr;
1013   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1014   const PetscInt    *aj=a->j,*ai=a->i,*ib;
1015   PetscInt          nonzerorow=0;
1016 
1017   PetscFunctionBegin;
1018   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1019   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1020   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1021 
1022   v  = a->a;
1023   xb = x;
1024 
1025   for (i=0; i<mbs; i++) {
1026     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1027     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
1028     ib          = aj + *ai;
1029     jmin        = 0;
1030     nonzerorow += (n>0);
1031     if (*ib == i) {     /* (diag of A)*x */
1032       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
1033       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1034       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1035       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1036       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1037       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1038       v        += 36; jmin++;
1039     }
1040     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1041     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1042     for (j=jmin; j<n; j++) {
1043       /* (strict lower triangular part of A)*x  */
1044       cval       = ib[j]*6;
1045       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1046       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1047       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1048       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1049       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1050       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1051       /* (strict upper triangular part of A)*x  */
1052       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1053       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1054       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1055       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1056       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1057       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1058       v        += 36;
1059     }
1060     xb +=6; ai++;
1061   }
1062 
1063   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1064   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1065 
1066   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1067   PetscFunctionReturn(0);
1068 }
1069 
1070 #undef __FUNCT__
1071 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1072 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1073 {
1074   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1075   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1076   const PetscScalar *x,*xb;
1077   const MatScalar   *v;
1078   PetscErrorCode    ierr;
1079   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1080   const PetscInt    *aj=a->j,*ai=a->i,*ib;
1081   PetscInt          nonzerorow=0;
1082 
1083   PetscFunctionBegin;
1084   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1085   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1086   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1087 
1088   v  = a->a;
1089   xb = x;
1090 
1091   for (i=0; i<mbs; i++) {
1092     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1093     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1094     ib          = aj + *ai;
1095     jmin        = 0;
1096     nonzerorow += (n>0);
1097     if (*ib == i) {     /* (diag of A)*x */
1098       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1099       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1100       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1101       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1102       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1103       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1104       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1105       v        += 49; jmin++;
1106     }
1107     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1108     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1109     for (j=jmin; j<n; j++) {
1110       /* (strict lower triangular part of A)*x  */
1111       cval       = ib[j]*7;
1112       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1113       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1114       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1115       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1116       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1117       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1118       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1119       /* (strict upper triangular part of A)*x  */
1120       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1121       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1122       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1123       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1124       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1125       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1126       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1127       v       += 49;
1128     }
1129     xb +=7; ai++;
1130   }
1131 
1132   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1133   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1134 
1135   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1136   PetscFunctionReturn(0);
1137 }
1138 
1139 #undef __FUNCT__
1140 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1141 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1142 {
1143   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1144   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1145   const PetscScalar *x,*x_ptr,*xb;
1146   const MatScalar   *v;
1147   PetscErrorCode    ierr;
1148   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1149   const PetscInt    *idx,*aj,*ii;
1150   PetscInt          nonzerorow=0;
1151 
1152   PetscFunctionBegin;
1153   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1154   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
1155   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1156 
1157   aj = a->j;
1158   v  = a->a;
1159   ii = a->i;
1160 
1161   if (!a->mult_work) {
1162     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
1163   }
1164   work = a->mult_work;
1165 
1166 
1167   for (i=0; i<mbs; i++) {
1168     n           = ii[1] - ii[0]; ncols = n*bs;
1169     workt       = work; idx=aj+ii[0];
1170     nonzerorow += (n>0);
1171 
1172     /* upper triangular part */
1173     for (j=0; j<n; j++) {
1174       xb = x_ptr + bs*(*idx++);
1175       for (k=0; k<bs; k++) workt[k] = xb[k];
1176       workt += bs;
1177     }
1178     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1179     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1180 
1181     /* strict lower triangular part */
1182     idx = aj+ii[0];
1183     if (*idx == i) {
1184       ncols -= bs; v += bs2; idx++; n--;
1185     }
1186     if (ncols > 0) {
1187       workt = work;
1188       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1189       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1190       for (j=0; j<n; j++) {
1191         zb = z_ptr + bs*(*idx++);
1192         for (k=0; k<bs; k++) zb[k] += workt[k];
1193         workt += bs;
1194       }
1195     }
1196 
1197     x += bs; v += n*bs2; z += bs; ii++;
1198   }
1199 
1200   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1201   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1202 
1203   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1204   PetscFunctionReturn(0);
1205 }
1206 
1207 #undef __FUNCT__
1208 #define __FUNCT__ "MatScale_SeqSBAIJ"
1209 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1210 {
1211   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1212   PetscScalar    oalpha = alpha;
1213   PetscErrorCode ierr;
1214   PetscBLASInt   one = 1,totalnz;
1215 
1216   PetscFunctionBegin;
1217   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
1218   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1219   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1220   PetscFunctionReturn(0);
1221 }
1222 
1223 #undef __FUNCT__
1224 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1225 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1226 {
1227   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1228   const MatScalar *v       = a->a;
1229   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1230   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1231   PetscErrorCode  ierr;
1232   const PetscInt  *aj=a->j,*col;
1233 
1234   PetscFunctionBegin;
1235   if (type == NORM_FROBENIUS) {
1236     for (k=0; k<mbs; k++) {
1237       jmin = a->i[k]; jmax = a->i[k+1];
1238       col  = aj + jmin;
1239       if (*col == k) {         /* diagonal block */
1240         for (i=0; i<bs2; i++) {
1241           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1242         }
1243         jmin++;
1244       }
1245       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
1246         for (i=0; i<bs2; i++) {
1247           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1248         }
1249       }
1250     }
1251     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1252     ierr = PetscLogFlops(2*bs2*a->nz);CHKERRQ(ierr);
1253   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1254     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
1255     for (i=0; i<mbs; i++) jl[i] = mbs;
1256     il[0] = 0;
1257 
1258     *norm = 0.0;
1259     for (k=0; k<mbs; k++) { /* k_th block row */
1260       for (j=0; j<bs; j++) sum[j]=0.0;
1261       /*-- col sum --*/
1262       i = jl[k]; /* first |A(i,k)| to be added */
1263       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1264                   at step k */
1265       while (i<mbs) {
1266         nexti = jl[i];  /* next block row to be added */
1267         ik    = il[i];  /* block index of A(i,k) in the array a */
1268         for (j=0; j<bs; j++) {
1269           v = a->a + ik*bs2 + j*bs;
1270           for (k1=0; k1<bs; k1++) {
1271             sum[j] += PetscAbsScalar(*v); v++;
1272           }
1273         }
1274         /* update il, jl */
1275         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1276         jmax = a->i[i+1];
1277         if (jmin < jmax) {
1278           il[i] = jmin;
1279           j     = a->j[jmin];
1280           jl[i] = jl[j]; jl[j]=i;
1281         }
1282         i = nexti;
1283       }
1284       /*-- row sum --*/
1285       jmin = a->i[k]; jmax = a->i[k+1];
1286       for (i=jmin; i<jmax; i++) {
1287         for (j=0; j<bs; j++) {
1288           v = a->a + i*bs2 + j;
1289           for (k1=0; k1<bs; k1++) {
1290             sum[j] += PetscAbsScalar(*v); v += bs;
1291           }
1292         }
1293       }
1294       /* add k_th block row to il, jl */
1295       col = aj+jmin;
1296       if (*col == k) jmin++;
1297       if (jmin < jmax) {
1298         il[k] = jmin;
1299         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1300       }
1301       for (j=0; j<bs; j++) {
1302         if (sum[j] > *norm) *norm = sum[j];
1303       }
1304     }
1305     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1306     ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr);
1307   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1308   PetscFunctionReturn(0);
1309 }
1310 
1311 #undef __FUNCT__
1312 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1313 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1314 {
1315   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1316   PetscErrorCode ierr;
1317 
1318   PetscFunctionBegin;
1319   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1320   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1321     *flg = PETSC_FALSE;
1322     PetscFunctionReturn(0);
1323   }
1324 
1325   /* if the a->i are the same */
1326   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1327   if (!*flg) PetscFunctionReturn(0);
1328 
1329   /* if a->j are the same */
1330   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1331   if (!*flg) PetscFunctionReturn(0);
1332 
1333   /* if a->a are the same */
1334   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1335   PetscFunctionReturn(0);
1336 }
1337 
1338 #undef __FUNCT__
1339 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1340 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1341 {
1342   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1343   PetscErrorCode  ierr;
1344   PetscInt        i,j,k,row,bs,ambs,bs2;
1345   const PetscInt  *ai,*aj;
1346   PetscScalar     *x,zero = 0.0;
1347   const MatScalar *aa,*aa_j;
1348 
1349   PetscFunctionBegin;
1350   bs = A->rmap->bs;
1351   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1352 
1353   aa   = a->a;
1354   ambs = a->mbs;
1355 
1356   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1357     PetscInt *diag=a->diag;
1358     aa   = a->a;
1359     ambs = a->mbs;
1360     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1361     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1362     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1363     PetscFunctionReturn(0);
1364   }
1365 
1366   ai   = a->i;
1367   aj   = a->j;
1368   bs2  = a->bs2;
1369   ierr = VecSet(v,zero);CHKERRQ(ierr);
1370   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1371   for (i=0; i<ambs; i++) {
1372     j=ai[i];
1373     if (aj[j] == i) {    /* if this is a diagonal element */
1374       row  = i*bs;
1375       aa_j = aa + j*bs2;
1376       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1377     }
1378   }
1379   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1380   PetscFunctionReturn(0);
1381 }
1382 
1383 #undef __FUNCT__
1384 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1385 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1386 {
1387   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1388   PetscScalar       x;
1389   const PetscScalar *l,*li,*ri;
1390   MatScalar         *aa,*v;
1391   PetscErrorCode    ierr;
1392   PetscInt          i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1393   PetscBool         flg;
1394 
1395   PetscFunctionBegin;
1396   if (ll != rr) {
1397     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1398     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1399   }
1400   if (!ll) PetscFunctionReturn(0);
1401   ai  = a->i;
1402   aj  = a->j;
1403   aa  = a->a;
1404   m   = A->rmap->N;
1405   bs  = A->rmap->bs;
1406   mbs = a->mbs;
1407   bs2 = a->bs2;
1408 
1409   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
1410   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1411   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1412   for (i=0; i<mbs; i++) { /* for each block row */
1413     M  = ai[i+1] - ai[i];
1414     li = l + i*bs;
1415     v  = aa + bs2*ai[i];
1416     for (j=0; j<M; j++) { /* for each block */
1417       ri = l + bs*aj[ai[i]+j];
1418       for (k=0; k<bs; k++) {
1419         x = ri[k];
1420         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1421       }
1422     }
1423   }
1424   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1425   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1426   PetscFunctionReturn(0);
1427 }
1428 
1429 #undef __FUNCT__
1430 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1431 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1432 {
1433   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1434 
1435   PetscFunctionBegin;
1436   info->block_size   = a->bs2;
1437   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
1438   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
1439   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
1440   info->assemblies   = A->num_ass;
1441   info->mallocs      = A->info.mallocs;
1442   info->memory       = ((PetscObject)A)->mem;
1443   if (A->factortype) {
1444     info->fill_ratio_given  = A->info.fill_ratio_given;
1445     info->fill_ratio_needed = A->info.fill_ratio_needed;
1446     info->factor_mallocs    = A->info.factor_mallocs;
1447   } else {
1448     info->fill_ratio_given  = 0;
1449     info->fill_ratio_needed = 0;
1450     info->factor_mallocs    = 0;
1451   }
1452   PetscFunctionReturn(0);
1453 }
1454 
1455 
1456 #undef __FUNCT__
1457 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1458 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1459 {
1460   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1461   PetscErrorCode ierr;
1462 
1463   PetscFunctionBegin;
1464   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1465   PetscFunctionReturn(0);
1466 }
1467 
1468 #undef __FUNCT__
1469 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1470 /*
1471    This code does not work since it only checks the upper triangular part of
1472   the matrix. Hence it is not listed in the function table.
1473 */
1474 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1475 {
1476   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1477   PetscErrorCode  ierr;
1478   PetscInt        i,j,n,row,col,bs,mbs;
1479   const PetscInt  *ai,*aj;
1480   PetscReal       atmp;
1481   const MatScalar *aa;
1482   PetscScalar     *x;
1483   PetscInt        ncols,brow,bcol,krow,kcol;
1484 
1485   PetscFunctionBegin;
1486   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1487   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1488   bs  = A->rmap->bs;
1489   aa  = a->a;
1490   ai  = a->i;
1491   aj  = a->j;
1492   mbs = a->mbs;
1493 
1494   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1495   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1496   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1497   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1498   for (i=0; i<mbs; i++) {
1499     ncols = ai[1] - ai[0]; ai++;
1500     brow  = bs*i;
1501     for (j=0; j<ncols; j++) {
1502       bcol = bs*(*aj);
1503       for (kcol=0; kcol<bs; kcol++) {
1504         col = bcol + kcol;      /* col index */
1505         for (krow=0; krow<bs; krow++) {
1506           atmp = PetscAbsScalar(*aa); aa++;
1507           row  = brow + krow;   /* row index */
1508           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1509           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1510         }
1511       }
1512       aj++;
1513     }
1514   }
1515   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1516   PetscFunctionReturn(0);
1517 }
1518