xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 1873fc9fedfc3bd8fc2f82ccd29ab8b44a80f1db)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/impls/dense/seq/dense.h>
4 #include <../src/mat/impls/sbaij/seq/sbaij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6 #include <petscbt.h>
7 #include <petscblaslapack.h>
8 
9 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10 {
11   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
12   PetscErrorCode ierr;
13   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
14   const PetscInt *idx;
15   PetscBT        table_out,table_in;
16 
17   PetscFunctionBegin;
18   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
19   mbs  = a->mbs;
20   ai   = a->i;
21   aj   = a->j;
22   bs   = A->rmap->bs;
23   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
24   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
25   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
26   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
27 
28   for (i=0; i<is_max; i++) { /* for each is */
29     isz  = 0;
30     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
31 
32     /* Extract the indices, assume there can be duplicate entries */
33     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
34     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
35 
36     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
37     bcol_max = 0;
38     for (j=0; j<n; ++j) {
39       brow = idx[j]/bs; /* convert the indices into block indices */
40       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
41       if (!PetscBTLookupSet(table_out,brow)) {
42         nidx[isz++] = brow;
43         if (bcol_max < brow) bcol_max = brow;
44       }
45     }
46     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
47     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
48 
49     k = 0;
50     for (j=0; j<ov; j++) { /* for each overlap */
51       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
52       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
53       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
54 
55       n = isz;  /* length of the updated is[i] */
56       for (brow=0; brow<mbs; brow++) {
57         start = ai[brow]; end   = ai[brow+1];
58         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
59           for (l = start; l<end; l++) {
60             bcol = aj[l];
61             if (!PetscBTLookupSet(table_out,bcol)) {
62               nidx[isz++] = bcol;
63               if (bcol_max < bcol) bcol_max = bcol;
64             }
65           }
66           k++;
67           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
68         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
69           for (l = start; l<end; l++) {
70             bcol = aj[l];
71             if (bcol > bcol_max) break;
72             if (PetscBTLookup(table_in,bcol)) {
73               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
74               break; /* for l = start; l<end ; l++) */
75             }
76           }
77         }
78       }
79     } /* for each overlap */
80 
81     /* expand the Index Set */
82     for (j=0; j<isz; j++) {
83       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
84     }
85     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
86   }
87   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
88   ierr = PetscFree(nidx);CHKERRQ(ierr);
89   ierr = PetscFree(nidx2);CHKERRQ(ierr);
90   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
91   PetscFunctionReturn(0);
92 }
93 
94 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
95         Zero some ops' to avoid invalid usse */
96 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
97 {
98   PetscErrorCode ierr;
99 
100   PetscFunctionBegin;
101   ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr);
102   Bseq->ops->mult                   = 0;
103   Bseq->ops->multadd                = 0;
104   Bseq->ops->multtranspose          = 0;
105   Bseq->ops->multtransposeadd       = 0;
106   Bseq->ops->lufactor               = 0;
107   Bseq->ops->choleskyfactor         = 0;
108   Bseq->ops->lufactorsymbolic       = 0;
109   Bseq->ops->choleskyfactorsymbolic = 0;
110   Bseq->ops->getinertia             = 0;
111   PetscFunctionReturn(0);
112 }
113 
114 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
115 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
116 {
117   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*c;
118   PetscErrorCode ierr;
119   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
120   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
121   const PetscInt *irow,*icol;
122   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
123   PetscInt       *aj = a->j,*ai = a->i;
124   MatScalar      *mat_a;
125   Mat            C;
126   PetscBool      flag;
127 
128   PetscFunctionBegin;
129 
130   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
131   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
132   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
133   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
134 
135   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
136   ssmap = smap;
137   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
138   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
139   /* determine lens of each row */
140   for (i=0; i<nrows; i++) {
141     kstart  = ai[irow[i]];
142     kend    = kstart + a->ilen[irow[i]];
143     lens[i] = 0;
144     for (k=kstart; k<kend; k++) {
145       if (ssmap[aj[k]]) lens[i]++;
146     }
147   }
148   /* Create and fill new matrix */
149   if (scall == MAT_REUSE_MATRIX) {
150     c = (Mat_SeqSBAIJ*)((*B)->data);
151 
152     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
153     ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr);
154     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
155     ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr);
156     C    = *B;
157   } else {
158     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
159     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
160     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
161     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
162   }
163   c = (Mat_SeqSBAIJ*)(C->data);
164   for (i=0; i<nrows; i++) {
165     row      = irow[i];
166     kstart   = ai[row];
167     kend     = kstart + a->ilen[row];
168     mat_i    = c->i[i];
169     mat_j    = c->j + mat_i;
170     mat_a    = c->a + mat_i*bs2;
171     mat_ilen = c->ilen + i;
172     for (k=kstart; k<kend; k++) {
173       if ((tcol=ssmap[a->j[k]])) {
174         *mat_j++ = tcol - 1;
175         ierr     = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr);
176         mat_a   += bs2;
177         (*mat_ilen)++;
178       }
179     }
180   }
181   /* sort */
182   {
183     MatScalar *work;
184 
185     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
186     for (i=0; i<nrows; i++) {
187       PetscInt ilen;
188       mat_i = c->i[i];
189       mat_j = c->j + mat_i;
190       mat_a = c->a + mat_i*bs2;
191       ilen  = c->ilen[i];
192       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
193     }
194     ierr = PetscFree(work);CHKERRQ(ierr);
195   }
196 
197   /* Free work space */
198   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
199   ierr = PetscFree(smap);CHKERRQ(ierr);
200   ierr = PetscFree(lens);CHKERRQ(ierr);
201   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
202   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
203 
204   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
205   *B   = C;
206   PetscFunctionReturn(0);
207 }
208 
209 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
210 {
211   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
212   IS             is1,is2;
213   PetscErrorCode ierr;
214   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
215   const PetscInt *irow,*icol;
216 
217   PetscFunctionBegin;
218   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
219   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
220   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
221   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
222 
223   /* Verify if the indices corespond to each element in a block
224    and form the IS with compressed IS */
225   maxmnbs = PetscMax(a->mbs,a->nbs);
226   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
227   ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr);
228   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
229   for (i=0; i<a->mbs; i++) {
230     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
231   }
232   count = 0;
233   for (i=0; i<nrows; i++) {
234     PetscInt j = irow[i] / bs;
235     if ((vary[j]--)==bs) iary[count++] = j;
236   }
237   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
238 
239   ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr);
240   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
241   for (i=0; i<a->nbs; i++) {
242     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
243   }
244   count = 0;
245   for (i=0; i<ncols; i++) {
246     PetscInt j = icol[i] / bs;
247     if ((vary[j]--)==bs) iary[count++] = j;
248   }
249   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
250   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
251   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
252   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
253 
254   ierr = MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
255   ierr = ISDestroy(&is1);CHKERRQ(ierr);
256   ierr = ISDestroy(&is2);CHKERRQ(ierr);
257 
258   if (isrow != iscol) {
259     PetscBool isequal;
260     ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr);
261     if (!isequal) {
262       ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr);
263     }
264   }
265   PetscFunctionReturn(0);
266 }
267 
268 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
269 {
270   PetscErrorCode ierr;
271   PetscInt       i;
272 
273   PetscFunctionBegin;
274   if (scall == MAT_INITIAL_MATRIX) {
275     ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
276   }
277 
278   for (i=0; i<n; i++) {
279     ierr = MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
280   }
281   PetscFunctionReturn(0);
282 }
283 
284 /* -------------------------------------------------------*/
285 /* Should check that shapes of vectors and matrices match */
286 /* -------------------------------------------------------*/
287 
288 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
289 {
290   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
291   PetscScalar       *z,x1,x2,zero=0.0;
292   const PetscScalar *x,*xb;
293   const MatScalar   *v;
294   PetscErrorCode    ierr;
295   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
296   const PetscInt    *aj=a->j,*ai=a->i,*ib;
297   PetscInt          nonzerorow=0;
298 
299   PetscFunctionBegin;
300   ierr = VecSet(zz,zero);CHKERRQ(ierr);
301   if (!a->nz) PetscFunctionReturn(0);
302   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
303   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
304 
305   v  = a->a;
306   xb = x;
307 
308   for (i=0; i<mbs; i++) {
309     n           = ai[1] - ai[0]; /* length of i_th block row of A */
310     x1          = xb[0]; x2 = xb[1];
311     ib          = aj + *ai;
312     jmin        = 0;
313     nonzerorow += (n>0);
314     if (*ib == i) {     /* (diag of A)*x */
315       z[2*i]   += v[0]*x1 + v[2]*x2;
316       z[2*i+1] += v[2]*x1 + v[3]*x2;
317       v        += 4; jmin++;
318     }
319     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
320     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
321     for (j=jmin; j<n; j++) {
322       /* (strict lower triangular part of A)*x  */
323       cval       = ib[j]*2;
324       z[cval]   += v[0]*x1 + v[1]*x2;
325       z[cval+1] += v[2]*x1 + v[3]*x2;
326       /* (strict upper triangular part of A)*x  */
327       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
328       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
329       v        += 4;
330     }
331     xb +=2; ai++;
332   }
333 
334   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
335   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
336   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
337   PetscFunctionReturn(0);
338 }
339 
340 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
341 {
342   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
343   PetscScalar       *z,x1,x2,x3,zero=0.0;
344   const PetscScalar *x,*xb;
345   const MatScalar   *v;
346   PetscErrorCode    ierr;
347   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
348   const PetscInt    *aj = a->j,*ai = a->i,*ib;
349   PetscInt          nonzerorow=0;
350 
351   PetscFunctionBegin;
352   ierr = VecSet(zz,zero);CHKERRQ(ierr);
353   if (!a->nz) PetscFunctionReturn(0);
354   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
355   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
356 
357   v  = a->a;
358   xb = x;
359 
360   for (i=0; i<mbs; i++) {
361     n           = ai[1] - ai[0]; /* length of i_th block row of A */
362     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
363     ib          = aj + *ai;
364     jmin        = 0;
365     nonzerorow += (n>0);
366     if (*ib == i) {     /* (diag of A)*x */
367       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
368       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
369       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
370       v        += 9; jmin++;
371     }
372     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
373     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
374     for (j=jmin; j<n; j++) {
375       /* (strict lower triangular part of A)*x  */
376       cval       = ib[j]*3;
377       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
378       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
379       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
380       /* (strict upper triangular part of A)*x  */
381       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
382       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
383       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
384       v        += 9;
385     }
386     xb +=3; ai++;
387   }
388 
389   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
390   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
391   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
392   PetscFunctionReturn(0);
393 }
394 
395 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
396 {
397   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
398   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
399   const PetscScalar *x,*xb;
400   const MatScalar   *v;
401   PetscErrorCode    ierr;
402   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
403   const PetscInt    *aj = a->j,*ai = a->i,*ib;
404   PetscInt          nonzerorow = 0;
405 
406   PetscFunctionBegin;
407   ierr = VecSet(zz,zero);CHKERRQ(ierr);
408   if (!a->nz) PetscFunctionReturn(0);
409   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
410   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
411 
412   v  = a->a;
413   xb = x;
414 
415   for (i=0; i<mbs; i++) {
416     n           = ai[1] - ai[0]; /* length of i_th block row of A */
417     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
418     ib          = aj + *ai;
419     jmin        = 0;
420     nonzerorow += (n>0);
421     if (*ib == i) {     /* (diag of A)*x */
422       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
423       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
424       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
425       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
426       v        += 16; jmin++;
427     }
428     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
429     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
430     for (j=jmin; j<n; j++) {
431       /* (strict lower triangular part of A)*x  */
432       cval       = ib[j]*4;
433       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
434       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
435       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
436       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
437       /* (strict upper triangular part of A)*x  */
438       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
439       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
440       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
441       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
442       v        += 16;
443     }
444     xb +=4; ai++;
445   }
446 
447   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
448   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
449   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
450   PetscFunctionReturn(0);
451 }
452 
453 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
454 {
455   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
456   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
457   const PetscScalar *x,*xb;
458   const MatScalar   *v;
459   PetscErrorCode    ierr;
460   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
461   const PetscInt    *aj = a->j,*ai = a->i,*ib;
462   PetscInt          nonzerorow=0;
463 
464   PetscFunctionBegin;
465   ierr = VecSet(zz,zero);CHKERRQ(ierr);
466   if (!a->nz) PetscFunctionReturn(0);
467   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
468   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
469 
470   v  = a->a;
471   xb = x;
472 
473   for (i=0; i<mbs; i++) {
474     n           = ai[1] - ai[0]; /* length of i_th block row of A */
475     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
476     ib          = aj + *ai;
477     jmin        = 0;
478     nonzerorow += (n>0);
479     if (*ib == i) {      /* (diag of A)*x */
480       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
481       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
482       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
483       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
484       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
485       v        += 25; jmin++;
486     }
487     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
488     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
489     for (j=jmin; j<n; j++) {
490       /* (strict lower triangular part of A)*x  */
491       cval       = ib[j]*5;
492       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
493       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
494       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
495       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
496       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
497       /* (strict upper triangular part of A)*x  */
498       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
499       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
500       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
501       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
502       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
503       v        += 25;
504     }
505     xb +=5; ai++;
506   }
507 
508   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
509   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
510   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
511   PetscFunctionReturn(0);
512 }
513 
514 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
515 {
516   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
517   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
518   const PetscScalar *x,*xb;
519   const MatScalar   *v;
520   PetscErrorCode    ierr;
521   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
522   const PetscInt    *aj=a->j,*ai=a->i,*ib;
523   PetscInt          nonzerorow=0;
524 
525   PetscFunctionBegin;
526   ierr = VecSet(zz,zero);CHKERRQ(ierr);
527   if (!a->nz) PetscFunctionReturn(0);
528   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
529   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
530 
531   v  = a->a;
532   xb = x;
533 
534   for (i=0; i<mbs; i++) {
535     n           = ai[1] - ai[0]; /* length of i_th block row of A */
536     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
537     ib          = aj + *ai;
538     jmin        = 0;
539     nonzerorow += (n>0);
540     if (*ib == i) {      /* (diag of A)*x */
541       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
542       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
543       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
544       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
545       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
546       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
547       v        += 36; jmin++;
548     }
549     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
550     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
551     for (j=jmin; j<n; j++) {
552       /* (strict lower triangular part of A)*x  */
553       cval       = ib[j]*6;
554       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
555       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
556       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
557       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
558       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
559       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
560       /* (strict upper triangular part of A)*x  */
561       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
562       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
563       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
564       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
565       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
566       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
567       v        += 36;
568     }
569     xb +=6; ai++;
570   }
571 
572   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
573   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
574   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
575   PetscFunctionReturn(0);
576 }
577 
578 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
579 {
580   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
581   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
582   const PetscScalar *x,*xb;
583   const MatScalar   *v;
584   PetscErrorCode    ierr;
585   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
586   const PetscInt    *aj=a->j,*ai=a->i,*ib;
587   PetscInt          nonzerorow=0;
588 
589   PetscFunctionBegin;
590   ierr = VecSet(zz,zero);CHKERRQ(ierr);
591   if (!a->nz) PetscFunctionReturn(0);
592   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
593   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
594 
595   v  = a->a;
596   xb = x;
597 
598   for (i=0; i<mbs; i++) {
599     n           = ai[1] - ai[0]; /* length of i_th block row of A */
600     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
601     ib          = aj + *ai;
602     jmin        = 0;
603     nonzerorow += (n>0);
604     if (*ib == i) {      /* (diag of A)*x */
605       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
606       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
607       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
608       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
609       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
610       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
611       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
612       v        += 49; jmin++;
613     }
614     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
615     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
616     for (j=jmin; j<n; j++) {
617       /* (strict lower triangular part of A)*x  */
618       cval       = ib[j]*7;
619       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
620       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
621       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
622       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
623       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
624       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
625       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
626       /* (strict upper triangular part of A)*x  */
627       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
628       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
629       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
630       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
631       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
632       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
633       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
634       v       += 49;
635     }
636     xb +=7; ai++;
637   }
638   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
639   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
640   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
641   PetscFunctionReturn(0);
642 }
643 
644 /*
645     This will not work with MatScalar == float because it calls the BLAS
646 */
647 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
648 {
649   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
650   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
651   const PetscScalar *x,*x_ptr,*xb;
652   const MatScalar   *v;
653   PetscErrorCode    ierr;
654   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
655   const PetscInt    *idx,*aj,*ii;
656   PetscInt          nonzerorow=0;
657 
658   PetscFunctionBegin;
659   ierr = VecSet(zz,zero);CHKERRQ(ierr);
660   if (!a->nz) PetscFunctionReturn(0);
661   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x;
662   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
663 
664   aj = a->j;
665   v  = a->a;
666   ii = a->i;
667 
668   if (!a->mult_work) {
669     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
670   }
671   work = a->mult_work;
672 
673   for (i=0; i<mbs; i++) {
674     n           = ii[1] - ii[0]; ncols = n*bs;
675     workt       = work; idx=aj+ii[0];
676     nonzerorow += (n>0);
677 
678     /* upper triangular part */
679     for (j=0; j<n; j++) {
680       xb = x_ptr + bs*(*idx++);
681       for (k=0; k<bs; k++) workt[k] = xb[k];
682       workt += bs;
683     }
684     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
685     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
686 
687     /* strict lower triangular part */
688     idx = aj+ii[0];
689     if (*idx == i) {
690       ncols -= bs; v += bs2; idx++; n--;
691     }
692 
693     if (ncols > 0) {
694       workt = work;
695       ierr  = PetscArrayzero(workt,ncols);CHKERRQ(ierr);
696       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
697       for (j=0; j<n; j++) {
698         zb = z_ptr + bs*(*idx++);
699         for (k=0; k<bs; k++) zb[k] += workt[k];
700         workt += bs;
701       }
702     }
703     x += bs; v += n*bs2; z += bs; ii++;
704   }
705 
706   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
707   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
708   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
709   PetscFunctionReturn(0);
710 }
711 
712 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
713 {
714   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
715   PetscScalar       *z,x1;
716   const PetscScalar *x,*xb;
717   const MatScalar   *v;
718   PetscErrorCode    ierr;
719   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
720   const PetscInt    *aj=a->j,*ai=a->i,*ib;
721   PetscInt          nonzerorow=0;
722 
723   PetscFunctionBegin;
724   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
725   if (!a->nz) PetscFunctionReturn(0);
726   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
727   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
728   v    = a->a;
729   xb   = x;
730 
731   for (i=0; i<mbs; i++) {
732     n           = ai[1] - ai[0]; /* length of i_th row of A */
733     x1          = xb[0];
734     ib          = aj + *ai;
735     jmin        = 0;
736     nonzerorow += (n>0);
737     if (*ib == i) {            /* (diag of A)*x */
738       z[i] += *v++ * x[*ib++]; jmin++;
739     }
740     for (j=jmin; j<n; j++) {
741       cval    = *ib;
742       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
743       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
744     }
745     xb++; ai++;
746   }
747 
748   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
749   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
750 
751   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
752   PetscFunctionReturn(0);
753 }
754 
755 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
756 {
757   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
758   PetscScalar       *z,x1,x2;
759   const PetscScalar *x,*xb;
760   const MatScalar   *v;
761   PetscErrorCode    ierr;
762   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
763   const PetscInt    *aj=a->j,*ai=a->i,*ib;
764   PetscInt          nonzerorow=0;
765 
766   PetscFunctionBegin;
767   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
768   if (!a->nz) PetscFunctionReturn(0);
769   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
770   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
771 
772   v  = a->a;
773   xb = x;
774 
775   for (i=0; i<mbs; i++) {
776     n           = ai[1] - ai[0]; /* length of i_th block row of A */
777     x1          = xb[0]; x2 = xb[1];
778     ib          = aj + *ai;
779     jmin        = 0;
780     nonzerorow += (n>0);
781     if (*ib == i) {      /* (diag of A)*x */
782       z[2*i]   += v[0]*x1 + v[2]*x2;
783       z[2*i+1] += v[2]*x1 + v[3]*x2;
784       v        += 4; jmin++;
785     }
786     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
787     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
788     for (j=jmin; j<n; j++) {
789       /* (strict lower triangular part of A)*x  */
790       cval       = ib[j]*2;
791       z[cval]   += v[0]*x1 + v[1]*x2;
792       z[cval+1] += v[2]*x1 + v[3]*x2;
793       /* (strict upper triangular part of A)*x  */
794       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
795       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
796       v        += 4;
797     }
798     xb +=2; ai++;
799   }
800   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
801   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
802 
803   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
804   PetscFunctionReturn(0);
805 }
806 
807 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
808 {
809   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
810   PetscScalar       *z,x1,x2,x3;
811   const PetscScalar *x,*xb;
812   const MatScalar   *v;
813   PetscErrorCode    ierr;
814   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
815   const PetscInt    *aj=a->j,*ai=a->i,*ib;
816   PetscInt          nonzerorow=0;
817 
818   PetscFunctionBegin;
819   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
820   if (!a->nz) PetscFunctionReturn(0);
821   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
822   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
823 
824   v  = a->a;
825   xb = x;
826 
827   for (i=0; i<mbs; i++) {
828     n           = ai[1] - ai[0]; /* length of i_th block row of A */
829     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
830     ib          = aj + *ai;
831     jmin        = 0;
832     nonzerorow += (n>0);
833     if (*ib == i) {     /* (diag of A)*x */
834       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
835       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
836       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
837       v        += 9; jmin++;
838     }
839     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
840     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
841     for (j=jmin; j<n; j++) {
842       /* (strict lower triangular part of A)*x  */
843       cval       = ib[j]*3;
844       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
845       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
846       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
847       /* (strict upper triangular part of A)*x  */
848       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
849       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
850       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
851       v        += 9;
852     }
853     xb +=3; ai++;
854   }
855 
856   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
857   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
858 
859   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
860   PetscFunctionReturn(0);
861 }
862 
863 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
864 {
865   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
866   PetscScalar       *z,x1,x2,x3,x4;
867   const PetscScalar *x,*xb;
868   const MatScalar   *v;
869   PetscErrorCode    ierr;
870   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
871   const PetscInt    *aj=a->j,*ai=a->i,*ib;
872   PetscInt          nonzerorow=0;
873 
874   PetscFunctionBegin;
875   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
876   if (!a->nz) PetscFunctionReturn(0);
877   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
878   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
879 
880   v  = a->a;
881   xb = x;
882 
883   for (i=0; i<mbs; i++) {
884     n           = ai[1] - ai[0]; /* length of i_th block row of A */
885     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
886     ib          = aj + *ai;
887     jmin        = 0;
888     nonzerorow += (n>0);
889     if (*ib == i) {      /* (diag of A)*x */
890       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
891       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
892       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
893       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
894       v        += 16; jmin++;
895     }
896     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
897     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
898     for (j=jmin; j<n; j++) {
899       /* (strict lower triangular part of A)*x  */
900       cval       = ib[j]*4;
901       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
902       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
903       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
904       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
905       /* (strict upper triangular part of A)*x  */
906       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
907       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
908       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
909       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
910       v        += 16;
911     }
912     xb +=4; ai++;
913   }
914 
915   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
916   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
917 
918   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
919   PetscFunctionReturn(0);
920 }
921 
922 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
923 {
924   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
925   PetscScalar       *z,x1,x2,x3,x4,x5;
926   const PetscScalar *x,*xb;
927   const MatScalar   *v;
928   PetscErrorCode    ierr;
929   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
930   const PetscInt    *aj=a->j,*ai=a->i,*ib;
931   PetscInt          nonzerorow=0;
932 
933   PetscFunctionBegin;
934   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
935   if (!a->nz) PetscFunctionReturn(0);
936   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
937   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
938 
939   v  = a->a;
940   xb = x;
941 
942   for (i=0; i<mbs; i++) {
943     n           = ai[1] - ai[0]; /* length of i_th block row of A */
944     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
945     ib          = aj + *ai;
946     jmin        = 0;
947     nonzerorow += (n>0);
948     if (*ib == i) {      /* (diag of A)*x */
949       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
950       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
951       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
952       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
953       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
954       v        += 25; jmin++;
955     }
956     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
957     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
958     for (j=jmin; j<n; j++) {
959       /* (strict lower triangular part of A)*x  */
960       cval       = ib[j]*5;
961       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
962       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
963       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
964       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
965       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
966       /* (strict upper triangular part of A)*x  */
967       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
968       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
969       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
970       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
971       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
972       v        += 25;
973     }
974     xb +=5; ai++;
975   }
976 
977   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
978   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
979 
980   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
981   PetscFunctionReturn(0);
982 }
983 
984 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
985 {
986   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
987   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
988   const PetscScalar *x,*xb;
989   const MatScalar   *v;
990   PetscErrorCode    ierr;
991   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
992   const PetscInt    *aj=a->j,*ai=a->i,*ib;
993   PetscInt          nonzerorow=0;
994 
995   PetscFunctionBegin;
996   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
997   if (!a->nz) PetscFunctionReturn(0);
998   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
999   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1000 
1001   v  = a->a;
1002   xb = x;
1003 
1004   for (i=0; i<mbs; i++) {
1005     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1006     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
1007     ib          = aj + *ai;
1008     jmin        = 0;
1009     nonzerorow += (n>0);
1010     if (*ib == i) {     /* (diag of A)*x */
1011       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
1012       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1013       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1014       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1015       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1016       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1017       v        += 36; jmin++;
1018     }
1019     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1020     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1021     for (j=jmin; j<n; j++) {
1022       /* (strict lower triangular part of A)*x  */
1023       cval       = ib[j]*6;
1024       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1025       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1026       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1027       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1028       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1029       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1030       /* (strict upper triangular part of A)*x  */
1031       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1032       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1033       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1034       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1035       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1036       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1037       v        += 36;
1038     }
1039     xb +=6; ai++;
1040   }
1041 
1042   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1043   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1044 
1045   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1046   PetscFunctionReturn(0);
1047 }
1048 
1049 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1050 {
1051   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1052   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1053   const PetscScalar *x,*xb;
1054   const MatScalar   *v;
1055   PetscErrorCode    ierr;
1056   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1057   const PetscInt    *aj=a->j,*ai=a->i,*ib;
1058   PetscInt          nonzerorow=0;
1059 
1060   PetscFunctionBegin;
1061   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1062   if (!a->nz) PetscFunctionReturn(0);
1063   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1064   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1065 
1066   v  = a->a;
1067   xb = x;
1068 
1069   for (i=0; i<mbs; i++) {
1070     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1071     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1072     ib          = aj + *ai;
1073     jmin        = 0;
1074     nonzerorow += (n>0);
1075     if (*ib == i) {     /* (diag of A)*x */
1076       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1077       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1078       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1079       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1080       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1081       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1082       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1083       v        += 49; jmin++;
1084     }
1085     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1086     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1087     for (j=jmin; j<n; j++) {
1088       /* (strict lower triangular part of A)*x  */
1089       cval       = ib[j]*7;
1090       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1091       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1092       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1093       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1094       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1095       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1096       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1097       /* (strict upper triangular part of A)*x  */
1098       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1099       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1100       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1101       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1102       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1103       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1104       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1105       v       += 49;
1106     }
1107     xb +=7; ai++;
1108   }
1109 
1110   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1111   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1112 
1113   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1114   PetscFunctionReturn(0);
1115 }
1116 
1117 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1118 {
1119   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1120   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1121   const PetscScalar *x,*x_ptr,*xb;
1122   const MatScalar   *v;
1123   PetscErrorCode    ierr;
1124   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1125   const PetscInt    *idx,*aj,*ii;
1126   PetscInt          nonzerorow=0;
1127 
1128   PetscFunctionBegin;
1129   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1130   if (!a->nz) PetscFunctionReturn(0);
1131   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
1132   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1133 
1134   aj = a->j;
1135   v  = a->a;
1136   ii = a->i;
1137 
1138   if (!a->mult_work) {
1139     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
1140   }
1141   work = a->mult_work;
1142 
1143 
1144   for (i=0; i<mbs; i++) {
1145     n           = ii[1] - ii[0]; ncols = n*bs;
1146     workt       = work; idx=aj+ii[0];
1147     nonzerorow += (n>0);
1148 
1149     /* upper triangular part */
1150     for (j=0; j<n; j++) {
1151       xb = x_ptr + bs*(*idx++);
1152       for (k=0; k<bs; k++) workt[k] = xb[k];
1153       workt += bs;
1154     }
1155     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1156     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1157 
1158     /* strict lower triangular part */
1159     idx = aj+ii[0];
1160     if (*idx == i) {
1161       ncols -= bs; v += bs2; idx++; n--;
1162     }
1163     if (ncols > 0) {
1164       workt = work;
1165       ierr  = PetscArrayzero(workt,ncols);CHKERRQ(ierr);
1166       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1167       for (j=0; j<n; j++) {
1168         zb = z_ptr + bs*(*idx++);
1169         for (k=0; k<bs; k++) zb[k] += workt[k];
1170         workt += bs;
1171       }
1172     }
1173 
1174     x += bs; v += n*bs2; z += bs; ii++;
1175   }
1176 
1177   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1178   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1179 
1180   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1181   PetscFunctionReturn(0);
1182 }
1183 
1184 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1185 {
1186   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1187   PetscScalar    oalpha = alpha;
1188   PetscErrorCode ierr;
1189   PetscBLASInt   one = 1,totalnz;
1190 
1191   PetscFunctionBegin;
1192   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
1193   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1194   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1195   PetscFunctionReturn(0);
1196 }
1197 
1198 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1199 {
1200   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1201   const MatScalar *v       = a->a;
1202   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1203   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1204   PetscErrorCode  ierr;
1205   const PetscInt  *aj=a->j,*col;
1206 
1207   PetscFunctionBegin;
1208   if (!a->nz) {
1209     *norm = 0.0;
1210     PetscFunctionReturn(0);
1211   }
1212   if (type == NORM_FROBENIUS) {
1213     for (k=0; k<mbs; k++) {
1214       jmin = a->i[k]; jmax = a->i[k+1];
1215       col  = aj + jmin;
1216       if (*col == k) {         /* diagonal block */
1217         for (i=0; i<bs2; i++) {
1218           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1219         }
1220         jmin++;
1221       }
1222       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
1223         for (i=0; i<bs2; i++) {
1224           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1225         }
1226       }
1227     }
1228     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1229     ierr = PetscLogFlops(2*bs2*a->nz);CHKERRQ(ierr);
1230   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1231     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
1232     for (i=0; i<mbs; i++) jl[i] = mbs;
1233     il[0] = 0;
1234 
1235     *norm = 0.0;
1236     for (k=0; k<mbs; k++) { /* k_th block row */
1237       for (j=0; j<bs; j++) sum[j]=0.0;
1238       /*-- col sum --*/
1239       i = jl[k]; /* first |A(i,k)| to be added */
1240       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1241                   at step k */
1242       while (i<mbs) {
1243         nexti = jl[i];  /* next block row to be added */
1244         ik    = il[i];  /* block index of A(i,k) in the array a */
1245         for (j=0; j<bs; j++) {
1246           v = a->a + ik*bs2 + j*bs;
1247           for (k1=0; k1<bs; k1++) {
1248             sum[j] += PetscAbsScalar(*v); v++;
1249           }
1250         }
1251         /* update il, jl */
1252         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1253         jmax = a->i[i+1];
1254         if (jmin < jmax) {
1255           il[i] = jmin;
1256           j     = a->j[jmin];
1257           jl[i] = jl[j]; jl[j]=i;
1258         }
1259         i = nexti;
1260       }
1261       /*-- row sum --*/
1262       jmin = a->i[k]; jmax = a->i[k+1];
1263       for (i=jmin; i<jmax; i++) {
1264         for (j=0; j<bs; j++) {
1265           v = a->a + i*bs2 + j;
1266           for (k1=0; k1<bs; k1++) {
1267             sum[j] += PetscAbsScalar(*v); v += bs;
1268           }
1269         }
1270       }
1271       /* add k_th block row to il, jl */
1272       col = aj+jmin;
1273       if (*col == k) jmin++;
1274       if (jmin < jmax) {
1275         il[k] = jmin;
1276         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1277       }
1278       for (j=0; j<bs; j++) {
1279         if (sum[j] > *norm) *norm = sum[j];
1280       }
1281     }
1282     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1283     ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr);
1284   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1285   PetscFunctionReturn(0);
1286 }
1287 
1288 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1289 {
1290   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1291   PetscErrorCode ierr;
1292 
1293   PetscFunctionBegin;
1294   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1295   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1296     *flg = PETSC_FALSE;
1297     PetscFunctionReturn(0);
1298   }
1299 
1300   /* if the a->i are the same */
1301   ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr);
1302   if (!*flg) PetscFunctionReturn(0);
1303 
1304   /* if a->j are the same */
1305   ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr);
1306   if (!*flg) PetscFunctionReturn(0);
1307 
1308   /* if a->a are the same */
1309   ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg);CHKERRQ(ierr);
1310   PetscFunctionReturn(0);
1311 }
1312 
1313 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1314 {
1315   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1316   PetscErrorCode  ierr;
1317   PetscInt        i,j,k,row,bs,ambs,bs2;
1318   const PetscInt  *ai,*aj;
1319   PetscScalar     *x,zero = 0.0;
1320   const MatScalar *aa,*aa_j;
1321 
1322   PetscFunctionBegin;
1323   bs = A->rmap->bs;
1324   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1325 
1326   aa   = a->a;
1327   ambs = a->mbs;
1328 
1329   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1330     PetscInt *diag=a->diag;
1331     aa   = a->a;
1332     ambs = a->mbs;
1333     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1334     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1335     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1336     PetscFunctionReturn(0);
1337   }
1338 
1339   ai   = a->i;
1340   aj   = a->j;
1341   bs2  = a->bs2;
1342   ierr = VecSet(v,zero);CHKERRQ(ierr);
1343   if (!a->nz) PetscFunctionReturn(0);
1344   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1345   for (i=0; i<ambs; i++) {
1346     j=ai[i];
1347     if (aj[j] == i) {    /* if this is a diagonal element */
1348       row  = i*bs;
1349       aa_j = aa + j*bs2;
1350       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1351     }
1352   }
1353   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1354   PetscFunctionReturn(0);
1355 }
1356 
1357 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1358 {
1359   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1360   PetscScalar       x;
1361   const PetscScalar *l,*li,*ri;
1362   MatScalar         *aa,*v;
1363   PetscErrorCode    ierr;
1364   PetscInt          i,j,k,lm,M,m,mbs,tmp,bs,bs2;
1365   const PetscInt    *ai,*aj;
1366   PetscBool         flg;
1367 
1368   PetscFunctionBegin;
1369   if (ll != rr) {
1370     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1371     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1372   }
1373   if (!ll) PetscFunctionReturn(0);
1374   ai  = a->i;
1375   aj  = a->j;
1376   aa  = a->a;
1377   m   = A->rmap->N;
1378   bs  = A->rmap->bs;
1379   mbs = a->mbs;
1380   bs2 = a->bs2;
1381 
1382   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
1383   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1384   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1385   for (i=0; i<mbs; i++) { /* for each block row */
1386     M  = ai[i+1] - ai[i];
1387     li = l + i*bs;
1388     v  = aa + bs2*ai[i];
1389     for (j=0; j<M; j++) { /* for each block */
1390       ri = l + bs*aj[ai[i]+j];
1391       for (k=0; k<bs; k++) {
1392         x = ri[k];
1393         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1394       }
1395     }
1396   }
1397   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1398   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1399   PetscFunctionReturn(0);
1400 }
1401 
1402 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1403 {
1404   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1405 
1406   PetscFunctionBegin;
1407   info->block_size   = a->bs2;
1408   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
1409   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
1410   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1411   info->assemblies   = A->num_ass;
1412   info->mallocs      = A->info.mallocs;
1413   info->memory       = ((PetscObject)A)->mem;
1414   if (A->factortype) {
1415     info->fill_ratio_given  = A->info.fill_ratio_given;
1416     info->fill_ratio_needed = A->info.fill_ratio_needed;
1417     info->factor_mallocs    = A->info.factor_mallocs;
1418   } else {
1419     info->fill_ratio_given  = 0;
1420     info->fill_ratio_needed = 0;
1421     info->factor_mallocs    = 0;
1422   }
1423   PetscFunctionReturn(0);
1424 }
1425 
1426 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1427 {
1428   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1429   PetscErrorCode ierr;
1430 
1431   PetscFunctionBegin;
1432   ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr);
1433   PetscFunctionReturn(0);
1434 }
1435 
1436 /*
1437    This code does not work since it only checks the upper triangular part of
1438   the matrix. Hence it is not listed in the function table.
1439 */
1440 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1441 {
1442   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1443   PetscErrorCode  ierr;
1444   PetscInt        i,j,n,row,col,bs,mbs;
1445   const PetscInt  *ai,*aj;
1446   PetscReal       atmp;
1447   const MatScalar *aa;
1448   PetscScalar     *x;
1449   PetscInt        ncols,brow,bcol,krow,kcol;
1450 
1451   PetscFunctionBegin;
1452   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1453   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1454   bs  = A->rmap->bs;
1455   aa  = a->a;
1456   ai  = a->i;
1457   aj  = a->j;
1458   mbs = a->mbs;
1459 
1460   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1461   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1462   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1463   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1464   for (i=0; i<mbs; i++) {
1465     ncols = ai[1] - ai[0]; ai++;
1466     brow  = bs*i;
1467     for (j=0; j<ncols; j++) {
1468       bcol = bs*(*aj);
1469       for (kcol=0; kcol<bs; kcol++) {
1470         col = bcol + kcol;      /* col index */
1471         for (krow=0; krow<bs; krow++) {
1472           atmp = PetscAbsScalar(*aa); aa++;
1473           row  = brow + krow;   /* row index */
1474           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1475           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1476         }
1477       }
1478       aj++;
1479     }
1480   }
1481   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1482   PetscFunctionReturn(0);
1483 }
1484 
1485 PETSC_INTERN PetscErrorCode MatMatMult_SeqSBAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
1486 {
1487   PetscErrorCode ierr;
1488 
1489   PetscFunctionBegin;
1490   if (scall == MAT_INITIAL_MATRIX) {
1491     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
1492     ierr = MatMatMultSymbolic_SeqSBAIJ_SeqDense(A,B,fill,C);CHKERRQ(ierr);
1493     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
1494   }
1495   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
1496   ierr = MatMatMultNumeric_SeqSBAIJ_SeqDense(A,B,*C);CHKERRQ(ierr);
1497   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
1498   PetscFunctionReturn(0);
1499 }
1500 
1501 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
1502 {
1503   PetscErrorCode ierr;
1504 
1505   PetscFunctionBegin;
1506   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
1507 
1508   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1509   PetscFunctionReturn(0);
1510 }
1511 
1512 PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1513 {
1514   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1515   PetscScalar       *z = c;
1516   const PetscScalar *xb;
1517   PetscScalar       x1;
1518   const MatScalar   *v = a->a,*vv;
1519   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1520 
1521   PetscFunctionBegin;
1522   for (i=0; i<mbs; i++) {
1523     n = ii[1] - ii[0]; ii++;
1524     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1525     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1526     jj = idx;
1527     vv = v;
1528     for (k=0; k<cn; k++) {
1529       idx = jj;
1530       v = vv;
1531       for (j=0; j<n; j++) {
1532         xb = b + (*idx); x1 = xb[0+k*bm];
1533         z[0+k*cm] += v[0]*x1;
1534         if (*idx != i) c[(*idx)+k*cm] += v[0]*b[i+k*bm];
1535         v += 1;
1536         ++idx;
1537       }
1538     }
1539     z += 1;
1540   }
1541   PetscFunctionReturn(0);
1542 }
1543 
1544 PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1545 {
1546   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1547   PetscScalar       *z = c;
1548   const PetscScalar *xb;
1549   PetscScalar       x1,x2;
1550   const MatScalar   *v = a->a,*vv;
1551   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1552 
1553   PetscFunctionBegin;
1554   for (i=0; i<mbs; i++) {
1555     n = ii[1] - ii[0]; ii++;
1556     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1557     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1558     jj = idx;
1559     vv = v;
1560     for (k=0; k<cn; k++) {
1561       idx = jj;
1562       v = vv;
1563       for (j=0; j<n; j++) {
1564         xb    = b + 2*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
1565         z[0+k*cm] += v[0]*x1 + v[2]*x2;
1566         z[1+k*cm] += v[1]*x1 + v[3]*x2;
1567         if (*idx != i) {
1568           c[2*(*idx)+0+k*cm] += v[0]*b[2*i+k*bm] + v[1]*b[2*i+1+k*bm];
1569           c[2*(*idx)+1+k*cm] += v[2]*b[2*i+k*bm] + v[3]*b[2*i+1+k*bm];
1570         }
1571         v += 4;
1572         ++idx;
1573       }
1574     }
1575     z += 2;
1576   }
1577   PetscFunctionReturn(0);
1578 }
1579 
1580 PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1581 {
1582   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1583   PetscScalar       *z = c;
1584   const PetscScalar *xb;
1585   PetscScalar       x1,x2,x3;
1586   const MatScalar   *v = a->a,*vv;
1587   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1588 
1589   PetscFunctionBegin;
1590   for (i=0; i<mbs; i++) {
1591     n = ii[1] - ii[0]; ii++;
1592     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1593     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1594     jj = idx;
1595     vv = v;
1596     for (k=0; k<cn; k++) {
1597       idx = jj;
1598       v = vv;
1599       for (j=0; j<n; j++) {
1600         xb = b + 3*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
1601         z[0+k*cm] += v[0]*x1 + v[3]*x2 + v[6]*x3;
1602         z[1+k*cm] += v[1]*x1 + v[4]*x2 + v[7]*x3;
1603         z[2+k*cm] += v[2]*x1 + v[5]*x2 + v[8]*x3;
1604         if (*idx != i) {
1605           c[3*(*idx)+0+k*cm] += v[0]*b[3*i+k*bm] + v[3]*b[3*i+1+k*bm] + v[6]*b[3*i+2+k*bm];
1606           c[3*(*idx)+1+k*cm] += v[1]*b[3*i+k*bm] + v[4]*b[3*i+1+k*bm] + v[7]*b[3*i+2+k*bm];
1607           c[3*(*idx)+2+k*cm] += v[2]*b[3*i+k*bm] + v[5]*b[3*i+1+k*bm] + v[8]*b[3*i+2+k*bm];
1608         }
1609         v += 9;
1610         ++idx;
1611       }
1612     }
1613     z += 3;
1614   }
1615   PetscFunctionReturn(0);
1616 }
1617 
1618 PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1619 {
1620   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1621   PetscScalar       *z = c;
1622   const PetscScalar *xb;
1623   PetscScalar       x1,x2,x3,x4;
1624   const MatScalar   *v = a->a,*vv;
1625   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1626 
1627   PetscFunctionBegin;
1628   for (i=0; i<mbs; i++) {
1629     n = ii[1] - ii[0]; ii++;
1630     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1631     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1632     jj = idx;
1633     vv = v;
1634     for (k=0; k<cn; k++) {
1635       idx = jj;
1636       v = vv;
1637       for (j=0; j<n; j++) {
1638         xb = b + 4*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
1639         z[0+k*cm] += v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1640         z[1+k*cm] += v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1641         z[2+k*cm] += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1642         z[3+k*cm] += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1643         if (*idx != i) {
1644           c[4*(*idx)+0+k*cm] += v[0]*b[4*i+k*bm] + v[4]*b[4*i+1+k*bm] + v[8]*b[4*i+2+k*bm]  + v[12]*b[4*i+3+k*bm];
1645           c[4*(*idx)+1+k*cm] += v[1]*b[4*i+k*bm] + v[5]*b[4*i+1+k*bm] + v[9]*b[4*i+2+k*bm]  + v[13]*b[4*i+3+k*bm];
1646           c[4*(*idx)+2+k*cm] += v[2]*b[4*i+k*bm] + v[6]*b[4*i+1+k*bm] + v[10]*b[4*i+2+k*bm] + v[14]*b[4*i+3+k*bm];
1647           c[4*(*idx)+3+k*cm] += v[3]*b[4*i+k*bm] + v[7]*b[4*i+1+k*bm] + v[11]*b[4*i+2+k*bm] + v[15]*b[4*i+3+k*bm];
1648         }
1649         v += 16;
1650         ++idx;
1651       }
1652     }
1653     z += 4;
1654   }
1655   PetscFunctionReturn(0);
1656 }
1657 
1658 PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1659 {
1660   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1661   PetscScalar       *z = c;
1662   const PetscScalar *xb;
1663   PetscScalar       x1,x2,x3,x4,x5;
1664   const MatScalar   *v = a->a,*vv;
1665   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1666 
1667   PetscFunctionBegin;
1668   for (i=0; i<mbs; i++) {
1669     n = ii[1] - ii[0]; ii++;
1670     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1671     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1672     jj = idx;
1673     vv = v;
1674     for (k=0; k<cn; k++) {
1675       idx = jj;
1676       v = vv;
1677       for (j=0; j<n; j++) {
1678         xb = b + 5*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*cm];
1679         z[0+k*cm] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1680         z[1+k*cm] += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1681         z[2+k*cm] += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1682         z[3+k*cm] += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1683         z[4+k*cm] += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1684         if (*idx != i) {
1685           c[5*(*idx)+0+k*cm] += v[0]*b[5*i+k*bm] + v[5]*b[5*i+1+k*bm] + v[10]*b[5*i+2+k*bm] + v[15]*b[5*i+3+k*bm] + v[20]*b[5*i+4+k*bm];
1686           c[5*(*idx)+1+k*cm] += v[1]*b[5*i+k*bm] + v[6]*b[5*i+1+k*bm] + v[11]*b[5*i+2+k*bm] + v[16]*b[5*i+3+k*bm] + v[21]*b[5*i+4+k*bm];
1687           c[5*(*idx)+2+k*cm] += v[2]*b[5*i+k*bm] + v[7]*b[5*i+1+k*bm] + v[12]*b[5*i+2+k*bm] + v[17]*b[5*i+3+k*bm] + v[22]*b[5*i+4+k*bm];
1688           c[5*(*idx)+3+k*cm] += v[3]*b[5*i+k*bm] + v[8]*b[5*i+1+k*bm] + v[13]*b[5*i+2+k*bm] + v[18]*b[5*i+3+k*bm] + v[23]*b[5*i+4+k*bm];
1689           c[5*(*idx)+4+k*cm] += v[4]*b[5*i+k*bm] + v[9]*b[5*i+1+k*bm] + v[14]*b[5*i+2+k*bm] + v[19]*b[5*i+3+k*bm] + v[24]*b[5*i+4+k*bm];
1690         }
1691         v += 25;
1692         ++idx;
1693       }
1694     }
1695     z += 5;
1696   }
1697   PetscFunctionReturn(0);
1698 }
1699 
1700 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C)
1701 {
1702   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1703   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
1704   Mat_SeqDense      *cd = (Mat_SeqDense*)B->data;
1705   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
1706   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1707   PetscBLASInt      bbs,bcn,bbm,bcm;
1708   PetscScalar       *z = 0;
1709   PetscScalar       *c,*b;
1710   const MatScalar   *v;
1711   const PetscInt    *idx,*ii;
1712   PetscScalar       _DOne=1.0;
1713   PetscErrorCode    ierr;
1714 
1715   PetscFunctionBegin;
1716   if (!cm || !cn) PetscFunctionReturn(0);
1717   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
1718   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1719   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1720   b = bd->v;
1721   ierr = MatZeroEntries(C);CHKERRQ(ierr);
1722   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1723   switch (bs) {
1724   case 1:
1725     ierr = MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn);
1726     break;
1727   case 2:
1728     ierr = MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn);
1729     break;
1730   case 3:
1731     ierr = MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn);
1732     break;
1733   case 4:
1734     ierr = MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn);
1735     break;
1736   case 5:
1737     ierr = MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn);
1738     break;
1739   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1740     ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr);
1741     ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr);
1742     ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr);
1743     ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr);
1744     idx = a->j;
1745     v   = a->a;
1746     mbs = a->mbs;
1747     ii  = a->i;
1748     z   = c;
1749     for (i=0; i<mbs; i++) {
1750       n = ii[1] - ii[0]; ii++;
1751       for (j=0; j<n; j++) {
1752         if (*idx != i)
1753           PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*i,&bbm,&_DOne,c+bs*(*idx),&bcm));
1754         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
1755         v += bs2;
1756       }
1757       z += bs;
1758     }
1759   }
1760   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1761   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1762   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1763   ierr = PetscLogFlops((2.0*(a->nz*2.0 - a->nonzerorowcnt)*bs2 - a->nonzerorowcnt)*cn);CHKERRQ(ierr);
1764   PetscFunctionReturn(0);
1765 }
1766