xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision fdf6c4e30aafdbc795e4f76379caa977fd5cdf5a)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/impls/dense/seq/dense.h>
4 #include <../src/mat/impls/sbaij/seq/sbaij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6 #include <petscbt.h>
7 #include <petscblaslapack.h>
8 
9 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10 {
11   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
12   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
13   const PetscInt *idx;
14   PetscBT        table_out,table_in;
15 
16   PetscFunctionBegin;
17   PetscCheck(ov >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
18   mbs  = a->mbs;
19   ai   = a->i;
20   aj   = a->j;
21   bs   = A->rmap->bs;
22   PetscCall(PetscBTCreate(mbs,&table_out));
23   PetscCall(PetscMalloc1(mbs+1,&nidx));
24   PetscCall(PetscMalloc1(A->rmap->N+1,&nidx2));
25   PetscCall(PetscBTCreate(mbs,&table_in));
26 
27   for (i=0; i<is_max; i++) { /* for each is */
28     isz  = 0;
29     PetscCall(PetscBTMemzero(mbs,table_out));
30 
31     /* Extract the indices, assume there can be duplicate entries */
32     PetscCall(ISGetIndices(is[i],&idx));
33     PetscCall(ISGetLocalSize(is[i],&n));
34 
35     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
36     bcol_max = 0;
37     for (j=0; j<n; ++j) {
38       brow = idx[j]/bs; /* convert the indices into block indices */
39       PetscCheck(brow < mbs,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
40       if (!PetscBTLookupSet(table_out,brow)) {
41         nidx[isz++] = brow;
42         if (bcol_max < brow) bcol_max = brow;
43       }
44     }
45     PetscCall(ISRestoreIndices(is[i],&idx));
46     PetscCall(ISDestroy(&is[i]));
47 
48     k = 0;
49     for (j=0; j<ov; j++) { /* for each overlap */
50       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
51       PetscCall(PetscBTMemzero(mbs,table_in));
52       for (l=k; l<isz; l++) PetscCall(PetscBTSet(table_in,nidx[l]));
53 
54       n = isz;  /* length of the updated is[i] */
55       for (brow=0; brow<mbs; brow++) {
56         start = ai[brow]; end   = ai[brow+1];
57         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
58           for (l = start; l<end; l++) {
59             bcol = aj[l];
60             if (!PetscBTLookupSet(table_out,bcol)) {
61               nidx[isz++] = bcol;
62               if (bcol_max < bcol) bcol_max = bcol;
63             }
64           }
65           k++;
66           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
68           for (l = start; l<end; l++) {
69             bcol = aj[l];
70             if (bcol > bcol_max) break;
71             if (PetscBTLookup(table_in,bcol)) {
72               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
73               break; /* for l = start; l<end ; l++) */
74             }
75           }
76         }
77       }
78     } /* for each overlap */
79 
80     /* expand the Index Set */
81     for (j=0; j<isz; j++) {
82       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
83     }
84     PetscCall(ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i));
85   }
86   PetscCall(PetscBTDestroy(&table_out));
87   PetscCall(PetscFree(nidx));
88   PetscCall(PetscFree(nidx2));
89   PetscCall(PetscBTDestroy(&table_in));
90   PetscFunctionReturn(0);
91 }
92 
93 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
94         Zero some ops' to avoid invalid usse */
95 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
96 {
97   PetscFunctionBegin;
98   PetscCall(MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE));
99   Bseq->ops->mult                   = NULL;
100   Bseq->ops->multadd                = NULL;
101   Bseq->ops->multtranspose          = NULL;
102   Bseq->ops->multtransposeadd       = NULL;
103   Bseq->ops->lufactor               = NULL;
104   Bseq->ops->choleskyfactor         = NULL;
105   Bseq->ops->lufactorsymbolic       = NULL;
106   Bseq->ops->choleskyfactorsymbolic = NULL;
107   Bseq->ops->getinertia             = NULL;
108   PetscFunctionReturn(0);
109 }
110 
111 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
112 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
113 {
114   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*c;
115   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
116   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
117   const PetscInt *irow,*icol;
118   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
119   PetscInt       *aj = a->j,*ai = a->i;
120   MatScalar      *mat_a;
121   Mat            C;
122   PetscBool      flag;
123 
124   PetscFunctionBegin;
125 
126   PetscCall(ISGetIndices(isrow,&irow));
127   PetscCall(ISGetIndices(iscol,&icol));
128   PetscCall(ISGetLocalSize(isrow,&nrows));
129   PetscCall(ISGetLocalSize(iscol,&ncols));
130 
131   PetscCall(PetscCalloc1(1+oldcols,&smap));
132   ssmap = smap;
133   PetscCall(PetscMalloc1(1+nrows,&lens));
134   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
135   /* determine lens of each row */
136   for (i=0; i<nrows; i++) {
137     kstart  = ai[irow[i]];
138     kend    = kstart + a->ilen[irow[i]];
139     lens[i] = 0;
140     for (k=kstart; k<kend; k++) {
141       if (ssmap[aj[k]]) lens[i]++;
142     }
143   }
144   /* Create and fill new matrix */
145   if (scall == MAT_REUSE_MATRIX) {
146     c = (Mat_SeqSBAIJ*)((*B)->data);
147 
148     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
149     PetscCall(PetscArraycmp(c->ilen,lens,c->mbs,&flag));
150     PetscCheck(flag,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
151     PetscCall(PetscArrayzero(c->ilen,c->mbs));
152     C    = *B;
153   } else {
154     PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&C));
155     PetscCall(MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE));
156     PetscCall(MatSetType(C,((PetscObject)A)->type_name));
157     PetscCall(MatSeqSBAIJSetPreallocation(C,bs,0,lens));
158   }
159   c = (Mat_SeqSBAIJ*)(C->data);
160   for (i=0; i<nrows; i++) {
161     row      = irow[i];
162     kstart   = ai[row];
163     kend     = kstart + a->ilen[row];
164     mat_i    = c->i[i];
165     mat_j    = c->j + mat_i;
166     mat_a    = c->a + mat_i*bs2;
167     mat_ilen = c->ilen + i;
168     for (k=kstart; k<kend; k++) {
169       if ((tcol=ssmap[a->j[k]])) {
170         *mat_j++ = tcol - 1;
171         PetscCall(PetscArraycpy(mat_a,a->a+k*bs2,bs2));
172         mat_a   += bs2;
173         (*mat_ilen)++;
174       }
175     }
176   }
177   /* sort */
178   {
179     MatScalar *work;
180 
181     PetscCall(PetscMalloc1(bs2,&work));
182     for (i=0; i<nrows; i++) {
183       PetscInt ilen;
184       mat_i = c->i[i];
185       mat_j = c->j + mat_i;
186       mat_a = c->a + mat_i*bs2;
187       ilen  = c->ilen[i];
188       PetscCall(PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work));
189     }
190     PetscCall(PetscFree(work));
191   }
192 
193   /* Free work space */
194   PetscCall(ISRestoreIndices(iscol,&icol));
195   PetscCall(PetscFree(smap));
196   PetscCall(PetscFree(lens));
197   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
198   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
199 
200   PetscCall(ISRestoreIndices(isrow,&irow));
201   *B   = C;
202   PetscFunctionReturn(0);
203 }
204 
205 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
206 {
207   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
208   IS             is1,is2;
209   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
210   const PetscInt *irow,*icol;
211 
212   PetscFunctionBegin;
213   PetscCall(ISGetIndices(isrow,&irow));
214   PetscCall(ISGetIndices(iscol,&icol));
215   PetscCall(ISGetLocalSize(isrow,&nrows));
216   PetscCall(ISGetLocalSize(iscol,&ncols));
217 
218   /* Verify if the indices corespond to each element in a block
219    and form the IS with compressed IS */
220   maxmnbs = PetscMax(a->mbs,a->nbs);
221   PetscCall(PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary));
222   PetscCall(PetscArrayzero(vary,a->mbs));
223   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
224   for (i=0; i<a->mbs; i++) {
225     PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
226   }
227   count = 0;
228   for (i=0; i<nrows; i++) {
229     PetscInt j = irow[i] / bs;
230     if ((vary[j]--)==bs) iary[count++] = j;
231   }
232   PetscCall(ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1));
233 
234   PetscCall(PetscArrayzero(vary,a->nbs));
235   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
236   for (i=0; i<a->nbs; i++) {
237     PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
238   }
239   count = 0;
240   for (i=0; i<ncols; i++) {
241     PetscInt j = icol[i] / bs;
242     if ((vary[j]--)==bs) iary[count++] = j;
243   }
244   PetscCall(ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2));
245   PetscCall(ISRestoreIndices(isrow,&irow));
246   PetscCall(ISRestoreIndices(iscol,&icol));
247   PetscCall(PetscFree2(vary,iary));
248 
249   PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B));
250   PetscCall(ISDestroy(&is1));
251   PetscCall(ISDestroy(&is2));
252 
253   if (isrow != iscol) {
254     PetscBool isequal;
255     PetscCall(ISEqual(isrow,iscol,&isequal));
256     if (!isequal) {
257       PetscCall(MatSeqSBAIJZeroOps_Private(*B));
258     }
259   }
260   PetscFunctionReturn(0);
261 }
262 
263 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
264 {
265   PetscInt       i;
266 
267   PetscFunctionBegin;
268   if (scall == MAT_INITIAL_MATRIX) {
269     PetscCall(PetscCalloc1(n+1,B));
270   }
271 
272   for (i=0; i<n; i++) {
273     PetscCall(MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]));
274   }
275   PetscFunctionReturn(0);
276 }
277 
278 /* -------------------------------------------------------*/
279 /* Should check that shapes of vectors and matrices match */
280 /* -------------------------------------------------------*/
281 
282 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
283 {
284   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
285   PetscScalar       *z,x1,x2,zero=0.0;
286   const PetscScalar *x,*xb;
287   const MatScalar   *v;
288   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
289   const PetscInt    *aj=a->j,*ai=a->i,*ib;
290   PetscInt          nonzerorow=0;
291 
292   PetscFunctionBegin;
293   PetscCall(VecSet(zz,zero));
294   if (!a->nz) PetscFunctionReturn(0);
295   PetscCall(VecGetArrayRead(xx,&x));
296   PetscCall(VecGetArray(zz,&z));
297 
298   v  = a->a;
299   xb = x;
300 
301   for (i=0; i<mbs; i++) {
302     n           = ai[1] - ai[0]; /* length of i_th block row of A */
303     x1          = xb[0]; x2 = xb[1];
304     ib          = aj + *ai;
305     jmin        = 0;
306     nonzerorow += (n>0);
307     if (*ib == i) {     /* (diag of A)*x */
308       z[2*i]   += v[0]*x1 + v[2]*x2;
309       z[2*i+1] += v[2]*x1 + v[3]*x2;
310       v        += 4; jmin++;
311     }
312     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
313     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
314     for (j=jmin; j<n; j++) {
315       /* (strict lower triangular part of A)*x  */
316       cval       = ib[j]*2;
317       z[cval]   += v[0]*x1 + v[1]*x2;
318       z[cval+1] += v[2]*x1 + v[3]*x2;
319       /* (strict upper triangular part of A)*x  */
320       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
321       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
322       v        += 4;
323     }
324     xb +=2; ai++;
325   }
326 
327   PetscCall(VecRestoreArrayRead(xx,&x));
328   PetscCall(VecRestoreArray(zz,&z));
329   PetscCall(PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow));
330   PetscFunctionReturn(0);
331 }
332 
333 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
334 {
335   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
336   PetscScalar       *z,x1,x2,x3,zero=0.0;
337   const PetscScalar *x,*xb;
338   const MatScalar   *v;
339   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
340   const PetscInt    *aj = a->j,*ai = a->i,*ib;
341   PetscInt          nonzerorow=0;
342 
343   PetscFunctionBegin;
344   PetscCall(VecSet(zz,zero));
345   if (!a->nz) PetscFunctionReturn(0);
346   PetscCall(VecGetArrayRead(xx,&x));
347   PetscCall(VecGetArray(zz,&z));
348 
349   v  = a->a;
350   xb = x;
351 
352   for (i=0; i<mbs; i++) {
353     n           = ai[1] - ai[0]; /* length of i_th block row of A */
354     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
355     ib          = aj + *ai;
356     jmin        = 0;
357     nonzerorow += (n>0);
358     if (*ib == i) {     /* (diag of A)*x */
359       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
360       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
361       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
362       v        += 9; jmin++;
363     }
364     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
365     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
366     for (j=jmin; j<n; j++) {
367       /* (strict lower triangular part of A)*x  */
368       cval       = ib[j]*3;
369       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
370       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
371       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
372       /* (strict upper triangular part of A)*x  */
373       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
374       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
375       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
376       v        += 9;
377     }
378     xb +=3; ai++;
379   }
380 
381   PetscCall(VecRestoreArrayRead(xx,&x));
382   PetscCall(VecRestoreArray(zz,&z));
383   PetscCall(PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow));
384   PetscFunctionReturn(0);
385 }
386 
387 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
388 {
389   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
390   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
391   const PetscScalar *x,*xb;
392   const MatScalar   *v;
393   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
394   const PetscInt    *aj = a->j,*ai = a->i,*ib;
395   PetscInt          nonzerorow = 0;
396 
397   PetscFunctionBegin;
398   PetscCall(VecSet(zz,zero));
399   if (!a->nz) PetscFunctionReturn(0);
400   PetscCall(VecGetArrayRead(xx,&x));
401   PetscCall(VecGetArray(zz,&z));
402 
403   v  = a->a;
404   xb = x;
405 
406   for (i=0; i<mbs; i++) {
407     n           = ai[1] - ai[0]; /* length of i_th block row of A */
408     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
409     ib          = aj + *ai;
410     jmin        = 0;
411     nonzerorow += (n>0);
412     if (*ib == i) {     /* (diag of A)*x */
413       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
414       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
415       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
416       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
417       v        += 16; jmin++;
418     }
419     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
420     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
421     for (j=jmin; j<n; j++) {
422       /* (strict lower triangular part of A)*x  */
423       cval       = ib[j]*4;
424       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
425       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
426       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
427       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
428       /* (strict upper triangular part of A)*x  */
429       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
430       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
431       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
432       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
433       v        += 16;
434     }
435     xb +=4; ai++;
436   }
437 
438   PetscCall(VecRestoreArrayRead(xx,&x));
439   PetscCall(VecRestoreArray(zz,&z));
440   PetscCall(PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow));
441   PetscFunctionReturn(0);
442 }
443 
444 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
445 {
446   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
447   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
448   const PetscScalar *x,*xb;
449   const MatScalar   *v;
450   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
451   const PetscInt    *aj = a->j,*ai = a->i,*ib;
452   PetscInt          nonzerorow=0;
453 
454   PetscFunctionBegin;
455   PetscCall(VecSet(zz,zero));
456   if (!a->nz) PetscFunctionReturn(0);
457   PetscCall(VecGetArrayRead(xx,&x));
458   PetscCall(VecGetArray(zz,&z));
459 
460   v  = a->a;
461   xb = x;
462 
463   for (i=0; i<mbs; i++) {
464     n           = ai[1] - ai[0]; /* length of i_th block row of A */
465     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
466     ib          = aj + *ai;
467     jmin        = 0;
468     nonzerorow += (n>0);
469     if (*ib == i) {      /* (diag of A)*x */
470       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
471       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
472       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
473       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
474       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
475       v        += 25; jmin++;
476     }
477     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
478     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
479     for (j=jmin; j<n; j++) {
480       /* (strict lower triangular part of A)*x  */
481       cval       = ib[j]*5;
482       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
483       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
484       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
485       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
486       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
487       /* (strict upper triangular part of A)*x  */
488       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
489       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
490       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
491       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
492       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
493       v        += 25;
494     }
495     xb +=5; ai++;
496   }
497 
498   PetscCall(VecRestoreArrayRead(xx,&x));
499   PetscCall(VecRestoreArray(zz,&z));
500   PetscCall(PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow));
501   PetscFunctionReturn(0);
502 }
503 
504 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
505 {
506   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
507   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
508   const PetscScalar *x,*xb;
509   const MatScalar   *v;
510   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
511   const PetscInt    *aj=a->j,*ai=a->i,*ib;
512   PetscInt          nonzerorow=0;
513 
514   PetscFunctionBegin;
515   PetscCall(VecSet(zz,zero));
516   if (!a->nz) PetscFunctionReturn(0);
517   PetscCall(VecGetArrayRead(xx,&x));
518   PetscCall(VecGetArray(zz,&z));
519 
520   v  = a->a;
521   xb = x;
522 
523   for (i=0; i<mbs; i++) {
524     n           = ai[1] - ai[0]; /* length of i_th block row of A */
525     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
526     ib          = aj + *ai;
527     jmin        = 0;
528     nonzerorow += (n>0);
529     if (*ib == i) {      /* (diag of A)*x */
530       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
531       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
532       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
533       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
534       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
535       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
536       v        += 36; jmin++;
537     }
538     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
539     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
540     for (j=jmin; j<n; j++) {
541       /* (strict lower triangular part of A)*x  */
542       cval       = ib[j]*6;
543       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
544       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
545       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
546       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
547       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
548       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
549       /* (strict upper triangular part of A)*x  */
550       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
551       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
552       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
553       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
554       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
555       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
556       v        += 36;
557     }
558     xb +=6; ai++;
559   }
560 
561   PetscCall(VecRestoreArrayRead(xx,&x));
562   PetscCall(VecRestoreArray(zz,&z));
563   PetscCall(PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow));
564   PetscFunctionReturn(0);
565 }
566 
567 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
568 {
569   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
570   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
571   const PetscScalar *x,*xb;
572   const MatScalar   *v;
573   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
574   const PetscInt    *aj=a->j,*ai=a->i,*ib;
575   PetscInt          nonzerorow=0;
576 
577   PetscFunctionBegin;
578   PetscCall(VecSet(zz,zero));
579   if (!a->nz) PetscFunctionReturn(0);
580   PetscCall(VecGetArrayRead(xx,&x));
581   PetscCall(VecGetArray(zz,&z));
582 
583   v  = a->a;
584   xb = x;
585 
586   for (i=0; i<mbs; i++) {
587     n           = ai[1] - ai[0]; /* length of i_th block row of A */
588     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
589     ib          = aj + *ai;
590     jmin        = 0;
591     nonzerorow += (n>0);
592     if (*ib == i) {      /* (diag of A)*x */
593       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
594       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
595       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
596       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
597       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
598       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
599       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
600       v        += 49; jmin++;
601     }
602     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
603     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
604     for (j=jmin; j<n; j++) {
605       /* (strict lower triangular part of A)*x  */
606       cval       = ib[j]*7;
607       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
608       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
609       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
610       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
611       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
612       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
613       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
614       /* (strict upper triangular part of A)*x  */
615       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
616       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
617       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
618       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
619       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
620       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
621       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
622       v       += 49;
623     }
624     xb +=7; ai++;
625   }
626   PetscCall(VecRestoreArrayRead(xx,&x));
627   PetscCall(VecRestoreArray(zz,&z));
628   PetscCall(PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow));
629   PetscFunctionReturn(0);
630 }
631 
632 /*
633     This will not work with MatScalar == float because it calls the BLAS
634 */
635 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
636 {
637   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
638   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
639   const PetscScalar *x,*x_ptr,*xb;
640   const MatScalar   *v;
641   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
642   const PetscInt    *idx,*aj,*ii;
643   PetscInt          nonzerorow=0;
644 
645   PetscFunctionBegin;
646   PetscCall(VecSet(zz,zero));
647   if (!a->nz) PetscFunctionReturn(0);
648   PetscCall(VecGetArrayRead(xx,&x));
649   PetscCall(VecGetArray(zz,&z));
650 
651   x_ptr = x;
652   z_ptr = z;
653 
654   aj = a->j;
655   v  = a->a;
656   ii = a->i;
657 
658   if (!a->mult_work) {
659     PetscCall(PetscMalloc1(A->rmap->N+1,&a->mult_work));
660   }
661   work = a->mult_work;
662 
663   for (i=0; i<mbs; i++) {
664     n           = ii[1] - ii[0]; ncols = n*bs;
665     workt       = work; idx=aj+ii[0];
666     nonzerorow += (n>0);
667 
668     /* upper triangular part */
669     for (j=0; j<n; j++) {
670       xb = x_ptr + bs*(*idx++);
671       for (k=0; k<bs; k++) workt[k] = xb[k];
672       workt += bs;
673     }
674     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
675     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
676 
677     /* strict lower triangular part */
678     idx = aj+ii[0];
679     if (n && *idx == i) {
680       ncols -= bs; v += bs2; idx++; n--;
681     }
682 
683     if (ncols > 0) {
684       workt = work;
685       PetscCall(PetscArrayzero(workt,ncols));
686       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
687       for (j=0; j<n; j++) {
688         zb = z_ptr + bs*(*idx++);
689         for (k=0; k<bs; k++) zb[k] += workt[k];
690         workt += bs;
691       }
692     }
693     x += bs; v += n*bs2; z += bs; ii++;
694   }
695 
696   PetscCall(VecRestoreArrayRead(xx,&x));
697   PetscCall(VecRestoreArray(zz,&z));
698   PetscCall(PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow));
699   PetscFunctionReturn(0);
700 }
701 
702 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
703 {
704   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
705   PetscScalar       *z,x1;
706   const PetscScalar *x,*xb;
707   const MatScalar   *v;
708   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
709   const PetscInt    *aj=a->j,*ai=a->i,*ib;
710   PetscInt          nonzerorow=0;
711 #if defined(PETSC_USE_COMPLEX)
712   const int         aconj = A->hermitian == PETSC_BOOL3_TRUE;
713 #else
714   const int         aconj = 0;
715 #endif
716 
717   PetscFunctionBegin;
718   PetscCall(VecCopy(yy,zz));
719   if (!a->nz) PetscFunctionReturn(0);
720   PetscCall(VecGetArrayRead(xx,&x));
721   PetscCall(VecGetArray(zz,&z));
722   v    = a->a;
723   xb   = x;
724 
725   for (i=0; i<mbs; i++) {
726     n           = ai[1] - ai[0]; /* length of i_th row of A */
727     x1          = xb[0];
728     ib          = aj + *ai;
729     jmin        = 0;
730     nonzerorow += (n>0);
731     if (n && *ib == i) {            /* (diag of A)*x */
732       z[i] += *v++ * x[*ib++]; jmin++;
733     }
734     if (aconj) {
735       for (j=jmin; j<n; j++) {
736         cval    = *ib;
737         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
738         z[i]    += *v++ * x[*ib++];    /* (strict upper triangular part of A)*x  */
739       }
740     } else {
741       for (j=jmin; j<n; j++) {
742         cval    = *ib;
743         z[cval] += *v * x1;         /* (strict lower triangular part of A)*x  */
744         z[i]    += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
745       }
746     }
747     xb++; ai++;
748   }
749 
750   PetscCall(VecRestoreArrayRead(xx,&x));
751   PetscCall(VecRestoreArray(zz,&z));
752 
753   PetscCall(PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)));
754   PetscFunctionReturn(0);
755 }
756 
757 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
758 {
759   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
760   PetscScalar       *z,x1,x2;
761   const PetscScalar *x,*xb;
762   const MatScalar   *v;
763   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
764   const PetscInt    *aj=a->j,*ai=a->i,*ib;
765   PetscInt          nonzerorow=0;
766 
767   PetscFunctionBegin;
768   PetscCall(VecCopy(yy,zz));
769   if (!a->nz) PetscFunctionReturn(0);
770   PetscCall(VecGetArrayRead(xx,&x));
771   PetscCall(VecGetArray(zz,&z));
772 
773   v  = a->a;
774   xb = x;
775 
776   for (i=0; i<mbs; i++) {
777     n           = ai[1] - ai[0]; /* length of i_th block row of A */
778     x1          = xb[0]; x2 = xb[1];
779     ib          = aj + *ai;
780     jmin        = 0;
781     nonzerorow += (n>0);
782     if (n && *ib == i) {      /* (diag of A)*x */
783       z[2*i]   += v[0]*x1 + v[2]*x2;
784       z[2*i+1] += v[2]*x1 + v[3]*x2;
785       v        += 4; jmin++;
786     }
787     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
788     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
789     for (j=jmin; j<n; j++) {
790       /* (strict lower triangular part of A)*x  */
791       cval       = ib[j]*2;
792       z[cval]   += v[0]*x1 + v[1]*x2;
793       z[cval+1] += v[2]*x1 + v[3]*x2;
794       /* (strict upper triangular part of A)*x  */
795       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
796       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
797       v        += 4;
798     }
799     xb +=2; ai++;
800   }
801   PetscCall(VecRestoreArrayRead(xx,&x));
802   PetscCall(VecRestoreArray(zz,&z));
803 
804   PetscCall(PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow)));
805   PetscFunctionReturn(0);
806 }
807 
808 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
809 {
810   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
811   PetscScalar       *z,x1,x2,x3;
812   const PetscScalar *x,*xb;
813   const MatScalar   *v;
814   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
815   const PetscInt    *aj=a->j,*ai=a->i,*ib;
816   PetscInt          nonzerorow=0;
817 
818   PetscFunctionBegin;
819   PetscCall(VecCopy(yy,zz));
820   if (!a->nz) PetscFunctionReturn(0);
821   PetscCall(VecGetArrayRead(xx,&x));
822   PetscCall(VecGetArray(zz,&z));
823 
824   v  = a->a;
825   xb = x;
826 
827   for (i=0; i<mbs; i++) {
828     n           = ai[1] - ai[0]; /* length of i_th block row of A */
829     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
830     ib          = aj + *ai;
831     jmin        = 0;
832     nonzerorow += (n>0);
833     if (n && *ib == i) {     /* (diag of A)*x */
834       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
835       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
836       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
837       v        += 9; jmin++;
838     }
839     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
840     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
841     for (j=jmin; j<n; j++) {
842       /* (strict lower triangular part of A)*x  */
843       cval       = ib[j]*3;
844       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
845       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
846       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
847       /* (strict upper triangular part of A)*x  */
848       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
849       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
850       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
851       v        += 9;
852     }
853     xb +=3; ai++;
854   }
855 
856   PetscCall(VecRestoreArrayRead(xx,&x));
857   PetscCall(VecRestoreArray(zz,&z));
858 
859   PetscCall(PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow)));
860   PetscFunctionReturn(0);
861 }
862 
863 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
864 {
865   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
866   PetscScalar       *z,x1,x2,x3,x4;
867   const PetscScalar *x,*xb;
868   const MatScalar   *v;
869   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
870   const PetscInt    *aj=a->j,*ai=a->i,*ib;
871   PetscInt          nonzerorow=0;
872 
873   PetscFunctionBegin;
874   PetscCall(VecCopy(yy,zz));
875   if (!a->nz) PetscFunctionReturn(0);
876   PetscCall(VecGetArrayRead(xx,&x));
877   PetscCall(VecGetArray(zz,&z));
878 
879   v  = a->a;
880   xb = x;
881 
882   for (i=0; i<mbs; i++) {
883     n           = ai[1] - ai[0]; /* length of i_th block row of A */
884     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
885     ib          = aj + *ai;
886     jmin        = 0;
887     nonzerorow += (n>0);
888     if (n && *ib == i) {      /* (diag of A)*x */
889       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
890       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
891       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
892       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
893       v        += 16; jmin++;
894     }
895     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
896     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
897     for (j=jmin; j<n; j++) {
898       /* (strict lower triangular part of A)*x  */
899       cval       = ib[j]*4;
900       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
901       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
902       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
903       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
904       /* (strict upper triangular part of A)*x  */
905       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
906       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
907       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
908       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
909       v        += 16;
910     }
911     xb +=4; ai++;
912   }
913 
914   PetscCall(VecRestoreArrayRead(xx,&x));
915   PetscCall(VecRestoreArray(zz,&z));
916 
917   PetscCall(PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow)));
918   PetscFunctionReturn(0);
919 }
920 
921 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
922 {
923   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
924   PetscScalar       *z,x1,x2,x3,x4,x5;
925   const PetscScalar *x,*xb;
926   const MatScalar   *v;
927   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
928   const PetscInt    *aj=a->j,*ai=a->i,*ib;
929   PetscInt          nonzerorow=0;
930 
931   PetscFunctionBegin;
932   PetscCall(VecCopy(yy,zz));
933   if (!a->nz) PetscFunctionReturn(0);
934   PetscCall(VecGetArrayRead(xx,&x));
935   PetscCall(VecGetArray(zz,&z));
936 
937   v  = a->a;
938   xb = x;
939 
940   for (i=0; i<mbs; i++) {
941     n           = ai[1] - ai[0]; /* length of i_th block row of A */
942     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
943     ib          = aj + *ai;
944     jmin        = 0;
945     nonzerorow += (n>0);
946     if (n && *ib == i) {      /* (diag of A)*x */
947       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
948       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
949       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
950       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
951       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
952       v        += 25; jmin++;
953     }
954     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
955     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
956     for (j=jmin; j<n; j++) {
957       /* (strict lower triangular part of A)*x  */
958       cval       = ib[j]*5;
959       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
960       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
961       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
962       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
963       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
964       /* (strict upper triangular part of A)*x  */
965       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
966       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
967       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
968       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
969       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
970       v        += 25;
971     }
972     xb +=5; ai++;
973   }
974 
975   PetscCall(VecRestoreArrayRead(xx,&x));
976   PetscCall(VecRestoreArray(zz,&z));
977 
978   PetscCall(PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow)));
979   PetscFunctionReturn(0);
980 }
981 
982 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
983 {
984   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
985   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
986   const PetscScalar *x,*xb;
987   const MatScalar   *v;
988   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
989   const PetscInt    *aj=a->j,*ai=a->i,*ib;
990   PetscInt          nonzerorow=0;
991 
992   PetscFunctionBegin;
993   PetscCall(VecCopy(yy,zz));
994   if (!a->nz) PetscFunctionReturn(0);
995   PetscCall(VecGetArrayRead(xx,&x));
996   PetscCall(VecGetArray(zz,&z));
997 
998   v  = a->a;
999   xb = x;
1000 
1001   for (i=0; i<mbs; i++) {
1002     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1003     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
1004     ib          = aj + *ai;
1005     jmin        = 0;
1006     nonzerorow += (n>0);
1007     if (n && *ib == i) {     /* (diag of A)*x */
1008       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
1009       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1010       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1011       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1012       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1013       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1014       v        += 36; jmin++;
1015     }
1016     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1017     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1018     for (j=jmin; j<n; j++) {
1019       /* (strict lower triangular part of A)*x  */
1020       cval       = ib[j]*6;
1021       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1022       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1023       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1024       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1025       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1026       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1027       /* (strict upper triangular part of A)*x  */
1028       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1029       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1030       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1031       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1032       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1033       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1034       v        += 36;
1035     }
1036     xb +=6; ai++;
1037   }
1038 
1039   PetscCall(VecRestoreArrayRead(xx,&x));
1040   PetscCall(VecRestoreArray(zz,&z));
1041 
1042   PetscCall(PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow)));
1043   PetscFunctionReturn(0);
1044 }
1045 
1046 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1047 {
1048   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1049   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1050   const PetscScalar *x,*xb;
1051   const MatScalar   *v;
1052   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1053   const PetscInt    *aj=a->j,*ai=a->i,*ib;
1054   PetscInt          nonzerorow=0;
1055 
1056   PetscFunctionBegin;
1057   PetscCall(VecCopy(yy,zz));
1058   if (!a->nz) PetscFunctionReturn(0);
1059   PetscCall(VecGetArrayRead(xx,&x));
1060   PetscCall(VecGetArray(zz,&z));
1061 
1062   v  = a->a;
1063   xb = x;
1064 
1065   for (i=0; i<mbs; i++) {
1066     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1067     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1068     ib          = aj + *ai;
1069     jmin        = 0;
1070     nonzerorow += (n>0);
1071     if (n && *ib == i) {     /* (diag of A)*x */
1072       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1073       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1074       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1075       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1076       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1077       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1078       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1079       v        += 49; jmin++;
1080     }
1081     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1082     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1083     for (j=jmin; j<n; j++) {
1084       /* (strict lower triangular part of A)*x  */
1085       cval       = ib[j]*7;
1086       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1087       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1088       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1089       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1090       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1091       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1092       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1093       /* (strict upper triangular part of A)*x  */
1094       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1095       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1096       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1097       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1098       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1099       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1100       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1101       v       += 49;
1102     }
1103     xb +=7; ai++;
1104   }
1105 
1106   PetscCall(VecRestoreArrayRead(xx,&x));
1107   PetscCall(VecRestoreArray(zz,&z));
1108 
1109   PetscCall(PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow)));
1110   PetscFunctionReturn(0);
1111 }
1112 
1113 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1114 {
1115   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1116   PetscScalar       *z,*z_ptr=NULL,*zb,*work,*workt;
1117   const PetscScalar *x,*x_ptr,*xb;
1118   const MatScalar   *v;
1119   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1120   const PetscInt    *idx,*aj,*ii;
1121   PetscInt          nonzerorow=0;
1122 
1123   PetscFunctionBegin;
1124   PetscCall(VecCopy(yy,zz));
1125   if (!a->nz) PetscFunctionReturn(0);
1126   PetscCall(VecGetArrayRead(xx,&x)); x_ptr=x;
1127   PetscCall(VecGetArray(zz,&z)); z_ptr=z;
1128 
1129   aj = a->j;
1130   v  = a->a;
1131   ii = a->i;
1132 
1133   if (!a->mult_work) {
1134     PetscCall(PetscMalloc1(A->rmap->n+1,&a->mult_work));
1135   }
1136   work = a->mult_work;
1137 
1138   for (i=0; i<mbs; i++) {
1139     n           = ii[1] - ii[0]; ncols = n*bs;
1140     workt       = work; idx=aj+ii[0];
1141     nonzerorow += (n>0);
1142 
1143     /* upper triangular part */
1144     for (j=0; j<n; j++) {
1145       xb = x_ptr + bs*(*idx++);
1146       for (k=0; k<bs; k++) workt[k] = xb[k];
1147       workt += bs;
1148     }
1149     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1150     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1151 
1152     /* strict lower triangular part */
1153     idx = aj+ii[0];
1154     if (n && *idx == i) {
1155       ncols -= bs; v += bs2; idx++; n--;
1156     }
1157     if (ncols > 0) {
1158       workt = work;
1159       PetscCall(PetscArrayzero(workt,ncols));
1160       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1161       for (j=0; j<n; j++) {
1162         zb = z_ptr + bs*(*idx++);
1163         for (k=0; k<bs; k++) zb[k] += workt[k];
1164         workt += bs;
1165       }
1166     }
1167 
1168     x += bs; v += n*bs2; z += bs; ii++;
1169   }
1170 
1171   PetscCall(VecRestoreArrayRead(xx,&x));
1172   PetscCall(VecRestoreArray(zz,&z));
1173 
1174   PetscCall(PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)));
1175   PetscFunctionReturn(0);
1176 }
1177 
1178 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1179 {
1180   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1181   PetscScalar    oalpha = alpha;
1182   PetscBLASInt   one = 1,totalnz;
1183 
1184   PetscFunctionBegin;
1185   PetscCall(PetscBLASIntCast(a->bs2*a->nz,&totalnz));
1186   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1187   PetscCall(PetscLogFlops(totalnz));
1188   PetscFunctionReturn(0);
1189 }
1190 
1191 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1192 {
1193   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1194   const MatScalar *v       = a->a;
1195   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1196   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1197   const PetscInt  *aj=a->j,*col;
1198 
1199   PetscFunctionBegin;
1200   if (!a->nz) {
1201     *norm = 0.0;
1202     PetscFunctionReturn(0);
1203   }
1204   if (type == NORM_FROBENIUS) {
1205     for (k=0; k<mbs; k++) {
1206       jmin = a->i[k];
1207       jmax = a->i[k+1];
1208       col  = aj + jmin;
1209       if (jmax-jmin > 0 && *col == k) {         /* diagonal block */
1210         for (i=0; i<bs2; i++) {
1211           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1212         }
1213         jmin++;
1214       }
1215       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
1216         for (i=0; i<bs2; i++) {
1217           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1218         }
1219       }
1220     }
1221     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1222     PetscCall(PetscLogFlops(2.0*bs2*a->nz));
1223   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1224     PetscCall(PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl));
1225     for (i=0; i<mbs; i++) jl[i] = mbs;
1226     il[0] = 0;
1227 
1228     *norm = 0.0;
1229     for (k=0; k<mbs; k++) { /* k_th block row */
1230       for (j=0; j<bs; j++) sum[j]=0.0;
1231       /*-- col sum --*/
1232       i = jl[k]; /* first |A(i,k)| to be added */
1233       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1234                   at step k */
1235       while (i<mbs) {
1236         nexti = jl[i];  /* next block row to be added */
1237         ik    = il[i];  /* block index of A(i,k) in the array a */
1238         for (j=0; j<bs; j++) {
1239           v = a->a + ik*bs2 + j*bs;
1240           for (k1=0; k1<bs; k1++) {
1241             sum[j] += PetscAbsScalar(*v); v++;
1242           }
1243         }
1244         /* update il, jl */
1245         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1246         jmax = a->i[i+1];
1247         if (jmin < jmax) {
1248           il[i] = jmin;
1249           j     = a->j[jmin];
1250           jl[i] = jl[j]; jl[j]=i;
1251         }
1252         i = nexti;
1253       }
1254       /*-- row sum --*/
1255       jmin = a->i[k];
1256       jmax = a->i[k+1];
1257       for (i=jmin; i<jmax; i++) {
1258         for (j=0; j<bs; j++) {
1259           v = a->a + i*bs2 + j;
1260           for (k1=0; k1<bs; k1++) {
1261             sum[j] += PetscAbsScalar(*v); v += bs;
1262           }
1263         }
1264       }
1265       /* add k_th block row to il, jl */
1266       col = aj+jmin;
1267       if (jmax - jmin > 0 && *col == k) jmin++;
1268       if (jmin < jmax) {
1269         il[k] = jmin;
1270         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1271       }
1272       for (j=0; j<bs; j++) {
1273         if (sum[j] > *norm) *norm = sum[j];
1274       }
1275     }
1276     PetscCall(PetscFree3(sum,il,jl));
1277     PetscCall(PetscLogFlops(PetscMax(mbs*a->nz-1,0)));
1278   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1279   PetscFunctionReturn(0);
1280 }
1281 
1282 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1283 {
1284   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1285 
1286   PetscFunctionBegin;
1287   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1288   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1289     *flg = PETSC_FALSE;
1290     PetscFunctionReturn(0);
1291   }
1292 
1293   /* if the a->i are the same */
1294   PetscCall(PetscArraycmp(a->i,b->i,a->mbs+1,flg));
1295   if (!*flg) PetscFunctionReturn(0);
1296 
1297   /* if a->j are the same */
1298   PetscCall(PetscArraycmp(a->j,b->j,a->nz,flg));
1299   if (!*flg) PetscFunctionReturn(0);
1300 
1301   /* if a->a are the same */
1302   PetscCall(PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg));
1303   PetscFunctionReturn(0);
1304 }
1305 
1306 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1307 {
1308   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1309   PetscInt        i,j,k,row,bs,ambs,bs2;
1310   const PetscInt  *ai,*aj;
1311   PetscScalar     *x,zero = 0.0;
1312   const MatScalar *aa,*aa_j;
1313 
1314   PetscFunctionBegin;
1315   bs = A->rmap->bs;
1316   PetscCheck(!A->factortype || bs <= 1,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1317 
1318   aa   = a->a;
1319   ambs = a->mbs;
1320 
1321   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1322     PetscInt *diag=a->diag;
1323     aa   = a->a;
1324     ambs = a->mbs;
1325     PetscCall(VecGetArray(v,&x));
1326     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1327     PetscCall(VecRestoreArray(v,&x));
1328     PetscFunctionReturn(0);
1329   }
1330 
1331   ai   = a->i;
1332   aj   = a->j;
1333   bs2  = a->bs2;
1334   PetscCall(VecSet(v,zero));
1335   if (!a->nz) PetscFunctionReturn(0);
1336   PetscCall(VecGetArray(v,&x));
1337   for (i=0; i<ambs; i++) {
1338     j = ai[i];
1339     if (aj[j] == i) {    /* if this is a diagonal element */
1340       row  = i*bs;
1341       aa_j = aa + j*bs2;
1342       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1343     }
1344   }
1345   PetscCall(VecRestoreArray(v,&x));
1346   PetscFunctionReturn(0);
1347 }
1348 
1349 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1350 {
1351   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1352   PetscScalar       x;
1353   const PetscScalar *l,*li,*ri;
1354   MatScalar         *aa,*v;
1355   PetscInt          i,j,k,lm,M,m,mbs,tmp,bs,bs2;
1356   const PetscInt    *ai,*aj;
1357   PetscBool         flg;
1358 
1359   PetscFunctionBegin;
1360   if (ll != rr) {
1361     PetscCall(VecEqual(ll,rr,&flg));
1362     PetscCheck(flg,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same");
1363   }
1364   if (!ll) PetscFunctionReturn(0);
1365   ai  = a->i;
1366   aj  = a->j;
1367   aa  = a->a;
1368   m   = A->rmap->N;
1369   bs  = A->rmap->bs;
1370   mbs = a->mbs;
1371   bs2 = a->bs2;
1372 
1373   PetscCall(VecGetArrayRead(ll,&l));
1374   PetscCall(VecGetLocalSize(ll,&lm));
1375   PetscCheck(lm == m,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1376   for (i=0; i<mbs; i++) { /* for each block row */
1377     M  = ai[i+1] - ai[i];
1378     li = l + i*bs;
1379     v  = aa + bs2*ai[i];
1380     for (j=0; j<M; j++) { /* for each block */
1381       ri = l + bs*aj[ai[i]+j];
1382       for (k=0; k<bs; k++) {
1383         x = ri[k];
1384         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1385       }
1386     }
1387   }
1388   PetscCall(VecRestoreArrayRead(ll,&l));
1389   PetscCall(PetscLogFlops(2.0*a->nz));
1390   PetscFunctionReturn(0);
1391 }
1392 
1393 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1394 {
1395   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1396 
1397   PetscFunctionBegin;
1398   info->block_size   = a->bs2;
1399   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
1400   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
1401   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1402   info->assemblies   = A->num_ass;
1403   info->mallocs      = A->info.mallocs;
1404   info->memory       = ((PetscObject)A)->mem;
1405   if (A->factortype) {
1406     info->fill_ratio_given  = A->info.fill_ratio_given;
1407     info->fill_ratio_needed = A->info.fill_ratio_needed;
1408     info->factor_mallocs    = A->info.factor_mallocs;
1409   } else {
1410     info->fill_ratio_given  = 0;
1411     info->fill_ratio_needed = 0;
1412     info->factor_mallocs    = 0;
1413   }
1414   PetscFunctionReturn(0);
1415 }
1416 
1417 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1418 {
1419   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1420 
1421   PetscFunctionBegin;
1422   PetscCall(PetscArrayzero(a->a,a->bs2*a->i[a->mbs]));
1423   PetscFunctionReturn(0);
1424 }
1425 
1426 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1427 {
1428   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1429   PetscInt        i,j,n,row,col,bs,mbs;
1430   const PetscInt  *ai,*aj;
1431   PetscReal       atmp;
1432   const MatScalar *aa;
1433   PetscScalar     *x;
1434   PetscInt        ncols,brow,bcol,krow,kcol;
1435 
1436   PetscFunctionBegin;
1437   PetscCheck(!idx,PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1438   PetscCheck(!A->factortype,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1439   bs  = A->rmap->bs;
1440   aa  = a->a;
1441   ai  = a->i;
1442   aj  = a->j;
1443   mbs = a->mbs;
1444 
1445   PetscCall(VecSet(v,0.0));
1446   PetscCall(VecGetArray(v,&x));
1447   PetscCall(VecGetLocalSize(v,&n));
1448   PetscCheck(n == A->rmap->N,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1449   for (i=0; i<mbs; i++) {
1450     ncols = ai[1] - ai[0]; ai++;
1451     brow  = bs*i;
1452     for (j=0; j<ncols; j++) {
1453       bcol = bs*(*aj);
1454       for (kcol=0; kcol<bs; kcol++) {
1455         col = bcol + kcol;      /* col index */
1456         for (krow=0; krow<bs; krow++) {
1457           atmp = PetscAbsScalar(*aa); aa++;
1458           row  = brow + krow;   /* row index */
1459           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1460           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1461         }
1462       }
1463       aj++;
1464     }
1465   }
1466   PetscCall(VecRestoreArray(v,&x));
1467   PetscFunctionReturn(0);
1468 }
1469 
1470 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1471 {
1472   PetscFunctionBegin;
1473   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C));
1474   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1475   PetscFunctionReturn(0);
1476 }
1477 
1478 PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1479 {
1480   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1481   PetscScalar       *z = c;
1482   const PetscScalar *xb;
1483   PetscScalar       x1;
1484   const MatScalar   *v = a->a,*vv;
1485   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1486 #if defined(PETSC_USE_COMPLEX)
1487   const int         aconj = A->hermitian == PETSC_BOOL3_TRUE;
1488 #else
1489   const int         aconj = 0;
1490 #endif
1491 
1492   PetscFunctionBegin;
1493   for (i=0; i<mbs; i++) {
1494     n = ii[1] - ii[0]; ii++;
1495     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1496     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1497     jj = idx;
1498     vv = v;
1499     for (k=0; k<cn; k++) {
1500       idx = jj;
1501       v = vv;
1502       for (j=0; j<n; j++) {
1503         xb = b + (*idx); x1 = xb[0+k*bm];
1504         z[0+k*cm] += v[0]*x1;
1505         if (*idx != i) c[(*idx)+k*cm] += (aconj ? PetscConj(v[0]) : v[0])*b[i+k*bm];
1506         v += 1;
1507         ++idx;
1508       }
1509     }
1510     z += 1;
1511   }
1512   PetscFunctionReturn(0);
1513 }
1514 
1515 PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1516 {
1517   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1518   PetscScalar       *z = c;
1519   const PetscScalar *xb;
1520   PetscScalar       x1,x2;
1521   const MatScalar   *v = a->a,*vv;
1522   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1523 
1524   PetscFunctionBegin;
1525   for (i=0; i<mbs; i++) {
1526     n = ii[1] - ii[0]; ii++;
1527     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1528     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1529     jj = idx;
1530     vv = v;
1531     for (k=0; k<cn; k++) {
1532       idx = jj;
1533       v = vv;
1534       for (j=0; j<n; j++) {
1535         xb    = b + 2*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
1536         z[0+k*cm] += v[0]*x1 + v[2]*x2;
1537         z[1+k*cm] += v[1]*x1 + v[3]*x2;
1538         if (*idx != i) {
1539           c[2*(*idx)+0+k*cm] += v[0]*b[2*i+k*bm] + v[1]*b[2*i+1+k*bm];
1540           c[2*(*idx)+1+k*cm] += v[2]*b[2*i+k*bm] + v[3]*b[2*i+1+k*bm];
1541         }
1542         v += 4;
1543         ++idx;
1544       }
1545     }
1546     z += 2;
1547   }
1548   PetscFunctionReturn(0);
1549 }
1550 
1551 PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1552 {
1553   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1554   PetscScalar       *z = c;
1555   const PetscScalar *xb;
1556   PetscScalar       x1,x2,x3;
1557   const MatScalar   *v = a->a,*vv;
1558   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1559 
1560   PetscFunctionBegin;
1561   for (i=0; i<mbs; i++) {
1562     n = ii[1] - ii[0]; ii++;
1563     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1564     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1565     jj = idx;
1566     vv = v;
1567     for (k=0; k<cn; k++) {
1568       idx = jj;
1569       v = vv;
1570       for (j=0; j<n; j++) {
1571         xb = b + 3*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
1572         z[0+k*cm] += v[0]*x1 + v[3]*x2 + v[6]*x3;
1573         z[1+k*cm] += v[1]*x1 + v[4]*x2 + v[7]*x3;
1574         z[2+k*cm] += v[2]*x1 + v[5]*x2 + v[8]*x3;
1575         if (*idx != i) {
1576           c[3*(*idx)+0+k*cm] += v[0]*b[3*i+k*bm] + v[3]*b[3*i+1+k*bm] + v[6]*b[3*i+2+k*bm];
1577           c[3*(*idx)+1+k*cm] += v[1]*b[3*i+k*bm] + v[4]*b[3*i+1+k*bm] + v[7]*b[3*i+2+k*bm];
1578           c[3*(*idx)+2+k*cm] += v[2]*b[3*i+k*bm] + v[5]*b[3*i+1+k*bm] + v[8]*b[3*i+2+k*bm];
1579         }
1580         v += 9;
1581         ++idx;
1582       }
1583     }
1584     z += 3;
1585   }
1586   PetscFunctionReturn(0);
1587 }
1588 
1589 PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1590 {
1591   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1592   PetscScalar       *z = c;
1593   const PetscScalar *xb;
1594   PetscScalar       x1,x2,x3,x4;
1595   const MatScalar   *v = a->a,*vv;
1596   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1597 
1598   PetscFunctionBegin;
1599   for (i=0; i<mbs; i++) {
1600     n = ii[1] - ii[0]; ii++;
1601     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1602     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1603     jj = idx;
1604     vv = v;
1605     for (k=0; k<cn; k++) {
1606       idx = jj;
1607       v = vv;
1608       for (j=0; j<n; j++) {
1609         xb = b + 4*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
1610         z[0+k*cm] += v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1611         z[1+k*cm] += v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1612         z[2+k*cm] += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1613         z[3+k*cm] += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1614         if (*idx != i) {
1615           c[4*(*idx)+0+k*cm] += v[0]*b[4*i+k*bm] + v[4]*b[4*i+1+k*bm] + v[8]*b[4*i+2+k*bm]  + v[12]*b[4*i+3+k*bm];
1616           c[4*(*idx)+1+k*cm] += v[1]*b[4*i+k*bm] + v[5]*b[4*i+1+k*bm] + v[9]*b[4*i+2+k*bm]  + v[13]*b[4*i+3+k*bm];
1617           c[4*(*idx)+2+k*cm] += v[2]*b[4*i+k*bm] + v[6]*b[4*i+1+k*bm] + v[10]*b[4*i+2+k*bm] + v[14]*b[4*i+3+k*bm];
1618           c[4*(*idx)+3+k*cm] += v[3]*b[4*i+k*bm] + v[7]*b[4*i+1+k*bm] + v[11]*b[4*i+2+k*bm] + v[15]*b[4*i+3+k*bm];
1619         }
1620         v += 16;
1621         ++idx;
1622       }
1623     }
1624     z += 4;
1625   }
1626   PetscFunctionReturn(0);
1627 }
1628 
1629 PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1630 {
1631   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1632   PetscScalar       *z = c;
1633   const PetscScalar *xb;
1634   PetscScalar       x1,x2,x3,x4,x5;
1635   const MatScalar   *v = a->a,*vv;
1636   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1637 
1638   PetscFunctionBegin;
1639   for (i=0; i<mbs; i++) {
1640     n = ii[1] - ii[0]; ii++;
1641     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1642     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1643     jj = idx;
1644     vv = v;
1645     for (k=0; k<cn; k++) {
1646       idx = jj;
1647       v = vv;
1648       for (j=0; j<n; j++) {
1649         xb = b + 5*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*cm];
1650         z[0+k*cm] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1651         z[1+k*cm] += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1652         z[2+k*cm] += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1653         z[3+k*cm] += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1654         z[4+k*cm] += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1655         if (*idx != i) {
1656           c[5*(*idx)+0+k*cm] += v[0]*b[5*i+k*bm] + v[5]*b[5*i+1+k*bm] + v[10]*b[5*i+2+k*bm] + v[15]*b[5*i+3+k*bm] + v[20]*b[5*i+4+k*bm];
1657           c[5*(*idx)+1+k*cm] += v[1]*b[5*i+k*bm] + v[6]*b[5*i+1+k*bm] + v[11]*b[5*i+2+k*bm] + v[16]*b[5*i+3+k*bm] + v[21]*b[5*i+4+k*bm];
1658           c[5*(*idx)+2+k*cm] += v[2]*b[5*i+k*bm] + v[7]*b[5*i+1+k*bm] + v[12]*b[5*i+2+k*bm] + v[17]*b[5*i+3+k*bm] + v[22]*b[5*i+4+k*bm];
1659           c[5*(*idx)+3+k*cm] += v[3]*b[5*i+k*bm] + v[8]*b[5*i+1+k*bm] + v[13]*b[5*i+2+k*bm] + v[18]*b[5*i+3+k*bm] + v[23]*b[5*i+4+k*bm];
1660           c[5*(*idx)+4+k*cm] += v[4]*b[5*i+k*bm] + v[9]*b[5*i+1+k*bm] + v[14]*b[5*i+2+k*bm] + v[19]*b[5*i+3+k*bm] + v[24]*b[5*i+4+k*bm];
1661         }
1662         v += 25;
1663         ++idx;
1664       }
1665     }
1666     z += 5;
1667   }
1668   PetscFunctionReturn(0);
1669 }
1670 
1671 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C)
1672 {
1673   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1674   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
1675   Mat_SeqDense      *cd = (Mat_SeqDense*)C->data;
1676   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
1677   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1678   PetscBLASInt      bbs,bcn,bbm,bcm;
1679   PetscScalar       *z = NULL;
1680   PetscScalar       *c,*b;
1681   const MatScalar   *v;
1682   const PetscInt    *idx,*ii;
1683   PetscScalar       _DOne=1.0;
1684 
1685   PetscFunctionBegin;
1686   if (!cm || !cn) PetscFunctionReturn(0);
1687   PetscCheck(B->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n);
1688   PetscCheck(A->rmap->n == C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n);
1689   PetscCheck(B->cmap->n == C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n);
1690   b = bd->v;
1691   PetscCall(MatZeroEntries(C));
1692   PetscCall(MatDenseGetArray(C,&c));
1693   switch (bs) {
1694   case 1:
1695     PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1696     break;
1697   case 2:
1698     PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1699     break;
1700   case 3:
1701     PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1702     break;
1703   case 4:
1704     PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1705     break;
1706   case 5:
1707     PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1708     break;
1709   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1710     PetscCall(PetscBLASIntCast(bs,&bbs));
1711     PetscCall(PetscBLASIntCast(cn,&bcn));
1712     PetscCall(PetscBLASIntCast(bm,&bbm));
1713     PetscCall(PetscBLASIntCast(cm,&bcm));
1714     idx = a->j;
1715     v   = a->a;
1716     mbs = a->mbs;
1717     ii  = a->i;
1718     z   = c;
1719     for (i=0; i<mbs; i++) {
1720       n = ii[1] - ii[0]; ii++;
1721       for (j=0; j<n; j++) {
1722         if (*idx != i) PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*i,&bbm,&_DOne,c+bs*(*idx),&bcm));
1723         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
1724         v += bs2;
1725       }
1726       z += bs;
1727     }
1728   }
1729   PetscCall(MatDenseRestoreArray(C,&c));
1730   PetscCall(PetscLogFlops((2.0*(a->nz*2.0 - a->nonzerorowcnt)*bs2 - a->nonzerorowcnt)*cn));
1731   PetscFunctionReturn(0);
1732 }
1733