xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 6c2b77d522d8aa5c8b27f04fddd7150d0d6755fb)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/impls/dense/seq/dense.h>
4 #include <../src/mat/impls/sbaij/seq/sbaij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6 #include <petscbt.h>
7 #include <petscblaslapack.h>
8 
9 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
10 {
11   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
12   PetscInt        brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs, *nidx2;
13   const PetscInt *idx;
14   PetscBT         table_out, table_in;
15 
16   PetscFunctionBegin;
17   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
18   mbs = a->mbs;
19   ai  = a->i;
20   aj  = a->j;
21   bs  = A->rmap->bs;
22   PetscCall(PetscBTCreate(mbs, &table_out));
23   PetscCall(PetscMalloc1(mbs + 1, &nidx));
24   PetscCall(PetscMalloc1(A->rmap->N + 1, &nidx2));
25   PetscCall(PetscBTCreate(mbs, &table_in));
26 
27   for (i = 0; i < is_max; i++) { /* for each is */
28     isz = 0;
29     PetscCall(PetscBTMemzero(mbs, table_out));
30 
31     /* Extract the indices, assume there can be duplicate entries */
32     PetscCall(ISGetIndices(is[i], &idx));
33     PetscCall(ISGetLocalSize(is[i], &n));
34 
35     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
36     bcol_max = 0;
37     for (j = 0; j < n; ++j) {
38       brow = idx[j] / bs; /* convert the indices into block indices */
39       PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
40       if (!PetscBTLookupSet(table_out, brow)) {
41         nidx[isz++] = brow;
42         if (bcol_max < brow) bcol_max = brow;
43       }
44     }
45     PetscCall(ISRestoreIndices(is[i], &idx));
46     PetscCall(ISDestroy(&is[i]));
47 
48     k = 0;
49     for (j = 0; j < ov; j++) { /* for each overlap */
50       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
51       PetscCall(PetscBTMemzero(mbs, table_in));
52       for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));
53 
54       n = isz; /* length of the updated is[i] */
55       for (brow = 0; brow < mbs; brow++) {
56         start = ai[brow];
57         end   = ai[brow + 1];
58         if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
59           for (l = start; l < end; l++) {
60             bcol = aj[l];
61             if (!PetscBTLookupSet(table_out, bcol)) {
62               nidx[isz++] = bcol;
63               if (bcol_max < bcol) bcol_max = bcol;
64             }
65           }
66           k++;
67           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
68         } else {             /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
69           for (l = start; l < end; l++) {
70             bcol = aj[l];
71             if (bcol > bcol_max) break;
72             if (PetscBTLookup(table_in, bcol)) {
73               if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
74               break; /* for l = start; l<end ; l++) */
75             }
76           }
77         }
78       }
79     } /* for each overlap */
80 
81     /* expand the Index Set */
82     for (j = 0; j < isz; j++) {
83       for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k;
84     }
85     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i));
86   }
87   PetscCall(PetscBTDestroy(&table_out));
88   PetscCall(PetscFree(nidx));
89   PetscCall(PetscFree(nidx2));
90   PetscCall(PetscBTDestroy(&table_in));
91   PetscFunctionReturn(0);
92 }
93 
94 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
95         Zero some ops' to avoid invalid usse */
96 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
97 {
98   PetscFunctionBegin;
99   PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
100   Bseq->ops->mult                   = NULL;
101   Bseq->ops->multadd                = NULL;
102   Bseq->ops->multtranspose          = NULL;
103   Bseq->ops->multtransposeadd       = NULL;
104   Bseq->ops->lufactor               = NULL;
105   Bseq->ops->choleskyfactor         = NULL;
106   Bseq->ops->lufactorsymbolic       = NULL;
107   Bseq->ops->choleskyfactorsymbolic = NULL;
108   Bseq->ops->getinertia             = NULL;
109   PetscFunctionReturn(0);
110 }
111 
112 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
113 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
114 {
115   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *c;
116   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
117   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
118   const PetscInt *irow, *icol;
119   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
120   PetscInt       *aj = a->j, *ai = a->i;
121   MatScalar      *mat_a;
122   Mat             C;
123   PetscBool       flag;
124 
125   PetscFunctionBegin;
126 
127   PetscCall(ISGetIndices(isrow, &irow));
128   PetscCall(ISGetIndices(iscol, &icol));
129   PetscCall(ISGetLocalSize(isrow, &nrows));
130   PetscCall(ISGetLocalSize(iscol, &ncols));
131 
132   PetscCall(PetscCalloc1(1 + oldcols, &smap));
133   ssmap = smap;
134   PetscCall(PetscMalloc1(1 + nrows, &lens));
135   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
136   /* determine lens of each row */
137   for (i = 0; i < nrows; i++) {
138     kstart  = ai[irow[i]];
139     kend    = kstart + a->ilen[irow[i]];
140     lens[i] = 0;
141     for (k = kstart; k < kend; k++) {
142       if (ssmap[aj[k]]) lens[i]++;
143     }
144   }
145   /* Create and fill new matrix */
146   if (scall == MAT_REUSE_MATRIX) {
147     c = (Mat_SeqSBAIJ *)((*B)->data);
148 
149     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
150     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
151     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
152     PetscCall(PetscArrayzero(c->ilen, c->mbs));
153     C = *B;
154   } else {
155     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
156     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
157     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
158     PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
159   }
160   c = (Mat_SeqSBAIJ *)(C->data);
161   for (i = 0; i < nrows; i++) {
162     row      = irow[i];
163     kstart   = ai[row];
164     kend     = kstart + a->ilen[row];
165     mat_i    = c->i[i];
166     mat_j    = c->j + mat_i;
167     mat_a    = c->a + mat_i * bs2;
168     mat_ilen = c->ilen + i;
169     for (k = kstart; k < kend; k++) {
170       if ((tcol = ssmap[a->j[k]])) {
171         *mat_j++ = tcol - 1;
172         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
173         mat_a += bs2;
174         (*mat_ilen)++;
175       }
176     }
177   }
178   /* sort */
179   {
180     MatScalar *work;
181 
182     PetscCall(PetscMalloc1(bs2, &work));
183     for (i = 0; i < nrows; i++) {
184       PetscInt ilen;
185       mat_i = c->i[i];
186       mat_j = c->j + mat_i;
187       mat_a = c->a + mat_i * bs2;
188       ilen  = c->ilen[i];
189       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
190     }
191     PetscCall(PetscFree(work));
192   }
193 
194   /* Free work space */
195   PetscCall(ISRestoreIndices(iscol, &icol));
196   PetscCall(PetscFree(smap));
197   PetscCall(PetscFree(lens));
198   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
199   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
200 
201   PetscCall(ISRestoreIndices(isrow, &irow));
202   *B = C;
203   PetscFunctionReturn(0);
204 }
205 
206 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
207 {
208   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
209   IS              is1, is2;
210   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs;
211   const PetscInt *irow, *icol;
212 
213   PetscFunctionBegin;
214   PetscCall(ISGetIndices(isrow, &irow));
215   PetscCall(ISGetIndices(iscol, &icol));
216   PetscCall(ISGetLocalSize(isrow, &nrows));
217   PetscCall(ISGetLocalSize(iscol, &ncols));
218 
219   /* Verify if the indices corespond to each element in a block
220    and form the IS with compressed IS */
221   maxmnbs = PetscMax(a->mbs, a->nbs);
222   PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary));
223   PetscCall(PetscArrayzero(vary, a->mbs));
224   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
225   for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks");
226   count = 0;
227   for (i = 0; i < nrows; i++) {
228     PetscInt j = irow[i] / bs;
229     if ((vary[j]--) == bs) iary[count++] = j;
230   }
231   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1));
232 
233   PetscCall(PetscArrayzero(vary, a->nbs));
234   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
235   for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc");
236   count = 0;
237   for (i = 0; i < ncols; i++) {
238     PetscInt j = icol[i] / bs;
239     if ((vary[j]--) == bs) iary[count++] = j;
240   }
241   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2));
242   PetscCall(ISRestoreIndices(isrow, &irow));
243   PetscCall(ISRestoreIndices(iscol, &icol));
244   PetscCall(PetscFree2(vary, iary));
245 
246   PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B));
247   PetscCall(ISDestroy(&is1));
248   PetscCall(ISDestroy(&is2));
249 
250   if (isrow != iscol) {
251     PetscBool isequal;
252     PetscCall(ISEqual(isrow, iscol, &isequal));
253     if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
254   }
255   PetscFunctionReturn(0);
256 }
257 
258 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
259 {
260   PetscInt i;
261 
262   PetscFunctionBegin;
263   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));
264 
265   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
266   PetscFunctionReturn(0);
267 }
268 
269 /* -------------------------------------------------------*/
270 /* Should check that shapes of vectors and matrices match */
271 /* -------------------------------------------------------*/
272 
273 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
274 {
275   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
276   PetscScalar       *z, x1, x2, zero = 0.0;
277   const PetscScalar *x, *xb;
278   const MatScalar   *v;
279   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
280   const PetscInt    *aj = a->j, *ai = a->i, *ib;
281   PetscInt           nonzerorow = 0;
282 
283   PetscFunctionBegin;
284   PetscCall(VecSet(zz, zero));
285   if (!a->nz) PetscFunctionReturn(0);
286   PetscCall(VecGetArrayRead(xx, &x));
287   PetscCall(VecGetArray(zz, &z));
288 
289   v  = a->a;
290   xb = x;
291 
292   for (i = 0; i < mbs; i++) {
293     n    = ai[1] - ai[0]; /* length of i_th block row of A */
294     x1   = xb[0];
295     x2   = xb[1];
296     ib   = aj + *ai;
297     jmin = 0;
298     nonzerorow += (n > 0);
299     if (*ib == i) { /* (diag of A)*x */
300       z[2 * i] += v[0] * x1 + v[2] * x2;
301       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
302       v += 4;
303       jmin++;
304     }
305     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
306     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
307     for (j = jmin; j < n; j++) {
308       /* (strict lower triangular part of A)*x  */
309       cval = ib[j] * 2;
310       z[cval] += v[0] * x1 + v[1] * x2;
311       z[cval + 1] += v[2] * x1 + v[3] * x2;
312       /* (strict upper triangular part of A)*x  */
313       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
314       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
315       v += 4;
316     }
317     xb += 2;
318     ai++;
319   }
320 
321   PetscCall(VecRestoreArrayRead(xx, &x));
322   PetscCall(VecRestoreArray(zz, &z));
323   PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
324   PetscFunctionReturn(0);
325 }
326 
327 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
328 {
329   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
330   PetscScalar       *z, x1, x2, x3, zero = 0.0;
331   const PetscScalar *x, *xb;
332   const MatScalar   *v;
333   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
334   const PetscInt    *aj = a->j, *ai = a->i, *ib;
335   PetscInt           nonzerorow = 0;
336 
337   PetscFunctionBegin;
338   PetscCall(VecSet(zz, zero));
339   if (!a->nz) PetscFunctionReturn(0);
340   PetscCall(VecGetArrayRead(xx, &x));
341   PetscCall(VecGetArray(zz, &z));
342 
343   v  = a->a;
344   xb = x;
345 
346   for (i = 0; i < mbs; i++) {
347     n    = ai[1] - ai[0]; /* length of i_th block row of A */
348     x1   = xb[0];
349     x2   = xb[1];
350     x3   = xb[2];
351     ib   = aj + *ai;
352     jmin = 0;
353     nonzerorow += (n > 0);
354     if (*ib == i) { /* (diag of A)*x */
355       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
356       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
357       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
358       v += 9;
359       jmin++;
360     }
361     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
362     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
363     for (j = jmin; j < n; j++) {
364       /* (strict lower triangular part of A)*x  */
365       cval = ib[j] * 3;
366       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
367       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
368       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
369       /* (strict upper triangular part of A)*x  */
370       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
371       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
372       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
373       v += 9;
374     }
375     xb += 3;
376     ai++;
377   }
378 
379   PetscCall(VecRestoreArrayRead(xx, &x));
380   PetscCall(VecRestoreArray(zz, &z));
381   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
382   PetscFunctionReturn(0);
383 }
384 
385 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
386 {
387   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
388   PetscScalar       *z, x1, x2, x3, x4, zero = 0.0;
389   const PetscScalar *x, *xb;
390   const MatScalar   *v;
391   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
392   const PetscInt    *aj = a->j, *ai = a->i, *ib;
393   PetscInt           nonzerorow = 0;
394 
395   PetscFunctionBegin;
396   PetscCall(VecSet(zz, zero));
397   if (!a->nz) PetscFunctionReturn(0);
398   PetscCall(VecGetArrayRead(xx, &x));
399   PetscCall(VecGetArray(zz, &z));
400 
401   v  = a->a;
402   xb = x;
403 
404   for (i = 0; i < mbs; i++) {
405     n    = ai[1] - ai[0]; /* length of i_th block row of A */
406     x1   = xb[0];
407     x2   = xb[1];
408     x3   = xb[2];
409     x4   = xb[3];
410     ib   = aj + *ai;
411     jmin = 0;
412     nonzerorow += (n > 0);
413     if (*ib == i) { /* (diag of A)*x */
414       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
415       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
416       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
417       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
418       v += 16;
419       jmin++;
420     }
421     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
422     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
423     for (j = jmin; j < n; j++) {
424       /* (strict lower triangular part of A)*x  */
425       cval = ib[j] * 4;
426       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
427       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
428       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
429       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
430       /* (strict upper triangular part of A)*x  */
431       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
432       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
433       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
434       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
435       v += 16;
436     }
437     xb += 4;
438     ai++;
439   }
440 
441   PetscCall(VecRestoreArrayRead(xx, &x));
442   PetscCall(VecRestoreArray(zz, &z));
443   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
444   PetscFunctionReturn(0);
445 }
446 
447 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
448 {
449   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
450   PetscScalar       *z, x1, x2, x3, x4, x5, zero = 0.0;
451   const PetscScalar *x, *xb;
452   const MatScalar   *v;
453   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
454   const PetscInt    *aj = a->j, *ai = a->i, *ib;
455   PetscInt           nonzerorow = 0;
456 
457   PetscFunctionBegin;
458   PetscCall(VecSet(zz, zero));
459   if (!a->nz) PetscFunctionReturn(0);
460   PetscCall(VecGetArrayRead(xx, &x));
461   PetscCall(VecGetArray(zz, &z));
462 
463   v  = a->a;
464   xb = x;
465 
466   for (i = 0; i < mbs; i++) {
467     n    = ai[1] - ai[0]; /* length of i_th block row of A */
468     x1   = xb[0];
469     x2   = xb[1];
470     x3   = xb[2];
471     x4   = xb[3];
472     x5   = xb[4];
473     ib   = aj + *ai;
474     jmin = 0;
475     nonzerorow += (n > 0);
476     if (*ib == i) { /* (diag of A)*x */
477       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
478       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
479       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
480       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
481       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
482       v += 25;
483       jmin++;
484     }
485     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
486     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
487     for (j = jmin; j < n; j++) {
488       /* (strict lower triangular part of A)*x  */
489       cval = ib[j] * 5;
490       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
491       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
492       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
493       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
494       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
495       /* (strict upper triangular part of A)*x  */
496       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
497       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
498       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
499       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
500       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
501       v += 25;
502     }
503     xb += 5;
504     ai++;
505   }
506 
507   PetscCall(VecRestoreArrayRead(xx, &x));
508   PetscCall(VecRestoreArray(zz, &z));
509   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
510   PetscFunctionReturn(0);
511 }
512 
513 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
514 {
515   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
516   PetscScalar       *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
517   const PetscScalar *x, *xb;
518   const MatScalar   *v;
519   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
520   const PetscInt    *aj = a->j, *ai = a->i, *ib;
521   PetscInt           nonzerorow = 0;
522 
523   PetscFunctionBegin;
524   PetscCall(VecSet(zz, zero));
525   if (!a->nz) PetscFunctionReturn(0);
526   PetscCall(VecGetArrayRead(xx, &x));
527   PetscCall(VecGetArray(zz, &z));
528 
529   v  = a->a;
530   xb = x;
531 
532   for (i = 0; i < mbs; i++) {
533     n    = ai[1] - ai[0]; /* length of i_th block row of A */
534     x1   = xb[0];
535     x2   = xb[1];
536     x3   = xb[2];
537     x4   = xb[3];
538     x5   = xb[4];
539     x6   = xb[5];
540     ib   = aj + *ai;
541     jmin = 0;
542     nonzerorow += (n > 0);
543     if (*ib == i) { /* (diag of A)*x */
544       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
545       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
546       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
547       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
548       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
549       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
550       v += 36;
551       jmin++;
552     }
553     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
554     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
555     for (j = jmin; j < n; j++) {
556       /* (strict lower triangular part of A)*x  */
557       cval = ib[j] * 6;
558       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
559       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
560       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
561       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
562       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
563       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
564       /* (strict upper triangular part of A)*x  */
565       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
566       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
567       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
568       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
569       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
570       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
571       v += 36;
572     }
573     xb += 6;
574     ai++;
575   }
576 
577   PetscCall(VecRestoreArrayRead(xx, &x));
578   PetscCall(VecRestoreArray(zz, &z));
579   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
580   PetscFunctionReturn(0);
581 }
582 
583 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
584 {
585   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
586   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
587   const PetscScalar *x, *xb;
588   const MatScalar   *v;
589   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
590   const PetscInt    *aj = a->j, *ai = a->i, *ib;
591   PetscInt           nonzerorow = 0;
592 
593   PetscFunctionBegin;
594   PetscCall(VecSet(zz, zero));
595   if (!a->nz) PetscFunctionReturn(0);
596   PetscCall(VecGetArrayRead(xx, &x));
597   PetscCall(VecGetArray(zz, &z));
598 
599   v  = a->a;
600   xb = x;
601 
602   for (i = 0; i < mbs; i++) {
603     n    = ai[1] - ai[0]; /* length of i_th block row of A */
604     x1   = xb[0];
605     x2   = xb[1];
606     x3   = xb[2];
607     x4   = xb[3];
608     x5   = xb[4];
609     x6   = xb[5];
610     x7   = xb[6];
611     ib   = aj + *ai;
612     jmin = 0;
613     nonzerorow += (n > 0);
614     if (*ib == i) { /* (diag of A)*x */
615       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
616       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
617       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
618       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
619       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
620       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
621       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
622       v += 49;
623       jmin++;
624     }
625     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
626     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
627     for (j = jmin; j < n; j++) {
628       /* (strict lower triangular part of A)*x  */
629       cval = ib[j] * 7;
630       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
631       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
632       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
633       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
634       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
635       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
636       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
637       /* (strict upper triangular part of A)*x  */
638       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
639       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
640       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
641       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
642       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
643       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
644       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
645       v += 49;
646     }
647     xb += 7;
648     ai++;
649   }
650   PetscCall(VecRestoreArrayRead(xx, &x));
651   PetscCall(VecRestoreArray(zz, &z));
652   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
653   PetscFunctionReturn(0);
654 }
655 
656 /*
657     This will not work with MatScalar == float because it calls the BLAS
658 */
659 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
660 {
661   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
662   PetscScalar       *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
663   const PetscScalar *x, *x_ptr, *xb;
664   const MatScalar   *v;
665   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
666   const PetscInt    *idx, *aj, *ii;
667   PetscInt           nonzerorow = 0;
668 
669   PetscFunctionBegin;
670   PetscCall(VecSet(zz, zero));
671   if (!a->nz) PetscFunctionReturn(0);
672   PetscCall(VecGetArrayRead(xx, &x));
673   PetscCall(VecGetArray(zz, &z));
674 
675   x_ptr = x;
676   z_ptr = z;
677 
678   aj = a->j;
679   v  = a->a;
680   ii = a->i;
681 
682   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
683   work = a->mult_work;
684 
685   for (i = 0; i < mbs; i++) {
686     n     = ii[1] - ii[0];
687     ncols = n * bs;
688     workt = work;
689     idx   = aj + ii[0];
690     nonzerorow += (n > 0);
691 
692     /* upper triangular part */
693     for (j = 0; j < n; j++) {
694       xb = x_ptr + bs * (*idx++);
695       for (k = 0; k < bs; k++) workt[k] = xb[k];
696       workt += bs;
697     }
698     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
699     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
700 
701     /* strict lower triangular part */
702     idx = aj + ii[0];
703     if (n && *idx == i) {
704       ncols -= bs;
705       v += bs2;
706       idx++;
707       n--;
708     }
709 
710     if (ncols > 0) {
711       workt = work;
712       PetscCall(PetscArrayzero(workt, ncols));
713       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
714       for (j = 0; j < n; j++) {
715         zb = z_ptr + bs * (*idx++);
716         for (k = 0; k < bs; k++) zb[k] += workt[k];
717         workt += bs;
718       }
719     }
720     x += bs;
721     v += n * bs2;
722     z += bs;
723     ii++;
724   }
725 
726   PetscCall(VecRestoreArrayRead(xx, &x));
727   PetscCall(VecRestoreArray(zz, &z));
728   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
729   PetscFunctionReturn(0);
730 }
731 
732 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
733 {
734   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
735   PetscScalar       *z, x1;
736   const PetscScalar *x, *xb;
737   const MatScalar   *v;
738   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
739   const PetscInt    *aj = a->j, *ai = a->i, *ib;
740   PetscInt           nonzerorow = 0;
741 #if defined(PETSC_USE_COMPLEX)
742   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
743 #else
744   const int aconj = 0;
745 #endif
746 
747   PetscFunctionBegin;
748   PetscCall(VecCopy(yy, zz));
749   if (!a->nz) PetscFunctionReturn(0);
750   PetscCall(VecGetArrayRead(xx, &x));
751   PetscCall(VecGetArray(zz, &z));
752   v  = a->a;
753   xb = x;
754 
755   for (i = 0; i < mbs; i++) {
756     n    = ai[1] - ai[0]; /* length of i_th row of A */
757     x1   = xb[0];
758     ib   = aj + *ai;
759     jmin = 0;
760     nonzerorow += (n > 0);
761     if (n && *ib == i) { /* (diag of A)*x */
762       z[i] += *v++ * x[*ib++];
763       jmin++;
764     }
765     if (aconj) {
766       for (j = jmin; j < n; j++) {
767         cval = *ib;
768         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
769         z[i] += *v++ * x[*ib++];       /* (strict upper triangular part of A)*x  */
770       }
771     } else {
772       for (j = jmin; j < n; j++) {
773         cval = *ib;
774         z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
775         z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
776       }
777     }
778     xb++;
779     ai++;
780   }
781 
782   PetscCall(VecRestoreArrayRead(xx, &x));
783   PetscCall(VecRestoreArray(zz, &z));
784 
785   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
786   PetscFunctionReturn(0);
787 }
788 
789 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
790 {
791   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
792   PetscScalar       *z, x1, x2;
793   const PetscScalar *x, *xb;
794   const MatScalar   *v;
795   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
796   const PetscInt    *aj = a->j, *ai = a->i, *ib;
797   PetscInt           nonzerorow = 0;
798 
799   PetscFunctionBegin;
800   PetscCall(VecCopy(yy, zz));
801   if (!a->nz) PetscFunctionReturn(0);
802   PetscCall(VecGetArrayRead(xx, &x));
803   PetscCall(VecGetArray(zz, &z));
804 
805   v  = a->a;
806   xb = x;
807 
808   for (i = 0; i < mbs; i++) {
809     n    = ai[1] - ai[0]; /* length of i_th block row of A */
810     x1   = xb[0];
811     x2   = xb[1];
812     ib   = aj + *ai;
813     jmin = 0;
814     nonzerorow += (n > 0);
815     if (n && *ib == i) { /* (diag of A)*x */
816       z[2 * i] += v[0] * x1 + v[2] * x2;
817       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
818       v += 4;
819       jmin++;
820     }
821     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
822     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
823     for (j = jmin; j < n; j++) {
824       /* (strict lower triangular part of A)*x  */
825       cval = ib[j] * 2;
826       z[cval] += v[0] * x1 + v[1] * x2;
827       z[cval + 1] += v[2] * x1 + v[3] * x2;
828       /* (strict upper triangular part of A)*x  */
829       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
830       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
831       v += 4;
832     }
833     xb += 2;
834     ai++;
835   }
836   PetscCall(VecRestoreArrayRead(xx, &x));
837   PetscCall(VecRestoreArray(zz, &z));
838 
839   PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
840   PetscFunctionReturn(0);
841 }
842 
843 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
844 {
845   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
846   PetscScalar       *z, x1, x2, x3;
847   const PetscScalar *x, *xb;
848   const MatScalar   *v;
849   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
850   const PetscInt    *aj = a->j, *ai = a->i, *ib;
851   PetscInt           nonzerorow = 0;
852 
853   PetscFunctionBegin;
854   PetscCall(VecCopy(yy, zz));
855   if (!a->nz) PetscFunctionReturn(0);
856   PetscCall(VecGetArrayRead(xx, &x));
857   PetscCall(VecGetArray(zz, &z));
858 
859   v  = a->a;
860   xb = x;
861 
862   for (i = 0; i < mbs; i++) {
863     n    = ai[1] - ai[0]; /* length of i_th block row of A */
864     x1   = xb[0];
865     x2   = xb[1];
866     x3   = xb[2];
867     ib   = aj + *ai;
868     jmin = 0;
869     nonzerorow += (n > 0);
870     if (n && *ib == i) { /* (diag of A)*x */
871       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
872       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
873       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
874       v += 9;
875       jmin++;
876     }
877     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
878     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
879     for (j = jmin; j < n; j++) {
880       /* (strict lower triangular part of A)*x  */
881       cval = ib[j] * 3;
882       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
883       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
884       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
885       /* (strict upper triangular part of A)*x  */
886       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
887       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
888       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
889       v += 9;
890     }
891     xb += 3;
892     ai++;
893   }
894 
895   PetscCall(VecRestoreArrayRead(xx, &x));
896   PetscCall(VecRestoreArray(zz, &z));
897 
898   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
899   PetscFunctionReturn(0);
900 }
901 
902 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
903 {
904   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
905   PetscScalar       *z, x1, x2, x3, x4;
906   const PetscScalar *x, *xb;
907   const MatScalar   *v;
908   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
909   const PetscInt    *aj = a->j, *ai = a->i, *ib;
910   PetscInt           nonzerorow = 0;
911 
912   PetscFunctionBegin;
913   PetscCall(VecCopy(yy, zz));
914   if (!a->nz) PetscFunctionReturn(0);
915   PetscCall(VecGetArrayRead(xx, &x));
916   PetscCall(VecGetArray(zz, &z));
917 
918   v  = a->a;
919   xb = x;
920 
921   for (i = 0; i < mbs; i++) {
922     n    = ai[1] - ai[0]; /* length of i_th block row of A */
923     x1   = xb[0];
924     x2   = xb[1];
925     x3   = xb[2];
926     x4   = xb[3];
927     ib   = aj + *ai;
928     jmin = 0;
929     nonzerorow += (n > 0);
930     if (n && *ib == i) { /* (diag of A)*x */
931       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
932       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
933       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
934       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
935       v += 16;
936       jmin++;
937     }
938     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
939     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
940     for (j = jmin; j < n; j++) {
941       /* (strict lower triangular part of A)*x  */
942       cval = ib[j] * 4;
943       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
944       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
945       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
946       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
947       /* (strict upper triangular part of A)*x  */
948       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
949       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
950       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
951       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
952       v += 16;
953     }
954     xb += 4;
955     ai++;
956   }
957 
958   PetscCall(VecRestoreArrayRead(xx, &x));
959   PetscCall(VecRestoreArray(zz, &z));
960 
961   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
962   PetscFunctionReturn(0);
963 }
964 
965 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
966 {
967   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
968   PetscScalar       *z, x1, x2, x3, x4, x5;
969   const PetscScalar *x, *xb;
970   const MatScalar   *v;
971   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
972   const PetscInt    *aj = a->j, *ai = a->i, *ib;
973   PetscInt           nonzerorow = 0;
974 
975   PetscFunctionBegin;
976   PetscCall(VecCopy(yy, zz));
977   if (!a->nz) PetscFunctionReturn(0);
978   PetscCall(VecGetArrayRead(xx, &x));
979   PetscCall(VecGetArray(zz, &z));
980 
981   v  = a->a;
982   xb = x;
983 
984   for (i = 0; i < mbs; i++) {
985     n    = ai[1] - ai[0]; /* length of i_th block row of A */
986     x1   = xb[0];
987     x2   = xb[1];
988     x3   = xb[2];
989     x4   = xb[3];
990     x5   = xb[4];
991     ib   = aj + *ai;
992     jmin = 0;
993     nonzerorow += (n > 0);
994     if (n && *ib == i) { /* (diag of A)*x */
995       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
996       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
997       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
998       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
999       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1000       v += 25;
1001       jmin++;
1002     }
1003     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1004     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1005     for (j = jmin; j < n; j++) {
1006       /* (strict lower triangular part of A)*x  */
1007       cval = ib[j] * 5;
1008       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
1009       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
1010       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
1011       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
1012       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1013       /* (strict upper triangular part of A)*x  */
1014       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
1015       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
1016       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
1017       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
1018       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
1019       v += 25;
1020     }
1021     xb += 5;
1022     ai++;
1023   }
1024 
1025   PetscCall(VecRestoreArrayRead(xx, &x));
1026   PetscCall(VecRestoreArray(zz, &z));
1027 
1028   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
1029   PetscFunctionReturn(0);
1030 }
1031 
1032 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1033 {
1034   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1035   PetscScalar       *z, x1, x2, x3, x4, x5, x6;
1036   const PetscScalar *x, *xb;
1037   const MatScalar   *v;
1038   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1039   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1040   PetscInt           nonzerorow = 0;
1041 
1042   PetscFunctionBegin;
1043   PetscCall(VecCopy(yy, zz));
1044   if (!a->nz) PetscFunctionReturn(0);
1045   PetscCall(VecGetArrayRead(xx, &x));
1046   PetscCall(VecGetArray(zz, &z));
1047 
1048   v  = a->a;
1049   xb = x;
1050 
1051   for (i = 0; i < mbs; i++) {
1052     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1053     x1   = xb[0];
1054     x2   = xb[1];
1055     x3   = xb[2];
1056     x4   = xb[3];
1057     x5   = xb[4];
1058     x6   = xb[5];
1059     ib   = aj + *ai;
1060     jmin = 0;
1061     nonzerorow += (n > 0);
1062     if (n && *ib == i) { /* (diag of A)*x */
1063       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1064       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1065       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1066       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1067       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1068       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1069       v += 36;
1070       jmin++;
1071     }
1072     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1073     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1074     for (j = jmin; j < n; j++) {
1075       /* (strict lower triangular part of A)*x  */
1076       cval = ib[j] * 6;
1077       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1078       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1079       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1080       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1081       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1082       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1083       /* (strict upper triangular part of A)*x  */
1084       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1085       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1086       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1087       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1088       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1089       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1090       v += 36;
1091     }
1092     xb += 6;
1093     ai++;
1094   }
1095 
1096   PetscCall(VecRestoreArrayRead(xx, &x));
1097   PetscCall(VecRestoreArray(zz, &z));
1098 
1099   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
1100   PetscFunctionReturn(0);
1101 }
1102 
1103 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1104 {
1105   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1106   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7;
1107   const PetscScalar *x, *xb;
1108   const MatScalar   *v;
1109   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1110   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1111   PetscInt           nonzerorow = 0;
1112 
1113   PetscFunctionBegin;
1114   PetscCall(VecCopy(yy, zz));
1115   if (!a->nz) PetscFunctionReturn(0);
1116   PetscCall(VecGetArrayRead(xx, &x));
1117   PetscCall(VecGetArray(zz, &z));
1118 
1119   v  = a->a;
1120   xb = x;
1121 
1122   for (i = 0; i < mbs; i++) {
1123     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1124     x1   = xb[0];
1125     x2   = xb[1];
1126     x3   = xb[2];
1127     x4   = xb[3];
1128     x5   = xb[4];
1129     x6   = xb[5];
1130     x7   = xb[6];
1131     ib   = aj + *ai;
1132     jmin = 0;
1133     nonzerorow += (n > 0);
1134     if (n && *ib == i) { /* (diag of A)*x */
1135       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1136       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1137       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1138       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1139       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1140       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1141       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1142       v += 49;
1143       jmin++;
1144     }
1145     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1146     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1147     for (j = jmin; j < n; j++) {
1148       /* (strict lower triangular part of A)*x  */
1149       cval = ib[j] * 7;
1150       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1151       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1152       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1153       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1154       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1155       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1156       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1157       /* (strict upper triangular part of A)*x  */
1158       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1159       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1160       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1161       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1162       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1163       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1164       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1165       v += 49;
1166     }
1167     xb += 7;
1168     ai++;
1169   }
1170 
1171   PetscCall(VecRestoreArrayRead(xx, &x));
1172   PetscCall(VecRestoreArray(zz, &z));
1173 
1174   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
1175   PetscFunctionReturn(0);
1176 }
1177 
1178 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1179 {
1180   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1181   PetscScalar       *z, *z_ptr = NULL, *zb, *work, *workt;
1182   const PetscScalar *x, *x_ptr, *xb;
1183   const MatScalar   *v;
1184   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1185   const PetscInt    *idx, *aj, *ii;
1186   PetscInt           nonzerorow = 0;
1187 
1188   PetscFunctionBegin;
1189   PetscCall(VecCopy(yy, zz));
1190   if (!a->nz) PetscFunctionReturn(0);
1191   PetscCall(VecGetArrayRead(xx, &x));
1192   x_ptr = x;
1193   PetscCall(VecGetArray(zz, &z));
1194   z_ptr = z;
1195 
1196   aj = a->j;
1197   v  = a->a;
1198   ii = a->i;
1199 
1200   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
1201   work = a->mult_work;
1202 
1203   for (i = 0; i < mbs; i++) {
1204     n     = ii[1] - ii[0];
1205     ncols = n * bs;
1206     workt = work;
1207     idx   = aj + ii[0];
1208     nonzerorow += (n > 0);
1209 
1210     /* upper triangular part */
1211     for (j = 0; j < n; j++) {
1212       xb = x_ptr + bs * (*idx++);
1213       for (k = 0; k < bs; k++) workt[k] = xb[k];
1214       workt += bs;
1215     }
1216     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1217     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
1218 
1219     /* strict lower triangular part */
1220     idx = aj + ii[0];
1221     if (n && *idx == i) {
1222       ncols -= bs;
1223       v += bs2;
1224       idx++;
1225       n--;
1226     }
1227     if (ncols > 0) {
1228       workt = work;
1229       PetscCall(PetscArrayzero(workt, ncols));
1230       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1231       for (j = 0; j < n; j++) {
1232         zb = z_ptr + bs * (*idx++);
1233         for (k = 0; k < bs; k++) zb[k] += workt[k];
1234         workt += bs;
1235       }
1236     }
1237 
1238     x += bs;
1239     v += n * bs2;
1240     z += bs;
1241     ii++;
1242   }
1243 
1244   PetscCall(VecRestoreArrayRead(xx, &x));
1245   PetscCall(VecRestoreArray(zz, &z));
1246 
1247   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
1248   PetscFunctionReturn(0);
1249 }
1250 
1251 PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1252 {
1253   Mat_SeqSBAIJ *a      = (Mat_SeqSBAIJ *)inA->data;
1254   PetscScalar   oalpha = alpha;
1255   PetscBLASInt  one    = 1, totalnz;
1256 
1257   PetscFunctionBegin;
1258   PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1259   PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1260   PetscCall(PetscLogFlops(totalnz));
1261   PetscFunctionReturn(0);
1262 }
1263 
1264 PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1265 {
1266   Mat_SeqSBAIJ    *a        = (Mat_SeqSBAIJ *)A->data;
1267   const MatScalar *v        = a->a;
1268   PetscReal        sum_diag = 0.0, sum_off = 0.0, *sum;
1269   PetscInt         i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1270   const PetscInt  *aj = a->j, *col;
1271 
1272   PetscFunctionBegin;
1273   if (!a->nz) {
1274     *norm = 0.0;
1275     PetscFunctionReturn(0);
1276   }
1277   if (type == NORM_FROBENIUS) {
1278     for (k = 0; k < mbs; k++) {
1279       jmin = a->i[k];
1280       jmax = a->i[k + 1];
1281       col  = aj + jmin;
1282       if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1283         for (i = 0; i < bs2; i++) {
1284           sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1285           v++;
1286         }
1287         jmin++;
1288       }
1289       for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1290         for (i = 0; i < bs2; i++) {
1291           sum_off += PetscRealPart(PetscConj(*v) * (*v));
1292           v++;
1293         }
1294       }
1295     }
1296     *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1297     PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1298   } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1299     PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
1300     for (i = 0; i < mbs; i++) jl[i] = mbs;
1301     il[0] = 0;
1302 
1303     *norm = 0.0;
1304     for (k = 0; k < mbs; k++) { /* k_th block row */
1305       for (j = 0; j < bs; j++) sum[j] = 0.0;
1306       /*-- col sum --*/
1307       i = jl[k]; /* first |A(i,k)| to be added */
1308       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1309                   at step k */
1310       while (i < mbs) {
1311         nexti = jl[i]; /* next block row to be added */
1312         ik    = il[i]; /* block index of A(i,k) in the array a */
1313         for (j = 0; j < bs; j++) {
1314           v = a->a + ik * bs2 + j * bs;
1315           for (k1 = 0; k1 < bs; k1++) {
1316             sum[j] += PetscAbsScalar(*v);
1317             v++;
1318           }
1319         }
1320         /* update il, jl */
1321         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1322         jmax = a->i[i + 1];
1323         if (jmin < jmax) {
1324           il[i] = jmin;
1325           j     = a->j[jmin];
1326           jl[i] = jl[j];
1327           jl[j] = i;
1328         }
1329         i = nexti;
1330       }
1331       /*-- row sum --*/
1332       jmin = a->i[k];
1333       jmax = a->i[k + 1];
1334       for (i = jmin; i < jmax; i++) {
1335         for (j = 0; j < bs; j++) {
1336           v = a->a + i * bs2 + j;
1337           for (k1 = 0; k1 < bs; k1++) {
1338             sum[j] += PetscAbsScalar(*v);
1339             v += bs;
1340           }
1341         }
1342       }
1343       /* add k_th block row to il, jl */
1344       col = aj + jmin;
1345       if (jmax - jmin > 0 && *col == k) jmin++;
1346       if (jmin < jmax) {
1347         il[k] = jmin;
1348         j     = a->j[jmin];
1349         jl[k] = jl[j];
1350         jl[j] = k;
1351       }
1352       for (j = 0; j < bs; j++) {
1353         if (sum[j] > *norm) *norm = sum[j];
1354       }
1355     }
1356     PetscCall(PetscFree3(sum, il, jl));
1357     PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1358   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1359   PetscFunctionReturn(0);
1360 }
1361 
1362 PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1363 {
1364   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
1365 
1366   PetscFunctionBegin;
1367   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1368   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1369     *flg = PETSC_FALSE;
1370     PetscFunctionReturn(0);
1371   }
1372 
1373   /* if the a->i are the same */
1374   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
1375   if (!*flg) PetscFunctionReturn(0);
1376 
1377   /* if a->j are the same */
1378   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
1379   if (!*flg) PetscFunctionReturn(0);
1380 
1381   /* if a->a are the same */
1382   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg));
1383   PetscFunctionReturn(0);
1384 }
1385 
1386 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1387 {
1388   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1389   PetscInt         i, j, k, row, bs, ambs, bs2;
1390   const PetscInt  *ai, *aj;
1391   PetscScalar     *x, zero = 0.0;
1392   const MatScalar *aa, *aa_j;
1393 
1394   PetscFunctionBegin;
1395   bs = A->rmap->bs;
1396   PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");
1397 
1398   aa   = a->a;
1399   ambs = a->mbs;
1400 
1401   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1402     PetscInt *diag = a->diag;
1403     aa             = a->a;
1404     ambs           = a->mbs;
1405     PetscCall(VecGetArray(v, &x));
1406     for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
1407     PetscCall(VecRestoreArray(v, &x));
1408     PetscFunctionReturn(0);
1409   }
1410 
1411   ai  = a->i;
1412   aj  = a->j;
1413   bs2 = a->bs2;
1414   PetscCall(VecSet(v, zero));
1415   if (!a->nz) PetscFunctionReturn(0);
1416   PetscCall(VecGetArray(v, &x));
1417   for (i = 0; i < ambs; i++) {
1418     j = ai[i];
1419     if (aj[j] == i) { /* if this is a diagonal element */
1420       row  = i * bs;
1421       aa_j = aa + j * bs2;
1422       for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
1423     }
1424   }
1425   PetscCall(VecRestoreArray(v, &x));
1426   PetscFunctionReturn(0);
1427 }
1428 
1429 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1430 {
1431   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1432   PetscScalar        x;
1433   const PetscScalar *l, *li, *ri;
1434   MatScalar         *aa, *v;
1435   PetscInt           i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1436   const PetscInt    *ai, *aj;
1437   PetscBool          flg;
1438 
1439   PetscFunctionBegin;
1440   if (ll != rr) {
1441     PetscCall(VecEqual(ll, rr, &flg));
1442     PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same");
1443   }
1444   if (!ll) PetscFunctionReturn(0);
1445   ai  = a->i;
1446   aj  = a->j;
1447   aa  = a->a;
1448   m   = A->rmap->N;
1449   bs  = A->rmap->bs;
1450   mbs = a->mbs;
1451   bs2 = a->bs2;
1452 
1453   PetscCall(VecGetArrayRead(ll, &l));
1454   PetscCall(VecGetLocalSize(ll, &lm));
1455   PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1456   for (i = 0; i < mbs; i++) { /* for each block row */
1457     M  = ai[i + 1] - ai[i];
1458     li = l + i * bs;
1459     v  = aa + bs2 * ai[i];
1460     for (j = 0; j < M; j++) { /* for each block */
1461       ri = l + bs * aj[ai[i] + j];
1462       for (k = 0; k < bs; k++) {
1463         x = ri[k];
1464         for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1465       }
1466     }
1467   }
1468   PetscCall(VecRestoreArrayRead(ll, &l));
1469   PetscCall(PetscLogFlops(2.0 * a->nz));
1470   PetscFunctionReturn(0);
1471 }
1472 
1473 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1474 {
1475   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1476 
1477   PetscFunctionBegin;
1478   info->block_size   = a->bs2;
1479   info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1480   info->nz_used      = a->bs2 * a->nz;    /*num. of nonzeros in upper triangular part */
1481   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1482   info->assemblies   = A->num_ass;
1483   info->mallocs      = A->info.mallocs;
1484   info->memory       = 0; /* REVIEW ME */
1485   if (A->factortype) {
1486     info->fill_ratio_given  = A->info.fill_ratio_given;
1487     info->fill_ratio_needed = A->info.fill_ratio_needed;
1488     info->factor_mallocs    = A->info.factor_mallocs;
1489   } else {
1490     info->fill_ratio_given  = 0;
1491     info->fill_ratio_needed = 0;
1492     info->factor_mallocs    = 0;
1493   }
1494   PetscFunctionReturn(0);
1495 }
1496 
1497 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1498 {
1499   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1500 
1501   PetscFunctionBegin;
1502   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
1503   PetscFunctionReturn(0);
1504 }
1505 
1506 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1507 {
1508   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1509   PetscInt         i, j, n, row, col, bs, mbs;
1510   const PetscInt  *ai, *aj;
1511   PetscReal        atmp;
1512   const MatScalar *aa;
1513   PetscScalar     *x;
1514   PetscInt         ncols, brow, bcol, krow, kcol;
1515 
1516   PetscFunctionBegin;
1517   PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
1518   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1519   bs  = A->rmap->bs;
1520   aa  = a->a;
1521   ai  = a->i;
1522   aj  = a->j;
1523   mbs = a->mbs;
1524 
1525   PetscCall(VecSet(v, 0.0));
1526   PetscCall(VecGetArray(v, &x));
1527   PetscCall(VecGetLocalSize(v, &n));
1528   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1529   for (i = 0; i < mbs; i++) {
1530     ncols = ai[1] - ai[0];
1531     ai++;
1532     brow = bs * i;
1533     for (j = 0; j < ncols; j++) {
1534       bcol = bs * (*aj);
1535       for (kcol = 0; kcol < bs; kcol++) {
1536         col = bcol + kcol; /* col index */
1537         for (krow = 0; krow < bs; krow++) {
1538           atmp = PetscAbsScalar(*aa);
1539           aa++;
1540           row = brow + krow; /* row index */
1541           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1542           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1543         }
1544       }
1545       aj++;
1546     }
1547   }
1548   PetscCall(VecRestoreArray(v, &x));
1549   PetscFunctionReturn(0);
1550 }
1551 
1552 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1553 {
1554   PetscFunctionBegin;
1555   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1556   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1557   PetscFunctionReturn(0);
1558 }
1559 
1560 PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1561 {
1562   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1563   PetscScalar       *z = c;
1564   const PetscScalar *xb;
1565   PetscScalar        x1;
1566   const MatScalar   *v   = a->a, *vv;
1567   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1568 #if defined(PETSC_USE_COMPLEX)
1569   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
1570 #else
1571   const int aconj = 0;
1572 #endif
1573 
1574   PetscFunctionBegin;
1575   for (i = 0; i < mbs; i++) {
1576     n = ii[1] - ii[0];
1577     ii++;
1578     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1579     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1580     jj = idx;
1581     vv = v;
1582     for (k = 0; k < cn; k++) {
1583       idx = jj;
1584       v   = vv;
1585       for (j = 0; j < n; j++) {
1586         xb = b + (*idx);
1587         x1 = xb[0 + k * bm];
1588         z[0 + k * cm] += v[0] * x1;
1589         if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1590         v += 1;
1591         ++idx;
1592       }
1593     }
1594     z += 1;
1595   }
1596   PetscFunctionReturn(0);
1597 }
1598 
1599 PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1600 {
1601   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1602   PetscScalar       *z = c;
1603   const PetscScalar *xb;
1604   PetscScalar        x1, x2;
1605   const MatScalar   *v   = a->a, *vv;
1606   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1607 
1608   PetscFunctionBegin;
1609   for (i = 0; i < mbs; i++) {
1610     n = ii[1] - ii[0];
1611     ii++;
1612     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1613     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1614     jj = idx;
1615     vv = v;
1616     for (k = 0; k < cn; k++) {
1617       idx = jj;
1618       v   = vv;
1619       for (j = 0; j < n; j++) {
1620         xb = b + 2 * (*idx);
1621         x1 = xb[0 + k * bm];
1622         x2 = xb[1 + k * bm];
1623         z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1624         z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1625         if (*idx != i) {
1626           c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1627           c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1628         }
1629         v += 4;
1630         ++idx;
1631       }
1632     }
1633     z += 2;
1634   }
1635   PetscFunctionReturn(0);
1636 }
1637 
1638 PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1639 {
1640   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1641   PetscScalar       *z = c;
1642   const PetscScalar *xb;
1643   PetscScalar        x1, x2, x3;
1644   const MatScalar   *v   = a->a, *vv;
1645   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1646 
1647   PetscFunctionBegin;
1648   for (i = 0; i < mbs; i++) {
1649     n = ii[1] - ii[0];
1650     ii++;
1651     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1652     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1653     jj = idx;
1654     vv = v;
1655     for (k = 0; k < cn; k++) {
1656       idx = jj;
1657       v   = vv;
1658       for (j = 0; j < n; j++) {
1659         xb = b + 3 * (*idx);
1660         x1 = xb[0 + k * bm];
1661         x2 = xb[1 + k * bm];
1662         x3 = xb[2 + k * bm];
1663         z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1664         z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1665         z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1666         if (*idx != i) {
1667           c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1668           c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1669           c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1670         }
1671         v += 9;
1672         ++idx;
1673       }
1674     }
1675     z += 3;
1676   }
1677   PetscFunctionReturn(0);
1678 }
1679 
1680 PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1681 {
1682   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1683   PetscScalar       *z = c;
1684   const PetscScalar *xb;
1685   PetscScalar        x1, x2, x3, x4;
1686   const MatScalar   *v   = a->a, *vv;
1687   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1688 
1689   PetscFunctionBegin;
1690   for (i = 0; i < mbs; i++) {
1691     n = ii[1] - ii[0];
1692     ii++;
1693     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1694     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1695     jj = idx;
1696     vv = v;
1697     for (k = 0; k < cn; k++) {
1698       idx = jj;
1699       v   = vv;
1700       for (j = 0; j < n; j++) {
1701         xb = b + 4 * (*idx);
1702         x1 = xb[0 + k * bm];
1703         x2 = xb[1 + k * bm];
1704         x3 = xb[2 + k * bm];
1705         x4 = xb[3 + k * bm];
1706         z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1707         z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1708         z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1709         z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1710         if (*idx != i) {
1711           c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1712           c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1713           c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1714           c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1715         }
1716         v += 16;
1717         ++idx;
1718       }
1719     }
1720     z += 4;
1721   }
1722   PetscFunctionReturn(0);
1723 }
1724 
1725 PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1726 {
1727   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1728   PetscScalar       *z = c;
1729   const PetscScalar *xb;
1730   PetscScalar        x1, x2, x3, x4, x5;
1731   const MatScalar   *v   = a->a, *vv;
1732   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1733 
1734   PetscFunctionBegin;
1735   for (i = 0; i < mbs; i++) {
1736     n = ii[1] - ii[0];
1737     ii++;
1738     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1739     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1740     jj = idx;
1741     vv = v;
1742     for (k = 0; k < cn; k++) {
1743       idx = jj;
1744       v   = vv;
1745       for (j = 0; j < n; j++) {
1746         xb = b + 5 * (*idx);
1747         x1 = xb[0 + k * bm];
1748         x2 = xb[1 + k * bm];
1749         x3 = xb[2 + k * bm];
1750         x4 = xb[3 + k * bm];
1751         x5 = xb[4 + k * cm];
1752         z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1753         z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1754         z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1755         z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1756         z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1757         if (*idx != i) {
1758           c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1759           c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1760           c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1761           c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1762           c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1763         }
1764         v += 25;
1765         ++idx;
1766       }
1767     }
1768     z += 5;
1769   }
1770   PetscFunctionReturn(0);
1771 }
1772 
1773 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1774 {
1775   Mat_SeqSBAIJ    *a  = (Mat_SeqSBAIJ *)A->data;
1776   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
1777   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
1778   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1779   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1780   PetscBLASInt     bbs, bcn, bbm, bcm;
1781   PetscScalar     *z = NULL;
1782   PetscScalar     *c, *b;
1783   const MatScalar *v;
1784   const PetscInt  *idx, *ii;
1785   PetscScalar      _DOne = 1.0;
1786 
1787   PetscFunctionBegin;
1788   if (!cm || !cn) PetscFunctionReturn(0);
1789   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1790   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1791   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1792   b = bd->v;
1793   PetscCall(MatZeroEntries(C));
1794   PetscCall(MatDenseGetArray(C, &c));
1795   switch (bs) {
1796   case 1:
1797     PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1798     break;
1799   case 2:
1800     PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1801     break;
1802   case 3:
1803     PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1804     break;
1805   case 4:
1806     PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1807     break;
1808   case 5:
1809     PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1810     break;
1811   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1812     PetscCall(PetscBLASIntCast(bs, &bbs));
1813     PetscCall(PetscBLASIntCast(cn, &bcn));
1814     PetscCall(PetscBLASIntCast(bm, &bbm));
1815     PetscCall(PetscBLASIntCast(cm, &bcm));
1816     idx = a->j;
1817     v   = a->a;
1818     mbs = a->mbs;
1819     ii  = a->i;
1820     z   = c;
1821     for (i = 0; i < mbs; i++) {
1822       n = ii[1] - ii[0];
1823       ii++;
1824       for (j = 0; j < n; j++) {
1825         if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1826         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1827         v += bs2;
1828       }
1829       z += bs;
1830     }
1831   }
1832   PetscCall(MatDenseRestoreArray(C, &c));
1833   PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
1834   PetscFunctionReturn(0);
1835 }
1836