xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision d2522c19e8fa9bca20aaca277941d9a63e71db6a)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/impls/dense/seq/dense.h>
4 #include <../src/mat/impls/sbaij/seq/sbaij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6 #include <petscbt.h>
7 #include <petscblaslapack.h>
8 
9 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) {
10   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
11   PetscInt        brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs, *nidx2;
12   const PetscInt *idx;
13   PetscBT         table_out, table_in;
14 
15   PetscFunctionBegin;
16   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
17   mbs = a->mbs;
18   ai  = a->i;
19   aj  = a->j;
20   bs  = A->rmap->bs;
21   PetscCall(PetscBTCreate(mbs, &table_out));
22   PetscCall(PetscMalloc1(mbs + 1, &nidx));
23   PetscCall(PetscMalloc1(A->rmap->N + 1, &nidx2));
24   PetscCall(PetscBTCreate(mbs, &table_in));
25 
26   for (i = 0; i < is_max; i++) { /* for each is */
27     isz = 0;
28     PetscCall(PetscBTMemzero(mbs, table_out));
29 
30     /* Extract the indices, assume there can be duplicate entries */
31     PetscCall(ISGetIndices(is[i], &idx));
32     PetscCall(ISGetLocalSize(is[i], &n));
33 
34     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
35     bcol_max = 0;
36     for (j = 0; j < n; ++j) {
37       brow = idx[j] / bs; /* convert the indices into block indices */
38       PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
39       if (!PetscBTLookupSet(table_out, brow)) {
40         nidx[isz++] = brow;
41         if (bcol_max < brow) bcol_max = brow;
42       }
43     }
44     PetscCall(ISRestoreIndices(is[i], &idx));
45     PetscCall(ISDestroy(&is[i]));
46 
47     k = 0;
48     for (j = 0; j < ov; j++) { /* for each overlap */
49       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
50       PetscCall(PetscBTMemzero(mbs, table_in));
51       for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));
52 
53       n = isz; /* length of the updated is[i] */
54       for (brow = 0; brow < mbs; brow++) {
55         start = ai[brow];
56         end   = ai[brow + 1];
57         if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
58           for (l = start; l < end; l++) {
59             bcol = aj[l];
60             if (!PetscBTLookupSet(table_out, bcol)) {
61               nidx[isz++] = bcol;
62               if (bcol_max < bcol) bcol_max = bcol;
63             }
64           }
65           k++;
66           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67         } else {             /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
68           for (l = start; l < end; l++) {
69             bcol = aj[l];
70             if (bcol > bcol_max) break;
71             if (PetscBTLookup(table_in, bcol)) {
72               if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
73               break; /* for l = start; l<end ; l++) */
74             }
75           }
76         }
77       }
78     } /* for each overlap */
79 
80     /* expand the Index Set */
81     for (j = 0; j < isz; j++) {
82       for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k;
83     }
84     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i));
85   }
86   PetscCall(PetscBTDestroy(&table_out));
87   PetscCall(PetscFree(nidx));
88   PetscCall(PetscFree(nidx2));
89   PetscCall(PetscBTDestroy(&table_in));
90   PetscFunctionReturn(0);
91 }
92 
93 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
94         Zero some ops' to avoid invalid usse */
95 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) {
96   PetscFunctionBegin;
97   PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
98   Bseq->ops->mult                   = NULL;
99   Bseq->ops->multadd                = NULL;
100   Bseq->ops->multtranspose          = NULL;
101   Bseq->ops->multtransposeadd       = NULL;
102   Bseq->ops->lufactor               = NULL;
103   Bseq->ops->choleskyfactor         = NULL;
104   Bseq->ops->lufactorsymbolic       = NULL;
105   Bseq->ops->choleskyfactorsymbolic = NULL;
106   Bseq->ops->getinertia             = NULL;
107   PetscFunctionReturn(0);
108 }
109 
110 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
111 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) {
112   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *c;
113   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
114   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
115   const PetscInt *irow, *icol;
116   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
117   PetscInt       *aj = a->j, *ai = a->i;
118   MatScalar      *mat_a;
119   Mat             C;
120   PetscBool       flag;
121 
122   PetscFunctionBegin;
123 
124   PetscCall(ISGetIndices(isrow, &irow));
125   PetscCall(ISGetIndices(iscol, &icol));
126   PetscCall(ISGetLocalSize(isrow, &nrows));
127   PetscCall(ISGetLocalSize(iscol, &ncols));
128 
129   PetscCall(PetscCalloc1(1 + oldcols, &smap));
130   ssmap = smap;
131   PetscCall(PetscMalloc1(1 + nrows, &lens));
132   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
133   /* determine lens of each row */
134   for (i = 0; i < nrows; i++) {
135     kstart  = ai[irow[i]];
136     kend    = kstart + a->ilen[irow[i]];
137     lens[i] = 0;
138     for (k = kstart; k < kend; k++) {
139       if (ssmap[aj[k]]) lens[i]++;
140     }
141   }
142   /* Create and fill new matrix */
143   if (scall == MAT_REUSE_MATRIX) {
144     c = (Mat_SeqSBAIJ *)((*B)->data);
145 
146     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
147     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
148     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
149     PetscCall(PetscArrayzero(c->ilen, c->mbs));
150     C = *B;
151   } else {
152     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
153     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
154     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
155     PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
156   }
157   c = (Mat_SeqSBAIJ *)(C->data);
158   for (i = 0; i < nrows; i++) {
159     row      = irow[i];
160     kstart   = ai[row];
161     kend     = kstart + a->ilen[row];
162     mat_i    = c->i[i];
163     mat_j    = c->j + mat_i;
164     mat_a    = c->a + mat_i * bs2;
165     mat_ilen = c->ilen + i;
166     for (k = kstart; k < kend; k++) {
167       if ((tcol = ssmap[a->j[k]])) {
168         *mat_j++ = tcol - 1;
169         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
170         mat_a += bs2;
171         (*mat_ilen)++;
172       }
173     }
174   }
175   /* sort */
176   {
177     MatScalar *work;
178 
179     PetscCall(PetscMalloc1(bs2, &work));
180     for (i = 0; i < nrows; i++) {
181       PetscInt ilen;
182       mat_i = c->i[i];
183       mat_j = c->j + mat_i;
184       mat_a = c->a + mat_i * bs2;
185       ilen  = c->ilen[i];
186       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
187     }
188     PetscCall(PetscFree(work));
189   }
190 
191   /* Free work space */
192   PetscCall(ISRestoreIndices(iscol, &icol));
193   PetscCall(PetscFree(smap));
194   PetscCall(PetscFree(lens));
195   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
196   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
197 
198   PetscCall(ISRestoreIndices(isrow, &irow));
199   *B = C;
200   PetscFunctionReturn(0);
201 }
202 
203 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) {
204   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
205   IS              is1, is2;
206   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs;
207   const PetscInt *irow, *icol;
208 
209   PetscFunctionBegin;
210   PetscCall(ISGetIndices(isrow, &irow));
211   PetscCall(ISGetIndices(iscol, &icol));
212   PetscCall(ISGetLocalSize(isrow, &nrows));
213   PetscCall(ISGetLocalSize(iscol, &ncols));
214 
215   /* Verify if the indices corespond to each element in a block
216    and form the IS with compressed IS */
217   maxmnbs = PetscMax(a->mbs, a->nbs);
218   PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary));
219   PetscCall(PetscArrayzero(vary, a->mbs));
220   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
221   for (i = 0; i < a->mbs; i++) { PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks"); }
222   count = 0;
223   for (i = 0; i < nrows; i++) {
224     PetscInt j = irow[i] / bs;
225     if ((vary[j]--) == bs) iary[count++] = j;
226   }
227   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1));
228 
229   PetscCall(PetscArrayzero(vary, a->nbs));
230   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
231   for (i = 0; i < a->nbs; i++) { PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc"); }
232   count = 0;
233   for (i = 0; i < ncols; i++) {
234     PetscInt j = icol[i] / bs;
235     if ((vary[j]--) == bs) iary[count++] = j;
236   }
237   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2));
238   PetscCall(ISRestoreIndices(isrow, &irow));
239   PetscCall(ISRestoreIndices(iscol, &icol));
240   PetscCall(PetscFree2(vary, iary));
241 
242   PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B));
243   PetscCall(ISDestroy(&is1));
244   PetscCall(ISDestroy(&is2));
245 
246   if (isrow != iscol) {
247     PetscBool isequal;
248     PetscCall(ISEqual(isrow, iscol, &isequal));
249     if (!isequal) { PetscCall(MatSeqSBAIJZeroOps_Private(*B)); }
250   }
251   PetscFunctionReturn(0);
252 }
253 
254 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) {
255   PetscInt i;
256 
257   PetscFunctionBegin;
258   if (scall == MAT_INITIAL_MATRIX) { PetscCall(PetscCalloc1(n + 1, B)); }
259 
260   for (i = 0; i < n; i++) { PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); }
261   PetscFunctionReturn(0);
262 }
263 
264 /* -------------------------------------------------------*/
265 /* Should check that shapes of vectors and matrices match */
266 /* -------------------------------------------------------*/
267 
268 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz) {
269   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
270   PetscScalar       *z, x1, x2, zero = 0.0;
271   const PetscScalar *x, *xb;
272   const MatScalar   *v;
273   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
274   const PetscInt    *aj = a->j, *ai = a->i, *ib;
275   PetscInt           nonzerorow = 0;
276 
277   PetscFunctionBegin;
278   PetscCall(VecSet(zz, zero));
279   if (!a->nz) PetscFunctionReturn(0);
280   PetscCall(VecGetArrayRead(xx, &x));
281   PetscCall(VecGetArray(zz, &z));
282 
283   v  = a->a;
284   xb = x;
285 
286   for (i = 0; i < mbs; i++) {
287     n    = ai[1] - ai[0]; /* length of i_th block row of A */
288     x1   = xb[0];
289     x2   = xb[1];
290     ib   = aj + *ai;
291     jmin = 0;
292     nonzerorow += (n > 0);
293     if (*ib == i) { /* (diag of A)*x */
294       z[2 * i] += v[0] * x1 + v[2] * x2;
295       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
296       v += 4;
297       jmin++;
298     }
299     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
300     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
301     for (j = jmin; j < n; j++) {
302       /* (strict lower triangular part of A)*x  */
303       cval = ib[j] * 2;
304       z[cval] += v[0] * x1 + v[1] * x2;
305       z[cval + 1] += v[2] * x1 + v[3] * x2;
306       /* (strict upper triangular part of A)*x  */
307       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
308       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
309       v += 4;
310     }
311     xb += 2;
312     ai++;
313   }
314 
315   PetscCall(VecRestoreArrayRead(xx, &x));
316   PetscCall(VecRestoreArray(zz, &z));
317   PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
318   PetscFunctionReturn(0);
319 }
320 
321 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz) {
322   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
323   PetscScalar       *z, x1, x2, x3, zero = 0.0;
324   const PetscScalar *x, *xb;
325   const MatScalar   *v;
326   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
327   const PetscInt    *aj = a->j, *ai = a->i, *ib;
328   PetscInt           nonzerorow = 0;
329 
330   PetscFunctionBegin;
331   PetscCall(VecSet(zz, zero));
332   if (!a->nz) PetscFunctionReturn(0);
333   PetscCall(VecGetArrayRead(xx, &x));
334   PetscCall(VecGetArray(zz, &z));
335 
336   v  = a->a;
337   xb = x;
338 
339   for (i = 0; i < mbs; i++) {
340     n    = ai[1] - ai[0]; /* length of i_th block row of A */
341     x1   = xb[0];
342     x2   = xb[1];
343     x3   = xb[2];
344     ib   = aj + *ai;
345     jmin = 0;
346     nonzerorow += (n > 0);
347     if (*ib == i) { /* (diag of A)*x */
348       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
349       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
350       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
351       v += 9;
352       jmin++;
353     }
354     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
355     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
356     for (j = jmin; j < n; j++) {
357       /* (strict lower triangular part of A)*x  */
358       cval = ib[j] * 3;
359       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
360       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
361       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
362       /* (strict upper triangular part of A)*x  */
363       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
364       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
365       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
366       v += 9;
367     }
368     xb += 3;
369     ai++;
370   }
371 
372   PetscCall(VecRestoreArrayRead(xx, &x));
373   PetscCall(VecRestoreArray(zz, &z));
374   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
375   PetscFunctionReturn(0);
376 }
377 
378 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz) {
379   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
380   PetscScalar       *z, x1, x2, x3, x4, zero = 0.0;
381   const PetscScalar *x, *xb;
382   const MatScalar   *v;
383   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
384   const PetscInt    *aj = a->j, *ai = a->i, *ib;
385   PetscInt           nonzerorow = 0;
386 
387   PetscFunctionBegin;
388   PetscCall(VecSet(zz, zero));
389   if (!a->nz) PetscFunctionReturn(0);
390   PetscCall(VecGetArrayRead(xx, &x));
391   PetscCall(VecGetArray(zz, &z));
392 
393   v  = a->a;
394   xb = x;
395 
396   for (i = 0; i < mbs; i++) {
397     n    = ai[1] - ai[0]; /* length of i_th block row of A */
398     x1   = xb[0];
399     x2   = xb[1];
400     x3   = xb[2];
401     x4   = xb[3];
402     ib   = aj + *ai;
403     jmin = 0;
404     nonzerorow += (n > 0);
405     if (*ib == i) { /* (diag of A)*x */
406       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
407       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
408       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
409       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
410       v += 16;
411       jmin++;
412     }
413     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
414     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
415     for (j = jmin; j < n; j++) {
416       /* (strict lower triangular part of A)*x  */
417       cval = ib[j] * 4;
418       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
419       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
420       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
421       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
422       /* (strict upper triangular part of A)*x  */
423       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
424       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
425       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
426       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
427       v += 16;
428     }
429     xb += 4;
430     ai++;
431   }
432 
433   PetscCall(VecRestoreArrayRead(xx, &x));
434   PetscCall(VecRestoreArray(zz, &z));
435   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
436   PetscFunctionReturn(0);
437 }
438 
439 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz) {
440   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
441   PetscScalar       *z, x1, x2, x3, x4, x5, zero = 0.0;
442   const PetscScalar *x, *xb;
443   const MatScalar   *v;
444   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
445   const PetscInt    *aj = a->j, *ai = a->i, *ib;
446   PetscInt           nonzerorow = 0;
447 
448   PetscFunctionBegin;
449   PetscCall(VecSet(zz, zero));
450   if (!a->nz) PetscFunctionReturn(0);
451   PetscCall(VecGetArrayRead(xx, &x));
452   PetscCall(VecGetArray(zz, &z));
453 
454   v  = a->a;
455   xb = x;
456 
457   for (i = 0; i < mbs; i++) {
458     n    = ai[1] - ai[0]; /* length of i_th block row of A */
459     x1   = xb[0];
460     x2   = xb[1];
461     x3   = xb[2];
462     x4   = xb[3];
463     x5   = xb[4];
464     ib   = aj + *ai;
465     jmin = 0;
466     nonzerorow += (n > 0);
467     if (*ib == i) { /* (diag of A)*x */
468       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
469       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
470       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
471       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
472       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
473       v += 25;
474       jmin++;
475     }
476     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
477     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
478     for (j = jmin; j < n; j++) {
479       /* (strict lower triangular part of A)*x  */
480       cval = ib[j] * 5;
481       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
482       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
483       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
484       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
485       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
486       /* (strict upper triangular part of A)*x  */
487       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
488       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
489       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
490       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
491       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
492       v += 25;
493     }
494     xb += 5;
495     ai++;
496   }
497 
498   PetscCall(VecRestoreArrayRead(xx, &x));
499   PetscCall(VecRestoreArray(zz, &z));
500   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
501   PetscFunctionReturn(0);
502 }
503 
504 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz) {
505   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
506   PetscScalar       *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
507   const PetscScalar *x, *xb;
508   const MatScalar   *v;
509   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
510   const PetscInt    *aj = a->j, *ai = a->i, *ib;
511   PetscInt           nonzerorow = 0;
512 
513   PetscFunctionBegin;
514   PetscCall(VecSet(zz, zero));
515   if (!a->nz) PetscFunctionReturn(0);
516   PetscCall(VecGetArrayRead(xx, &x));
517   PetscCall(VecGetArray(zz, &z));
518 
519   v  = a->a;
520   xb = x;
521 
522   for (i = 0; i < mbs; i++) {
523     n    = ai[1] - ai[0]; /* length of i_th block row of A */
524     x1   = xb[0];
525     x2   = xb[1];
526     x3   = xb[2];
527     x4   = xb[3];
528     x5   = xb[4];
529     x6   = xb[5];
530     ib   = aj + *ai;
531     jmin = 0;
532     nonzerorow += (n > 0);
533     if (*ib == i) { /* (diag of A)*x */
534       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
535       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
536       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
537       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
538       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
539       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
540       v += 36;
541       jmin++;
542     }
543     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
544     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
545     for (j = jmin; j < n; j++) {
546       /* (strict lower triangular part of A)*x  */
547       cval = ib[j] * 6;
548       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
549       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
550       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
551       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
552       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
553       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
554       /* (strict upper triangular part of A)*x  */
555       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
556       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
557       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
558       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
559       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
560       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
561       v += 36;
562     }
563     xb += 6;
564     ai++;
565   }
566 
567   PetscCall(VecRestoreArrayRead(xx, &x));
568   PetscCall(VecRestoreArray(zz, &z));
569   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
570   PetscFunctionReturn(0);
571 }
572 
573 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz) {
574   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
575   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
576   const PetscScalar *x, *xb;
577   const MatScalar   *v;
578   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
579   const PetscInt    *aj = a->j, *ai = a->i, *ib;
580   PetscInt           nonzerorow = 0;
581 
582   PetscFunctionBegin;
583   PetscCall(VecSet(zz, zero));
584   if (!a->nz) PetscFunctionReturn(0);
585   PetscCall(VecGetArrayRead(xx, &x));
586   PetscCall(VecGetArray(zz, &z));
587 
588   v  = a->a;
589   xb = x;
590 
591   for (i = 0; i < mbs; i++) {
592     n    = ai[1] - ai[0]; /* length of i_th block row of A */
593     x1   = xb[0];
594     x2   = xb[1];
595     x3   = xb[2];
596     x4   = xb[3];
597     x5   = xb[4];
598     x6   = xb[5];
599     x7   = xb[6];
600     ib   = aj + *ai;
601     jmin = 0;
602     nonzerorow += (n > 0);
603     if (*ib == i) { /* (diag of A)*x */
604       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
605       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
606       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
607       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
608       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
609       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
610       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
611       v += 49;
612       jmin++;
613     }
614     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
615     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
616     for (j = jmin; j < n; j++) {
617       /* (strict lower triangular part of A)*x  */
618       cval = ib[j] * 7;
619       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
620       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
621       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
622       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
623       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
624       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
625       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
626       /* (strict upper triangular part of A)*x  */
627       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
628       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
629       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
630       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
631       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
632       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
633       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
634       v += 49;
635     }
636     xb += 7;
637     ai++;
638   }
639   PetscCall(VecRestoreArrayRead(xx, &x));
640   PetscCall(VecRestoreArray(zz, &z));
641   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
642   PetscFunctionReturn(0);
643 }
644 
645 /*
646     This will not work with MatScalar == float because it calls the BLAS
647 */
648 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz) {
649   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
650   PetscScalar       *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
651   const PetscScalar *x, *x_ptr, *xb;
652   const MatScalar   *v;
653   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
654   const PetscInt    *idx, *aj, *ii;
655   PetscInt           nonzerorow = 0;
656 
657   PetscFunctionBegin;
658   PetscCall(VecSet(zz, zero));
659   if (!a->nz) PetscFunctionReturn(0);
660   PetscCall(VecGetArrayRead(xx, &x));
661   PetscCall(VecGetArray(zz, &z));
662 
663   x_ptr = x;
664   z_ptr = z;
665 
666   aj = a->j;
667   v  = a->a;
668   ii = a->i;
669 
670   if (!a->mult_work) { PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work)); }
671   work = a->mult_work;
672 
673   for (i = 0; i < mbs; i++) {
674     n     = ii[1] - ii[0];
675     ncols = n * bs;
676     workt = work;
677     idx   = aj + ii[0];
678     nonzerorow += (n > 0);
679 
680     /* upper triangular part */
681     for (j = 0; j < n; j++) {
682       xb = x_ptr + bs * (*idx++);
683       for (k = 0; k < bs; k++) workt[k] = xb[k];
684       workt += bs;
685     }
686     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
687     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
688 
689     /* strict lower triangular part */
690     idx = aj + ii[0];
691     if (n && *idx == i) {
692       ncols -= bs;
693       v += bs2;
694       idx++;
695       n--;
696     }
697 
698     if (ncols > 0) {
699       workt = work;
700       PetscCall(PetscArrayzero(workt, ncols));
701       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
702       for (j = 0; j < n; j++) {
703         zb = z_ptr + bs * (*idx++);
704         for (k = 0; k < bs; k++) zb[k] += workt[k];
705         workt += bs;
706       }
707     }
708     x += bs;
709     v += n * bs2;
710     z += bs;
711     ii++;
712   }
713 
714   PetscCall(VecRestoreArrayRead(xx, &x));
715   PetscCall(VecRestoreArray(zz, &z));
716   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
717   PetscFunctionReturn(0);
718 }
719 
720 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) {
721   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
722   PetscScalar       *z, x1;
723   const PetscScalar *x, *xb;
724   const MatScalar   *v;
725   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
726   const PetscInt    *aj = a->j, *ai = a->i, *ib;
727   PetscInt           nonzerorow = 0;
728 #if defined(PETSC_USE_COMPLEX)
729   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
730 #else
731   const int aconj = 0;
732 #endif
733 
734   PetscFunctionBegin;
735   PetscCall(VecCopy(yy, zz));
736   if (!a->nz) PetscFunctionReturn(0);
737   PetscCall(VecGetArrayRead(xx, &x));
738   PetscCall(VecGetArray(zz, &z));
739   v  = a->a;
740   xb = x;
741 
742   for (i = 0; i < mbs; i++) {
743     n    = ai[1] - ai[0]; /* length of i_th row of A */
744     x1   = xb[0];
745     ib   = aj + *ai;
746     jmin = 0;
747     nonzerorow += (n > 0);
748     if (n && *ib == i) { /* (diag of A)*x */
749       z[i] += *v++ * x[*ib++];
750       jmin++;
751     }
752     if (aconj) {
753       for (j = jmin; j < n; j++) {
754         cval = *ib;
755         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
756         z[i] += *v++ * x[*ib++];       /* (strict upper triangular part of A)*x  */
757       }
758     } else {
759       for (j = jmin; j < n; j++) {
760         cval = *ib;
761         z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
762         z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
763       }
764     }
765     xb++;
766     ai++;
767   }
768 
769   PetscCall(VecRestoreArrayRead(xx, &x));
770   PetscCall(VecRestoreArray(zz, &z));
771 
772   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
773   PetscFunctionReturn(0);
774 }
775 
776 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) {
777   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
778   PetscScalar       *z, x1, x2;
779   const PetscScalar *x, *xb;
780   const MatScalar   *v;
781   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
782   const PetscInt    *aj = a->j, *ai = a->i, *ib;
783   PetscInt           nonzerorow = 0;
784 
785   PetscFunctionBegin;
786   PetscCall(VecCopy(yy, zz));
787   if (!a->nz) PetscFunctionReturn(0);
788   PetscCall(VecGetArrayRead(xx, &x));
789   PetscCall(VecGetArray(zz, &z));
790 
791   v  = a->a;
792   xb = x;
793 
794   for (i = 0; i < mbs; i++) {
795     n    = ai[1] - ai[0]; /* length of i_th block row of A */
796     x1   = xb[0];
797     x2   = xb[1];
798     ib   = aj + *ai;
799     jmin = 0;
800     nonzerorow += (n > 0);
801     if (n && *ib == i) { /* (diag of A)*x */
802       z[2 * i] += v[0] * x1 + v[2] * x2;
803       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
804       v += 4;
805       jmin++;
806     }
807     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
808     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
809     for (j = jmin; j < n; j++) {
810       /* (strict lower triangular part of A)*x  */
811       cval = ib[j] * 2;
812       z[cval] += v[0] * x1 + v[1] * x2;
813       z[cval + 1] += v[2] * x1 + v[3] * x2;
814       /* (strict upper triangular part of A)*x  */
815       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
816       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
817       v += 4;
818     }
819     xb += 2;
820     ai++;
821   }
822   PetscCall(VecRestoreArrayRead(xx, &x));
823   PetscCall(VecRestoreArray(zz, &z));
824 
825   PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
826   PetscFunctionReturn(0);
827 }
828 
829 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) {
830   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
831   PetscScalar       *z, x1, x2, x3;
832   const PetscScalar *x, *xb;
833   const MatScalar   *v;
834   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
835   const PetscInt    *aj = a->j, *ai = a->i, *ib;
836   PetscInt           nonzerorow = 0;
837 
838   PetscFunctionBegin;
839   PetscCall(VecCopy(yy, zz));
840   if (!a->nz) PetscFunctionReturn(0);
841   PetscCall(VecGetArrayRead(xx, &x));
842   PetscCall(VecGetArray(zz, &z));
843 
844   v  = a->a;
845   xb = x;
846 
847   for (i = 0; i < mbs; i++) {
848     n    = ai[1] - ai[0]; /* length of i_th block row of A */
849     x1   = xb[0];
850     x2   = xb[1];
851     x3   = xb[2];
852     ib   = aj + *ai;
853     jmin = 0;
854     nonzerorow += (n > 0);
855     if (n && *ib == i) { /* (diag of A)*x */
856       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
857       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
858       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
859       v += 9;
860       jmin++;
861     }
862     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
863     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
864     for (j = jmin; j < n; j++) {
865       /* (strict lower triangular part of A)*x  */
866       cval = ib[j] * 3;
867       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
868       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
869       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
870       /* (strict upper triangular part of A)*x  */
871       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
872       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
873       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
874       v += 9;
875     }
876     xb += 3;
877     ai++;
878   }
879 
880   PetscCall(VecRestoreArrayRead(xx, &x));
881   PetscCall(VecRestoreArray(zz, &z));
882 
883   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
884   PetscFunctionReturn(0);
885 }
886 
887 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) {
888   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
889   PetscScalar       *z, x1, x2, x3, x4;
890   const PetscScalar *x, *xb;
891   const MatScalar   *v;
892   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
893   const PetscInt    *aj = a->j, *ai = a->i, *ib;
894   PetscInt           nonzerorow = 0;
895 
896   PetscFunctionBegin;
897   PetscCall(VecCopy(yy, zz));
898   if (!a->nz) PetscFunctionReturn(0);
899   PetscCall(VecGetArrayRead(xx, &x));
900   PetscCall(VecGetArray(zz, &z));
901 
902   v  = a->a;
903   xb = x;
904 
905   for (i = 0; i < mbs; i++) {
906     n    = ai[1] - ai[0]; /* length of i_th block row of A */
907     x1   = xb[0];
908     x2   = xb[1];
909     x3   = xb[2];
910     x4   = xb[3];
911     ib   = aj + *ai;
912     jmin = 0;
913     nonzerorow += (n > 0);
914     if (n && *ib == i) { /* (diag of A)*x */
915       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
916       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
917       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
918       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
919       v += 16;
920       jmin++;
921     }
922     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
923     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
924     for (j = jmin; j < n; j++) {
925       /* (strict lower triangular part of A)*x  */
926       cval = ib[j] * 4;
927       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
928       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
929       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
930       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
931       /* (strict upper triangular part of A)*x  */
932       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
933       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
934       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
935       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
936       v += 16;
937     }
938     xb += 4;
939     ai++;
940   }
941 
942   PetscCall(VecRestoreArrayRead(xx, &x));
943   PetscCall(VecRestoreArray(zz, &z));
944 
945   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
946   PetscFunctionReturn(0);
947 }
948 
949 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) {
950   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
951   PetscScalar       *z, x1, x2, x3, x4, x5;
952   const PetscScalar *x, *xb;
953   const MatScalar   *v;
954   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
955   const PetscInt    *aj = a->j, *ai = a->i, *ib;
956   PetscInt           nonzerorow = 0;
957 
958   PetscFunctionBegin;
959   PetscCall(VecCopy(yy, zz));
960   if (!a->nz) PetscFunctionReturn(0);
961   PetscCall(VecGetArrayRead(xx, &x));
962   PetscCall(VecGetArray(zz, &z));
963 
964   v  = a->a;
965   xb = x;
966 
967   for (i = 0; i < mbs; i++) {
968     n    = ai[1] - ai[0]; /* length of i_th block row of A */
969     x1   = xb[0];
970     x2   = xb[1];
971     x3   = xb[2];
972     x4   = xb[3];
973     x5   = xb[4];
974     ib   = aj + *ai;
975     jmin = 0;
976     nonzerorow += (n > 0);
977     if (n && *ib == i) { /* (diag of A)*x */
978       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
979       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
980       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
981       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
982       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
983       v += 25;
984       jmin++;
985     }
986     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
987     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
988     for (j = jmin; j < n; j++) {
989       /* (strict lower triangular part of A)*x  */
990       cval = ib[j] * 5;
991       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
992       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
993       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
994       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
995       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
996       /* (strict upper triangular part of A)*x  */
997       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
998       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
999       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
1000       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
1001       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
1002       v += 25;
1003     }
1004     xb += 5;
1005     ai++;
1006   }
1007 
1008   PetscCall(VecRestoreArrayRead(xx, &x));
1009   PetscCall(VecRestoreArray(zz, &z));
1010 
1011   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
1012   PetscFunctionReturn(0);
1013 }
1014 
1015 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) {
1016   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1017   PetscScalar       *z, x1, x2, x3, x4, x5, x6;
1018   const PetscScalar *x, *xb;
1019   const MatScalar   *v;
1020   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1021   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1022   PetscInt           nonzerorow = 0;
1023 
1024   PetscFunctionBegin;
1025   PetscCall(VecCopy(yy, zz));
1026   if (!a->nz) PetscFunctionReturn(0);
1027   PetscCall(VecGetArrayRead(xx, &x));
1028   PetscCall(VecGetArray(zz, &z));
1029 
1030   v  = a->a;
1031   xb = x;
1032 
1033   for (i = 0; i < mbs; i++) {
1034     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1035     x1   = xb[0];
1036     x2   = xb[1];
1037     x3   = xb[2];
1038     x4   = xb[3];
1039     x5   = xb[4];
1040     x6   = xb[5];
1041     ib   = aj + *ai;
1042     jmin = 0;
1043     nonzerorow += (n > 0);
1044     if (n && *ib == i) { /* (diag of A)*x */
1045       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1046       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1047       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1048       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1049       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1050       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1051       v += 36;
1052       jmin++;
1053     }
1054     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1055     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1056     for (j = jmin; j < n; j++) {
1057       /* (strict lower triangular part of A)*x  */
1058       cval = ib[j] * 6;
1059       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1060       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1061       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1062       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1063       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1064       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1065       /* (strict upper triangular part of A)*x  */
1066       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1067       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1068       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1069       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1070       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1071       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1072       v += 36;
1073     }
1074     xb += 6;
1075     ai++;
1076   }
1077 
1078   PetscCall(VecRestoreArrayRead(xx, &x));
1079   PetscCall(VecRestoreArray(zz, &z));
1080 
1081   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
1082   PetscFunctionReturn(0);
1083 }
1084 
1085 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) {
1086   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1087   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7;
1088   const PetscScalar *x, *xb;
1089   const MatScalar   *v;
1090   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1091   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1092   PetscInt           nonzerorow = 0;
1093 
1094   PetscFunctionBegin;
1095   PetscCall(VecCopy(yy, zz));
1096   if (!a->nz) PetscFunctionReturn(0);
1097   PetscCall(VecGetArrayRead(xx, &x));
1098   PetscCall(VecGetArray(zz, &z));
1099 
1100   v  = a->a;
1101   xb = x;
1102 
1103   for (i = 0; i < mbs; i++) {
1104     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1105     x1   = xb[0];
1106     x2   = xb[1];
1107     x3   = xb[2];
1108     x4   = xb[3];
1109     x5   = xb[4];
1110     x6   = xb[5];
1111     x7   = xb[6];
1112     ib   = aj + *ai;
1113     jmin = 0;
1114     nonzerorow += (n > 0);
1115     if (n && *ib == i) { /* (diag of A)*x */
1116       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1117       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1118       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1119       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1120       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1121       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1122       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1123       v += 49;
1124       jmin++;
1125     }
1126     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1127     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1128     for (j = jmin; j < n; j++) {
1129       /* (strict lower triangular part of A)*x  */
1130       cval = ib[j] * 7;
1131       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1132       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1133       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1134       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1135       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1136       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1137       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1138       /* (strict upper triangular part of A)*x  */
1139       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1140       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1141       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1142       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1143       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1144       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1145       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1146       v += 49;
1147     }
1148     xb += 7;
1149     ai++;
1150   }
1151 
1152   PetscCall(VecRestoreArrayRead(xx, &x));
1153   PetscCall(VecRestoreArray(zz, &z));
1154 
1155   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
1156   PetscFunctionReturn(0);
1157 }
1158 
1159 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) {
1160   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1161   PetscScalar       *z, *z_ptr = NULL, *zb, *work, *workt;
1162   const PetscScalar *x, *x_ptr, *xb;
1163   const MatScalar   *v;
1164   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1165   const PetscInt    *idx, *aj, *ii;
1166   PetscInt           nonzerorow = 0;
1167 
1168   PetscFunctionBegin;
1169   PetscCall(VecCopy(yy, zz));
1170   if (!a->nz) PetscFunctionReturn(0);
1171   PetscCall(VecGetArrayRead(xx, &x));
1172   x_ptr = x;
1173   PetscCall(VecGetArray(zz, &z));
1174   z_ptr = z;
1175 
1176   aj = a->j;
1177   v  = a->a;
1178   ii = a->i;
1179 
1180   if (!a->mult_work) { PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work)); }
1181   work = a->mult_work;
1182 
1183   for (i = 0; i < mbs; i++) {
1184     n     = ii[1] - ii[0];
1185     ncols = n * bs;
1186     workt = work;
1187     idx   = aj + ii[0];
1188     nonzerorow += (n > 0);
1189 
1190     /* upper triangular part */
1191     for (j = 0; j < n; j++) {
1192       xb = x_ptr + bs * (*idx++);
1193       for (k = 0; k < bs; k++) workt[k] = xb[k];
1194       workt += bs;
1195     }
1196     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1197     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
1198 
1199     /* strict lower triangular part */
1200     idx = aj + ii[0];
1201     if (n && *idx == i) {
1202       ncols -= bs;
1203       v += bs2;
1204       idx++;
1205       n--;
1206     }
1207     if (ncols > 0) {
1208       workt = work;
1209       PetscCall(PetscArrayzero(workt, ncols));
1210       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1211       for (j = 0; j < n; j++) {
1212         zb = z_ptr + bs * (*idx++);
1213         for (k = 0; k < bs; k++) zb[k] += workt[k];
1214         workt += bs;
1215       }
1216     }
1217 
1218     x += bs;
1219     v += n * bs2;
1220     z += bs;
1221     ii++;
1222   }
1223 
1224   PetscCall(VecRestoreArrayRead(xx, &x));
1225   PetscCall(VecRestoreArray(zz, &z));
1226 
1227   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
1228   PetscFunctionReturn(0);
1229 }
1230 
1231 PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha) {
1232   Mat_SeqSBAIJ *a      = (Mat_SeqSBAIJ *)inA->data;
1233   PetscScalar   oalpha = alpha;
1234   PetscBLASInt  one    = 1, totalnz;
1235 
1236   PetscFunctionBegin;
1237   PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1238   PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1239   PetscCall(PetscLogFlops(totalnz));
1240   PetscFunctionReturn(0);
1241 }
1242 
1243 PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm) {
1244   Mat_SeqSBAIJ    *a        = (Mat_SeqSBAIJ *)A->data;
1245   const MatScalar *v        = a->a;
1246   PetscReal        sum_diag = 0.0, sum_off = 0.0, *sum;
1247   PetscInt         i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1248   const PetscInt  *aj = a->j, *col;
1249 
1250   PetscFunctionBegin;
1251   if (!a->nz) {
1252     *norm = 0.0;
1253     PetscFunctionReturn(0);
1254   }
1255   if (type == NORM_FROBENIUS) {
1256     for (k = 0; k < mbs; k++) {
1257       jmin = a->i[k];
1258       jmax = a->i[k + 1];
1259       col  = aj + jmin;
1260       if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1261         for (i = 0; i < bs2; i++) {
1262           sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1263           v++;
1264         }
1265         jmin++;
1266       }
1267       for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1268         for (i = 0; i < bs2; i++) {
1269           sum_off += PetscRealPart(PetscConj(*v) * (*v));
1270           v++;
1271         }
1272       }
1273     }
1274     *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1275     PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1276   } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1277     PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
1278     for (i = 0; i < mbs; i++) jl[i] = mbs;
1279     il[0] = 0;
1280 
1281     *norm = 0.0;
1282     for (k = 0; k < mbs; k++) { /* k_th block row */
1283       for (j = 0; j < bs; j++) sum[j] = 0.0;
1284       /*-- col sum --*/
1285       i = jl[k]; /* first |A(i,k)| to be added */
1286       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1287                   at step k */
1288       while (i < mbs) {
1289         nexti = jl[i]; /* next block row to be added */
1290         ik    = il[i]; /* block index of A(i,k) in the array a */
1291         for (j = 0; j < bs; j++) {
1292           v = a->a + ik * bs2 + j * bs;
1293           for (k1 = 0; k1 < bs; k1++) {
1294             sum[j] += PetscAbsScalar(*v);
1295             v++;
1296           }
1297         }
1298         /* update il, jl */
1299         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1300         jmax = a->i[i + 1];
1301         if (jmin < jmax) {
1302           il[i] = jmin;
1303           j     = a->j[jmin];
1304           jl[i] = jl[j];
1305           jl[j] = i;
1306         }
1307         i = nexti;
1308       }
1309       /*-- row sum --*/
1310       jmin = a->i[k];
1311       jmax = a->i[k + 1];
1312       for (i = jmin; i < jmax; i++) {
1313         for (j = 0; j < bs; j++) {
1314           v = a->a + i * bs2 + j;
1315           for (k1 = 0; k1 < bs; k1++) {
1316             sum[j] += PetscAbsScalar(*v);
1317             v += bs;
1318           }
1319         }
1320       }
1321       /* add k_th block row to il, jl */
1322       col = aj + jmin;
1323       if (jmax - jmin > 0 && *col == k) jmin++;
1324       if (jmin < jmax) {
1325         il[k] = jmin;
1326         j     = a->j[jmin];
1327         jl[k] = jl[j];
1328         jl[j] = k;
1329       }
1330       for (j = 0; j < bs; j++) {
1331         if (sum[j] > *norm) *norm = sum[j];
1332       }
1333     }
1334     PetscCall(PetscFree3(sum, il, jl));
1335     PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1336   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1337   PetscFunctionReturn(0);
1338 }
1339 
1340 PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg) {
1341   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
1342 
1343   PetscFunctionBegin;
1344   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1345   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1346     *flg = PETSC_FALSE;
1347     PetscFunctionReturn(0);
1348   }
1349 
1350   /* if the a->i are the same */
1351   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
1352   if (!*flg) PetscFunctionReturn(0);
1353 
1354   /* if a->j are the same */
1355   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
1356   if (!*flg) PetscFunctionReturn(0);
1357 
1358   /* if a->a are the same */
1359   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg));
1360   PetscFunctionReturn(0);
1361 }
1362 
1363 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v) {
1364   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1365   PetscInt         i, j, k, row, bs, ambs, bs2;
1366   const PetscInt  *ai, *aj;
1367   PetscScalar     *x, zero = 0.0;
1368   const MatScalar *aa, *aa_j;
1369 
1370   PetscFunctionBegin;
1371   bs = A->rmap->bs;
1372   PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");
1373 
1374   aa   = a->a;
1375   ambs = a->mbs;
1376 
1377   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1378     PetscInt *diag = a->diag;
1379     aa             = a->a;
1380     ambs           = a->mbs;
1381     PetscCall(VecGetArray(v, &x));
1382     for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
1383     PetscCall(VecRestoreArray(v, &x));
1384     PetscFunctionReturn(0);
1385   }
1386 
1387   ai  = a->i;
1388   aj  = a->j;
1389   bs2 = a->bs2;
1390   PetscCall(VecSet(v, zero));
1391   if (!a->nz) PetscFunctionReturn(0);
1392   PetscCall(VecGetArray(v, &x));
1393   for (i = 0; i < ambs; i++) {
1394     j = ai[i];
1395     if (aj[j] == i) { /* if this is a diagonal element */
1396       row  = i * bs;
1397       aa_j = aa + j * bs2;
1398       for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
1399     }
1400   }
1401   PetscCall(VecRestoreArray(v, &x));
1402   PetscFunctionReturn(0);
1403 }
1404 
1405 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr) {
1406   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1407   PetscScalar        x;
1408   const PetscScalar *l, *li, *ri;
1409   MatScalar         *aa, *v;
1410   PetscInt           i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1411   const PetscInt    *ai, *aj;
1412   PetscBool          flg;
1413 
1414   PetscFunctionBegin;
1415   if (ll != rr) {
1416     PetscCall(VecEqual(ll, rr, &flg));
1417     PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same");
1418   }
1419   if (!ll) PetscFunctionReturn(0);
1420   ai  = a->i;
1421   aj  = a->j;
1422   aa  = a->a;
1423   m   = A->rmap->N;
1424   bs  = A->rmap->bs;
1425   mbs = a->mbs;
1426   bs2 = a->bs2;
1427 
1428   PetscCall(VecGetArrayRead(ll, &l));
1429   PetscCall(VecGetLocalSize(ll, &lm));
1430   PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1431   for (i = 0; i < mbs; i++) { /* for each block row */
1432     M  = ai[i + 1] - ai[i];
1433     li = l + i * bs;
1434     v  = aa + bs2 * ai[i];
1435     for (j = 0; j < M; j++) { /* for each block */
1436       ri = l + bs * aj[ai[i] + j];
1437       for (k = 0; k < bs; k++) {
1438         x = ri[k];
1439         for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1440       }
1441     }
1442   }
1443   PetscCall(VecRestoreArrayRead(ll, &l));
1444   PetscCall(PetscLogFlops(2.0 * a->nz));
1445   PetscFunctionReturn(0);
1446 }
1447 
1448 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info) {
1449   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1450 
1451   PetscFunctionBegin;
1452   info->block_size   = a->bs2;
1453   info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1454   info->nz_used      = a->bs2 * a->nz;    /*num. of nonzeros in upper triangular part */
1455   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1456   info->assemblies   = A->num_ass;
1457   info->mallocs      = A->info.mallocs;
1458   info->memory       = ((PetscObject)A)->mem;
1459   if (A->factortype) {
1460     info->fill_ratio_given  = A->info.fill_ratio_given;
1461     info->fill_ratio_needed = A->info.fill_ratio_needed;
1462     info->factor_mallocs    = A->info.factor_mallocs;
1463   } else {
1464     info->fill_ratio_given  = 0;
1465     info->fill_ratio_needed = 0;
1466     info->factor_mallocs    = 0;
1467   }
1468   PetscFunctionReturn(0);
1469 }
1470 
1471 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) {
1472   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1473 
1474   PetscFunctionBegin;
1475   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
1476   PetscFunctionReturn(0);
1477 }
1478 
1479 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[]) {
1480   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1481   PetscInt         i, j, n, row, col, bs, mbs;
1482   const PetscInt  *ai, *aj;
1483   PetscReal        atmp;
1484   const MatScalar *aa;
1485   PetscScalar     *x;
1486   PetscInt         ncols, brow, bcol, krow, kcol;
1487 
1488   PetscFunctionBegin;
1489   PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
1490   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1491   bs  = A->rmap->bs;
1492   aa  = a->a;
1493   ai  = a->i;
1494   aj  = a->j;
1495   mbs = a->mbs;
1496 
1497   PetscCall(VecSet(v, 0.0));
1498   PetscCall(VecGetArray(v, &x));
1499   PetscCall(VecGetLocalSize(v, &n));
1500   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1501   for (i = 0; i < mbs; i++) {
1502     ncols = ai[1] - ai[0];
1503     ai++;
1504     brow = bs * i;
1505     for (j = 0; j < ncols; j++) {
1506       bcol = bs * (*aj);
1507       for (kcol = 0; kcol < bs; kcol++) {
1508         col = bcol + kcol; /* col index */
1509         for (krow = 0; krow < bs; krow++) {
1510           atmp = PetscAbsScalar(*aa);
1511           aa++;
1512           row = brow + krow; /* row index */
1513           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1514           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1515         }
1516       }
1517       aj++;
1518     }
1519   }
1520   PetscCall(VecRestoreArray(v, &x));
1521   PetscFunctionReturn(0);
1522 }
1523 
1524 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) {
1525   PetscFunctionBegin;
1526   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1527   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1528   PetscFunctionReturn(0);
1529 }
1530 
1531 PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
1532   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1533   PetscScalar       *z = c;
1534   const PetscScalar *xb;
1535   PetscScalar        x1;
1536   const MatScalar   *v   = a->a, *vv;
1537   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1538 #if defined(PETSC_USE_COMPLEX)
1539   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
1540 #else
1541   const int aconj = 0;
1542 #endif
1543 
1544   PetscFunctionBegin;
1545   for (i = 0; i < mbs; i++) {
1546     n = ii[1] - ii[0];
1547     ii++;
1548     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1549     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1550     jj = idx;
1551     vv = v;
1552     for (k = 0; k < cn; k++) {
1553       idx = jj;
1554       v   = vv;
1555       for (j = 0; j < n; j++) {
1556         xb = b + (*idx);
1557         x1 = xb[0 + k * bm];
1558         z[0 + k * cm] += v[0] * x1;
1559         if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1560         v += 1;
1561         ++idx;
1562       }
1563     }
1564     z += 1;
1565   }
1566   PetscFunctionReturn(0);
1567 }
1568 
1569 PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
1570   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1571   PetscScalar       *z = c;
1572   const PetscScalar *xb;
1573   PetscScalar        x1, x2;
1574   const MatScalar   *v   = a->a, *vv;
1575   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1576 
1577   PetscFunctionBegin;
1578   for (i = 0; i < mbs; i++) {
1579     n = ii[1] - ii[0];
1580     ii++;
1581     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1582     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1583     jj = idx;
1584     vv = v;
1585     for (k = 0; k < cn; k++) {
1586       idx = jj;
1587       v   = vv;
1588       for (j = 0; j < n; j++) {
1589         xb = b + 2 * (*idx);
1590         x1 = xb[0 + k * bm];
1591         x2 = xb[1 + k * bm];
1592         z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1593         z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1594         if (*idx != i) {
1595           c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1596           c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1597         }
1598         v += 4;
1599         ++idx;
1600       }
1601     }
1602     z += 2;
1603   }
1604   PetscFunctionReturn(0);
1605 }
1606 
1607 PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
1608   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1609   PetscScalar       *z = c;
1610   const PetscScalar *xb;
1611   PetscScalar        x1, x2, x3;
1612   const MatScalar   *v   = a->a, *vv;
1613   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1614 
1615   PetscFunctionBegin;
1616   for (i = 0; i < mbs; i++) {
1617     n = ii[1] - ii[0];
1618     ii++;
1619     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1620     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1621     jj = idx;
1622     vv = v;
1623     for (k = 0; k < cn; k++) {
1624       idx = jj;
1625       v   = vv;
1626       for (j = 0; j < n; j++) {
1627         xb = b + 3 * (*idx);
1628         x1 = xb[0 + k * bm];
1629         x2 = xb[1 + k * bm];
1630         x3 = xb[2 + k * bm];
1631         z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1632         z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1633         z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1634         if (*idx != i) {
1635           c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1636           c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1637           c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1638         }
1639         v += 9;
1640         ++idx;
1641       }
1642     }
1643     z += 3;
1644   }
1645   PetscFunctionReturn(0);
1646 }
1647 
1648 PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
1649   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1650   PetscScalar       *z = c;
1651   const PetscScalar *xb;
1652   PetscScalar        x1, x2, x3, x4;
1653   const MatScalar   *v   = a->a, *vv;
1654   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1655 
1656   PetscFunctionBegin;
1657   for (i = 0; i < mbs; i++) {
1658     n = ii[1] - ii[0];
1659     ii++;
1660     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1661     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1662     jj = idx;
1663     vv = v;
1664     for (k = 0; k < cn; k++) {
1665       idx = jj;
1666       v   = vv;
1667       for (j = 0; j < n; j++) {
1668         xb = b + 4 * (*idx);
1669         x1 = xb[0 + k * bm];
1670         x2 = xb[1 + k * bm];
1671         x3 = xb[2 + k * bm];
1672         x4 = xb[3 + k * bm];
1673         z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1674         z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1675         z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1676         z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1677         if (*idx != i) {
1678           c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1679           c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1680           c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1681           c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1682         }
1683         v += 16;
1684         ++idx;
1685       }
1686     }
1687     z += 4;
1688   }
1689   PetscFunctionReturn(0);
1690 }
1691 
1692 PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
1693   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1694   PetscScalar       *z = c;
1695   const PetscScalar *xb;
1696   PetscScalar        x1, x2, x3, x4, x5;
1697   const MatScalar   *v   = a->a, *vv;
1698   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1699 
1700   PetscFunctionBegin;
1701   for (i = 0; i < mbs; i++) {
1702     n = ii[1] - ii[0];
1703     ii++;
1704     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1705     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1706     jj = idx;
1707     vv = v;
1708     for (k = 0; k < cn; k++) {
1709       idx = jj;
1710       v   = vv;
1711       for (j = 0; j < n; j++) {
1712         xb = b + 5 * (*idx);
1713         x1 = xb[0 + k * bm];
1714         x2 = xb[1 + k * bm];
1715         x3 = xb[2 + k * bm];
1716         x4 = xb[3 + k * bm];
1717         x5 = xb[4 + k * cm];
1718         z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1719         z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1720         z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1721         z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1722         z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1723         if (*idx != i) {
1724           c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1725           c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1726           c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1727           c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1728           c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1729         }
1730         v += 25;
1731         ++idx;
1732       }
1733     }
1734     z += 5;
1735   }
1736   PetscFunctionReturn(0);
1737 }
1738 
1739 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C) {
1740   Mat_SeqSBAIJ    *a  = (Mat_SeqSBAIJ *)A->data;
1741   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
1742   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
1743   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1744   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1745   PetscBLASInt     bbs, bcn, bbm, bcm;
1746   PetscScalar     *z = NULL;
1747   PetscScalar     *c, *b;
1748   const MatScalar *v;
1749   const PetscInt  *idx, *ii;
1750   PetscScalar      _DOne = 1.0;
1751 
1752   PetscFunctionBegin;
1753   if (!cm || !cn) PetscFunctionReturn(0);
1754   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1755   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1756   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1757   b = bd->v;
1758   PetscCall(MatZeroEntries(C));
1759   PetscCall(MatDenseGetArray(C, &c));
1760   switch (bs) {
1761   case 1: PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn)); break;
1762   case 2: PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn)); break;
1763   case 3: PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn)); break;
1764   case 4: PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn)); break;
1765   case 5: PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn)); break;
1766   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1767     PetscCall(PetscBLASIntCast(bs, &bbs));
1768     PetscCall(PetscBLASIntCast(cn, &bcn));
1769     PetscCall(PetscBLASIntCast(bm, &bbm));
1770     PetscCall(PetscBLASIntCast(cm, &bcm));
1771     idx = a->j;
1772     v   = a->a;
1773     mbs = a->mbs;
1774     ii  = a->i;
1775     z   = c;
1776     for (i = 0; i < mbs; i++) {
1777       n = ii[1] - ii[0];
1778       ii++;
1779       for (j = 0; j < n; j++) {
1780         if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1781         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1782         v += bs2;
1783       }
1784       z += bs;
1785     }
1786   }
1787   PetscCall(MatDenseRestoreArray(C, &c));
1788   PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
1789   PetscFunctionReturn(0);
1790 }
1791