xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision f40ac2af7ea630559cb57a5f8cf29bc3ca5e6983)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/impls/dense/seq/dense.h>
4 #include <../src/mat/impls/sbaij/seq/sbaij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6 #include <petscbt.h>
7 #include <petscblaslapack.h>
8 
9 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
10 {
11   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
12   PetscInt        brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
13   const PetscInt *idx;
14   PetscBT         table_out, table_in;
15 
16   PetscFunctionBegin;
17   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
18   mbs = a->mbs;
19   ai  = a->i;
20   aj  = a->j;
21   bs  = A->rmap->bs;
22   PetscCall(PetscBTCreate(mbs, &table_out));
23   PetscCall(PetscMalloc1(mbs + 1, &nidx));
24   PetscCall(PetscBTCreate(mbs, &table_in));
25 
26   for (i = 0; i < is_max; i++) { /* for each is */
27     isz = 0;
28     PetscCall(PetscBTMemzero(mbs, table_out));
29 
30     /* Extract the indices, assume there can be duplicate entries */
31     PetscCall(ISGetIndices(is[i], &idx));
32     PetscCall(ISGetLocalSize(is[i], &n));
33 
34     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
35     bcol_max = 0;
36     for (j = 0; j < n; ++j) {
37       brow = idx[j] / bs; /* convert the indices into block indices */
38       PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
39       if (!PetscBTLookupSet(table_out, brow)) {
40         nidx[isz++] = brow;
41         if (bcol_max < brow) bcol_max = brow;
42       }
43     }
44     PetscCall(ISRestoreIndices(is[i], &idx));
45     PetscCall(ISDestroy(&is[i]));
46 
47     k = 0;
48     for (j = 0; j < ov; j++) { /* for each overlap */
49       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
50       PetscCall(PetscBTMemzero(mbs, table_in));
51       for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));
52 
53       n = isz; /* length of the updated is[i] */
54       for (brow = 0; brow < mbs; brow++) {
55         start = ai[brow];
56         end   = ai[brow + 1];
57         if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
58           for (l = start; l < end; l++) {
59             bcol = aj[l];
60             if (!PetscBTLookupSet(table_out, bcol)) {
61               nidx[isz++] = bcol;
62               if (bcol_max < bcol) bcol_max = bcol;
63             }
64           }
65           k++;
66           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67         } else {             /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
68           for (l = start; l < end; l++) {
69             bcol = aj[l];
70             if (bcol > bcol_max) break;
71             if (PetscBTLookup(table_in, bcol)) {
72               if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
73               break; /* for l = start; l<end ; l++) */
74             }
75           }
76         }
77       }
78     } /* for each overlap */
79     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
80   } /* for each is */
81   PetscCall(PetscBTDestroy(&table_out));
82   PetscCall(PetscFree(nidx));
83   PetscCall(PetscBTDestroy(&table_in));
84   PetscFunctionReturn(PETSC_SUCCESS);
85 }
86 
87 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
88         Zero some ops' to avoid invalid usse */
89 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
90 {
91   PetscFunctionBegin;
92   PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
93   Bseq->ops->mult                   = NULL;
94   Bseq->ops->multadd                = NULL;
95   Bseq->ops->multtranspose          = NULL;
96   Bseq->ops->multtransposeadd       = NULL;
97   Bseq->ops->lufactor               = NULL;
98   Bseq->ops->choleskyfactor         = NULL;
99   Bseq->ops->lufactorsymbolic       = NULL;
100   Bseq->ops->choleskyfactorsymbolic = NULL;
101   Bseq->ops->getinertia             = NULL;
102   PetscFunctionReturn(PETSC_SUCCESS);
103 }
104 
105 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
106 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
107 {
108   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *c;
109   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
110   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
111   const PetscInt *irow, *icol;
112   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
113   PetscInt       *aj = a->j, *ai = a->i;
114   MatScalar      *mat_a;
115   Mat             C;
116   PetscBool       flag;
117 
118   PetscFunctionBegin;
119 
120   PetscCall(ISGetIndices(isrow, &irow));
121   PetscCall(ISGetIndices(iscol, &icol));
122   PetscCall(ISGetLocalSize(isrow, &nrows));
123   PetscCall(ISGetLocalSize(iscol, &ncols));
124 
125   PetscCall(PetscCalloc1(1 + oldcols, &smap));
126   ssmap = smap;
127   PetscCall(PetscMalloc1(1 + nrows, &lens));
128   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
129   /* determine lens of each row */
130   for (i = 0; i < nrows; i++) {
131     kstart  = ai[irow[i]];
132     kend    = kstart + a->ilen[irow[i]];
133     lens[i] = 0;
134     for (k = kstart; k < kend; k++) {
135       if (ssmap[aj[k]]) lens[i]++;
136     }
137   }
138   /* Create and fill new matrix */
139   if (scall == MAT_REUSE_MATRIX) {
140     c = (Mat_SeqSBAIJ *)((*B)->data);
141 
142     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
143     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
144     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
145     PetscCall(PetscArrayzero(c->ilen, c->mbs));
146     C = *B;
147   } else {
148     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
149     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
150     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
151     PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
152   }
153   c = (Mat_SeqSBAIJ *)(C->data);
154   for (i = 0; i < nrows; i++) {
155     row      = irow[i];
156     kstart   = ai[row];
157     kend     = kstart + a->ilen[row];
158     mat_i    = c->i[i];
159     mat_j    = c->j + mat_i;
160     mat_a    = c->a + mat_i * bs2;
161     mat_ilen = c->ilen + i;
162     for (k = kstart; k < kend; k++) {
163       if ((tcol = ssmap[a->j[k]])) {
164         *mat_j++ = tcol - 1;
165         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
166         mat_a += bs2;
167         (*mat_ilen)++;
168       }
169     }
170   }
171   /* sort */
172   {
173     MatScalar *work;
174 
175     PetscCall(PetscMalloc1(bs2, &work));
176     for (i = 0; i < nrows; i++) {
177       PetscInt ilen;
178       mat_i = c->i[i];
179       mat_j = c->j + mat_i;
180       mat_a = c->a + mat_i * bs2;
181       ilen  = c->ilen[i];
182       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
183     }
184     PetscCall(PetscFree(work));
185   }
186 
187   /* Free work space */
188   PetscCall(ISRestoreIndices(iscol, &icol));
189   PetscCall(PetscFree(smap));
190   PetscCall(PetscFree(lens));
191   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
192   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
193 
194   PetscCall(ISRestoreIndices(isrow, &irow));
195   *B = C;
196   PetscFunctionReturn(PETSC_SUCCESS);
197 }
198 
199 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
200 {
201   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
202   IS              is1, is2;
203   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs;
204   const PetscInt *irow, *icol;
205 
206   PetscFunctionBegin;
207   PetscCall(ISGetIndices(isrow, &irow));
208   PetscCall(ISGetIndices(iscol, &icol));
209   PetscCall(ISGetLocalSize(isrow, &nrows));
210   PetscCall(ISGetLocalSize(iscol, &ncols));
211 
212   /* Verify if the indices correspond to each element in a block
213    and form the IS with compressed IS */
214   maxmnbs = PetscMax(a->mbs, a->nbs);
215   PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary));
216   PetscCall(PetscArrayzero(vary, a->mbs));
217   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
218   for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks");
219   count = 0;
220   for (i = 0; i < nrows; i++) {
221     PetscInt j = irow[i] / bs;
222     if ((vary[j]--) == bs) iary[count++] = j;
223   }
224   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1));
225 
226   PetscCall(PetscArrayzero(vary, a->nbs));
227   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
228   for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc");
229   count = 0;
230   for (i = 0; i < ncols; i++) {
231     PetscInt j = icol[i] / bs;
232     if ((vary[j]--) == bs) iary[count++] = j;
233   }
234   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2));
235   PetscCall(ISRestoreIndices(isrow, &irow));
236   PetscCall(ISRestoreIndices(iscol, &icol));
237   PetscCall(PetscFree2(vary, iary));
238 
239   PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B));
240   PetscCall(ISDestroy(&is1));
241   PetscCall(ISDestroy(&is2));
242 
243   if (isrow != iscol) {
244     PetscBool isequal;
245     PetscCall(ISEqual(isrow, iscol, &isequal));
246     if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
247   }
248   PetscFunctionReturn(PETSC_SUCCESS);
249 }
250 
251 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
252 {
253   PetscInt i;
254 
255   PetscFunctionBegin;
256   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));
257 
258   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
259   PetscFunctionReturn(PETSC_SUCCESS);
260 }
261 
262 /* Should check that shapes of vectors and matrices match */
263 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
264 {
265   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
266   PetscScalar       *z, x1, x2, zero = 0.0;
267   const PetscScalar *x, *xb;
268   const MatScalar   *v;
269   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
270   const PetscInt    *aj = a->j, *ai = a->i, *ib;
271   PetscInt           nonzerorow = 0;
272 
273   PetscFunctionBegin;
274   PetscCall(VecSet(zz, zero));
275   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
276   PetscCall(VecGetArrayRead(xx, &x));
277   PetscCall(VecGetArray(zz, &z));
278 
279   v  = a->a;
280   xb = x;
281 
282   for (i = 0; i < mbs; i++) {
283     n    = ai[1] - ai[0]; /* length of i_th block row of A */
284     x1   = xb[0];
285     x2   = xb[1];
286     ib   = aj + *ai;
287     jmin = 0;
288     nonzerorow += (n > 0);
289     if (*ib == i) { /* (diag of A)*x */
290       z[2 * i] += v[0] * x1 + v[2] * x2;
291       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
292       v += 4;
293       jmin++;
294     }
295     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
296     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
297     for (j = jmin; j < n; j++) {
298       /* (strict lower triangular part of A)*x  */
299       cval = ib[j] * 2;
300       z[cval] += v[0] * x1 + v[1] * x2;
301       z[cval + 1] += v[2] * x1 + v[3] * x2;
302       /* (strict upper triangular part of A)*x  */
303       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
304       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
305       v += 4;
306     }
307     xb += 2;
308     ai++;
309   }
310 
311   PetscCall(VecRestoreArrayRead(xx, &x));
312   PetscCall(VecRestoreArray(zz, &z));
313   PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
314   PetscFunctionReturn(PETSC_SUCCESS);
315 }
316 
317 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
318 {
319   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
320   PetscScalar       *z, x1, x2, x3, zero = 0.0;
321   const PetscScalar *x, *xb;
322   const MatScalar   *v;
323   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
324   const PetscInt    *aj = a->j, *ai = a->i, *ib;
325   PetscInt           nonzerorow = 0;
326 
327   PetscFunctionBegin;
328   PetscCall(VecSet(zz, zero));
329   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
330   PetscCall(VecGetArrayRead(xx, &x));
331   PetscCall(VecGetArray(zz, &z));
332 
333   v  = a->a;
334   xb = x;
335 
336   for (i = 0; i < mbs; i++) {
337     n    = ai[1] - ai[0]; /* length of i_th block row of A */
338     x1   = xb[0];
339     x2   = xb[1];
340     x3   = xb[2];
341     ib   = aj + *ai;
342     jmin = 0;
343     nonzerorow += (n > 0);
344     if (*ib == i) { /* (diag of A)*x */
345       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
346       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
347       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
348       v += 9;
349       jmin++;
350     }
351     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
352     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
353     for (j = jmin; j < n; j++) {
354       /* (strict lower triangular part of A)*x  */
355       cval = ib[j] * 3;
356       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
357       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
358       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
359       /* (strict upper triangular part of A)*x  */
360       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
361       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
362       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
363       v += 9;
364     }
365     xb += 3;
366     ai++;
367   }
368 
369   PetscCall(VecRestoreArrayRead(xx, &x));
370   PetscCall(VecRestoreArray(zz, &z));
371   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
372   PetscFunctionReturn(PETSC_SUCCESS);
373 }
374 
375 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
376 {
377   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
378   PetscScalar       *z, x1, x2, x3, x4, zero = 0.0;
379   const PetscScalar *x, *xb;
380   const MatScalar   *v;
381   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
382   const PetscInt    *aj = a->j, *ai = a->i, *ib;
383   PetscInt           nonzerorow = 0;
384 
385   PetscFunctionBegin;
386   PetscCall(VecSet(zz, zero));
387   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
388   PetscCall(VecGetArrayRead(xx, &x));
389   PetscCall(VecGetArray(zz, &z));
390 
391   v  = a->a;
392   xb = x;
393 
394   for (i = 0; i < mbs; i++) {
395     n    = ai[1] - ai[0]; /* length of i_th block row of A */
396     x1   = xb[0];
397     x2   = xb[1];
398     x3   = xb[2];
399     x4   = xb[3];
400     ib   = aj + *ai;
401     jmin = 0;
402     nonzerorow += (n > 0);
403     if (*ib == i) { /* (diag of A)*x */
404       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
405       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
406       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
407       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
408       v += 16;
409       jmin++;
410     }
411     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
412     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
413     for (j = jmin; j < n; j++) {
414       /* (strict lower triangular part of A)*x  */
415       cval = ib[j] * 4;
416       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
417       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
418       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
419       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
420       /* (strict upper triangular part of A)*x  */
421       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
422       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
423       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
424       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
425       v += 16;
426     }
427     xb += 4;
428     ai++;
429   }
430 
431   PetscCall(VecRestoreArrayRead(xx, &x));
432   PetscCall(VecRestoreArray(zz, &z));
433   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
434   PetscFunctionReturn(PETSC_SUCCESS);
435 }
436 
437 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
438 {
439   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
440   PetscScalar       *z, x1, x2, x3, x4, x5, zero = 0.0;
441   const PetscScalar *x, *xb;
442   const MatScalar   *v;
443   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
444   const PetscInt    *aj = a->j, *ai = a->i, *ib;
445   PetscInt           nonzerorow = 0;
446 
447   PetscFunctionBegin;
448   PetscCall(VecSet(zz, zero));
449   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
450   PetscCall(VecGetArrayRead(xx, &x));
451   PetscCall(VecGetArray(zz, &z));
452 
453   v  = a->a;
454   xb = x;
455 
456   for (i = 0; i < mbs; i++) {
457     n    = ai[1] - ai[0]; /* length of i_th block row of A */
458     x1   = xb[0];
459     x2   = xb[1];
460     x3   = xb[2];
461     x4   = xb[3];
462     x5   = xb[4];
463     ib   = aj + *ai;
464     jmin = 0;
465     nonzerorow += (n > 0);
466     if (*ib == i) { /* (diag of A)*x */
467       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
468       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
469       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
470       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
471       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
472       v += 25;
473       jmin++;
474     }
475     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
476     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
477     for (j = jmin; j < n; j++) {
478       /* (strict lower triangular part of A)*x  */
479       cval = ib[j] * 5;
480       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
481       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
482       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
483       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
484       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
485       /* (strict upper triangular part of A)*x  */
486       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
487       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
488       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
489       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
490       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
491       v += 25;
492     }
493     xb += 5;
494     ai++;
495   }
496 
497   PetscCall(VecRestoreArrayRead(xx, &x));
498   PetscCall(VecRestoreArray(zz, &z));
499   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
500   PetscFunctionReturn(PETSC_SUCCESS);
501 }
502 
503 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
504 {
505   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
506   PetscScalar       *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
507   const PetscScalar *x, *xb;
508   const MatScalar   *v;
509   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
510   const PetscInt    *aj = a->j, *ai = a->i, *ib;
511   PetscInt           nonzerorow = 0;
512 
513   PetscFunctionBegin;
514   PetscCall(VecSet(zz, zero));
515   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
516   PetscCall(VecGetArrayRead(xx, &x));
517   PetscCall(VecGetArray(zz, &z));
518 
519   v  = a->a;
520   xb = x;
521 
522   for (i = 0; i < mbs; i++) {
523     n    = ai[1] - ai[0]; /* length of i_th block row of A */
524     x1   = xb[0];
525     x2   = xb[1];
526     x3   = xb[2];
527     x4   = xb[3];
528     x5   = xb[4];
529     x6   = xb[5];
530     ib   = aj + *ai;
531     jmin = 0;
532     nonzerorow += (n > 0);
533     if (*ib == i) { /* (diag of A)*x */
534       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
535       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
536       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
537       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
538       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
539       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
540       v += 36;
541       jmin++;
542     }
543     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
544     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
545     for (j = jmin; j < n; j++) {
546       /* (strict lower triangular part of A)*x  */
547       cval = ib[j] * 6;
548       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
549       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
550       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
551       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
552       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
553       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
554       /* (strict upper triangular part of A)*x  */
555       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
556       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
557       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
558       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
559       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
560       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
561       v += 36;
562     }
563     xb += 6;
564     ai++;
565   }
566 
567   PetscCall(VecRestoreArrayRead(xx, &x));
568   PetscCall(VecRestoreArray(zz, &z));
569   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
570   PetscFunctionReturn(PETSC_SUCCESS);
571 }
572 
573 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
574 {
575   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
576   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
577   const PetscScalar *x, *xb;
578   const MatScalar   *v;
579   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
580   const PetscInt    *aj = a->j, *ai = a->i, *ib;
581   PetscInt           nonzerorow = 0;
582 
583   PetscFunctionBegin;
584   PetscCall(VecSet(zz, zero));
585   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
586   PetscCall(VecGetArrayRead(xx, &x));
587   PetscCall(VecGetArray(zz, &z));
588 
589   v  = a->a;
590   xb = x;
591 
592   for (i = 0; i < mbs; i++) {
593     n    = ai[1] - ai[0]; /* length of i_th block row of A */
594     x1   = xb[0];
595     x2   = xb[1];
596     x3   = xb[2];
597     x4   = xb[3];
598     x5   = xb[4];
599     x6   = xb[5];
600     x7   = xb[6];
601     ib   = aj + *ai;
602     jmin = 0;
603     nonzerorow += (n > 0);
604     if (*ib == i) { /* (diag of A)*x */
605       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
606       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
607       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
608       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
609       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
610       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
611       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
612       v += 49;
613       jmin++;
614     }
615     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
616     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
617     for (j = jmin; j < n; j++) {
618       /* (strict lower triangular part of A)*x  */
619       cval = ib[j] * 7;
620       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
621       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
622       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
623       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
624       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
625       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
626       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
627       /* (strict upper triangular part of A)*x  */
628       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
629       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
630       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
631       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
632       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
633       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
634       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
635       v += 49;
636     }
637     xb += 7;
638     ai++;
639   }
640   PetscCall(VecRestoreArrayRead(xx, &x));
641   PetscCall(VecRestoreArray(zz, &z));
642   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
643   PetscFunctionReturn(PETSC_SUCCESS);
644 }
645 
646 /*
647     This will not work with MatScalar == float because it calls the BLAS
648 */
649 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
650 {
651   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
652   PetscScalar       *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
653   const PetscScalar *x, *x_ptr, *xb;
654   const MatScalar   *v;
655   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
656   const PetscInt    *idx, *aj, *ii;
657   PetscInt           nonzerorow = 0;
658 
659   PetscFunctionBegin;
660   PetscCall(VecSet(zz, zero));
661   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
662   PetscCall(VecGetArrayRead(xx, &x));
663   PetscCall(VecGetArray(zz, &z));
664 
665   x_ptr = x;
666   z_ptr = z;
667 
668   aj = a->j;
669   v  = a->a;
670   ii = a->i;
671 
672   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
673   work = a->mult_work;
674 
675   for (i = 0; i < mbs; i++) {
676     n     = ii[1] - ii[0];
677     ncols = n * bs;
678     workt = work;
679     idx   = aj + ii[0];
680     nonzerorow += (n > 0);
681 
682     /* upper triangular part */
683     for (j = 0; j < n; j++) {
684       xb = x_ptr + bs * (*idx++);
685       for (k = 0; k < bs; k++) workt[k] = xb[k];
686       workt += bs;
687     }
688     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
689     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
690 
691     /* strict lower triangular part */
692     idx = aj + ii[0];
693     if (n && *idx == i) {
694       ncols -= bs;
695       v += bs2;
696       idx++;
697       n--;
698     }
699 
700     if (ncols > 0) {
701       workt = work;
702       PetscCall(PetscArrayzero(workt, ncols));
703       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
704       for (j = 0; j < n; j++) {
705         zb = z_ptr + bs * (*idx++);
706         for (k = 0; k < bs; k++) zb[k] += workt[k];
707         workt += bs;
708       }
709     }
710     x += bs;
711     v += n * bs2;
712     z += bs;
713     ii++;
714   }
715 
716   PetscCall(VecRestoreArrayRead(xx, &x));
717   PetscCall(VecRestoreArray(zz, &z));
718   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
719   PetscFunctionReturn(PETSC_SUCCESS);
720 }
721 
722 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
723 {
724   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
725   PetscScalar       *z, x1;
726   const PetscScalar *x, *xb;
727   const MatScalar   *v;
728   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
729   const PetscInt    *aj = a->j, *ai = a->i, *ib;
730   PetscInt           nonzerorow = 0;
731 #if defined(PETSC_USE_COMPLEX)
732   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
733 #else
734   const int aconj = 0;
735 #endif
736 
737   PetscFunctionBegin;
738   PetscCall(VecCopy(yy, zz));
739   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
740   PetscCall(VecGetArrayRead(xx, &x));
741   PetscCall(VecGetArray(zz, &z));
742   v  = a->a;
743   xb = x;
744 
745   for (i = 0; i < mbs; i++) {
746     n    = ai[1] - ai[0]; /* length of i_th row of A */
747     x1   = xb[0];
748     ib   = aj + *ai;
749     jmin = 0;
750     nonzerorow += (n > 0);
751     if (n && *ib == i) { /* (diag of A)*x */
752       z[i] += *v++ * x[*ib++];
753       jmin++;
754     }
755     if (aconj) {
756       for (j = jmin; j < n; j++) {
757         cval = *ib;
758         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
759         z[i] += *v++ * x[*ib++];       /* (strict upper triangular part of A)*x  */
760       }
761     } else {
762       for (j = jmin; j < n; j++) {
763         cval = *ib;
764         z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
765         z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
766       }
767     }
768     xb++;
769     ai++;
770   }
771 
772   PetscCall(VecRestoreArrayRead(xx, &x));
773   PetscCall(VecRestoreArray(zz, &z));
774 
775   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
776   PetscFunctionReturn(PETSC_SUCCESS);
777 }
778 
779 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
780 {
781   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
782   PetscScalar       *z, x1, x2;
783   const PetscScalar *x, *xb;
784   const MatScalar   *v;
785   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
786   const PetscInt    *aj = a->j, *ai = a->i, *ib;
787   PetscInt           nonzerorow = 0;
788 
789   PetscFunctionBegin;
790   PetscCall(VecCopy(yy, zz));
791   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
792   PetscCall(VecGetArrayRead(xx, &x));
793   PetscCall(VecGetArray(zz, &z));
794 
795   v  = a->a;
796   xb = x;
797 
798   for (i = 0; i < mbs; i++) {
799     n    = ai[1] - ai[0]; /* length of i_th block row of A */
800     x1   = xb[0];
801     x2   = xb[1];
802     ib   = aj + *ai;
803     jmin = 0;
804     nonzerorow += (n > 0);
805     if (n && *ib == i) { /* (diag of A)*x */
806       z[2 * i] += v[0] * x1 + v[2] * x2;
807       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
808       v += 4;
809       jmin++;
810     }
811     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
812     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
813     for (j = jmin; j < n; j++) {
814       /* (strict lower triangular part of A)*x  */
815       cval = ib[j] * 2;
816       z[cval] += v[0] * x1 + v[1] * x2;
817       z[cval + 1] += v[2] * x1 + v[3] * x2;
818       /* (strict upper triangular part of A)*x  */
819       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
820       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
821       v += 4;
822     }
823     xb += 2;
824     ai++;
825   }
826   PetscCall(VecRestoreArrayRead(xx, &x));
827   PetscCall(VecRestoreArray(zz, &z));
828 
829   PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
830   PetscFunctionReturn(PETSC_SUCCESS);
831 }
832 
833 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
834 {
835   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
836   PetscScalar       *z, x1, x2, x3;
837   const PetscScalar *x, *xb;
838   const MatScalar   *v;
839   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
840   const PetscInt    *aj = a->j, *ai = a->i, *ib;
841   PetscInt           nonzerorow = 0;
842 
843   PetscFunctionBegin;
844   PetscCall(VecCopy(yy, zz));
845   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
846   PetscCall(VecGetArrayRead(xx, &x));
847   PetscCall(VecGetArray(zz, &z));
848 
849   v  = a->a;
850   xb = x;
851 
852   for (i = 0; i < mbs; i++) {
853     n    = ai[1] - ai[0]; /* length of i_th block row of A */
854     x1   = xb[0];
855     x2   = xb[1];
856     x3   = xb[2];
857     ib   = aj + *ai;
858     jmin = 0;
859     nonzerorow += (n > 0);
860     if (n && *ib == i) { /* (diag of A)*x */
861       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
862       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
863       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
864       v += 9;
865       jmin++;
866     }
867     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
868     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
869     for (j = jmin; j < n; j++) {
870       /* (strict lower triangular part of A)*x  */
871       cval = ib[j] * 3;
872       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
873       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
874       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
875       /* (strict upper triangular part of A)*x  */
876       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
877       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
878       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
879       v += 9;
880     }
881     xb += 3;
882     ai++;
883   }
884 
885   PetscCall(VecRestoreArrayRead(xx, &x));
886   PetscCall(VecRestoreArray(zz, &z));
887 
888   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
889   PetscFunctionReturn(PETSC_SUCCESS);
890 }
891 
892 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
893 {
894   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
895   PetscScalar       *z, x1, x2, x3, x4;
896   const PetscScalar *x, *xb;
897   const MatScalar   *v;
898   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
899   const PetscInt    *aj = a->j, *ai = a->i, *ib;
900   PetscInt           nonzerorow = 0;
901 
902   PetscFunctionBegin;
903   PetscCall(VecCopy(yy, zz));
904   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
905   PetscCall(VecGetArrayRead(xx, &x));
906   PetscCall(VecGetArray(zz, &z));
907 
908   v  = a->a;
909   xb = x;
910 
911   for (i = 0; i < mbs; i++) {
912     n    = ai[1] - ai[0]; /* length of i_th block row of A */
913     x1   = xb[0];
914     x2   = xb[1];
915     x3   = xb[2];
916     x4   = xb[3];
917     ib   = aj + *ai;
918     jmin = 0;
919     nonzerorow += (n > 0);
920     if (n && *ib == i) { /* (diag of A)*x */
921       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
922       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
923       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
924       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
925       v += 16;
926       jmin++;
927     }
928     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
929     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
930     for (j = jmin; j < n; j++) {
931       /* (strict lower triangular part of A)*x  */
932       cval = ib[j] * 4;
933       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
934       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
935       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
936       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
937       /* (strict upper triangular part of A)*x  */
938       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
939       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
940       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
941       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
942       v += 16;
943     }
944     xb += 4;
945     ai++;
946   }
947 
948   PetscCall(VecRestoreArrayRead(xx, &x));
949   PetscCall(VecRestoreArray(zz, &z));
950 
951   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
952   PetscFunctionReturn(PETSC_SUCCESS);
953 }
954 
955 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
956 {
957   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
958   PetscScalar       *z, x1, x2, x3, x4, x5;
959   const PetscScalar *x, *xb;
960   const MatScalar   *v;
961   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
962   const PetscInt    *aj = a->j, *ai = a->i, *ib;
963   PetscInt           nonzerorow = 0;
964 
965   PetscFunctionBegin;
966   PetscCall(VecCopy(yy, zz));
967   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
968   PetscCall(VecGetArrayRead(xx, &x));
969   PetscCall(VecGetArray(zz, &z));
970 
971   v  = a->a;
972   xb = x;
973 
974   for (i = 0; i < mbs; i++) {
975     n    = ai[1] - ai[0]; /* length of i_th block row of A */
976     x1   = xb[0];
977     x2   = xb[1];
978     x3   = xb[2];
979     x4   = xb[3];
980     x5   = xb[4];
981     ib   = aj + *ai;
982     jmin = 0;
983     nonzerorow += (n > 0);
984     if (n && *ib == i) { /* (diag of A)*x */
985       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
986       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
987       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
988       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
989       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
990       v += 25;
991       jmin++;
992     }
993     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
994     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
995     for (j = jmin; j < n; j++) {
996       /* (strict lower triangular part of A)*x  */
997       cval = ib[j] * 5;
998       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
999       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
1000       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
1001       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
1002       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1003       /* (strict upper triangular part of A)*x  */
1004       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
1005       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
1006       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
1007       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
1008       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
1009       v += 25;
1010     }
1011     xb += 5;
1012     ai++;
1013   }
1014 
1015   PetscCall(VecRestoreArrayRead(xx, &x));
1016   PetscCall(VecRestoreArray(zz, &z));
1017 
1018   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
1019   PetscFunctionReturn(PETSC_SUCCESS);
1020 }
1021 
1022 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1023 {
1024   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1025   PetscScalar       *z, x1, x2, x3, x4, x5, x6;
1026   const PetscScalar *x, *xb;
1027   const MatScalar   *v;
1028   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1029   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1030   PetscInt           nonzerorow = 0;
1031 
1032   PetscFunctionBegin;
1033   PetscCall(VecCopy(yy, zz));
1034   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1035   PetscCall(VecGetArrayRead(xx, &x));
1036   PetscCall(VecGetArray(zz, &z));
1037 
1038   v  = a->a;
1039   xb = x;
1040 
1041   for (i = 0; i < mbs; i++) {
1042     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1043     x1   = xb[0];
1044     x2   = xb[1];
1045     x3   = xb[2];
1046     x4   = xb[3];
1047     x5   = xb[4];
1048     x6   = xb[5];
1049     ib   = aj + *ai;
1050     jmin = 0;
1051     nonzerorow += (n > 0);
1052     if (n && *ib == i) { /* (diag of A)*x */
1053       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1054       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1055       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1056       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1057       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1058       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1059       v += 36;
1060       jmin++;
1061     }
1062     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1063     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1064     for (j = jmin; j < n; j++) {
1065       /* (strict lower triangular part of A)*x  */
1066       cval = ib[j] * 6;
1067       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1068       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1069       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1070       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1071       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1072       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1073       /* (strict upper triangular part of A)*x  */
1074       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1075       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1076       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1077       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1078       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1079       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1080       v += 36;
1081     }
1082     xb += 6;
1083     ai++;
1084   }
1085 
1086   PetscCall(VecRestoreArrayRead(xx, &x));
1087   PetscCall(VecRestoreArray(zz, &z));
1088 
1089   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
1090   PetscFunctionReturn(PETSC_SUCCESS);
1091 }
1092 
1093 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1094 {
1095   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1096   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7;
1097   const PetscScalar *x, *xb;
1098   const MatScalar   *v;
1099   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1100   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1101   PetscInt           nonzerorow = 0;
1102 
1103   PetscFunctionBegin;
1104   PetscCall(VecCopy(yy, zz));
1105   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1106   PetscCall(VecGetArrayRead(xx, &x));
1107   PetscCall(VecGetArray(zz, &z));
1108 
1109   v  = a->a;
1110   xb = x;
1111 
1112   for (i = 0; i < mbs; i++) {
1113     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1114     x1   = xb[0];
1115     x2   = xb[1];
1116     x3   = xb[2];
1117     x4   = xb[3];
1118     x5   = xb[4];
1119     x6   = xb[5];
1120     x7   = xb[6];
1121     ib   = aj + *ai;
1122     jmin = 0;
1123     nonzerorow += (n > 0);
1124     if (n && *ib == i) { /* (diag of A)*x */
1125       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1126       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1127       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1128       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1129       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1130       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1131       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1132       v += 49;
1133       jmin++;
1134     }
1135     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1136     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1137     for (j = jmin; j < n; j++) {
1138       /* (strict lower triangular part of A)*x  */
1139       cval = ib[j] * 7;
1140       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1141       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1142       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1143       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1144       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1145       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1146       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1147       /* (strict upper triangular part of A)*x  */
1148       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1149       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1150       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1151       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1152       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1153       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1154       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1155       v += 49;
1156     }
1157     xb += 7;
1158     ai++;
1159   }
1160 
1161   PetscCall(VecRestoreArrayRead(xx, &x));
1162   PetscCall(VecRestoreArray(zz, &z));
1163 
1164   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
1165   PetscFunctionReturn(PETSC_SUCCESS);
1166 }
1167 
1168 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1169 {
1170   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1171   PetscScalar       *z, *z_ptr = NULL, *zb, *work, *workt;
1172   const PetscScalar *x, *x_ptr, *xb;
1173   const MatScalar   *v;
1174   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1175   const PetscInt    *idx, *aj, *ii;
1176   PetscInt           nonzerorow = 0;
1177 
1178   PetscFunctionBegin;
1179   PetscCall(VecCopy(yy, zz));
1180   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1181   PetscCall(VecGetArrayRead(xx, &x));
1182   x_ptr = x;
1183   PetscCall(VecGetArray(zz, &z));
1184   z_ptr = z;
1185 
1186   aj = a->j;
1187   v  = a->a;
1188   ii = a->i;
1189 
1190   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
1191   work = a->mult_work;
1192 
1193   for (i = 0; i < mbs; i++) {
1194     n     = ii[1] - ii[0];
1195     ncols = n * bs;
1196     workt = work;
1197     idx   = aj + ii[0];
1198     nonzerorow += (n > 0);
1199 
1200     /* upper triangular part */
1201     for (j = 0; j < n; j++) {
1202       xb = x_ptr + bs * (*idx++);
1203       for (k = 0; k < bs; k++) workt[k] = xb[k];
1204       workt += bs;
1205     }
1206     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1207     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
1208 
1209     /* strict lower triangular part */
1210     idx = aj + ii[0];
1211     if (n && *idx == i) {
1212       ncols -= bs;
1213       v += bs2;
1214       idx++;
1215       n--;
1216     }
1217     if (ncols > 0) {
1218       workt = work;
1219       PetscCall(PetscArrayzero(workt, ncols));
1220       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1221       for (j = 0; j < n; j++) {
1222         zb = z_ptr + bs * (*idx++);
1223         for (k = 0; k < bs; k++) zb[k] += workt[k];
1224         workt += bs;
1225       }
1226     }
1227 
1228     x += bs;
1229     v += n * bs2;
1230     z += bs;
1231     ii++;
1232   }
1233 
1234   PetscCall(VecRestoreArrayRead(xx, &x));
1235   PetscCall(VecRestoreArray(zz, &z));
1236 
1237   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
1238   PetscFunctionReturn(PETSC_SUCCESS);
1239 }
1240 
1241 PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1242 {
1243   Mat_SeqSBAIJ *a      = (Mat_SeqSBAIJ *)inA->data;
1244   PetscScalar   oalpha = alpha;
1245   PetscBLASInt  one    = 1, totalnz;
1246 
1247   PetscFunctionBegin;
1248   PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1249   PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1250   PetscCall(PetscLogFlops(totalnz));
1251   PetscFunctionReturn(PETSC_SUCCESS);
1252 }
1253 
1254 PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1255 {
1256   Mat_SeqSBAIJ    *a        = (Mat_SeqSBAIJ *)A->data;
1257   const MatScalar *v        = a->a;
1258   PetscReal        sum_diag = 0.0, sum_off = 0.0, *sum;
1259   PetscInt         i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1260   const PetscInt  *aj = a->j, *col;
1261 
1262   PetscFunctionBegin;
1263   if (!a->nz) {
1264     *norm = 0.0;
1265     PetscFunctionReturn(PETSC_SUCCESS);
1266   }
1267   if (type == NORM_FROBENIUS) {
1268     for (k = 0; k < mbs; k++) {
1269       jmin = a->i[k];
1270       jmax = a->i[k + 1];
1271       col  = aj + jmin;
1272       if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1273         for (i = 0; i < bs2; i++) {
1274           sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1275           v++;
1276         }
1277         jmin++;
1278       }
1279       for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1280         for (i = 0; i < bs2; i++) {
1281           sum_off += PetscRealPart(PetscConj(*v) * (*v));
1282           v++;
1283         }
1284       }
1285     }
1286     *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1287     PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1288   } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1289     PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
1290     for (i = 0; i < mbs; i++) jl[i] = mbs;
1291     il[0] = 0;
1292 
1293     *norm = 0.0;
1294     for (k = 0; k < mbs; k++) { /* k_th block row */
1295       for (j = 0; j < bs; j++) sum[j] = 0.0;
1296       /*-- col sum --*/
1297       i = jl[k]; /* first |A(i,k)| to be added */
1298       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1299                   at step k */
1300       while (i < mbs) {
1301         nexti = jl[i]; /* next block row to be added */
1302         ik    = il[i]; /* block index of A(i,k) in the array a */
1303         for (j = 0; j < bs; j++) {
1304           v = a->a + ik * bs2 + j * bs;
1305           for (k1 = 0; k1 < bs; k1++) {
1306             sum[j] += PetscAbsScalar(*v);
1307             v++;
1308           }
1309         }
1310         /* update il, jl */
1311         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1312         jmax = a->i[i + 1];
1313         if (jmin < jmax) {
1314           il[i] = jmin;
1315           j     = a->j[jmin];
1316           jl[i] = jl[j];
1317           jl[j] = i;
1318         }
1319         i = nexti;
1320       }
1321       /*-- row sum --*/
1322       jmin = a->i[k];
1323       jmax = a->i[k + 1];
1324       for (i = jmin; i < jmax; i++) {
1325         for (j = 0; j < bs; j++) {
1326           v = a->a + i * bs2 + j;
1327           for (k1 = 0; k1 < bs; k1++) {
1328             sum[j] += PetscAbsScalar(*v);
1329             v += bs;
1330           }
1331         }
1332       }
1333       /* add k_th block row to il, jl */
1334       col = aj + jmin;
1335       if (jmax - jmin > 0 && *col == k) jmin++;
1336       if (jmin < jmax) {
1337         il[k] = jmin;
1338         j     = a->j[jmin];
1339         jl[k] = jl[j];
1340         jl[j] = k;
1341       }
1342       for (j = 0; j < bs; j++) {
1343         if (sum[j] > *norm) *norm = sum[j];
1344       }
1345     }
1346     PetscCall(PetscFree3(sum, il, jl));
1347     PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1348   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1349   PetscFunctionReturn(PETSC_SUCCESS);
1350 }
1351 
1352 PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1353 {
1354   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
1355 
1356   PetscFunctionBegin;
1357   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1358   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1359     *flg = PETSC_FALSE;
1360     PetscFunctionReturn(PETSC_SUCCESS);
1361   }
1362 
1363   /* if the a->i are the same */
1364   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
1365   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1366 
1367   /* if a->j are the same */
1368   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
1369   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1370 
1371   /* if a->a are the same */
1372   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg));
1373   PetscFunctionReturn(PETSC_SUCCESS);
1374 }
1375 
1376 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1377 {
1378   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1379   PetscInt         i, j, k, row, bs, ambs, bs2;
1380   const PetscInt  *ai, *aj;
1381   PetscScalar     *x, zero = 0.0;
1382   const MatScalar *aa, *aa_j;
1383 
1384   PetscFunctionBegin;
1385   bs = A->rmap->bs;
1386   PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");
1387 
1388   aa   = a->a;
1389   ambs = a->mbs;
1390 
1391   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1392     PetscInt *diag = a->diag;
1393     aa             = a->a;
1394     ambs           = a->mbs;
1395     PetscCall(VecGetArray(v, &x));
1396     for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
1397     PetscCall(VecRestoreArray(v, &x));
1398     PetscFunctionReturn(PETSC_SUCCESS);
1399   }
1400 
1401   ai  = a->i;
1402   aj  = a->j;
1403   bs2 = a->bs2;
1404   PetscCall(VecSet(v, zero));
1405   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1406   PetscCall(VecGetArray(v, &x));
1407   for (i = 0; i < ambs; i++) {
1408     j = ai[i];
1409     if (aj[j] == i) { /* if this is a diagonal element */
1410       row  = i * bs;
1411       aa_j = aa + j * bs2;
1412       for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
1413     }
1414   }
1415   PetscCall(VecRestoreArray(v, &x));
1416   PetscFunctionReturn(PETSC_SUCCESS);
1417 }
1418 
1419 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1420 {
1421   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1422   PetscScalar        x;
1423   const PetscScalar *l, *li, *ri;
1424   MatScalar         *aa, *v;
1425   PetscInt           i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1426   const PetscInt    *ai, *aj;
1427   PetscBool          flg;
1428 
1429   PetscFunctionBegin;
1430   if (ll != rr) {
1431     PetscCall(VecEqual(ll, rr, &flg));
1432     PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same");
1433   }
1434   if (!ll) PetscFunctionReturn(PETSC_SUCCESS);
1435   ai  = a->i;
1436   aj  = a->j;
1437   aa  = a->a;
1438   m   = A->rmap->N;
1439   bs  = A->rmap->bs;
1440   mbs = a->mbs;
1441   bs2 = a->bs2;
1442 
1443   PetscCall(VecGetArrayRead(ll, &l));
1444   PetscCall(VecGetLocalSize(ll, &lm));
1445   PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1446   for (i = 0; i < mbs; i++) { /* for each block row */
1447     M  = ai[i + 1] - ai[i];
1448     li = l + i * bs;
1449     v  = aa + bs2 * ai[i];
1450     for (j = 0; j < M; j++) { /* for each block */
1451       ri = l + bs * aj[ai[i] + j];
1452       for (k = 0; k < bs; k++) {
1453         x = ri[k];
1454         for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1455       }
1456     }
1457   }
1458   PetscCall(VecRestoreArrayRead(ll, &l));
1459   PetscCall(PetscLogFlops(2.0 * a->nz));
1460   PetscFunctionReturn(PETSC_SUCCESS);
1461 }
1462 
1463 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1464 {
1465   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1466 
1467   PetscFunctionBegin;
1468   info->block_size   = a->bs2;
1469   info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1470   info->nz_used      = a->bs2 * a->nz;    /*num. of nonzeros in upper triangular part */
1471   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1472   info->assemblies   = A->num_ass;
1473   info->mallocs      = A->info.mallocs;
1474   info->memory       = 0; /* REVIEW ME */
1475   if (A->factortype) {
1476     info->fill_ratio_given  = A->info.fill_ratio_given;
1477     info->fill_ratio_needed = A->info.fill_ratio_needed;
1478     info->factor_mallocs    = A->info.factor_mallocs;
1479   } else {
1480     info->fill_ratio_given  = 0;
1481     info->fill_ratio_needed = 0;
1482     info->factor_mallocs    = 0;
1483   }
1484   PetscFunctionReturn(PETSC_SUCCESS);
1485 }
1486 
1487 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1488 {
1489   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1490 
1491   PetscFunctionBegin;
1492   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
1493   PetscFunctionReturn(PETSC_SUCCESS);
1494 }
1495 
1496 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1497 {
1498   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1499   PetscInt         i, j, n, row, col, bs, mbs;
1500   const PetscInt  *ai, *aj;
1501   PetscReal        atmp;
1502   const MatScalar *aa;
1503   PetscScalar     *x;
1504   PetscInt         ncols, brow, bcol, krow, kcol;
1505 
1506   PetscFunctionBegin;
1507   PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
1508   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1509   bs  = A->rmap->bs;
1510   aa  = a->a;
1511   ai  = a->i;
1512   aj  = a->j;
1513   mbs = a->mbs;
1514 
1515   PetscCall(VecSet(v, 0.0));
1516   PetscCall(VecGetArray(v, &x));
1517   PetscCall(VecGetLocalSize(v, &n));
1518   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1519   for (i = 0; i < mbs; i++) {
1520     ncols = ai[1] - ai[0];
1521     ai++;
1522     brow = bs * i;
1523     for (j = 0; j < ncols; j++) {
1524       bcol = bs * (*aj);
1525       for (kcol = 0; kcol < bs; kcol++) {
1526         col = bcol + kcol; /* col index */
1527         for (krow = 0; krow < bs; krow++) {
1528           atmp = PetscAbsScalar(*aa);
1529           aa++;
1530           row = brow + krow; /* row index */
1531           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1532           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1533         }
1534       }
1535       aj++;
1536     }
1537   }
1538   PetscCall(VecRestoreArray(v, &x));
1539   PetscFunctionReturn(PETSC_SUCCESS);
1540 }
1541 
1542 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1543 {
1544   PetscFunctionBegin;
1545   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1546   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1547   PetscFunctionReturn(PETSC_SUCCESS);
1548 }
1549 
1550 PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1551 {
1552   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1553   PetscScalar       *z = c;
1554   const PetscScalar *xb;
1555   PetscScalar        x1;
1556   const MatScalar   *v   = a->a, *vv;
1557   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1558 #if defined(PETSC_USE_COMPLEX)
1559   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
1560 #else
1561   const int aconj = 0;
1562 #endif
1563 
1564   PetscFunctionBegin;
1565   for (i = 0; i < mbs; i++) {
1566     n = ii[1] - ii[0];
1567     ii++;
1568     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1569     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1570     jj = idx;
1571     vv = v;
1572     for (k = 0; k < cn; k++) {
1573       idx = jj;
1574       v   = vv;
1575       for (j = 0; j < n; j++) {
1576         xb = b + (*idx);
1577         x1 = xb[0 + k * bm];
1578         z[0 + k * cm] += v[0] * x1;
1579         if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1580         v += 1;
1581         ++idx;
1582       }
1583     }
1584     z += 1;
1585   }
1586   PetscFunctionReturn(PETSC_SUCCESS);
1587 }
1588 
1589 PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1590 {
1591   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1592   PetscScalar       *z = c;
1593   const PetscScalar *xb;
1594   PetscScalar        x1, x2;
1595   const MatScalar   *v   = a->a, *vv;
1596   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1597 
1598   PetscFunctionBegin;
1599   for (i = 0; i < mbs; i++) {
1600     n = ii[1] - ii[0];
1601     ii++;
1602     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1603     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1604     jj = idx;
1605     vv = v;
1606     for (k = 0; k < cn; k++) {
1607       idx = jj;
1608       v   = vv;
1609       for (j = 0; j < n; j++) {
1610         xb = b + 2 * (*idx);
1611         x1 = xb[0 + k * bm];
1612         x2 = xb[1 + k * bm];
1613         z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1614         z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1615         if (*idx != i) {
1616           c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1617           c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1618         }
1619         v += 4;
1620         ++idx;
1621       }
1622     }
1623     z += 2;
1624   }
1625   PetscFunctionReturn(PETSC_SUCCESS);
1626 }
1627 
1628 PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1629 {
1630   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1631   PetscScalar       *z = c;
1632   const PetscScalar *xb;
1633   PetscScalar        x1, x2, x3;
1634   const MatScalar   *v   = a->a, *vv;
1635   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1636 
1637   PetscFunctionBegin;
1638   for (i = 0; i < mbs; i++) {
1639     n = ii[1] - ii[0];
1640     ii++;
1641     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1642     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1643     jj = idx;
1644     vv = v;
1645     for (k = 0; k < cn; k++) {
1646       idx = jj;
1647       v   = vv;
1648       for (j = 0; j < n; j++) {
1649         xb = b + 3 * (*idx);
1650         x1 = xb[0 + k * bm];
1651         x2 = xb[1 + k * bm];
1652         x3 = xb[2 + k * bm];
1653         z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1654         z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1655         z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1656         if (*idx != i) {
1657           c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1658           c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1659           c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1660         }
1661         v += 9;
1662         ++idx;
1663       }
1664     }
1665     z += 3;
1666   }
1667   PetscFunctionReturn(PETSC_SUCCESS);
1668 }
1669 
1670 PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1671 {
1672   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1673   PetscScalar       *z = c;
1674   const PetscScalar *xb;
1675   PetscScalar        x1, x2, x3, x4;
1676   const MatScalar   *v   = a->a, *vv;
1677   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1678 
1679   PetscFunctionBegin;
1680   for (i = 0; i < mbs; i++) {
1681     n = ii[1] - ii[0];
1682     ii++;
1683     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1684     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1685     jj = idx;
1686     vv = v;
1687     for (k = 0; k < cn; k++) {
1688       idx = jj;
1689       v   = vv;
1690       for (j = 0; j < n; j++) {
1691         xb = b + 4 * (*idx);
1692         x1 = xb[0 + k * bm];
1693         x2 = xb[1 + k * bm];
1694         x3 = xb[2 + k * bm];
1695         x4 = xb[3 + k * bm];
1696         z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1697         z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1698         z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1699         z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1700         if (*idx != i) {
1701           c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1702           c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1703           c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1704           c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1705         }
1706         v += 16;
1707         ++idx;
1708       }
1709     }
1710     z += 4;
1711   }
1712   PetscFunctionReturn(PETSC_SUCCESS);
1713 }
1714 
1715 PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1716 {
1717   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1718   PetscScalar       *z = c;
1719   const PetscScalar *xb;
1720   PetscScalar        x1, x2, x3, x4, x5;
1721   const MatScalar   *v   = a->a, *vv;
1722   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1723 
1724   PetscFunctionBegin;
1725   for (i = 0; i < mbs; i++) {
1726     n = ii[1] - ii[0];
1727     ii++;
1728     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1729     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1730     jj = idx;
1731     vv = v;
1732     for (k = 0; k < cn; k++) {
1733       idx = jj;
1734       v   = vv;
1735       for (j = 0; j < n; j++) {
1736         xb = b + 5 * (*idx);
1737         x1 = xb[0 + k * bm];
1738         x2 = xb[1 + k * bm];
1739         x3 = xb[2 + k * bm];
1740         x4 = xb[3 + k * bm];
1741         x5 = xb[4 + k * cm];
1742         z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1743         z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1744         z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1745         z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1746         z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1747         if (*idx != i) {
1748           c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1749           c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1750           c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1751           c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1752           c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1753         }
1754         v += 25;
1755         ++idx;
1756       }
1757     }
1758     z += 5;
1759   }
1760   PetscFunctionReturn(PETSC_SUCCESS);
1761 }
1762 
1763 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1764 {
1765   Mat_SeqSBAIJ    *a  = (Mat_SeqSBAIJ *)A->data;
1766   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
1767   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
1768   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1769   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1770   PetscBLASInt     bbs, bcn, bbm, bcm;
1771   PetscScalar     *z = NULL;
1772   PetscScalar     *c, *b;
1773   const MatScalar *v;
1774   const PetscInt  *idx, *ii;
1775   PetscScalar      _DOne = 1.0;
1776 
1777   PetscFunctionBegin;
1778   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1779   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1780   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1781   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1782   b = bd->v;
1783   PetscCall(MatZeroEntries(C));
1784   PetscCall(MatDenseGetArray(C, &c));
1785   switch (bs) {
1786   case 1:
1787     PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1788     break;
1789   case 2:
1790     PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1791     break;
1792   case 3:
1793     PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1794     break;
1795   case 4:
1796     PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1797     break;
1798   case 5:
1799     PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1800     break;
1801   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1802     PetscCall(PetscBLASIntCast(bs, &bbs));
1803     PetscCall(PetscBLASIntCast(cn, &bcn));
1804     PetscCall(PetscBLASIntCast(bm, &bbm));
1805     PetscCall(PetscBLASIntCast(cm, &bcm));
1806     idx = a->j;
1807     v   = a->a;
1808     mbs = a->mbs;
1809     ii  = a->i;
1810     z   = c;
1811     for (i = 0; i < mbs; i++) {
1812       n = ii[1] - ii[0];
1813       ii++;
1814       for (j = 0; j < n; j++) {
1815         if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1816         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1817         v += bs2;
1818       }
1819       z += bs;
1820     }
1821   }
1822   PetscCall(MatDenseRestoreArray(C, &c));
1823   PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
1824   PetscFunctionReturn(PETSC_SUCCESS);
1825 }
1826