xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision fdfbdca60871d1230bda7d5d096e2bd2d2a23f7a)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/impls/dense/seq/dense.h>
4 #include <../src/mat/impls/sbaij/seq/sbaij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6 #include <petscbt.h>
7 #include <petscblaslapack.h>
8 
9 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
10 {
11   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
12   PetscInt        brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
13   const PetscInt *idx;
14   PetscBT         table_out, table_in;
15 
16   PetscFunctionBegin;
17   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
18   mbs = a->mbs;
19   ai  = a->i;
20   aj  = a->j;
21   bs  = A->rmap->bs;
22   PetscCall(PetscBTCreate(mbs, &table_out));
23   PetscCall(PetscMalloc1(mbs + 1, &nidx));
24   PetscCall(PetscBTCreate(mbs, &table_in));
25 
26   for (i = 0; i < is_max; i++) { /* for each is */
27     isz = 0;
28     PetscCall(PetscBTMemzero(mbs, table_out));
29 
30     /* Extract the indices, assume there can be duplicate entries */
31     PetscCall(ISGetIndices(is[i], &idx));
32     PetscCall(ISGetLocalSize(is[i], &n));
33 
34     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
35     bcol_max = 0;
36     for (j = 0; j < n; ++j) {
37       brow = idx[j] / bs; /* convert the indices into block indices */
38       PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
39       if (!PetscBTLookupSet(table_out, brow)) {
40         nidx[isz++] = brow;
41         if (bcol_max < brow) bcol_max = brow;
42       }
43     }
44     PetscCall(ISRestoreIndices(is[i], &idx));
45     PetscCall(ISDestroy(&is[i]));
46 
47     k = 0;
48     for (j = 0; j < ov; j++) { /* for each overlap */
49       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
50       PetscCall(PetscBTMemzero(mbs, table_in));
51       for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));
52 
53       n = isz; /* length of the updated is[i] */
54       for (brow = 0; brow < mbs; brow++) {
55         start = ai[brow];
56         end   = ai[brow + 1];
57         if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
58           for (l = start; l < end; l++) {
59             bcol = aj[l];
60             if (!PetscBTLookupSet(table_out, bcol)) {
61               nidx[isz++] = bcol;
62               if (bcol_max < bcol) bcol_max = bcol;
63             }
64           }
65           k++;
66           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67         } else {             /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
68           for (l = start; l < end; l++) {
69             bcol = aj[l];
70             if (bcol > bcol_max) break;
71             if (PetscBTLookup(table_in, bcol)) {
72               if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
73               break; /* for l = start; l<end ; l++) */
74             }
75           }
76         }
77       }
78     } /* for each overlap */
79     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
80   } /* for each is */
81   PetscCall(PetscBTDestroy(&table_out));
82   PetscCall(PetscFree(nidx));
83   PetscCall(PetscBTDestroy(&table_in));
84   PetscFunctionReturn(PETSC_SUCCESS);
85 }
86 
87 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
88         Zero some ops' to avoid invalid use */
89 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
90 {
91   PetscFunctionBegin;
92   PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
93   Bseq->ops->mult                   = NULL;
94   Bseq->ops->multadd                = NULL;
95   Bseq->ops->multtranspose          = NULL;
96   Bseq->ops->multtransposeadd       = NULL;
97   Bseq->ops->lufactor               = NULL;
98   Bseq->ops->choleskyfactor         = NULL;
99   Bseq->ops->lufactorsymbolic       = NULL;
100   Bseq->ops->choleskyfactorsymbolic = NULL;
101   Bseq->ops->getinertia             = NULL;
102   PetscFunctionReturn(PETSC_SUCCESS);
103 }
104 
105 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
106 static PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B, PetscBool sym)
107 {
108   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *c = NULL;
109   Mat_SeqBAIJ    *d = NULL;
110   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
111   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
112   const PetscInt *irow, *icol;
113   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
114   PetscInt       *aj = a->j, *ai = a->i;
115   MatScalar      *mat_a;
116   Mat             C;
117   PetscBool       flag;
118 
119   PetscFunctionBegin;
120 
121   PetscCall(ISGetIndices(isrow, &irow));
122   PetscCall(ISGetIndices(iscol, &icol));
123   PetscCall(ISGetLocalSize(isrow, &nrows));
124   PetscCall(ISGetLocalSize(iscol, &ncols));
125 
126   PetscCall(PetscCalloc1(1 + oldcols, &smap));
127   ssmap = smap;
128   PetscCall(PetscMalloc1(1 + nrows, &lens));
129   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
130   /* determine lens of each row */
131   for (i = 0; i < nrows; i++) {
132     kstart  = ai[irow[i]];
133     kend    = kstart + a->ilen[irow[i]];
134     lens[i] = 0;
135     for (k = kstart; k < kend; k++) {
136       if (ssmap[aj[k]]) lens[i]++;
137     }
138   }
139   /* Create and fill new matrix */
140   if (scall == MAT_REUSE_MATRIX) {
141     if (sym) {
142       c = (Mat_SeqSBAIJ *)((*B)->data);
143 
144       PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
145       PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
146       PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
147       PetscCall(PetscArrayzero(c->ilen, c->mbs));
148     } else {
149       d = (Mat_SeqBAIJ *)((*B)->data);
150 
151       PetscCheck(d->mbs == nrows && d->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
152       PetscCall(PetscArraycmp(d->ilen, lens, d->mbs, &flag));
153       PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
154       PetscCall(PetscArrayzero(d->ilen, d->mbs));
155     }
156     C = *B;
157   } else {
158     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
159     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
160     if (sym) {
161       PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
162       PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
163     } else {
164       PetscCall(MatSetType(C, MATSEQBAIJ));
165       PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
166     }
167   }
168   if (sym) c = (Mat_SeqSBAIJ *)(C->data);
169   else d = (Mat_SeqBAIJ *)(C->data);
170   for (i = 0; i < nrows; i++) {
171     row    = irow[i];
172     kstart = ai[row];
173     kend   = kstart + a->ilen[row];
174     if (sym) {
175       mat_i    = c->i[i];
176       mat_j    = c->j + mat_i;
177       mat_a    = c->a + mat_i * bs2;
178       mat_ilen = c->ilen + i;
179     } else {
180       mat_i    = d->i[i];
181       mat_j    = d->j + mat_i;
182       mat_a    = d->a + mat_i * bs2;
183       mat_ilen = d->ilen + i;
184     }
185     for (k = kstart; k < kend; k++) {
186       if ((tcol = ssmap[a->j[k]])) {
187         *mat_j++ = tcol - 1;
188         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
189         mat_a += bs2;
190         (*mat_ilen)++;
191       }
192     }
193   }
194   /* sort */
195   {
196     MatScalar *work;
197 
198     PetscCall(PetscMalloc1(bs2, &work));
199     for (i = 0; i < nrows; i++) {
200       PetscInt ilen;
201       if (sym) {
202         mat_i = c->i[i];
203         mat_j = c->j + mat_i;
204         mat_a = c->a + mat_i * bs2;
205         ilen  = c->ilen[i];
206       } else {
207         mat_i = d->i[i];
208         mat_j = d->j + mat_i;
209         mat_a = d->a + mat_i * bs2;
210         ilen  = d->ilen[i];
211       }
212       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
213     }
214     PetscCall(PetscFree(work));
215   }
216 
217   /* Free work space */
218   PetscCall(ISRestoreIndices(iscol, &icol));
219   PetscCall(PetscFree(smap));
220   PetscCall(PetscFree(lens));
221   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
222   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
223 
224   PetscCall(ISRestoreIndices(isrow, &irow));
225   *B = C;
226   PetscFunctionReturn(PETSC_SUCCESS);
227 }
228 
229 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
230 {
231   Mat       C[2];
232   IS        is1, is2, intersect = NULL;
233   PetscInt  n1, n2, ni;
234   PetscBool sym = PETSC_TRUE;
235 
236   PetscFunctionBegin;
237   PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
238   if (isrow == iscol) {
239     is2 = is1;
240     PetscCall(PetscObjectReference((PetscObject)is2));
241   } else {
242     PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
243     PetscCall(ISIntersect(is1, is2, &intersect));
244     PetscCall(ISGetLocalSize(intersect, &ni));
245     PetscCall(ISDestroy(&intersect));
246     if (ni == 0) sym = PETSC_FALSE;
247     else if (PetscDefined(USE_DEBUG)) {
248       PetscCall(ISGetLocalSize(is1, &n1));
249       PetscCall(ISGetLocalSize(is2, &n2));
250       PetscCheck(ni == n1 && ni == n2, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create such a submatrix");
251     }
252   }
253   if (sym) PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B, sym));
254   else {
255     PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, MAT_INITIAL_MATRIX, C, sym));
256     PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is2, is1, MAT_INITIAL_MATRIX, C + 1, sym));
257     PetscCall(MatTranspose(C[1], MAT_INPLACE_MATRIX, C + 1));
258     PetscCall(MatAXPY(C[0], 1.0, C[1], DIFFERENT_NONZERO_PATTERN));
259     PetscCheck(scall != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
260     if (scall == MAT_REUSE_MATRIX) PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
261     else if (A->rmap->bs == 1) PetscCall(MatConvert(C[0], MATAIJ, MAT_INITIAL_MATRIX, B));
262     else PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
263     PetscCall(MatDestroy(C));
264     PetscCall(MatDestroy(C + 1));
265   }
266   PetscCall(ISDestroy(&is1));
267   PetscCall(ISDestroy(&is2));
268 
269   if (sym && isrow != iscol) {
270     PetscBool isequal;
271     PetscCall(ISEqual(isrow, iscol, &isequal));
272     if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
273   }
274   PetscFunctionReturn(PETSC_SUCCESS);
275 }
276 
277 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
278 {
279   PetscInt i;
280 
281   PetscFunctionBegin;
282   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B));
283 
284   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
285   PetscFunctionReturn(PETSC_SUCCESS);
286 }
287 
288 /* Should check that shapes of vectors and matrices match */
289 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
290 {
291   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
292   PetscScalar       *z, x1, x2, zero = 0.0;
293   const PetscScalar *x, *xb;
294   const MatScalar   *v;
295   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
296   const PetscInt    *aj = a->j, *ai = a->i, *ib;
297   PetscInt           nonzerorow = 0;
298 
299   PetscFunctionBegin;
300   PetscCall(VecSet(zz, zero));
301   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
302   PetscCall(VecGetArrayRead(xx, &x));
303   PetscCall(VecGetArray(zz, &z));
304 
305   v  = a->a;
306   xb = x;
307 
308   for (i = 0; i < mbs; i++) {
309     n    = ai[1] - ai[0]; /* length of i_th block row of A */
310     x1   = xb[0];
311     x2   = xb[1];
312     ib   = aj + *ai;
313     jmin = 0;
314     nonzerorow += (n > 0);
315     if (*ib == i) { /* (diag of A)*x */
316       z[2 * i] += v[0] * x1 + v[2] * x2;
317       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
318       v += 4;
319       jmin++;
320     }
321     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
322     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
323     for (j = jmin; j < n; j++) {
324       /* (strict lower triangular part of A)*x  */
325       cval = ib[j] * 2;
326       z[cval] += v[0] * x1 + v[1] * x2;
327       z[cval + 1] += v[2] * x1 + v[3] * x2;
328       /* (strict upper triangular part of A)*x  */
329       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
330       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
331       v += 4;
332     }
333     xb += 2;
334     ai++;
335   }
336 
337   PetscCall(VecRestoreArrayRead(xx, &x));
338   PetscCall(VecRestoreArray(zz, &z));
339   PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
340   PetscFunctionReturn(PETSC_SUCCESS);
341 }
342 
343 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
344 {
345   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
346   PetscScalar       *z, x1, x2, x3, zero = 0.0;
347   const PetscScalar *x, *xb;
348   const MatScalar   *v;
349   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
350   const PetscInt    *aj = a->j, *ai = a->i, *ib;
351   PetscInt           nonzerorow = 0;
352 
353   PetscFunctionBegin;
354   PetscCall(VecSet(zz, zero));
355   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
356   PetscCall(VecGetArrayRead(xx, &x));
357   PetscCall(VecGetArray(zz, &z));
358 
359   v  = a->a;
360   xb = x;
361 
362   for (i = 0; i < mbs; i++) {
363     n    = ai[1] - ai[0]; /* length of i_th block row of A */
364     x1   = xb[0];
365     x2   = xb[1];
366     x3   = xb[2];
367     ib   = aj + *ai;
368     jmin = 0;
369     nonzerorow += (n > 0);
370     if (*ib == i) { /* (diag of A)*x */
371       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
372       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
373       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
374       v += 9;
375       jmin++;
376     }
377     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
378     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
379     for (j = jmin; j < n; j++) {
380       /* (strict lower triangular part of A)*x  */
381       cval = ib[j] * 3;
382       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
383       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
384       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
385       /* (strict upper triangular part of A)*x  */
386       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
387       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
388       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
389       v += 9;
390     }
391     xb += 3;
392     ai++;
393   }
394 
395   PetscCall(VecRestoreArrayRead(xx, &x));
396   PetscCall(VecRestoreArray(zz, &z));
397   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
398   PetscFunctionReturn(PETSC_SUCCESS);
399 }
400 
401 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
402 {
403   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
404   PetscScalar       *z, x1, x2, x3, x4, zero = 0.0;
405   const PetscScalar *x, *xb;
406   const MatScalar   *v;
407   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
408   const PetscInt    *aj = a->j, *ai = a->i, *ib;
409   PetscInt           nonzerorow = 0;
410 
411   PetscFunctionBegin;
412   PetscCall(VecSet(zz, zero));
413   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
414   PetscCall(VecGetArrayRead(xx, &x));
415   PetscCall(VecGetArray(zz, &z));
416 
417   v  = a->a;
418   xb = x;
419 
420   for (i = 0; i < mbs; i++) {
421     n    = ai[1] - ai[0]; /* length of i_th block row of A */
422     x1   = xb[0];
423     x2   = xb[1];
424     x3   = xb[2];
425     x4   = xb[3];
426     ib   = aj + *ai;
427     jmin = 0;
428     nonzerorow += (n > 0);
429     if (*ib == i) { /* (diag of A)*x */
430       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
431       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
432       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
433       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
434       v += 16;
435       jmin++;
436     }
437     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
438     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
439     for (j = jmin; j < n; j++) {
440       /* (strict lower triangular part of A)*x  */
441       cval = ib[j] * 4;
442       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
443       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
444       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
445       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
446       /* (strict upper triangular part of A)*x  */
447       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
448       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
449       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
450       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
451       v += 16;
452     }
453     xb += 4;
454     ai++;
455   }
456 
457   PetscCall(VecRestoreArrayRead(xx, &x));
458   PetscCall(VecRestoreArray(zz, &z));
459   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
460   PetscFunctionReturn(PETSC_SUCCESS);
461 }
462 
463 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
464 {
465   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
466   PetscScalar       *z, x1, x2, x3, x4, x5, zero = 0.0;
467   const PetscScalar *x, *xb;
468   const MatScalar   *v;
469   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
470   const PetscInt    *aj = a->j, *ai = a->i, *ib;
471   PetscInt           nonzerorow = 0;
472 
473   PetscFunctionBegin;
474   PetscCall(VecSet(zz, zero));
475   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
476   PetscCall(VecGetArrayRead(xx, &x));
477   PetscCall(VecGetArray(zz, &z));
478 
479   v  = a->a;
480   xb = x;
481 
482   for (i = 0; i < mbs; i++) {
483     n    = ai[1] - ai[0]; /* length of i_th block row of A */
484     x1   = xb[0];
485     x2   = xb[1];
486     x3   = xb[2];
487     x4   = xb[3];
488     x5   = xb[4];
489     ib   = aj + *ai;
490     jmin = 0;
491     nonzerorow += (n > 0);
492     if (*ib == i) { /* (diag of A)*x */
493       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
494       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
495       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
496       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
497       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
498       v += 25;
499       jmin++;
500     }
501     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
502     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
503     for (j = jmin; j < n; j++) {
504       /* (strict lower triangular part of A)*x  */
505       cval = ib[j] * 5;
506       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
507       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
508       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
509       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
510       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
511       /* (strict upper triangular part of A)*x  */
512       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
513       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
514       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
515       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
516       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
517       v += 25;
518     }
519     xb += 5;
520     ai++;
521   }
522 
523   PetscCall(VecRestoreArrayRead(xx, &x));
524   PetscCall(VecRestoreArray(zz, &z));
525   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
526   PetscFunctionReturn(PETSC_SUCCESS);
527 }
528 
529 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
530 {
531   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
532   PetscScalar       *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
533   const PetscScalar *x, *xb;
534   const MatScalar   *v;
535   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
536   const PetscInt    *aj = a->j, *ai = a->i, *ib;
537   PetscInt           nonzerorow = 0;
538 
539   PetscFunctionBegin;
540   PetscCall(VecSet(zz, zero));
541   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
542   PetscCall(VecGetArrayRead(xx, &x));
543   PetscCall(VecGetArray(zz, &z));
544 
545   v  = a->a;
546   xb = x;
547 
548   for (i = 0; i < mbs; i++) {
549     n    = ai[1] - ai[0]; /* length of i_th block row of A */
550     x1   = xb[0];
551     x2   = xb[1];
552     x3   = xb[2];
553     x4   = xb[3];
554     x5   = xb[4];
555     x6   = xb[5];
556     ib   = aj + *ai;
557     jmin = 0;
558     nonzerorow += (n > 0);
559     if (*ib == i) { /* (diag of A)*x */
560       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
561       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
562       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
563       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
564       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
565       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
566       v += 36;
567       jmin++;
568     }
569     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
570     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
571     for (j = jmin; j < n; j++) {
572       /* (strict lower triangular part of A)*x  */
573       cval = ib[j] * 6;
574       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
575       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
576       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
577       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
578       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
579       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
580       /* (strict upper triangular part of A)*x  */
581       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
582       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
583       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
584       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
585       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
586       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
587       v += 36;
588     }
589     xb += 6;
590     ai++;
591   }
592 
593   PetscCall(VecRestoreArrayRead(xx, &x));
594   PetscCall(VecRestoreArray(zz, &z));
595   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
596   PetscFunctionReturn(PETSC_SUCCESS);
597 }
598 
599 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
600 {
601   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
602   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
603   const PetscScalar *x, *xb;
604   const MatScalar   *v;
605   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
606   const PetscInt    *aj = a->j, *ai = a->i, *ib;
607   PetscInt           nonzerorow = 0;
608 
609   PetscFunctionBegin;
610   PetscCall(VecSet(zz, zero));
611   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
612   PetscCall(VecGetArrayRead(xx, &x));
613   PetscCall(VecGetArray(zz, &z));
614 
615   v  = a->a;
616   xb = x;
617 
618   for (i = 0; i < mbs; i++) {
619     n    = ai[1] - ai[0]; /* length of i_th block row of A */
620     x1   = xb[0];
621     x2   = xb[1];
622     x3   = xb[2];
623     x4   = xb[3];
624     x5   = xb[4];
625     x6   = xb[5];
626     x7   = xb[6];
627     ib   = aj + *ai;
628     jmin = 0;
629     nonzerorow += (n > 0);
630     if (*ib == i) { /* (diag of A)*x */
631       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
632       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
633       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
634       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
635       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
636       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
637       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
638       v += 49;
639       jmin++;
640     }
641     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
642     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
643     for (j = jmin; j < n; j++) {
644       /* (strict lower triangular part of A)*x  */
645       cval = ib[j] * 7;
646       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
647       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
648       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
649       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
650       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
651       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
652       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
653       /* (strict upper triangular part of A)*x  */
654       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
655       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
656       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
657       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
658       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
659       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
660       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
661       v += 49;
662     }
663     xb += 7;
664     ai++;
665   }
666   PetscCall(VecRestoreArrayRead(xx, &x));
667   PetscCall(VecRestoreArray(zz, &z));
668   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
669   PetscFunctionReturn(PETSC_SUCCESS);
670 }
671 
672 /*
673     This will not work with MatScalar == float because it calls the BLAS
674 */
675 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
676 {
677   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
678   PetscScalar       *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
679   const PetscScalar *x, *x_ptr, *xb;
680   const MatScalar   *v;
681   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
682   const PetscInt    *idx, *aj, *ii;
683   PetscInt           nonzerorow = 0;
684 
685   PetscFunctionBegin;
686   PetscCall(VecSet(zz, zero));
687   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
688   PetscCall(VecGetArrayRead(xx, &x));
689   PetscCall(VecGetArray(zz, &z));
690 
691   x_ptr = x;
692   z_ptr = z;
693 
694   aj = a->j;
695   v  = a->a;
696   ii = a->i;
697 
698   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
699   work = a->mult_work;
700 
701   for (i = 0; i < mbs; i++) {
702     n     = ii[1] - ii[0];
703     ncols = n * bs;
704     workt = work;
705     idx   = aj + ii[0];
706     nonzerorow += (n > 0);
707 
708     /* upper triangular part */
709     for (j = 0; j < n; j++) {
710       xb = x_ptr + bs * (*idx++);
711       for (k = 0; k < bs; k++) workt[k] = xb[k];
712       workt += bs;
713     }
714     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
715     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
716 
717     /* strict lower triangular part */
718     idx = aj + ii[0];
719     if (n && *idx == i) {
720       ncols -= bs;
721       v += bs2;
722       idx++;
723       n--;
724     }
725 
726     if (ncols > 0) {
727       workt = work;
728       PetscCall(PetscArrayzero(workt, ncols));
729       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
730       for (j = 0; j < n; j++) {
731         zb = z_ptr + bs * (*idx++);
732         for (k = 0; k < bs; k++) zb[k] += workt[k];
733         workt += bs;
734       }
735     }
736     x += bs;
737     v += n * bs2;
738     z += bs;
739     ii++;
740   }
741 
742   PetscCall(VecRestoreArrayRead(xx, &x));
743   PetscCall(VecRestoreArray(zz, &z));
744   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
745   PetscFunctionReturn(PETSC_SUCCESS);
746 }
747 
748 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
749 {
750   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
751   PetscScalar       *z, x1;
752   const PetscScalar *x, *xb;
753   const MatScalar   *v;
754   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
755   const PetscInt    *aj = a->j, *ai = a->i, *ib;
756   PetscInt           nonzerorow = 0;
757 #if defined(PETSC_USE_COMPLEX)
758   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
759 #else
760   const int aconj = 0;
761 #endif
762 
763   PetscFunctionBegin;
764   PetscCall(VecCopy(yy, zz));
765   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
766   PetscCall(VecGetArrayRead(xx, &x));
767   PetscCall(VecGetArray(zz, &z));
768   v  = a->a;
769   xb = x;
770 
771   for (i = 0; i < mbs; i++) {
772     n    = ai[1] - ai[0]; /* length of i_th row of A */
773     x1   = xb[0];
774     ib   = aj + *ai;
775     jmin = 0;
776     nonzerorow += (n > 0);
777     if (n && *ib == i) { /* (diag of A)*x */
778       z[i] += *v++ * x[*ib++];
779       jmin++;
780     }
781     if (aconj) {
782       for (j = jmin; j < n; j++) {
783         cval = *ib;
784         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
785         z[i] += *v++ * x[*ib++];       /* (strict upper triangular part of A)*x  */
786       }
787     } else {
788       for (j = jmin; j < n; j++) {
789         cval = *ib;
790         z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
791         z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
792       }
793     }
794     xb++;
795     ai++;
796   }
797 
798   PetscCall(VecRestoreArrayRead(xx, &x));
799   PetscCall(VecRestoreArray(zz, &z));
800 
801   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
802   PetscFunctionReturn(PETSC_SUCCESS);
803 }
804 
805 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
806 {
807   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
808   PetscScalar       *z, x1, x2;
809   const PetscScalar *x, *xb;
810   const MatScalar   *v;
811   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
812   const PetscInt    *aj = a->j, *ai = a->i, *ib;
813   PetscInt           nonzerorow = 0;
814 
815   PetscFunctionBegin;
816   PetscCall(VecCopy(yy, zz));
817   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
818   PetscCall(VecGetArrayRead(xx, &x));
819   PetscCall(VecGetArray(zz, &z));
820 
821   v  = a->a;
822   xb = x;
823 
824   for (i = 0; i < mbs; i++) {
825     n    = ai[1] - ai[0]; /* length of i_th block row of A */
826     x1   = xb[0];
827     x2   = xb[1];
828     ib   = aj + *ai;
829     jmin = 0;
830     nonzerorow += (n > 0);
831     if (n && *ib == i) { /* (diag of A)*x */
832       z[2 * i] += v[0] * x1 + v[2] * x2;
833       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
834       v += 4;
835       jmin++;
836     }
837     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
838     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
839     for (j = jmin; j < n; j++) {
840       /* (strict lower triangular part of A)*x  */
841       cval = ib[j] * 2;
842       z[cval] += v[0] * x1 + v[1] * x2;
843       z[cval + 1] += v[2] * x1 + v[3] * x2;
844       /* (strict upper triangular part of A)*x  */
845       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
846       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
847       v += 4;
848     }
849     xb += 2;
850     ai++;
851   }
852   PetscCall(VecRestoreArrayRead(xx, &x));
853   PetscCall(VecRestoreArray(zz, &z));
854 
855   PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
856   PetscFunctionReturn(PETSC_SUCCESS);
857 }
858 
859 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
860 {
861   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
862   PetscScalar       *z, x1, x2, x3;
863   const PetscScalar *x, *xb;
864   const MatScalar   *v;
865   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
866   const PetscInt    *aj = a->j, *ai = a->i, *ib;
867   PetscInt           nonzerorow = 0;
868 
869   PetscFunctionBegin;
870   PetscCall(VecCopy(yy, zz));
871   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
872   PetscCall(VecGetArrayRead(xx, &x));
873   PetscCall(VecGetArray(zz, &z));
874 
875   v  = a->a;
876   xb = x;
877 
878   for (i = 0; i < mbs; i++) {
879     n    = ai[1] - ai[0]; /* length of i_th block row of A */
880     x1   = xb[0];
881     x2   = xb[1];
882     x3   = xb[2];
883     ib   = aj + *ai;
884     jmin = 0;
885     nonzerorow += (n > 0);
886     if (n && *ib == i) { /* (diag of A)*x */
887       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
888       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
889       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
890       v += 9;
891       jmin++;
892     }
893     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
894     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
895     for (j = jmin; j < n; j++) {
896       /* (strict lower triangular part of A)*x  */
897       cval = ib[j] * 3;
898       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
899       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
900       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
901       /* (strict upper triangular part of A)*x  */
902       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
903       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
904       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
905       v += 9;
906     }
907     xb += 3;
908     ai++;
909   }
910 
911   PetscCall(VecRestoreArrayRead(xx, &x));
912   PetscCall(VecRestoreArray(zz, &z));
913 
914   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
915   PetscFunctionReturn(PETSC_SUCCESS);
916 }
917 
918 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
919 {
920   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
921   PetscScalar       *z, x1, x2, x3, x4;
922   const PetscScalar *x, *xb;
923   const MatScalar   *v;
924   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
925   const PetscInt    *aj = a->j, *ai = a->i, *ib;
926   PetscInt           nonzerorow = 0;
927 
928   PetscFunctionBegin;
929   PetscCall(VecCopy(yy, zz));
930   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
931   PetscCall(VecGetArrayRead(xx, &x));
932   PetscCall(VecGetArray(zz, &z));
933 
934   v  = a->a;
935   xb = x;
936 
937   for (i = 0; i < mbs; i++) {
938     n    = ai[1] - ai[0]; /* length of i_th block row of A */
939     x1   = xb[0];
940     x2   = xb[1];
941     x3   = xb[2];
942     x4   = xb[3];
943     ib   = aj + *ai;
944     jmin = 0;
945     nonzerorow += (n > 0);
946     if (n && *ib == i) { /* (diag of A)*x */
947       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
948       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
949       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
950       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
951       v += 16;
952       jmin++;
953     }
954     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
955     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
956     for (j = jmin; j < n; j++) {
957       /* (strict lower triangular part of A)*x  */
958       cval = ib[j] * 4;
959       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
960       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
961       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
962       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
963       /* (strict upper triangular part of A)*x  */
964       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
965       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
966       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
967       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
968       v += 16;
969     }
970     xb += 4;
971     ai++;
972   }
973 
974   PetscCall(VecRestoreArrayRead(xx, &x));
975   PetscCall(VecRestoreArray(zz, &z));
976 
977   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
978   PetscFunctionReturn(PETSC_SUCCESS);
979 }
980 
981 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
982 {
983   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
984   PetscScalar       *z, x1, x2, x3, x4, x5;
985   const PetscScalar *x, *xb;
986   const MatScalar   *v;
987   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
988   const PetscInt    *aj = a->j, *ai = a->i, *ib;
989   PetscInt           nonzerorow = 0;
990 
991   PetscFunctionBegin;
992   PetscCall(VecCopy(yy, zz));
993   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
994   PetscCall(VecGetArrayRead(xx, &x));
995   PetscCall(VecGetArray(zz, &z));
996 
997   v  = a->a;
998   xb = x;
999 
1000   for (i = 0; i < mbs; i++) {
1001     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1002     x1   = xb[0];
1003     x2   = xb[1];
1004     x3   = xb[2];
1005     x4   = xb[3];
1006     x5   = xb[4];
1007     ib   = aj + *ai;
1008     jmin = 0;
1009     nonzerorow += (n > 0);
1010     if (n && *ib == i) { /* (diag of A)*x */
1011       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1012       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1013       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1014       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
1015       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1016       v += 25;
1017       jmin++;
1018     }
1019     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1020     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1021     for (j = jmin; j < n; j++) {
1022       /* (strict lower triangular part of A)*x  */
1023       cval = ib[j] * 5;
1024       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
1025       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
1026       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
1027       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
1028       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1029       /* (strict upper triangular part of A)*x  */
1030       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
1031       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
1032       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
1033       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
1034       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
1035       v += 25;
1036     }
1037     xb += 5;
1038     ai++;
1039   }
1040 
1041   PetscCall(VecRestoreArrayRead(xx, &x));
1042   PetscCall(VecRestoreArray(zz, &z));
1043 
1044   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
1045   PetscFunctionReturn(PETSC_SUCCESS);
1046 }
1047 
1048 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1049 {
1050   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1051   PetscScalar       *z, x1, x2, x3, x4, x5, x6;
1052   const PetscScalar *x, *xb;
1053   const MatScalar   *v;
1054   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1055   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1056   PetscInt           nonzerorow = 0;
1057 
1058   PetscFunctionBegin;
1059   PetscCall(VecCopy(yy, zz));
1060   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1061   PetscCall(VecGetArrayRead(xx, &x));
1062   PetscCall(VecGetArray(zz, &z));
1063 
1064   v  = a->a;
1065   xb = x;
1066 
1067   for (i = 0; i < mbs; i++) {
1068     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1069     x1   = xb[0];
1070     x2   = xb[1];
1071     x3   = xb[2];
1072     x4   = xb[3];
1073     x5   = xb[4];
1074     x6   = xb[5];
1075     ib   = aj + *ai;
1076     jmin = 0;
1077     nonzerorow += (n > 0);
1078     if (n && *ib == i) { /* (diag of A)*x */
1079       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1080       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1081       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1082       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1083       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1084       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1085       v += 36;
1086       jmin++;
1087     }
1088     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1089     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1090     for (j = jmin; j < n; j++) {
1091       /* (strict lower triangular part of A)*x  */
1092       cval = ib[j] * 6;
1093       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1094       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1095       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1096       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1097       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1098       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1099       /* (strict upper triangular part of A)*x  */
1100       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1101       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1102       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1103       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1104       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1105       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1106       v += 36;
1107     }
1108     xb += 6;
1109     ai++;
1110   }
1111 
1112   PetscCall(VecRestoreArrayRead(xx, &x));
1113   PetscCall(VecRestoreArray(zz, &z));
1114 
1115   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
1116   PetscFunctionReturn(PETSC_SUCCESS);
1117 }
1118 
1119 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1120 {
1121   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1122   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7;
1123   const PetscScalar *x, *xb;
1124   const MatScalar   *v;
1125   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1126   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1127   PetscInt           nonzerorow = 0;
1128 
1129   PetscFunctionBegin;
1130   PetscCall(VecCopy(yy, zz));
1131   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1132   PetscCall(VecGetArrayRead(xx, &x));
1133   PetscCall(VecGetArray(zz, &z));
1134 
1135   v  = a->a;
1136   xb = x;
1137 
1138   for (i = 0; i < mbs; i++) {
1139     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1140     x1   = xb[0];
1141     x2   = xb[1];
1142     x3   = xb[2];
1143     x4   = xb[3];
1144     x5   = xb[4];
1145     x6   = xb[5];
1146     x7   = xb[6];
1147     ib   = aj + *ai;
1148     jmin = 0;
1149     nonzerorow += (n > 0);
1150     if (n && *ib == i) { /* (diag of A)*x */
1151       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1152       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1153       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1154       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1155       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1156       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1157       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1158       v += 49;
1159       jmin++;
1160     }
1161     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1162     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1163     for (j = jmin; j < n; j++) {
1164       /* (strict lower triangular part of A)*x  */
1165       cval = ib[j] * 7;
1166       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1167       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1168       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1169       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1170       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1171       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1172       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1173       /* (strict upper triangular part of A)*x  */
1174       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1175       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1176       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1177       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1178       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1179       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1180       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1181       v += 49;
1182     }
1183     xb += 7;
1184     ai++;
1185   }
1186 
1187   PetscCall(VecRestoreArrayRead(xx, &x));
1188   PetscCall(VecRestoreArray(zz, &z));
1189 
1190   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
1191   PetscFunctionReturn(PETSC_SUCCESS);
1192 }
1193 
1194 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1195 {
1196   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1197   PetscScalar       *z, *z_ptr = NULL, *zb, *work, *workt;
1198   const PetscScalar *x, *x_ptr, *xb;
1199   const MatScalar   *v;
1200   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1201   const PetscInt    *idx, *aj, *ii;
1202   PetscInt           nonzerorow = 0;
1203 
1204   PetscFunctionBegin;
1205   PetscCall(VecCopy(yy, zz));
1206   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1207   PetscCall(VecGetArrayRead(xx, &x));
1208   x_ptr = x;
1209   PetscCall(VecGetArray(zz, &z));
1210   z_ptr = z;
1211 
1212   aj = a->j;
1213   v  = a->a;
1214   ii = a->i;
1215 
1216   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
1217   work = a->mult_work;
1218 
1219   for (i = 0; i < mbs; i++) {
1220     n     = ii[1] - ii[0];
1221     ncols = n * bs;
1222     workt = work;
1223     idx   = aj + ii[0];
1224     nonzerorow += (n > 0);
1225 
1226     /* upper triangular part */
1227     for (j = 0; j < n; j++) {
1228       xb = x_ptr + bs * (*idx++);
1229       for (k = 0; k < bs; k++) workt[k] = xb[k];
1230       workt += bs;
1231     }
1232     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1233     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
1234 
1235     /* strict lower triangular part */
1236     idx = aj + ii[0];
1237     if (n && *idx == i) {
1238       ncols -= bs;
1239       v += bs2;
1240       idx++;
1241       n--;
1242     }
1243     if (ncols > 0) {
1244       workt = work;
1245       PetscCall(PetscArrayzero(workt, ncols));
1246       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1247       for (j = 0; j < n; j++) {
1248         zb = z_ptr + bs * (*idx++);
1249         for (k = 0; k < bs; k++) zb[k] += workt[k];
1250         workt += bs;
1251       }
1252     }
1253 
1254     x += bs;
1255     v += n * bs2;
1256     z += bs;
1257     ii++;
1258   }
1259 
1260   PetscCall(VecRestoreArrayRead(xx, &x));
1261   PetscCall(VecRestoreArray(zz, &z));
1262 
1263   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
1264   PetscFunctionReturn(PETSC_SUCCESS);
1265 }
1266 
1267 PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1268 {
1269   Mat_SeqSBAIJ *a      = (Mat_SeqSBAIJ *)inA->data;
1270   PetscScalar   oalpha = alpha;
1271   PetscBLASInt  one    = 1, totalnz;
1272 
1273   PetscFunctionBegin;
1274   PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1275   PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1276   PetscCall(PetscLogFlops(totalnz));
1277   PetscFunctionReturn(PETSC_SUCCESS);
1278 }
1279 
1280 PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1281 {
1282   Mat_SeqSBAIJ    *a        = (Mat_SeqSBAIJ *)A->data;
1283   const MatScalar *v        = a->a;
1284   PetscReal        sum_diag = 0.0, sum_off = 0.0, *sum;
1285   PetscInt         i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1286   const PetscInt  *aj = a->j, *col;
1287 
1288   PetscFunctionBegin;
1289   if (!a->nz) {
1290     *norm = 0.0;
1291     PetscFunctionReturn(PETSC_SUCCESS);
1292   }
1293   if (type == NORM_FROBENIUS) {
1294     for (k = 0; k < mbs; k++) {
1295       jmin = a->i[k];
1296       jmax = a->i[k + 1];
1297       col  = aj + jmin;
1298       if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1299         for (i = 0; i < bs2; i++) {
1300           sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1301           v++;
1302         }
1303         jmin++;
1304       }
1305       for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1306         for (i = 0; i < bs2; i++) {
1307           sum_off += PetscRealPart(PetscConj(*v) * (*v));
1308           v++;
1309         }
1310       }
1311     }
1312     *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1313     PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1314   } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1315     PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
1316     for (i = 0; i < mbs; i++) jl[i] = mbs;
1317     il[0] = 0;
1318 
1319     *norm = 0.0;
1320     for (k = 0; k < mbs; k++) { /* k_th block row */
1321       for (j = 0; j < bs; j++) sum[j] = 0.0;
1322       /*-- col sum --*/
1323       i = jl[k]; /* first |A(i,k)| to be added */
1324       /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window)
1325                   at step k */
1326       while (i < mbs) {
1327         nexti = jl[i]; /* next block row to be added */
1328         ik    = il[i]; /* block index of A(i,k) in the array a */
1329         for (j = 0; j < bs; j++) {
1330           v = a->a + ik * bs2 + j * bs;
1331           for (k1 = 0; k1 < bs; k1++) {
1332             sum[j] += PetscAbsScalar(*v);
1333             v++;
1334           }
1335         }
1336         /* update il, jl */
1337         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1338         jmax = a->i[i + 1];
1339         if (jmin < jmax) {
1340           il[i] = jmin;
1341           j     = a->j[jmin];
1342           jl[i] = jl[j];
1343           jl[j] = i;
1344         }
1345         i = nexti;
1346       }
1347       /*-- row sum --*/
1348       jmin = a->i[k];
1349       jmax = a->i[k + 1];
1350       for (i = jmin; i < jmax; i++) {
1351         for (j = 0; j < bs; j++) {
1352           v = a->a + i * bs2 + j;
1353           for (k1 = 0; k1 < bs; k1++) {
1354             sum[j] += PetscAbsScalar(*v);
1355             v += bs;
1356           }
1357         }
1358       }
1359       /* add k_th block row to il, jl */
1360       col = aj + jmin;
1361       if (jmax - jmin > 0 && *col == k) jmin++;
1362       if (jmin < jmax) {
1363         il[k] = jmin;
1364         j     = a->j[jmin];
1365         jl[k] = jl[j];
1366         jl[j] = k;
1367       }
1368       for (j = 0; j < bs; j++) {
1369         if (sum[j] > *norm) *norm = sum[j];
1370       }
1371     }
1372     PetscCall(PetscFree3(sum, il, jl));
1373     PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1374   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1375   PetscFunctionReturn(PETSC_SUCCESS);
1376 }
1377 
1378 PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1379 {
1380   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
1381 
1382   PetscFunctionBegin;
1383   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1384   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1385     *flg = PETSC_FALSE;
1386     PetscFunctionReturn(PETSC_SUCCESS);
1387   }
1388 
1389   /* if the a->i are the same */
1390   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
1391   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1392 
1393   /* if a->j are the same */
1394   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
1395   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1396 
1397   /* if a->a are the same */
1398   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg));
1399   PetscFunctionReturn(PETSC_SUCCESS);
1400 }
1401 
1402 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1403 {
1404   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1405   PetscInt         i, j, k, row, bs, ambs, bs2;
1406   const PetscInt  *ai, *aj;
1407   PetscScalar     *x, zero = 0.0;
1408   const MatScalar *aa, *aa_j;
1409 
1410   PetscFunctionBegin;
1411   bs = A->rmap->bs;
1412   PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");
1413 
1414   aa   = a->a;
1415   ambs = a->mbs;
1416 
1417   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1418     PetscInt *diag = a->diag;
1419     aa             = a->a;
1420     ambs           = a->mbs;
1421     PetscCall(VecGetArray(v, &x));
1422     for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
1423     PetscCall(VecRestoreArray(v, &x));
1424     PetscFunctionReturn(PETSC_SUCCESS);
1425   }
1426 
1427   ai  = a->i;
1428   aj  = a->j;
1429   bs2 = a->bs2;
1430   PetscCall(VecSet(v, zero));
1431   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1432   PetscCall(VecGetArray(v, &x));
1433   for (i = 0; i < ambs; i++) {
1434     j = ai[i];
1435     if (aj[j] == i) { /* if this is a diagonal element */
1436       row  = i * bs;
1437       aa_j = aa + j * bs2;
1438       for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
1439     }
1440   }
1441   PetscCall(VecRestoreArray(v, &x));
1442   PetscFunctionReturn(PETSC_SUCCESS);
1443 }
1444 
1445 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1446 {
1447   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1448   PetscScalar        x;
1449   const PetscScalar *l, *li, *ri;
1450   MatScalar         *aa, *v;
1451   PetscInt           i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1452   const PetscInt    *ai, *aj;
1453   PetscBool          flg;
1454 
1455   PetscFunctionBegin;
1456   if (ll != rr) {
1457     PetscCall(VecEqual(ll, rr, &flg));
1458     PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same");
1459   }
1460   if (!ll) PetscFunctionReturn(PETSC_SUCCESS);
1461   ai  = a->i;
1462   aj  = a->j;
1463   aa  = a->a;
1464   m   = A->rmap->N;
1465   bs  = A->rmap->bs;
1466   mbs = a->mbs;
1467   bs2 = a->bs2;
1468 
1469   PetscCall(VecGetArrayRead(ll, &l));
1470   PetscCall(VecGetLocalSize(ll, &lm));
1471   PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1472   for (i = 0; i < mbs; i++) { /* for each block row */
1473     M  = ai[i + 1] - ai[i];
1474     li = l + i * bs;
1475     v  = aa + bs2 * ai[i];
1476     for (j = 0; j < M; j++) { /* for each block */
1477       ri = l + bs * aj[ai[i] + j];
1478       for (k = 0; k < bs; k++) {
1479         x = ri[k];
1480         for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1481       }
1482     }
1483   }
1484   PetscCall(VecRestoreArrayRead(ll, &l));
1485   PetscCall(PetscLogFlops(2.0 * a->nz));
1486   PetscFunctionReturn(PETSC_SUCCESS);
1487 }
1488 
1489 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1490 {
1491   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1492 
1493   PetscFunctionBegin;
1494   info->block_size   = a->bs2;
1495   info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1496   info->nz_used      = a->bs2 * a->nz;    /*num. of nonzeros in upper triangular part */
1497   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1498   info->assemblies   = A->num_ass;
1499   info->mallocs      = A->info.mallocs;
1500   info->memory       = 0; /* REVIEW ME */
1501   if (A->factortype) {
1502     info->fill_ratio_given  = A->info.fill_ratio_given;
1503     info->fill_ratio_needed = A->info.fill_ratio_needed;
1504     info->factor_mallocs    = A->info.factor_mallocs;
1505   } else {
1506     info->fill_ratio_given  = 0;
1507     info->fill_ratio_needed = 0;
1508     info->factor_mallocs    = 0;
1509   }
1510   PetscFunctionReturn(PETSC_SUCCESS);
1511 }
1512 
1513 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1514 {
1515   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1516 
1517   PetscFunctionBegin;
1518   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
1519   PetscFunctionReturn(PETSC_SUCCESS);
1520 }
1521 
1522 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1523 {
1524   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1525   PetscInt         i, j, n, row, col, bs, mbs;
1526   const PetscInt  *ai, *aj;
1527   PetscReal        atmp;
1528   const MatScalar *aa;
1529   PetscScalar     *x;
1530   PetscInt         ncols, brow, bcol, krow, kcol;
1531 
1532   PetscFunctionBegin;
1533   PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
1534   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1535   bs  = A->rmap->bs;
1536   aa  = a->a;
1537   ai  = a->i;
1538   aj  = a->j;
1539   mbs = a->mbs;
1540 
1541   PetscCall(VecSet(v, 0.0));
1542   PetscCall(VecGetArray(v, &x));
1543   PetscCall(VecGetLocalSize(v, &n));
1544   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1545   for (i = 0; i < mbs; i++) {
1546     ncols = ai[1] - ai[0];
1547     ai++;
1548     brow = bs * i;
1549     for (j = 0; j < ncols; j++) {
1550       bcol = bs * (*aj);
1551       for (kcol = 0; kcol < bs; kcol++) {
1552         col = bcol + kcol; /* col index */
1553         for (krow = 0; krow < bs; krow++) {
1554           atmp = PetscAbsScalar(*aa);
1555           aa++;
1556           row = brow + krow; /* row index */
1557           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1558           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1559         }
1560       }
1561       aj++;
1562     }
1563   }
1564   PetscCall(VecRestoreArray(v, &x));
1565   PetscFunctionReturn(PETSC_SUCCESS);
1566 }
1567 
1568 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1569 {
1570   PetscFunctionBegin;
1571   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1572   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1573   PetscFunctionReturn(PETSC_SUCCESS);
1574 }
1575 
1576 static PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1577 {
1578   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1579   PetscScalar       *z = c;
1580   const PetscScalar *xb;
1581   PetscScalar        x1;
1582   const MatScalar   *v   = a->a, *vv;
1583   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1584 #if defined(PETSC_USE_COMPLEX)
1585   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
1586 #else
1587   const int aconj = 0;
1588 #endif
1589 
1590   PetscFunctionBegin;
1591   for (i = 0; i < mbs; i++) {
1592     n = ii[1] - ii[0];
1593     ii++;
1594     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1595     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1596     jj = idx;
1597     vv = v;
1598     for (k = 0; k < cn; k++) {
1599       idx = jj;
1600       v   = vv;
1601       for (j = 0; j < n; j++) {
1602         xb = b + (*idx);
1603         x1 = xb[0 + k * bm];
1604         z[0 + k * cm] += v[0] * x1;
1605         if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1606         v += 1;
1607         ++idx;
1608       }
1609     }
1610     z += 1;
1611   }
1612   PetscFunctionReturn(PETSC_SUCCESS);
1613 }
1614 
1615 static PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1616 {
1617   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1618   PetscScalar       *z = c;
1619   const PetscScalar *xb;
1620   PetscScalar        x1, x2;
1621   const MatScalar   *v   = a->a, *vv;
1622   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1623 
1624   PetscFunctionBegin;
1625   for (i = 0; i < mbs; i++) {
1626     n = ii[1] - ii[0];
1627     ii++;
1628     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1629     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1630     jj = idx;
1631     vv = v;
1632     for (k = 0; k < cn; k++) {
1633       idx = jj;
1634       v   = vv;
1635       for (j = 0; j < n; j++) {
1636         xb = b + 2 * (*idx);
1637         x1 = xb[0 + k * bm];
1638         x2 = xb[1 + k * bm];
1639         z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1640         z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1641         if (*idx != i) {
1642           c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1643           c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1644         }
1645         v += 4;
1646         ++idx;
1647       }
1648     }
1649     z += 2;
1650   }
1651   PetscFunctionReturn(PETSC_SUCCESS);
1652 }
1653 
1654 static PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1655 {
1656   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1657   PetscScalar       *z = c;
1658   const PetscScalar *xb;
1659   PetscScalar        x1, x2, x3;
1660   const MatScalar   *v   = a->a, *vv;
1661   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1662 
1663   PetscFunctionBegin;
1664   for (i = 0; i < mbs; i++) {
1665     n = ii[1] - ii[0];
1666     ii++;
1667     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1668     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1669     jj = idx;
1670     vv = v;
1671     for (k = 0; k < cn; k++) {
1672       idx = jj;
1673       v   = vv;
1674       for (j = 0; j < n; j++) {
1675         xb = b + 3 * (*idx);
1676         x1 = xb[0 + k * bm];
1677         x2 = xb[1 + k * bm];
1678         x3 = xb[2 + k * bm];
1679         z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1680         z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1681         z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1682         if (*idx != i) {
1683           c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1684           c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1685           c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1686         }
1687         v += 9;
1688         ++idx;
1689       }
1690     }
1691     z += 3;
1692   }
1693   PetscFunctionReturn(PETSC_SUCCESS);
1694 }
1695 
1696 static PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1697 {
1698   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1699   PetscScalar       *z = c;
1700   const PetscScalar *xb;
1701   PetscScalar        x1, x2, x3, x4;
1702   const MatScalar   *v   = a->a, *vv;
1703   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1704 
1705   PetscFunctionBegin;
1706   for (i = 0; i < mbs; i++) {
1707     n = ii[1] - ii[0];
1708     ii++;
1709     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1710     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1711     jj = idx;
1712     vv = v;
1713     for (k = 0; k < cn; k++) {
1714       idx = jj;
1715       v   = vv;
1716       for (j = 0; j < n; j++) {
1717         xb = b + 4 * (*idx);
1718         x1 = xb[0 + k * bm];
1719         x2 = xb[1 + k * bm];
1720         x3 = xb[2 + k * bm];
1721         x4 = xb[3 + k * bm];
1722         z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1723         z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1724         z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1725         z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1726         if (*idx != i) {
1727           c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1728           c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1729           c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1730           c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1731         }
1732         v += 16;
1733         ++idx;
1734       }
1735     }
1736     z += 4;
1737   }
1738   PetscFunctionReturn(PETSC_SUCCESS);
1739 }
1740 
1741 static PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1742 {
1743   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1744   PetscScalar       *z = c;
1745   const PetscScalar *xb;
1746   PetscScalar        x1, x2, x3, x4, x5;
1747   const MatScalar   *v   = a->a, *vv;
1748   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1749 
1750   PetscFunctionBegin;
1751   for (i = 0; i < mbs; i++) {
1752     n = ii[1] - ii[0];
1753     ii++;
1754     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1755     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1756     jj = idx;
1757     vv = v;
1758     for (k = 0; k < cn; k++) {
1759       idx = jj;
1760       v   = vv;
1761       for (j = 0; j < n; j++) {
1762         xb = b + 5 * (*idx);
1763         x1 = xb[0 + k * bm];
1764         x2 = xb[1 + k * bm];
1765         x3 = xb[2 + k * bm];
1766         x4 = xb[3 + k * bm];
1767         x5 = xb[4 + k * cm];
1768         z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1769         z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1770         z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1771         z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1772         z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1773         if (*idx != i) {
1774           c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1775           c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1776           c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1777           c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1778           c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1779         }
1780         v += 25;
1781         ++idx;
1782       }
1783     }
1784     z += 5;
1785   }
1786   PetscFunctionReturn(PETSC_SUCCESS);
1787 }
1788 
1789 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1790 {
1791   Mat_SeqSBAIJ    *a  = (Mat_SeqSBAIJ *)A->data;
1792   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
1793   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
1794   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1795   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1796   PetscBLASInt     bbs, bcn, bbm, bcm;
1797   PetscScalar     *z = NULL;
1798   PetscScalar     *c, *b;
1799   const MatScalar *v;
1800   const PetscInt  *idx, *ii;
1801   PetscScalar      _DOne = 1.0;
1802 
1803   PetscFunctionBegin;
1804   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1805   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1806   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1807   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1808   b = bd->v;
1809   PetscCall(MatZeroEntries(C));
1810   PetscCall(MatDenseGetArray(C, &c));
1811   switch (bs) {
1812   case 1:
1813     PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1814     break;
1815   case 2:
1816     PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1817     break;
1818   case 3:
1819     PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1820     break;
1821   case 4:
1822     PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1823     break;
1824   case 5:
1825     PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1826     break;
1827   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1828     PetscCall(PetscBLASIntCast(bs, &bbs));
1829     PetscCall(PetscBLASIntCast(cn, &bcn));
1830     PetscCall(PetscBLASIntCast(bm, &bbm));
1831     PetscCall(PetscBLASIntCast(cm, &bcm));
1832     idx = a->j;
1833     v   = a->a;
1834     mbs = a->mbs;
1835     ii  = a->i;
1836     z   = c;
1837     for (i = 0; i < mbs; i++) {
1838       n = ii[1] - ii[0];
1839       ii++;
1840       for (j = 0; j < n; j++) {
1841         if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1842         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1843         v += bs2;
1844       }
1845       z += bs;
1846     }
1847   }
1848   PetscCall(MatDenseRestoreArray(C, &c));
1849   PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
1850   PetscFunctionReturn(PETSC_SUCCESS);
1851 }
1852