xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision d71ae5a4db6382e7f06317b8d368875286fe9008)
149b5e25fSSatish Balay 
2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
3c2916339SPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h>
4c2916339SPierre Jolivet #include <../src/mat/impls/sbaij/seq/sbaij.h>
5af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
6c6db04a5SJed Brown #include <petscbt.h>
7c6db04a5SJed Brown #include <petscblaslapack.h>
849b5e25fSSatish Balay 
9*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
10*d71ae5a4SJacob Faibussowitsch {
115eee224dSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
125d0c19d7SBarry Smith   PetscInt        brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs, *nidx2;
135d0c19d7SBarry Smith   const PetscInt *idx;
14db41cccfSHong Zhang   PetscBT         table_out, table_in;
15d94109b8SHong Zhang 
16d94109b8SHong Zhang   PetscFunctionBegin;
1708401ef6SPierre Jolivet   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
18d94109b8SHong Zhang   mbs = a->mbs;
19d94109b8SHong Zhang   ai  = a->i;
20d94109b8SHong Zhang   aj  = a->j;
21d0f46423SBarry Smith   bs  = A->rmap->bs;
229566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(mbs, &table_out));
239566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(mbs + 1, &nidx));
249566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(A->rmap->N + 1, &nidx2));
259566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(mbs, &table_in));
26d94109b8SHong Zhang 
27d94109b8SHong Zhang   for (i = 0; i < is_max; i++) { /* for each is */
28d94109b8SHong Zhang     isz = 0;
299566063dSJacob Faibussowitsch     PetscCall(PetscBTMemzero(mbs, table_out));
30d94109b8SHong Zhang 
31d94109b8SHong Zhang     /* Extract the indices, assume there can be duplicate entries */
329566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(is[i], &idx));
339566063dSJacob Faibussowitsch     PetscCall(ISGetLocalSize(is[i], &n));
34d94109b8SHong Zhang 
35db41cccfSHong Zhang     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
36dbe03f88SHong Zhang     bcol_max = 0;
37d94109b8SHong Zhang     for (j = 0; j < n; ++j) {
38d94109b8SHong Zhang       brow = idx[j] / bs; /* convert the indices into block indices */
3908401ef6SPierre Jolivet       PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
40db41cccfSHong Zhang       if (!PetscBTLookupSet(table_out, brow)) {
41dbe03f88SHong Zhang         nidx[isz++] = brow;
42dbe03f88SHong Zhang         if (bcol_max < brow) bcol_max = brow;
43dbe03f88SHong Zhang       }
44d94109b8SHong Zhang     }
459566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(is[i], &idx));
469566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&is[i]));
47d94109b8SHong Zhang 
48d94109b8SHong Zhang     k = 0;
49d94109b8SHong Zhang     for (j = 0; j < ov; j++) { /* for each overlap */
50db41cccfSHong Zhang       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
519566063dSJacob Faibussowitsch       PetscCall(PetscBTMemzero(mbs, table_in));
529566063dSJacob Faibussowitsch       for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));
53d94109b8SHong Zhang 
54d94109b8SHong Zhang       n = isz; /* length of the updated is[i] */
55d94109b8SHong Zhang       for (brow = 0; brow < mbs; brow++) {
569371c9d4SSatish Balay         start = ai[brow];
579371c9d4SSatish Balay         end   = ai[brow + 1];
58db41cccfSHong Zhang         if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
59d94109b8SHong Zhang           for (l = start; l < end; l++) {
60d94109b8SHong Zhang             bcol = aj[l];
61d7b97159SHong Zhang             if (!PetscBTLookupSet(table_out, bcol)) {
62d7b97159SHong Zhang               nidx[isz++] = bcol;
63d7b97159SHong Zhang               if (bcol_max < bcol) bcol_max = bcol;
64d7b97159SHong Zhang             }
65d94109b8SHong Zhang           }
66d94109b8SHong Zhang           k++;
67d94109b8SHong Zhang           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
68d94109b8SHong Zhang         } else {             /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
69d94109b8SHong Zhang           for (l = start; l < end; l++) {
70d94109b8SHong Zhang             bcol = aj[l];
71dbe03f88SHong Zhang             if (bcol > bcol_max) break;
72db41cccfSHong Zhang             if (PetscBTLookup(table_in, bcol)) {
7326fbe8dcSKarl Rupp               if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
74d94109b8SHong Zhang               break; /* for l = start; l<end ; l++) */
75d94109b8SHong Zhang             }
76d94109b8SHong Zhang           }
77d94109b8SHong Zhang         }
78d94109b8SHong Zhang       }
79d94109b8SHong Zhang     } /* for each overlap */
80d94109b8SHong Zhang 
81d94109b8SHong Zhang     /* expand the Index Set */
82d94109b8SHong Zhang     for (j = 0; j < isz; j++) {
8326fbe8dcSKarl Rupp       for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k;
84d94109b8SHong Zhang     }
859566063dSJacob Faibussowitsch     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i));
86d94109b8SHong Zhang   }
879566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&table_out));
889566063dSJacob Faibussowitsch   PetscCall(PetscFree(nidx));
899566063dSJacob Faibussowitsch   PetscCall(PetscFree(nidx2));
909566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&table_in));
915eee224dSHong Zhang   PetscFunctionReturn(0);
9249b5e25fSSatish Balay }
9349b5e25fSSatish Balay 
94847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
95847daeccSHong Zhang         Zero some ops' to avoid invalid usse */
96*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
97*d71ae5a4SJacob Faibussowitsch {
9849b5e25fSSatish Balay   PetscFunctionBegin;
999566063dSJacob Faibussowitsch   PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
100f4259b30SLisandro Dalcin   Bseq->ops->mult                   = NULL;
101f4259b30SLisandro Dalcin   Bseq->ops->multadd                = NULL;
102f4259b30SLisandro Dalcin   Bseq->ops->multtranspose          = NULL;
103f4259b30SLisandro Dalcin   Bseq->ops->multtransposeadd       = NULL;
104f4259b30SLisandro Dalcin   Bseq->ops->lufactor               = NULL;
105f4259b30SLisandro Dalcin   Bseq->ops->choleskyfactor         = NULL;
106f4259b30SLisandro Dalcin   Bseq->ops->lufactorsymbolic       = NULL;
107f4259b30SLisandro Dalcin   Bseq->ops->choleskyfactorsymbolic = NULL;
108f4259b30SLisandro Dalcin   Bseq->ops->getinertia             = NULL;
10949b5e25fSSatish Balay   PetscFunctionReturn(0);
11049b5e25fSSatish Balay }
11149b5e25fSSatish Balay 
1127dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
113*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
114*d71ae5a4SJacob Faibussowitsch {
115e50a8114SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *c;
116e50a8114SHong Zhang   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
117e50a8114SHong Zhang   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
118e50a8114SHong Zhang   const PetscInt *irow, *icol;
119e50a8114SHong Zhang   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
120e50a8114SHong Zhang   PetscInt       *aj = a->j, *ai = a->i;
121e50a8114SHong Zhang   MatScalar      *mat_a;
122e50a8114SHong Zhang   Mat             C;
123e50a8114SHong Zhang   PetscBool       flag;
124e50a8114SHong Zhang 
125e50a8114SHong Zhang   PetscFunctionBegin;
126e50a8114SHong Zhang 
1279566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &irow));
1289566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &icol));
1299566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(isrow, &nrows));
1309566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(iscol, &ncols));
131e50a8114SHong Zhang 
1329566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(1 + oldcols, &smap));
133e50a8114SHong Zhang   ssmap = smap;
1349566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(1 + nrows, &lens));
135e50a8114SHong Zhang   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
136e50a8114SHong Zhang   /* determine lens of each row */
137e50a8114SHong Zhang   for (i = 0; i < nrows; i++) {
138e50a8114SHong Zhang     kstart  = ai[irow[i]];
139e50a8114SHong Zhang     kend    = kstart + a->ilen[irow[i]];
140e50a8114SHong Zhang     lens[i] = 0;
141e50a8114SHong Zhang     for (k = kstart; k < kend; k++) {
142e50a8114SHong Zhang       if (ssmap[aj[k]]) lens[i]++;
143e50a8114SHong Zhang     }
144e50a8114SHong Zhang   }
145e50a8114SHong Zhang   /* Create and fill new matrix */
146e50a8114SHong Zhang   if (scall == MAT_REUSE_MATRIX) {
147e50a8114SHong Zhang     c = (Mat_SeqSBAIJ *)((*B)->data);
148e50a8114SHong Zhang 
149aed4548fSBarry Smith     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
1509566063dSJacob Faibussowitsch     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
15128b400f6SJacob Faibussowitsch     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
1529566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(c->ilen, c->mbs));
153e50a8114SHong Zhang     C = *B;
154e50a8114SHong Zhang   } else {
1559566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
1569566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
1579566063dSJacob Faibussowitsch     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
1589566063dSJacob Faibussowitsch     PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
159e50a8114SHong Zhang   }
160e50a8114SHong Zhang   c = (Mat_SeqSBAIJ *)(C->data);
161e50a8114SHong Zhang   for (i = 0; i < nrows; i++) {
162e50a8114SHong Zhang     row      = irow[i];
163e50a8114SHong Zhang     kstart   = ai[row];
164e50a8114SHong Zhang     kend     = kstart + a->ilen[row];
165e50a8114SHong Zhang     mat_i    = c->i[i];
166e50a8114SHong Zhang     mat_j    = c->j + mat_i;
167e50a8114SHong Zhang     mat_a    = c->a + mat_i * bs2;
168e50a8114SHong Zhang     mat_ilen = c->ilen + i;
169e50a8114SHong Zhang     for (k = kstart; k < kend; k++) {
170e50a8114SHong Zhang       if ((tcol = ssmap[a->j[k]])) {
171e50a8114SHong Zhang         *mat_j++ = tcol - 1;
1729566063dSJacob Faibussowitsch         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
173e50a8114SHong Zhang         mat_a += bs2;
174e50a8114SHong Zhang         (*mat_ilen)++;
175e50a8114SHong Zhang       }
176e50a8114SHong Zhang     }
177e50a8114SHong Zhang   }
178e50a8114SHong Zhang   /* sort */
179e50a8114SHong Zhang   {
180e50a8114SHong Zhang     MatScalar *work;
181e50a8114SHong Zhang 
1829566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(bs2, &work));
183e50a8114SHong Zhang     for (i = 0; i < nrows; i++) {
184e50a8114SHong Zhang       PetscInt ilen;
185e50a8114SHong Zhang       mat_i = c->i[i];
186e50a8114SHong Zhang       mat_j = c->j + mat_i;
187e50a8114SHong Zhang       mat_a = c->a + mat_i * bs2;
188e50a8114SHong Zhang       ilen  = c->ilen[i];
1899566063dSJacob Faibussowitsch       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
190e50a8114SHong Zhang     }
1919566063dSJacob Faibussowitsch     PetscCall(PetscFree(work));
192e50a8114SHong Zhang   }
193e50a8114SHong Zhang 
194e50a8114SHong Zhang   /* Free work space */
1959566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &icol));
1969566063dSJacob Faibussowitsch   PetscCall(PetscFree(smap));
1979566063dSJacob Faibussowitsch   PetscCall(PetscFree(lens));
1989566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1999566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
200e50a8114SHong Zhang 
2019566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &irow));
202e50a8114SHong Zhang   *B = C;
203e50a8114SHong Zhang   PetscFunctionReturn(0);
204e50a8114SHong Zhang }
205e50a8114SHong Zhang 
206*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
207*d71ae5a4SJacob Faibussowitsch {
208e50a8114SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
209e50a8114SHong Zhang   IS              is1, is2;
210e50a8114SHong Zhang   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs;
211e50a8114SHong Zhang   const PetscInt *irow, *icol;
21249b5e25fSSatish Balay 
21349b5e25fSSatish Balay   PetscFunctionBegin;
2149566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &irow));
2159566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &icol));
2169566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(isrow, &nrows));
2179566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(iscol, &ncols));
218e50a8114SHong Zhang 
219e50a8114SHong Zhang   /* Verify if the indices corespond to each element in a block
220e50a8114SHong Zhang    and form the IS with compressed IS */
221e50a8114SHong Zhang   maxmnbs = PetscMax(a->mbs, a->nbs);
2229566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary));
2239566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(vary, a->mbs));
224e50a8114SHong Zhang   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
225ad540459SPierre Jolivet   for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks");
226e50a8114SHong Zhang   count = 0;
227e50a8114SHong Zhang   for (i = 0; i < nrows; i++) {
228e50a8114SHong Zhang     PetscInt j = irow[i] / bs;
229e50a8114SHong Zhang     if ((vary[j]--) == bs) iary[count++] = j;
230e50a8114SHong Zhang   }
2319566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1));
232e50a8114SHong Zhang 
2339566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(vary, a->nbs));
234e50a8114SHong Zhang   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
235ad540459SPierre Jolivet   for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc");
236e50a8114SHong Zhang   count = 0;
237e50a8114SHong Zhang   for (i = 0; i < ncols; i++) {
238e50a8114SHong Zhang     PetscInt j = icol[i] / bs;
239e50a8114SHong Zhang     if ((vary[j]--) == bs) iary[count++] = j;
240e50a8114SHong Zhang   }
2419566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2));
2429566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &irow));
2439566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &icol));
2449566063dSJacob Faibussowitsch   PetscCall(PetscFree2(vary, iary));
245e50a8114SHong Zhang 
2469566063dSJacob Faibussowitsch   PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B));
2479566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is1));
2489566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is2));
249847daeccSHong Zhang 
2508f46ffcaSHong Zhang   if (isrow != iscol) {
2518f46ffcaSHong Zhang     PetscBool isequal;
2529566063dSJacob Faibussowitsch     PetscCall(ISEqual(isrow, iscol, &isequal));
25348a46eb9SPierre Jolivet     if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
25449b5e25fSSatish Balay   }
25549b5e25fSSatish Balay   PetscFunctionReturn(0);
25649b5e25fSSatish Balay }
25749b5e25fSSatish Balay 
258*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
259*d71ae5a4SJacob Faibussowitsch {
26013f74950SBarry Smith   PetscInt i;
26149b5e25fSSatish Balay 
26249b5e25fSSatish Balay   PetscFunctionBegin;
26348a46eb9SPierre Jolivet   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));
264e50a8114SHong Zhang 
26548a46eb9SPierre Jolivet   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
26649b5e25fSSatish Balay   PetscFunctionReturn(0);
26749b5e25fSSatish Balay }
26849b5e25fSSatish Balay 
26949b5e25fSSatish Balay /* -------------------------------------------------------*/
27049b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */
27149b5e25fSSatish Balay /* -------------------------------------------------------*/
27249b5e25fSSatish Balay 
273*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
274*d71ae5a4SJacob Faibussowitsch {
27549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
276d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, zero = 0.0;
277d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
278d9ca1df4SBarry Smith   const MatScalar   *v;
279d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
280d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
28198c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
28249b5e25fSSatish Balay 
28349b5e25fSSatish Balay   PetscFunctionBegin;
2849566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
285c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
2869566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
2879566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
28849b5e25fSSatish Balay 
28949b5e25fSSatish Balay   v  = a->a;
29049b5e25fSSatish Balay   xb = x;
29149b5e25fSSatish Balay 
29249b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
29349b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
2949371c9d4SSatish Balay     x1   = xb[0];
2959371c9d4SSatish Balay     x2   = xb[1];
29649b5e25fSSatish Balay     ib   = aj + *ai;
297831a3094SHong Zhang     jmin = 0;
29898c9bda7SSatish Balay     nonzerorow += (n > 0);
2997fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
30049b5e25fSSatish Balay       z[2 * i] += v[0] * x1 + v[2] * x2;
30149b5e25fSSatish Balay       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
3029371c9d4SSatish Balay       v += 4;
3039371c9d4SSatish Balay       jmin++;
3047fbae186SHong Zhang     }
305444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
306444d8c10SJed Brown     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
307831a3094SHong Zhang     for (j = jmin; j < n; j++) {
30849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
30949b5e25fSSatish Balay       cval = ib[j] * 2;
31049b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2;
31149b5e25fSSatish Balay       z[cval + 1] += v[2] * x1 + v[3] * x2;
31249b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
31349b5e25fSSatish Balay       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
31449b5e25fSSatish Balay       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
31549b5e25fSSatish Balay       v += 4;
31649b5e25fSSatish Balay     }
3179371c9d4SSatish Balay     xb += 2;
3189371c9d4SSatish Balay     ai++;
31949b5e25fSSatish Balay   }
32049b5e25fSSatish Balay 
3219566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
3229566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
3239566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
32449b5e25fSSatish Balay   PetscFunctionReturn(0);
32549b5e25fSSatish Balay }
32649b5e25fSSatish Balay 
327*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
328*d71ae5a4SJacob Faibussowitsch {
32949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
330d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, zero = 0.0;
331d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
332d9ca1df4SBarry Smith   const MatScalar   *v;
333d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
334d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
33598c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
33649b5e25fSSatish Balay 
33749b5e25fSSatish Balay   PetscFunctionBegin;
3389566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
339c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
3409566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3419566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
34249b5e25fSSatish Balay 
34349b5e25fSSatish Balay   v  = a->a;
34449b5e25fSSatish Balay   xb = x;
34549b5e25fSSatish Balay 
34649b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
34749b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
3489371c9d4SSatish Balay     x1   = xb[0];
3499371c9d4SSatish Balay     x2   = xb[1];
3509371c9d4SSatish Balay     x3   = xb[2];
35149b5e25fSSatish Balay     ib   = aj + *ai;
352831a3094SHong Zhang     jmin = 0;
35398c9bda7SSatish Balay     nonzerorow += (n > 0);
3547fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
35549b5e25fSSatish Balay       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
35649b5e25fSSatish Balay       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
35749b5e25fSSatish Balay       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3589371c9d4SSatish Balay       v += 9;
3599371c9d4SSatish Balay       jmin++;
3607fbae186SHong Zhang     }
361444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
362444d8c10SJed Brown     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
363831a3094SHong Zhang     for (j = jmin; j < n; j++) {
36449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
36549b5e25fSSatish Balay       cval = ib[j] * 3;
36649b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
36749b5e25fSSatish Balay       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
36849b5e25fSSatish Balay       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
36949b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
37049b5e25fSSatish Balay       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
37149b5e25fSSatish Balay       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
37249b5e25fSSatish Balay       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
37349b5e25fSSatish Balay       v += 9;
37449b5e25fSSatish Balay     }
3759371c9d4SSatish Balay     xb += 3;
3769371c9d4SSatish Balay     ai++;
37749b5e25fSSatish Balay   }
37849b5e25fSSatish Balay 
3799566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
3809566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
3819566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
38249b5e25fSSatish Balay   PetscFunctionReturn(0);
38349b5e25fSSatish Balay }
38449b5e25fSSatish Balay 
385*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
386*d71ae5a4SJacob Faibussowitsch {
38749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
388d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, zero = 0.0;
389d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
390d9ca1df4SBarry Smith   const MatScalar   *v;
391d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
392d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
39398c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
39449b5e25fSSatish Balay 
39549b5e25fSSatish Balay   PetscFunctionBegin;
3969566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
397c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
3989566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3999566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
40049b5e25fSSatish Balay 
40149b5e25fSSatish Balay   v  = a->a;
40249b5e25fSSatish Balay   xb = x;
40349b5e25fSSatish Balay 
40449b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
40549b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
4069371c9d4SSatish Balay     x1   = xb[0];
4079371c9d4SSatish Balay     x2   = xb[1];
4089371c9d4SSatish Balay     x3   = xb[2];
4099371c9d4SSatish Balay     x4   = xb[3];
41049b5e25fSSatish Balay     ib   = aj + *ai;
411831a3094SHong Zhang     jmin = 0;
41298c9bda7SSatish Balay     nonzerorow += (n > 0);
4137fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
41449b5e25fSSatish Balay       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
41549b5e25fSSatish Balay       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
41649b5e25fSSatish Balay       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
41749b5e25fSSatish Balay       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
4189371c9d4SSatish Balay       v += 16;
4199371c9d4SSatish Balay       jmin++;
4207fbae186SHong Zhang     }
421444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
422444d8c10SJed Brown     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
423831a3094SHong Zhang     for (j = jmin; j < n; j++) {
42449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
42549b5e25fSSatish Balay       cval = ib[j] * 4;
42649b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
42749b5e25fSSatish Balay       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
42849b5e25fSSatish Balay       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
42949b5e25fSSatish Balay       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
43049b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
43149b5e25fSSatish Balay       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
43249b5e25fSSatish Balay       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
43349b5e25fSSatish Balay       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
43449b5e25fSSatish Balay       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
43549b5e25fSSatish Balay       v += 16;
43649b5e25fSSatish Balay     }
4379371c9d4SSatish Balay     xb += 4;
4389371c9d4SSatish Balay     ai++;
43949b5e25fSSatish Balay   }
44049b5e25fSSatish Balay 
4419566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
4429566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
4439566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
44449b5e25fSSatish Balay   PetscFunctionReturn(0);
44549b5e25fSSatish Balay }
44649b5e25fSSatish Balay 
447*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
448*d71ae5a4SJacob Faibussowitsch {
44949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
450d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, zero = 0.0;
451d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
452d9ca1df4SBarry Smith   const MatScalar   *v;
453d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
454d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
45598c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
45649b5e25fSSatish Balay 
45749b5e25fSSatish Balay   PetscFunctionBegin;
4589566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
459c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
4609566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
4619566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
46249b5e25fSSatish Balay 
46349b5e25fSSatish Balay   v  = a->a;
46449b5e25fSSatish Balay   xb = x;
46549b5e25fSSatish Balay 
46649b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
46749b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
4689371c9d4SSatish Balay     x1   = xb[0];
4699371c9d4SSatish Balay     x2   = xb[1];
4709371c9d4SSatish Balay     x3   = xb[2];
4719371c9d4SSatish Balay     x4   = xb[3];
4729371c9d4SSatish Balay     x5   = xb[4];
47349b5e25fSSatish Balay     ib   = aj + *ai;
474831a3094SHong Zhang     jmin = 0;
47598c9bda7SSatish Balay     nonzerorow += (n > 0);
4767fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
47749b5e25fSSatish Balay       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
47849b5e25fSSatish Balay       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
47949b5e25fSSatish Balay       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
48049b5e25fSSatish Balay       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
48149b5e25fSSatish Balay       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
4829371c9d4SSatish Balay       v += 25;
4839371c9d4SSatish Balay       jmin++;
4847fbae186SHong Zhang     }
485444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
486444d8c10SJed Brown     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
487831a3094SHong Zhang     for (j = jmin; j < n; j++) {
48849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
48949b5e25fSSatish Balay       cval = ib[j] * 5;
49049b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
49149b5e25fSSatish Balay       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
49249b5e25fSSatish Balay       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
49349b5e25fSSatish Balay       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
49449b5e25fSSatish Balay       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
49549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
49649b5e25fSSatish Balay       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
49749b5e25fSSatish Balay       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
49849b5e25fSSatish Balay       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
49949b5e25fSSatish Balay       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
50049b5e25fSSatish Balay       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
50149b5e25fSSatish Balay       v += 25;
50249b5e25fSSatish Balay     }
5039371c9d4SSatish Balay     xb += 5;
5049371c9d4SSatish Balay     ai++;
50549b5e25fSSatish Balay   }
50649b5e25fSSatish Balay 
5079566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
5089566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
5099566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
51049b5e25fSSatish Balay   PetscFunctionReturn(0);
51149b5e25fSSatish Balay }
51249b5e25fSSatish Balay 
513*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
514*d71ae5a4SJacob Faibussowitsch {
51549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
516d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
517d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
518d9ca1df4SBarry Smith   const MatScalar   *v;
519d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
520d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
52198c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
52249b5e25fSSatish Balay 
52349b5e25fSSatish Balay   PetscFunctionBegin;
5249566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
525c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
5269566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
5279566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
52849b5e25fSSatish Balay 
52949b5e25fSSatish Balay   v  = a->a;
53049b5e25fSSatish Balay   xb = x;
53149b5e25fSSatish Balay 
53249b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
53349b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
5349371c9d4SSatish Balay     x1   = xb[0];
5359371c9d4SSatish Balay     x2   = xb[1];
5369371c9d4SSatish Balay     x3   = xb[2];
5379371c9d4SSatish Balay     x4   = xb[3];
5389371c9d4SSatish Balay     x5   = xb[4];
5399371c9d4SSatish Balay     x6   = xb[5];
54049b5e25fSSatish Balay     ib   = aj + *ai;
541831a3094SHong Zhang     jmin = 0;
54298c9bda7SSatish Balay     nonzerorow += (n > 0);
5437fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
54449b5e25fSSatish Balay       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
54549b5e25fSSatish Balay       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
54649b5e25fSSatish Balay       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
54749b5e25fSSatish Balay       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
54849b5e25fSSatish Balay       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
54949b5e25fSSatish Balay       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
5509371c9d4SSatish Balay       v += 36;
5519371c9d4SSatish Balay       jmin++;
5527fbae186SHong Zhang     }
553444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
554444d8c10SJed Brown     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
555831a3094SHong Zhang     for (j = jmin; j < n; j++) {
55649b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
55749b5e25fSSatish Balay       cval = ib[j] * 6;
55849b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
55949b5e25fSSatish Balay       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
56049b5e25fSSatish Balay       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
56149b5e25fSSatish Balay       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
56249b5e25fSSatish Balay       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
56349b5e25fSSatish Balay       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
56449b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
56549b5e25fSSatish Balay       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
56649b5e25fSSatish Balay       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
56749b5e25fSSatish Balay       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
56849b5e25fSSatish Balay       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
56949b5e25fSSatish Balay       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
57049b5e25fSSatish Balay       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
57149b5e25fSSatish Balay       v += 36;
57249b5e25fSSatish Balay     }
5739371c9d4SSatish Balay     xb += 6;
5749371c9d4SSatish Balay     ai++;
57549b5e25fSSatish Balay   }
57649b5e25fSSatish Balay 
5779566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
5789566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
5799566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
58049b5e25fSSatish Balay   PetscFunctionReturn(0);
58149b5e25fSSatish Balay }
582c2916339SPierre Jolivet 
583*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
584*d71ae5a4SJacob Faibussowitsch {
58549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
586d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
587d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
588d9ca1df4SBarry Smith   const MatScalar   *v;
589d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
590d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
59198c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
59249b5e25fSSatish Balay 
59349b5e25fSSatish Balay   PetscFunctionBegin;
5949566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
595c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
5969566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
5979566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
59849b5e25fSSatish Balay 
59949b5e25fSSatish Balay   v  = a->a;
60049b5e25fSSatish Balay   xb = x;
60149b5e25fSSatish Balay 
60249b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
60349b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
6049371c9d4SSatish Balay     x1   = xb[0];
6059371c9d4SSatish Balay     x2   = xb[1];
6069371c9d4SSatish Balay     x3   = xb[2];
6079371c9d4SSatish Balay     x4   = xb[3];
6089371c9d4SSatish Balay     x5   = xb[4];
6099371c9d4SSatish Balay     x6   = xb[5];
6109371c9d4SSatish Balay     x7   = xb[6];
61149b5e25fSSatish Balay     ib   = aj + *ai;
612831a3094SHong Zhang     jmin = 0;
61398c9bda7SSatish Balay     nonzerorow += (n > 0);
6147fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
61549b5e25fSSatish Balay       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
61649b5e25fSSatish Balay       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
61749b5e25fSSatish Balay       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
61849b5e25fSSatish Balay       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
61949b5e25fSSatish Balay       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
62049b5e25fSSatish Balay       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
62149b5e25fSSatish Balay       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
6229371c9d4SSatish Balay       v += 49;
6239371c9d4SSatish Balay       jmin++;
6247fbae186SHong Zhang     }
625444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
626444d8c10SJed Brown     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
627831a3094SHong Zhang     for (j = jmin; j < n; j++) {
62849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
62949b5e25fSSatish Balay       cval = ib[j] * 7;
63049b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
63149b5e25fSSatish Balay       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
63249b5e25fSSatish Balay       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
63349b5e25fSSatish Balay       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
63449b5e25fSSatish Balay       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
63549b5e25fSSatish Balay       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
63649b5e25fSSatish Balay       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
63749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
63849b5e25fSSatish Balay       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
63949b5e25fSSatish Balay       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
64049b5e25fSSatish Balay       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
64149b5e25fSSatish Balay       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
64249b5e25fSSatish Balay       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
64349b5e25fSSatish Balay       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
64449b5e25fSSatish Balay       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
64549b5e25fSSatish Balay       v += 49;
64649b5e25fSSatish Balay     }
6479371c9d4SSatish Balay     xb += 7;
6489371c9d4SSatish Balay     ai++;
64949b5e25fSSatish Balay   }
6509566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
6519566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
6529566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
65349b5e25fSSatish Balay   PetscFunctionReturn(0);
65449b5e25fSSatish Balay }
65549b5e25fSSatish Balay 
65649b5e25fSSatish Balay /*
65749b5e25fSSatish Balay     This will not work with MatScalar == float because it calls the BLAS
65849b5e25fSSatish Balay */
659*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
660*d71ae5a4SJacob Faibussowitsch {
66149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
662d9ca1df4SBarry Smith   PetscScalar       *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
663d9ca1df4SBarry Smith   const PetscScalar *x, *x_ptr, *xb;
664d9ca1df4SBarry Smith   const MatScalar   *v;
665d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
666d9ca1df4SBarry Smith   const PetscInt    *idx, *aj, *ii;
66798c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
66849b5e25fSSatish Balay 
66949b5e25fSSatish Balay   PetscFunctionBegin;
6709566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
671c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
6729566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
6739566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
67459689ffbSStefano Zampini 
67559689ffbSStefano Zampini   x_ptr = x;
67659689ffbSStefano Zampini   z_ptr = z;
67749b5e25fSSatish Balay 
67849b5e25fSSatish Balay   aj = a->j;
67949b5e25fSSatish Balay   v  = a->a;
68049b5e25fSSatish Balay   ii = a->i;
68149b5e25fSSatish Balay 
68248a46eb9SPierre Jolivet   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
68349b5e25fSSatish Balay   work = a->mult_work;
68449b5e25fSSatish Balay 
68549b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
6869371c9d4SSatish Balay     n     = ii[1] - ii[0];
6879371c9d4SSatish Balay     ncols = n * bs;
6889371c9d4SSatish Balay     workt = work;
6899371c9d4SSatish Balay     idx   = aj + ii[0];
69098c9bda7SSatish Balay     nonzerorow += (n > 0);
69149b5e25fSSatish Balay 
69249b5e25fSSatish Balay     /* upper triangular part */
69349b5e25fSSatish Balay     for (j = 0; j < n; j++) {
69449b5e25fSSatish Balay       xb = x_ptr + bs * (*idx++);
69549b5e25fSSatish Balay       for (k = 0; k < bs; k++) workt[k] = xb[k];
69649b5e25fSSatish Balay       workt += bs;
69749b5e25fSSatish Balay     }
69849b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
69996b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
70049b5e25fSSatish Balay 
70149b5e25fSSatish Balay     /* strict lower triangular part */
702831a3094SHong Zhang     idx = aj + ii[0];
70359689ffbSStefano Zampini     if (n && *idx == i) {
7049371c9d4SSatish Balay       ncols -= bs;
7059371c9d4SSatish Balay       v += bs2;
7069371c9d4SSatish Balay       idx++;
7079371c9d4SSatish Balay       n--;
708831a3094SHong Zhang     }
70996b9376eSHong Zhang 
71049b5e25fSSatish Balay     if (ncols > 0) {
71149b5e25fSSatish Balay       workt = work;
7129566063dSJacob Faibussowitsch       PetscCall(PetscArrayzero(workt, ncols));
71396b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
714831a3094SHong Zhang       for (j = 0; j < n; j++) {
715831a3094SHong Zhang         zb = z_ptr + bs * (*idx++);
71649b5e25fSSatish Balay         for (k = 0; k < bs; k++) zb[k] += workt[k];
71749b5e25fSSatish Balay         workt += bs;
71849b5e25fSSatish Balay       }
71949b5e25fSSatish Balay     }
7209371c9d4SSatish Balay     x += bs;
7219371c9d4SSatish Balay     v += n * bs2;
7229371c9d4SSatish Balay     z += bs;
7239371c9d4SSatish Balay     ii++;
72449b5e25fSSatish Balay   }
72549b5e25fSSatish Balay 
7269566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
7279566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
7289566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
72949b5e25fSSatish Balay   PetscFunctionReturn(0);
73049b5e25fSSatish Balay }
73149b5e25fSSatish Balay 
732*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
733*d71ae5a4SJacob Faibussowitsch {
73449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
735d9ca1df4SBarry Smith   PetscScalar       *z, x1;
736d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
737d9ca1df4SBarry Smith   const MatScalar   *v;
738d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
739d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
74098c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
741eb1ec7c1SStefano Zampini #if defined(PETSC_USE_COMPLEX)
742b94d7dedSBarry Smith   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
743eb1ec7c1SStefano Zampini #else
744eb1ec7c1SStefano Zampini   const int aconj = 0;
745eb1ec7c1SStefano Zampini #endif
74649b5e25fSSatish Balay 
74749b5e25fSSatish Balay   PetscFunctionBegin;
7489566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
749c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
7509566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
7519566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
75249b5e25fSSatish Balay   v  = a->a;
75349b5e25fSSatish Balay   xb = x;
75449b5e25fSSatish Balay 
75549b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
75649b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th row of A */
75749b5e25fSSatish Balay     x1   = xb[0];
75849b5e25fSSatish Balay     ib   = aj + *ai;
759831a3094SHong Zhang     jmin = 0;
76098c9bda7SSatish Balay     nonzerorow += (n > 0);
7613d9ade75SStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
7629371c9d4SSatish Balay       z[i] += *v++ * x[*ib++];
7639371c9d4SSatish Balay       jmin++;
764831a3094SHong Zhang     }
765eb1ec7c1SStefano Zampini     if (aconj) {
766eb1ec7c1SStefano Zampini       for (j = jmin; j < n; j++) {
767eb1ec7c1SStefano Zampini         cval = *ib;
768eb1ec7c1SStefano Zampini         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
769eb1ec7c1SStefano Zampini         z[i] += *v++ * x[*ib++];       /* (strict upper triangular part of A)*x  */
770eb1ec7c1SStefano Zampini       }
771eb1ec7c1SStefano Zampini     } else {
772831a3094SHong Zhang       for (j = jmin; j < n; j++) {
77349b5e25fSSatish Balay         cval = *ib;
77449b5e25fSSatish Balay         z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
77549b5e25fSSatish Balay         z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
77649b5e25fSSatish Balay       }
777eb1ec7c1SStefano Zampini     }
7789371c9d4SSatish Balay     xb++;
7799371c9d4SSatish Balay     ai++;
78049b5e25fSSatish Balay   }
78149b5e25fSSatish Balay 
7829566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
7839566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
78449b5e25fSSatish Balay 
7859566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
78649b5e25fSSatish Balay   PetscFunctionReturn(0);
78749b5e25fSSatish Balay }
78849b5e25fSSatish Balay 
789*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
790*d71ae5a4SJacob Faibussowitsch {
79149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
792d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2;
793d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
794d9ca1df4SBarry Smith   const MatScalar   *v;
795d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
796d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
79798c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
79849b5e25fSSatish Balay 
79949b5e25fSSatish Balay   PetscFunctionBegin;
8009566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
801c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
8029566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
8039566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
80449b5e25fSSatish Balay 
80549b5e25fSSatish Balay   v  = a->a;
80649b5e25fSSatish Balay   xb = x;
80749b5e25fSSatish Balay 
80849b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
80949b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
8109371c9d4SSatish Balay     x1   = xb[0];
8119371c9d4SSatish Balay     x2   = xb[1];
81249b5e25fSSatish Balay     ib   = aj + *ai;
813831a3094SHong Zhang     jmin = 0;
81498c9bda7SSatish Balay     nonzerorow += (n > 0);
81559689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
81649b5e25fSSatish Balay       z[2 * i] += v[0] * x1 + v[2] * x2;
81749b5e25fSSatish Balay       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
8189371c9d4SSatish Balay       v += 4;
8199371c9d4SSatish Balay       jmin++;
8207fbae186SHong Zhang     }
821444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
822444d8c10SJed Brown     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
823831a3094SHong Zhang     for (j = jmin; j < n; j++) {
82449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
82549b5e25fSSatish Balay       cval = ib[j] * 2;
82649b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2;
82749b5e25fSSatish Balay       z[cval + 1] += v[2] * x1 + v[3] * x2;
82849b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
82949b5e25fSSatish Balay       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
83049b5e25fSSatish Balay       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
83149b5e25fSSatish Balay       v += 4;
83249b5e25fSSatish Balay     }
8339371c9d4SSatish Balay     xb += 2;
8349371c9d4SSatish Balay     ai++;
83549b5e25fSSatish Balay   }
8369566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
8379566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
83849b5e25fSSatish Balay 
8399566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
84049b5e25fSSatish Balay   PetscFunctionReturn(0);
84149b5e25fSSatish Balay }
84249b5e25fSSatish Balay 
843*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
844*d71ae5a4SJacob Faibussowitsch {
84549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
846d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3;
847d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
848d9ca1df4SBarry Smith   const MatScalar   *v;
849d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
850d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
85198c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
85249b5e25fSSatish Balay 
85349b5e25fSSatish Balay   PetscFunctionBegin;
8549566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
855c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
8569566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
8579566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
85849b5e25fSSatish Balay 
85949b5e25fSSatish Balay   v  = a->a;
86049b5e25fSSatish Balay   xb = x;
86149b5e25fSSatish Balay 
86249b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
86349b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
8649371c9d4SSatish Balay     x1   = xb[0];
8659371c9d4SSatish Balay     x2   = xb[1];
8669371c9d4SSatish Balay     x3   = xb[2];
86749b5e25fSSatish Balay     ib   = aj + *ai;
868831a3094SHong Zhang     jmin = 0;
86998c9bda7SSatish Balay     nonzerorow += (n > 0);
87059689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
87149b5e25fSSatish Balay       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
87249b5e25fSSatish Balay       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
87349b5e25fSSatish Balay       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
8749371c9d4SSatish Balay       v += 9;
8759371c9d4SSatish Balay       jmin++;
8767fbae186SHong Zhang     }
877444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
878444d8c10SJed Brown     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
879831a3094SHong Zhang     for (j = jmin; j < n; j++) {
88049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
88149b5e25fSSatish Balay       cval = ib[j] * 3;
88249b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
88349b5e25fSSatish Balay       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
88449b5e25fSSatish Balay       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
88549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
88649b5e25fSSatish Balay       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
88749b5e25fSSatish Balay       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
88849b5e25fSSatish Balay       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
88949b5e25fSSatish Balay       v += 9;
89049b5e25fSSatish Balay     }
8919371c9d4SSatish Balay     xb += 3;
8929371c9d4SSatish Balay     ai++;
89349b5e25fSSatish Balay   }
89449b5e25fSSatish Balay 
8959566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
8969566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
89749b5e25fSSatish Balay 
8989566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
89949b5e25fSSatish Balay   PetscFunctionReturn(0);
90049b5e25fSSatish Balay }
90149b5e25fSSatish Balay 
902*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
903*d71ae5a4SJacob Faibussowitsch {
90449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
905d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4;
906d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
907d9ca1df4SBarry Smith   const MatScalar   *v;
908d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
909d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
91098c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
91149b5e25fSSatish Balay 
91249b5e25fSSatish Balay   PetscFunctionBegin;
9139566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
914c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
9159566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
9169566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
91749b5e25fSSatish Balay 
91849b5e25fSSatish Balay   v  = a->a;
91949b5e25fSSatish Balay   xb = x;
92049b5e25fSSatish Balay 
92149b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
92249b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
9239371c9d4SSatish Balay     x1   = xb[0];
9249371c9d4SSatish Balay     x2   = xb[1];
9259371c9d4SSatish Balay     x3   = xb[2];
9269371c9d4SSatish Balay     x4   = xb[3];
92749b5e25fSSatish Balay     ib   = aj + *ai;
928831a3094SHong Zhang     jmin = 0;
92998c9bda7SSatish Balay     nonzerorow += (n > 0);
93059689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
93149b5e25fSSatish Balay       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
93249b5e25fSSatish Balay       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
93349b5e25fSSatish Balay       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
93449b5e25fSSatish Balay       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
9359371c9d4SSatish Balay       v += 16;
9369371c9d4SSatish Balay       jmin++;
9377fbae186SHong Zhang     }
938444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
939444d8c10SJed Brown     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
940831a3094SHong Zhang     for (j = jmin; j < n; j++) {
94149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
94249b5e25fSSatish Balay       cval = ib[j] * 4;
94349b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
94449b5e25fSSatish Balay       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
94549b5e25fSSatish Balay       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
94649b5e25fSSatish Balay       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
94749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
94849b5e25fSSatish Balay       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
94949b5e25fSSatish Balay       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
95049b5e25fSSatish Balay       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
95149b5e25fSSatish Balay       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
95249b5e25fSSatish Balay       v += 16;
95349b5e25fSSatish Balay     }
9549371c9d4SSatish Balay     xb += 4;
9559371c9d4SSatish Balay     ai++;
95649b5e25fSSatish Balay   }
95749b5e25fSSatish Balay 
9589566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
9599566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
96049b5e25fSSatish Balay 
9619566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
96249b5e25fSSatish Balay   PetscFunctionReturn(0);
96349b5e25fSSatish Balay }
96449b5e25fSSatish Balay 
965*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
966*d71ae5a4SJacob Faibussowitsch {
96749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
968d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5;
969d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
970d9ca1df4SBarry Smith   const MatScalar   *v;
971d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
972d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
97398c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
97449b5e25fSSatish Balay 
97549b5e25fSSatish Balay   PetscFunctionBegin;
9769566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
977c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
9789566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
9799566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
98049b5e25fSSatish Balay 
98149b5e25fSSatish Balay   v  = a->a;
98249b5e25fSSatish Balay   xb = x;
98349b5e25fSSatish Balay 
98449b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
98549b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
9869371c9d4SSatish Balay     x1   = xb[0];
9879371c9d4SSatish Balay     x2   = xb[1];
9889371c9d4SSatish Balay     x3   = xb[2];
9899371c9d4SSatish Balay     x4   = xb[3];
9909371c9d4SSatish Balay     x5   = xb[4];
99149b5e25fSSatish Balay     ib   = aj + *ai;
992831a3094SHong Zhang     jmin = 0;
99398c9bda7SSatish Balay     nonzerorow += (n > 0);
99459689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
99549b5e25fSSatish Balay       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
99649b5e25fSSatish Balay       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
99749b5e25fSSatish Balay       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
99849b5e25fSSatish Balay       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
99949b5e25fSSatish Balay       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
10009371c9d4SSatish Balay       v += 25;
10019371c9d4SSatish Balay       jmin++;
10027fbae186SHong Zhang     }
1003444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1004444d8c10SJed Brown     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1005831a3094SHong Zhang     for (j = jmin; j < n; j++) {
100649b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
100749b5e25fSSatish Balay       cval = ib[j] * 5;
100849b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
100949b5e25fSSatish Balay       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
101049b5e25fSSatish Balay       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
101149b5e25fSSatish Balay       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
101249b5e25fSSatish Balay       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
101349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
101449b5e25fSSatish Balay       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
101549b5e25fSSatish Balay       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
101649b5e25fSSatish Balay       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
101749b5e25fSSatish Balay       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
101849b5e25fSSatish Balay       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
101949b5e25fSSatish Balay       v += 25;
102049b5e25fSSatish Balay     }
10219371c9d4SSatish Balay     xb += 5;
10229371c9d4SSatish Balay     ai++;
102349b5e25fSSatish Balay   }
102449b5e25fSSatish Balay 
10259566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
10269566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
102749b5e25fSSatish Balay 
10289566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
102949b5e25fSSatish Balay   PetscFunctionReturn(0);
103049b5e25fSSatish Balay }
1031c2916339SPierre Jolivet 
1032*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1033*d71ae5a4SJacob Faibussowitsch {
103449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1035d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, x6;
1036d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
1037d9ca1df4SBarry Smith   const MatScalar   *v;
1038d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1039d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
104098c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
104149b5e25fSSatish Balay 
104249b5e25fSSatish Balay   PetscFunctionBegin;
10439566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
1044c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
10459566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
10469566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
104749b5e25fSSatish Balay 
104849b5e25fSSatish Balay   v  = a->a;
104949b5e25fSSatish Balay   xb = x;
105049b5e25fSSatish Balay 
105149b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
105249b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
10539371c9d4SSatish Balay     x1   = xb[0];
10549371c9d4SSatish Balay     x2   = xb[1];
10559371c9d4SSatish Balay     x3   = xb[2];
10569371c9d4SSatish Balay     x4   = xb[3];
10579371c9d4SSatish Balay     x5   = xb[4];
10589371c9d4SSatish Balay     x6   = xb[5];
105949b5e25fSSatish Balay     ib   = aj + *ai;
1060831a3094SHong Zhang     jmin = 0;
106198c9bda7SSatish Balay     nonzerorow += (n > 0);
106259689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
106349b5e25fSSatish Balay       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
106449b5e25fSSatish Balay       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
106549b5e25fSSatish Balay       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
106649b5e25fSSatish Balay       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
106749b5e25fSSatish Balay       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
106849b5e25fSSatish Balay       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
10699371c9d4SSatish Balay       v += 36;
10709371c9d4SSatish Balay       jmin++;
10717fbae186SHong Zhang     }
1072444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1073444d8c10SJed Brown     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1074831a3094SHong Zhang     for (j = jmin; j < n; j++) {
107549b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
107649b5e25fSSatish Balay       cval = ib[j] * 6;
107749b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
107849b5e25fSSatish Balay       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
107949b5e25fSSatish Balay       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
108049b5e25fSSatish Balay       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
108149b5e25fSSatish Balay       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
108249b5e25fSSatish Balay       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
108349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
108449b5e25fSSatish Balay       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
108549b5e25fSSatish Balay       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
108649b5e25fSSatish Balay       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
108749b5e25fSSatish Balay       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
108849b5e25fSSatish Balay       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
108949b5e25fSSatish Balay       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
109049b5e25fSSatish Balay       v += 36;
109149b5e25fSSatish Balay     }
10929371c9d4SSatish Balay     xb += 6;
10939371c9d4SSatish Balay     ai++;
109449b5e25fSSatish Balay   }
109549b5e25fSSatish Balay 
10969566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
10979566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
109849b5e25fSSatish Balay 
10999566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
110049b5e25fSSatish Balay   PetscFunctionReturn(0);
110149b5e25fSSatish Balay }
110249b5e25fSSatish Balay 
1103*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1104*d71ae5a4SJacob Faibussowitsch {
110549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1106d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7;
1107d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
1108d9ca1df4SBarry Smith   const MatScalar   *v;
1109d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1110d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
111198c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
111249b5e25fSSatish Balay 
111349b5e25fSSatish Balay   PetscFunctionBegin;
11149566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
1115c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
11169566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
11179566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
111849b5e25fSSatish Balay 
111949b5e25fSSatish Balay   v  = a->a;
112049b5e25fSSatish Balay   xb = x;
112149b5e25fSSatish Balay 
112249b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
112349b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
11249371c9d4SSatish Balay     x1   = xb[0];
11259371c9d4SSatish Balay     x2   = xb[1];
11269371c9d4SSatish Balay     x3   = xb[2];
11279371c9d4SSatish Balay     x4   = xb[3];
11289371c9d4SSatish Balay     x5   = xb[4];
11299371c9d4SSatish Balay     x6   = xb[5];
11309371c9d4SSatish Balay     x7   = xb[6];
113149b5e25fSSatish Balay     ib   = aj + *ai;
1132831a3094SHong Zhang     jmin = 0;
113398c9bda7SSatish Balay     nonzerorow += (n > 0);
113459689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
113549b5e25fSSatish Balay       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
113649b5e25fSSatish Balay       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
113749b5e25fSSatish Balay       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
113849b5e25fSSatish Balay       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
113949b5e25fSSatish Balay       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
114049b5e25fSSatish Balay       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
114149b5e25fSSatish Balay       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
11429371c9d4SSatish Balay       v += 49;
11439371c9d4SSatish Balay       jmin++;
11447fbae186SHong Zhang     }
1145444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1146444d8c10SJed Brown     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1147831a3094SHong Zhang     for (j = jmin; j < n; j++) {
114849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
114949b5e25fSSatish Balay       cval = ib[j] * 7;
115049b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
115149b5e25fSSatish Balay       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
115249b5e25fSSatish Balay       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
115349b5e25fSSatish Balay       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
115449b5e25fSSatish Balay       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
115549b5e25fSSatish Balay       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
115649b5e25fSSatish Balay       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
115749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
115849b5e25fSSatish Balay       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
115949b5e25fSSatish Balay       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
116049b5e25fSSatish Balay       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
116149b5e25fSSatish Balay       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
116249b5e25fSSatish Balay       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
116349b5e25fSSatish Balay       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
116449b5e25fSSatish Balay       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
116549b5e25fSSatish Balay       v += 49;
116649b5e25fSSatish Balay     }
11679371c9d4SSatish Balay     xb += 7;
11689371c9d4SSatish Balay     ai++;
116949b5e25fSSatish Balay   }
117049b5e25fSSatish Balay 
11719566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
11729566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
117349b5e25fSSatish Balay 
11749566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
117549b5e25fSSatish Balay   PetscFunctionReturn(0);
117649b5e25fSSatish Balay }
117749b5e25fSSatish Balay 
1178*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1179*d71ae5a4SJacob Faibussowitsch {
118049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1181f4259b30SLisandro Dalcin   PetscScalar       *z, *z_ptr = NULL, *zb, *work, *workt;
1182d9ca1df4SBarry Smith   const PetscScalar *x, *x_ptr, *xb;
1183d9ca1df4SBarry Smith   const MatScalar   *v;
1184d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1185d9ca1df4SBarry Smith   const PetscInt    *idx, *aj, *ii;
118698c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
118749b5e25fSSatish Balay 
118849b5e25fSSatish Balay   PetscFunctionBegin;
11899566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
1190c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
11919371c9d4SSatish Balay   PetscCall(VecGetArrayRead(xx, &x));
11929371c9d4SSatish Balay   x_ptr = x;
11939371c9d4SSatish Balay   PetscCall(VecGetArray(zz, &z));
11949371c9d4SSatish Balay   z_ptr = z;
119549b5e25fSSatish Balay 
119649b5e25fSSatish Balay   aj = a->j;
119749b5e25fSSatish Balay   v  = a->a;
119849b5e25fSSatish Balay   ii = a->i;
119949b5e25fSSatish Balay 
120048a46eb9SPierre Jolivet   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
120149b5e25fSSatish Balay   work = a->mult_work;
120249b5e25fSSatish Balay 
120349b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
12049371c9d4SSatish Balay     n     = ii[1] - ii[0];
12059371c9d4SSatish Balay     ncols = n * bs;
12069371c9d4SSatish Balay     workt = work;
12079371c9d4SSatish Balay     idx   = aj + ii[0];
120898c9bda7SSatish Balay     nonzerorow += (n > 0);
120949b5e25fSSatish Balay 
121049b5e25fSSatish Balay     /* upper triangular part */
121149b5e25fSSatish Balay     for (j = 0; j < n; j++) {
121249b5e25fSSatish Balay       xb = x_ptr + bs * (*idx++);
121349b5e25fSSatish Balay       for (k = 0; k < bs; k++) workt[k] = xb[k];
121449b5e25fSSatish Balay       workt += bs;
121549b5e25fSSatish Balay     }
121649b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
121796b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
121849b5e25fSSatish Balay 
121949b5e25fSSatish Balay     /* strict lower triangular part */
1220831a3094SHong Zhang     idx = aj + ii[0];
122159689ffbSStefano Zampini     if (n && *idx == i) {
12229371c9d4SSatish Balay       ncols -= bs;
12239371c9d4SSatish Balay       v += bs2;
12249371c9d4SSatish Balay       idx++;
12259371c9d4SSatish Balay       n--;
1226831a3094SHong Zhang     }
122749b5e25fSSatish Balay     if (ncols > 0) {
122849b5e25fSSatish Balay       workt = work;
12299566063dSJacob Faibussowitsch       PetscCall(PetscArrayzero(workt, ncols));
123096b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1231831a3094SHong Zhang       for (j = 0; j < n; j++) {
1232831a3094SHong Zhang         zb = z_ptr + bs * (*idx++);
123349b5e25fSSatish Balay         for (k = 0; k < bs; k++) zb[k] += workt[k];
123449b5e25fSSatish Balay         workt += bs;
123549b5e25fSSatish Balay       }
123649b5e25fSSatish Balay     }
123749b5e25fSSatish Balay 
12389371c9d4SSatish Balay     x += bs;
12399371c9d4SSatish Balay     v += n * bs2;
12409371c9d4SSatish Balay     z += bs;
12419371c9d4SSatish Balay     ii++;
124249b5e25fSSatish Balay   }
124349b5e25fSSatish Balay 
12449566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
12459566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
124649b5e25fSSatish Balay 
12479566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
124849b5e25fSSatish Balay   PetscFunctionReturn(0);
124949b5e25fSSatish Balay }
125049b5e25fSSatish Balay 
1251*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1252*d71ae5a4SJacob Faibussowitsch {
125349b5e25fSSatish Balay   Mat_SeqSBAIJ *a      = (Mat_SeqSBAIJ *)inA->data;
1254f4df32b1SMatthew Knepley   PetscScalar   oalpha = alpha;
1255c5df96a5SBarry Smith   PetscBLASInt  one    = 1, totalnz;
125649b5e25fSSatish Balay 
125749b5e25fSSatish Balay   PetscFunctionBegin;
12589566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1259792fecdfSBarry Smith   PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
12609566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(totalnz));
126149b5e25fSSatish Balay   PetscFunctionReturn(0);
126249b5e25fSSatish Balay }
126349b5e25fSSatish Balay 
1264*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1265*d71ae5a4SJacob Faibussowitsch {
126649b5e25fSSatish Balay   Mat_SeqSBAIJ    *a        = (Mat_SeqSBAIJ *)A->data;
1267d9ca1df4SBarry Smith   const MatScalar *v        = a->a;
126849b5e25fSSatish Balay   PetscReal        sum_diag = 0.0, sum_off = 0.0, *sum;
1269d9ca1df4SBarry Smith   PetscInt         i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1270d9ca1df4SBarry Smith   const PetscInt  *aj = a->j, *col;
127149b5e25fSSatish Balay 
127249b5e25fSSatish Balay   PetscFunctionBegin;
1273c40ae873SPierre Jolivet   if (!a->nz) {
1274c40ae873SPierre Jolivet     *norm = 0.0;
1275c40ae873SPierre Jolivet     PetscFunctionReturn(0);
1276c40ae873SPierre Jolivet   }
127749b5e25fSSatish Balay   if (type == NORM_FROBENIUS) {
127849b5e25fSSatish Balay     for (k = 0; k < mbs; k++) {
127959689ffbSStefano Zampini       jmin = a->i[k];
128059689ffbSStefano Zampini       jmax = a->i[k + 1];
1281831a3094SHong Zhang       col  = aj + jmin;
128259689ffbSStefano Zampini       if (jmax - jmin > 0 && *col == k) { /* diagonal block */
128349b5e25fSSatish Balay         for (i = 0; i < bs2; i++) {
12849371c9d4SSatish Balay           sum_diag += PetscRealPart(PetscConj(*v) * (*v));
12859371c9d4SSatish Balay           v++;
128649b5e25fSSatish Balay         }
1287831a3094SHong Zhang         jmin++;
1288831a3094SHong Zhang       }
1289831a3094SHong Zhang       for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
129049b5e25fSSatish Balay         for (i = 0; i < bs2; i++) {
12919371c9d4SSatish Balay           sum_off += PetscRealPart(PetscConj(*v) * (*v));
12929371c9d4SSatish Balay           v++;
129349b5e25fSSatish Balay         }
129449b5e25fSSatish Balay       }
129549b5e25fSSatish Balay     }
12968f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
12979566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
12980b8dc8d2SHong Zhang   } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
12999566063dSJacob Faibussowitsch     PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
13000b8dc8d2SHong Zhang     for (i = 0; i < mbs; i++) jl[i] = mbs;
13010b8dc8d2SHong Zhang     il[0] = 0;
130249b5e25fSSatish Balay 
130349b5e25fSSatish Balay     *norm = 0.0;
130449b5e25fSSatish Balay     for (k = 0; k < mbs; k++) { /* k_th block row */
130549b5e25fSSatish Balay       for (j = 0; j < bs; j++) sum[j] = 0.0;
130649b5e25fSSatish Balay       /*-- col sum --*/
130749b5e25fSSatish Balay       i = jl[k]; /* first |A(i,k)| to be added */
1308831a3094SHong Zhang       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1309831a3094SHong Zhang                   at step k */
131049b5e25fSSatish Balay       while (i < mbs) {
131149b5e25fSSatish Balay         nexti = jl[i]; /* next block row to be added */
131249b5e25fSSatish Balay         ik    = il[i]; /* block index of A(i,k) in the array a */
131349b5e25fSSatish Balay         for (j = 0; j < bs; j++) {
131449b5e25fSSatish Balay           v = a->a + ik * bs2 + j * bs;
131549b5e25fSSatish Balay           for (k1 = 0; k1 < bs; k1++) {
13169371c9d4SSatish Balay             sum[j] += PetscAbsScalar(*v);
13179371c9d4SSatish Balay             v++;
131849b5e25fSSatish Balay           }
131949b5e25fSSatish Balay         }
132049b5e25fSSatish Balay         /* update il, jl */
1321831a3094SHong Zhang         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1322831a3094SHong Zhang         jmax = a->i[i + 1];
132349b5e25fSSatish Balay         if (jmin < jmax) {
132449b5e25fSSatish Balay           il[i] = jmin;
132549b5e25fSSatish Balay           j     = a->j[jmin];
13269371c9d4SSatish Balay           jl[i] = jl[j];
13279371c9d4SSatish Balay           jl[j] = i;
132849b5e25fSSatish Balay         }
132949b5e25fSSatish Balay         i = nexti;
133049b5e25fSSatish Balay       }
133149b5e25fSSatish Balay       /*-- row sum --*/
133259689ffbSStefano Zampini       jmin = a->i[k];
133359689ffbSStefano Zampini       jmax = a->i[k + 1];
133449b5e25fSSatish Balay       for (i = jmin; i < jmax; i++) {
133549b5e25fSSatish Balay         for (j = 0; j < bs; j++) {
133649b5e25fSSatish Balay           v = a->a + i * bs2 + j;
133749b5e25fSSatish Balay           for (k1 = 0; k1 < bs; k1++) {
13389371c9d4SSatish Balay             sum[j] += PetscAbsScalar(*v);
13399371c9d4SSatish Balay             v += bs;
134049b5e25fSSatish Balay           }
134149b5e25fSSatish Balay         }
134249b5e25fSSatish Balay       }
134349b5e25fSSatish Balay       /* add k_th block row to il, jl */
1344831a3094SHong Zhang       col = aj + jmin;
134559689ffbSStefano Zampini       if (jmax - jmin > 0 && *col == k) jmin++;
134649b5e25fSSatish Balay       if (jmin < jmax) {
134749b5e25fSSatish Balay         il[k] = jmin;
13489371c9d4SSatish Balay         j     = a->j[jmin];
13499371c9d4SSatish Balay         jl[k] = jl[j];
13509371c9d4SSatish Balay         jl[j] = k;
135149b5e25fSSatish Balay       }
135249b5e25fSSatish Balay       for (j = 0; j < bs; j++) {
135349b5e25fSSatish Balay         if (sum[j] > *norm) *norm = sum[j];
135449b5e25fSSatish Balay       }
135549b5e25fSSatish Balay     }
13569566063dSJacob Faibussowitsch     PetscCall(PetscFree3(sum, il, jl));
13579566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1358f23aa3ddSBarry Smith   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
135949b5e25fSSatish Balay   PetscFunctionReturn(0);
136049b5e25fSSatish Balay }
136149b5e25fSSatish Balay 
1362*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1363*d71ae5a4SJacob Faibussowitsch {
136449b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
136549b5e25fSSatish Balay 
136649b5e25fSSatish Balay   PetscFunctionBegin;
136749b5e25fSSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1368d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1369ef511fbeSHong Zhang     *flg = PETSC_FALSE;
1370ef511fbeSHong Zhang     PetscFunctionReturn(0);
137149b5e25fSSatish Balay   }
137249b5e25fSSatish Balay 
137349b5e25fSSatish Balay   /* if the a->i are the same */
13749566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
137526fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
137649b5e25fSSatish Balay 
137749b5e25fSSatish Balay   /* if a->j are the same */
13789566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
137926fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
138026fbe8dcSKarl Rupp 
138149b5e25fSSatish Balay   /* if a->a are the same */
13829566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg));
1383935af2e7SHong Zhang   PetscFunctionReturn(0);
138449b5e25fSSatish Balay }
138549b5e25fSSatish Balay 
1386*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1387*d71ae5a4SJacob Faibussowitsch {
138849b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1389d9ca1df4SBarry Smith   PetscInt         i, j, k, row, bs, ambs, bs2;
1390d9ca1df4SBarry Smith   const PetscInt  *ai, *aj;
139187828ca2SBarry Smith   PetscScalar     *x, zero = 0.0;
1392d9ca1df4SBarry Smith   const MatScalar *aa, *aa_j;
139349b5e25fSSatish Balay 
139449b5e25fSSatish Balay   PetscFunctionBegin;
1395d0f46423SBarry Smith   bs = A->rmap->bs;
1396aed4548fSBarry Smith   PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");
139782799104SHong Zhang 
139849b5e25fSSatish Balay   aa   = a->a;
13998a0c6efdSHong Zhang   ambs = a->mbs;
14008a0c6efdSHong Zhang 
14018a0c6efdSHong Zhang   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
14028a0c6efdSHong Zhang     PetscInt *diag = a->diag;
14038a0c6efdSHong Zhang     aa             = a->a;
14048a0c6efdSHong Zhang     ambs           = a->mbs;
14059566063dSJacob Faibussowitsch     PetscCall(VecGetArray(v, &x));
14068a0c6efdSHong Zhang     for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
14079566063dSJacob Faibussowitsch     PetscCall(VecRestoreArray(v, &x));
14088a0c6efdSHong Zhang     PetscFunctionReturn(0);
14098a0c6efdSHong Zhang   }
14108a0c6efdSHong Zhang 
141149b5e25fSSatish Balay   ai  = a->i;
141249b5e25fSSatish Balay   aj  = a->j;
141349b5e25fSSatish Balay   bs2 = a->bs2;
14149566063dSJacob Faibussowitsch   PetscCall(VecSet(v, zero));
1415c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
14169566063dSJacob Faibussowitsch   PetscCall(VecGetArray(v, &x));
141749b5e25fSSatish Balay   for (i = 0; i < ambs; i++) {
141849b5e25fSSatish Balay     j = ai[i];
141949b5e25fSSatish Balay     if (aj[j] == i) { /* if this is a diagonal element */
142049b5e25fSSatish Balay       row  = i * bs;
142149b5e25fSSatish Balay       aa_j = aa + j * bs2;
142249b5e25fSSatish Balay       for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
142349b5e25fSSatish Balay     }
142449b5e25fSSatish Balay   }
14259566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(v, &x));
142649b5e25fSSatish Balay   PetscFunctionReturn(0);
142749b5e25fSSatish Balay }
142849b5e25fSSatish Balay 
1429*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1430*d71ae5a4SJacob Faibussowitsch {
143149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1432d9ca1df4SBarry Smith   PetscScalar        x;
1433d9ca1df4SBarry Smith   const PetscScalar *l, *li, *ri;
143449b5e25fSSatish Balay   MatScalar         *aa, *v;
1435fff8e43fSBarry Smith   PetscInt           i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1436fff8e43fSBarry Smith   const PetscInt    *ai, *aj;
1437ace3abfcSBarry Smith   PetscBool          flg;
143849b5e25fSSatish Balay 
143949b5e25fSSatish Balay   PetscFunctionBegin;
1440b3bf805bSHong Zhang   if (ll != rr) {
14419566063dSJacob Faibussowitsch     PetscCall(VecEqual(ll, rr, &flg));
144228b400f6SJacob Faibussowitsch     PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same");
1443b3bf805bSHong Zhang   }
1444b3bf805bSHong Zhang   if (!ll) PetscFunctionReturn(0);
144549b5e25fSSatish Balay   ai  = a->i;
144649b5e25fSSatish Balay   aj  = a->j;
144749b5e25fSSatish Balay   aa  = a->a;
1448d0f46423SBarry Smith   m   = A->rmap->N;
1449d0f46423SBarry Smith   bs  = A->rmap->bs;
145049b5e25fSSatish Balay   mbs = a->mbs;
145149b5e25fSSatish Balay   bs2 = a->bs2;
145249b5e25fSSatish Balay 
14539566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(ll, &l));
14549566063dSJacob Faibussowitsch   PetscCall(VecGetLocalSize(ll, &lm));
145508401ef6SPierre Jolivet   PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
145649b5e25fSSatish Balay   for (i = 0; i < mbs; i++) { /* for each block row */
145749b5e25fSSatish Balay     M  = ai[i + 1] - ai[i];
145849b5e25fSSatish Balay     li = l + i * bs;
145949b5e25fSSatish Balay     v  = aa + bs2 * ai[i];
146049b5e25fSSatish Balay     for (j = 0; j < M; j++) { /* for each block */
146149b5e25fSSatish Balay       ri = l + bs * aj[ai[i] + j];
14625e90f9d9SHong Zhang       for (k = 0; k < bs; k++) {
146349b5e25fSSatish Balay         x = ri[k];
146449b5e25fSSatish Balay         for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
146549b5e25fSSatish Balay       }
146649b5e25fSSatish Balay     }
146749b5e25fSSatish Balay   }
14689566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(ll, &l));
14699566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz));
147049b5e25fSSatish Balay   PetscFunctionReturn(0);
147149b5e25fSSatish Balay }
147249b5e25fSSatish Balay 
1473*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1474*d71ae5a4SJacob Faibussowitsch {
147549b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
147649b5e25fSSatish Balay 
147749b5e25fSSatish Balay   PetscFunctionBegin;
147849b5e25fSSatish Balay   info->block_size   = a->bs2;
1479ceed8ce5SJed Brown   info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
14806c6c5352SBarry Smith   info->nz_used      = a->bs2 * a->nz;    /*num. of nonzeros in upper triangular part */
14813966268fSBarry Smith   info->nz_unneeded  = info->nz_allocated - info->nz_used;
148249b5e25fSSatish Balay   info->assemblies   = A->num_ass;
14838e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
14844dfa11a4SJacob Faibussowitsch   info->memory       = 0; /* REVIEW ME */
1485d5f3da31SBarry Smith   if (A->factortype) {
148649b5e25fSSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
148749b5e25fSSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
148849b5e25fSSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
148949b5e25fSSatish Balay   } else {
149049b5e25fSSatish Balay     info->fill_ratio_given  = 0;
149149b5e25fSSatish Balay     info->fill_ratio_needed = 0;
149249b5e25fSSatish Balay     info->factor_mallocs    = 0;
149349b5e25fSSatish Balay   }
149449b5e25fSSatish Balay   PetscFunctionReturn(0);
149549b5e25fSSatish Balay }
149649b5e25fSSatish Balay 
1497*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1498*d71ae5a4SJacob Faibussowitsch {
149949b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
150049b5e25fSSatish Balay 
150149b5e25fSSatish Balay   PetscFunctionBegin;
15029566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
150349b5e25fSSatish Balay   PetscFunctionReturn(0);
150449b5e25fSSatish Balay }
1505dc354874SHong Zhang 
1506*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1507*d71ae5a4SJacob Faibussowitsch {
1508dc354874SHong Zhang   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1509d9ca1df4SBarry Smith   PetscInt         i, j, n, row, col, bs, mbs;
1510d9ca1df4SBarry Smith   const PetscInt  *ai, *aj;
1511c3fca9a7SHong Zhang   PetscReal        atmp;
1512d9ca1df4SBarry Smith   const MatScalar *aa;
1513985db425SBarry Smith   PetscScalar     *x;
151413f74950SBarry Smith   PetscInt         ncols, brow, bcol, krow, kcol;
1515dc354874SHong Zhang 
1516dc354874SHong Zhang   PetscFunctionBegin;
151728b400f6SJacob Faibussowitsch   PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
151828b400f6SJacob Faibussowitsch   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1519d0f46423SBarry Smith   bs  = A->rmap->bs;
1520dc354874SHong Zhang   aa  = a->a;
1521dc354874SHong Zhang   ai  = a->i;
1522dc354874SHong Zhang   aj  = a->j;
152344117c81SHong Zhang   mbs = a->mbs;
1524dc354874SHong Zhang 
15259566063dSJacob Faibussowitsch   PetscCall(VecSet(v, 0.0));
15269566063dSJacob Faibussowitsch   PetscCall(VecGetArray(v, &x));
15279566063dSJacob Faibussowitsch   PetscCall(VecGetLocalSize(v, &n));
152808401ef6SPierre Jolivet   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
152944117c81SHong Zhang   for (i = 0; i < mbs; i++) {
15309371c9d4SSatish Balay     ncols = ai[1] - ai[0];
15319371c9d4SSatish Balay     ai++;
1532d0f6400bSHong Zhang     brow = bs * i;
153344117c81SHong Zhang     for (j = 0; j < ncols; j++) {
1534d0f6400bSHong Zhang       bcol = bs * (*aj);
153544117c81SHong Zhang       for (kcol = 0; kcol < bs; kcol++) {
1536d0f6400bSHong Zhang         col = bcol + kcol; /* col index */
153744117c81SHong Zhang         for (krow = 0; krow < bs; krow++) {
15389371c9d4SSatish Balay           atmp = PetscAbsScalar(*aa);
15399371c9d4SSatish Balay           aa++;
1540d0f6400bSHong Zhang           row = brow + krow; /* row index */
1541c3fca9a7SHong Zhang           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1542c3fca9a7SHong Zhang           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
154344117c81SHong Zhang         }
154444117c81SHong Zhang       }
1545d0f6400bSHong Zhang       aj++;
1546dc354874SHong Zhang     }
1547dc354874SHong Zhang   }
15489566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(v, &x));
1549dc354874SHong Zhang   PetscFunctionReturn(0);
1550dc354874SHong Zhang }
1551c2916339SPierre Jolivet 
1552*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1553*d71ae5a4SJacob Faibussowitsch {
1554c2916339SPierre Jolivet   PetscFunctionBegin;
15559566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
15564222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1557c2916339SPierre Jolivet   PetscFunctionReturn(0);
1558c2916339SPierre Jolivet }
1559c2916339SPierre Jolivet 
1560*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1561*d71ae5a4SJacob Faibussowitsch {
1562c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1563c2916339SPierre Jolivet   PetscScalar       *z = c;
1564c2916339SPierre Jolivet   const PetscScalar *xb;
1565c2916339SPierre Jolivet   PetscScalar        x1;
1566c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1567c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
15683c2e41e1SStefano Zampini #if defined(PETSC_USE_COMPLEX)
1569b94d7dedSBarry Smith   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
15703c2e41e1SStefano Zampini #else
15713c2e41e1SStefano Zampini   const int aconj = 0;
15723c2e41e1SStefano Zampini #endif
1573c2916339SPierre Jolivet 
1574c2916339SPierre Jolivet   PetscFunctionBegin;
1575c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
15769371c9d4SSatish Balay     n = ii[1] - ii[0];
15779371c9d4SSatish Balay     ii++;
1578c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1579c2916339SPierre Jolivet     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1580c2916339SPierre Jolivet     jj = idx;
1581c2916339SPierre Jolivet     vv = v;
1582c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1583c2916339SPierre Jolivet       idx = jj;
1584c2916339SPierre Jolivet       v   = vv;
1585c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
15869371c9d4SSatish Balay         xb = b + (*idx);
15879371c9d4SSatish Balay         x1 = xb[0 + k * bm];
1588c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1;
15893c2e41e1SStefano Zampini         if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1590c2916339SPierre Jolivet         v += 1;
1591c2916339SPierre Jolivet         ++idx;
1592c2916339SPierre Jolivet       }
1593c2916339SPierre Jolivet     }
1594c2916339SPierre Jolivet     z += 1;
1595c2916339SPierre Jolivet   }
1596c2916339SPierre Jolivet   PetscFunctionReturn(0);
1597c2916339SPierre Jolivet }
1598c2916339SPierre Jolivet 
1599*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1600*d71ae5a4SJacob Faibussowitsch {
1601c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1602c2916339SPierre Jolivet   PetscScalar       *z = c;
1603c2916339SPierre Jolivet   const PetscScalar *xb;
1604c2916339SPierre Jolivet   PetscScalar        x1, x2;
1605c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1606c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1607c2916339SPierre Jolivet 
1608c2916339SPierre Jolivet   PetscFunctionBegin;
1609c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
16109371c9d4SSatish Balay     n = ii[1] - ii[0];
16119371c9d4SSatish Balay     ii++;
1612c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1613c2916339SPierre Jolivet     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1614c2916339SPierre Jolivet     jj = idx;
1615c2916339SPierre Jolivet     vv = v;
1616c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1617c2916339SPierre Jolivet       idx = jj;
1618c2916339SPierre Jolivet       v   = vv;
1619c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
16209371c9d4SSatish Balay         xb = b + 2 * (*idx);
16219371c9d4SSatish Balay         x1 = xb[0 + k * bm];
16229371c9d4SSatish Balay         x2 = xb[1 + k * bm];
1623c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1624c2916339SPierre Jolivet         z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1625c2916339SPierre Jolivet         if (*idx != i) {
1626c2916339SPierre Jolivet           c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1627c2916339SPierre Jolivet           c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1628c2916339SPierre Jolivet         }
1629c2916339SPierre Jolivet         v += 4;
1630c2916339SPierre Jolivet         ++idx;
1631c2916339SPierre Jolivet       }
1632c2916339SPierre Jolivet     }
1633c2916339SPierre Jolivet     z += 2;
1634c2916339SPierre Jolivet   }
1635c2916339SPierre Jolivet   PetscFunctionReturn(0);
1636c2916339SPierre Jolivet }
1637c2916339SPierre Jolivet 
1638*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1639*d71ae5a4SJacob Faibussowitsch {
1640c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1641c2916339SPierre Jolivet   PetscScalar       *z = c;
1642c2916339SPierre Jolivet   const PetscScalar *xb;
1643c2916339SPierre Jolivet   PetscScalar        x1, x2, x3;
1644c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1645c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1646c2916339SPierre Jolivet 
1647c2916339SPierre Jolivet   PetscFunctionBegin;
1648c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
16499371c9d4SSatish Balay     n = ii[1] - ii[0];
16509371c9d4SSatish Balay     ii++;
1651c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1652c2916339SPierre Jolivet     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1653c2916339SPierre Jolivet     jj = idx;
1654c2916339SPierre Jolivet     vv = v;
1655c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1656c2916339SPierre Jolivet       idx = jj;
1657c2916339SPierre Jolivet       v   = vv;
1658c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
16599371c9d4SSatish Balay         xb = b + 3 * (*idx);
16609371c9d4SSatish Balay         x1 = xb[0 + k * bm];
16619371c9d4SSatish Balay         x2 = xb[1 + k * bm];
16629371c9d4SSatish Balay         x3 = xb[2 + k * bm];
1663c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1664c2916339SPierre Jolivet         z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1665c2916339SPierre Jolivet         z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1666c2916339SPierre Jolivet         if (*idx != i) {
1667c2916339SPierre Jolivet           c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1668c2916339SPierre Jolivet           c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1669c2916339SPierre Jolivet           c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1670c2916339SPierre Jolivet         }
1671c2916339SPierre Jolivet         v += 9;
1672c2916339SPierre Jolivet         ++idx;
1673c2916339SPierre Jolivet       }
1674c2916339SPierre Jolivet     }
1675c2916339SPierre Jolivet     z += 3;
1676c2916339SPierre Jolivet   }
1677c2916339SPierre Jolivet   PetscFunctionReturn(0);
1678c2916339SPierre Jolivet }
1679c2916339SPierre Jolivet 
1680*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1681*d71ae5a4SJacob Faibussowitsch {
1682c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1683c2916339SPierre Jolivet   PetscScalar       *z = c;
1684c2916339SPierre Jolivet   const PetscScalar *xb;
1685c2916339SPierre Jolivet   PetscScalar        x1, x2, x3, x4;
1686c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1687c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1688c2916339SPierre Jolivet 
1689c2916339SPierre Jolivet   PetscFunctionBegin;
1690c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
16919371c9d4SSatish Balay     n = ii[1] - ii[0];
16929371c9d4SSatish Balay     ii++;
1693c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1694c2916339SPierre Jolivet     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1695c2916339SPierre Jolivet     jj = idx;
1696c2916339SPierre Jolivet     vv = v;
1697c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1698c2916339SPierre Jolivet       idx = jj;
1699c2916339SPierre Jolivet       v   = vv;
1700c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
17019371c9d4SSatish Balay         xb = b + 4 * (*idx);
17029371c9d4SSatish Balay         x1 = xb[0 + k * bm];
17039371c9d4SSatish Balay         x2 = xb[1 + k * bm];
17049371c9d4SSatish Balay         x3 = xb[2 + k * bm];
17059371c9d4SSatish Balay         x4 = xb[3 + k * bm];
1706c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1707c2916339SPierre Jolivet         z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1708c2916339SPierre Jolivet         z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1709c2916339SPierre Jolivet         z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1710c2916339SPierre Jolivet         if (*idx != i) {
1711c2916339SPierre Jolivet           c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1712c2916339SPierre Jolivet           c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1713c2916339SPierre Jolivet           c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1714c2916339SPierre Jolivet           c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1715c2916339SPierre Jolivet         }
1716c2916339SPierre Jolivet         v += 16;
1717c2916339SPierre Jolivet         ++idx;
1718c2916339SPierre Jolivet       }
1719c2916339SPierre Jolivet     }
1720c2916339SPierre Jolivet     z += 4;
1721c2916339SPierre Jolivet   }
1722c2916339SPierre Jolivet   PetscFunctionReturn(0);
1723c2916339SPierre Jolivet }
1724c2916339SPierre Jolivet 
1725*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1726*d71ae5a4SJacob Faibussowitsch {
1727c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1728c2916339SPierre Jolivet   PetscScalar       *z = c;
1729c2916339SPierre Jolivet   const PetscScalar *xb;
1730c2916339SPierre Jolivet   PetscScalar        x1, x2, x3, x4, x5;
1731c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1732c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1733c2916339SPierre Jolivet 
1734c2916339SPierre Jolivet   PetscFunctionBegin;
1735c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
17369371c9d4SSatish Balay     n = ii[1] - ii[0];
17379371c9d4SSatish Balay     ii++;
1738c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1739c2916339SPierre Jolivet     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1740c2916339SPierre Jolivet     jj = idx;
1741c2916339SPierre Jolivet     vv = v;
1742c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1743c2916339SPierre Jolivet       idx = jj;
1744c2916339SPierre Jolivet       v   = vv;
1745c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
17469371c9d4SSatish Balay         xb = b + 5 * (*idx);
17479371c9d4SSatish Balay         x1 = xb[0 + k * bm];
17489371c9d4SSatish Balay         x2 = xb[1 + k * bm];
17499371c9d4SSatish Balay         x3 = xb[2 + k * bm];
17509371c9d4SSatish Balay         x4 = xb[3 + k * bm];
17519371c9d4SSatish Balay         x5 = xb[4 + k * cm];
1752c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1753c2916339SPierre Jolivet         z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1754c2916339SPierre Jolivet         z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1755c2916339SPierre Jolivet         z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1756c2916339SPierre Jolivet         z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1757c2916339SPierre Jolivet         if (*idx != i) {
1758c2916339SPierre Jolivet           c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1759c2916339SPierre Jolivet           c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1760c2916339SPierre Jolivet           c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1761c2916339SPierre Jolivet           c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1762c2916339SPierre Jolivet           c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1763c2916339SPierre Jolivet         }
1764c2916339SPierre Jolivet         v += 25;
1765c2916339SPierre Jolivet         ++idx;
1766c2916339SPierre Jolivet       }
1767c2916339SPierre Jolivet     }
1768c2916339SPierre Jolivet     z += 5;
1769c2916339SPierre Jolivet   }
1770c2916339SPierre Jolivet   PetscFunctionReturn(0);
1771c2916339SPierre Jolivet }
1772c2916339SPierre Jolivet 
1773*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1774*d71ae5a4SJacob Faibussowitsch {
1775c2916339SPierre Jolivet   Mat_SeqSBAIJ    *a  = (Mat_SeqSBAIJ *)A->data;
1776c2916339SPierre Jolivet   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
1777281439baSStefano Zampini   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
1778c2916339SPierre Jolivet   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1779c2916339SPierre Jolivet   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1780c2916339SPierre Jolivet   PetscBLASInt     bbs, bcn, bbm, bcm;
1781f4259b30SLisandro Dalcin   PetscScalar     *z = NULL;
1782c2916339SPierre Jolivet   PetscScalar     *c, *b;
1783c2916339SPierre Jolivet   const MatScalar *v;
1784c2916339SPierre Jolivet   const PetscInt  *idx, *ii;
1785c2916339SPierre Jolivet   PetscScalar      _DOne = 1.0;
1786c2916339SPierre Jolivet 
1787c2916339SPierre Jolivet   PetscFunctionBegin;
1788c2916339SPierre Jolivet   if (!cm || !cn) PetscFunctionReturn(0);
178908401ef6SPierre Jolivet   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
179008401ef6SPierre Jolivet   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
179108401ef6SPierre Jolivet   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1792c2916339SPierre Jolivet   b = bd->v;
17939566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
17949566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArray(C, &c));
1795c2916339SPierre Jolivet   switch (bs) {
1796*d71ae5a4SJacob Faibussowitsch   case 1:
1797*d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1798*d71ae5a4SJacob Faibussowitsch     break;
1799*d71ae5a4SJacob Faibussowitsch   case 2:
1800*d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1801*d71ae5a4SJacob Faibussowitsch     break;
1802*d71ae5a4SJacob Faibussowitsch   case 3:
1803*d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1804*d71ae5a4SJacob Faibussowitsch     break;
1805*d71ae5a4SJacob Faibussowitsch   case 4:
1806*d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1807*d71ae5a4SJacob Faibussowitsch     break;
1808*d71ae5a4SJacob Faibussowitsch   case 5:
1809*d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1810*d71ae5a4SJacob Faibussowitsch     break;
1811c2916339SPierre Jolivet   default: /* block sizes larger than 5 by 5 are handled by BLAS */
18129566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bs, &bbs));
18139566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cn, &bcn));
18149566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bm, &bbm));
18159566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cm, &bcm));
1816c2916339SPierre Jolivet     idx = a->j;
1817c2916339SPierre Jolivet     v   = a->a;
1818c2916339SPierre Jolivet     mbs = a->mbs;
1819c2916339SPierre Jolivet     ii  = a->i;
1820c2916339SPierre Jolivet     z   = c;
1821c2916339SPierre Jolivet     for (i = 0; i < mbs; i++) {
18229371c9d4SSatish Balay       n = ii[1] - ii[0];
18239371c9d4SSatish Balay       ii++;
1824c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
1825792fecdfSBarry Smith         if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1826792fecdfSBarry Smith         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1827c2916339SPierre Jolivet         v += bs2;
1828c2916339SPierre Jolivet       }
1829c2916339SPierre Jolivet       z += bs;
1830c2916339SPierre Jolivet     }
1831c2916339SPierre Jolivet   }
18329566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArray(C, &c));
18339566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
1834c2916339SPierre Jolivet   PetscFunctionReturn(0);
1835c2916339SPierre Jolivet }
1836