xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 314f503f21e0a84e8b07b06bfeed9e7ffa2a6bbf)
149b5e25fSSatish Balay 
2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
3c2916339SPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h>
4c2916339SPierre Jolivet #include <../src/mat/impls/sbaij/seq/sbaij.h>
5af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
6c6db04a5SJed Brown #include <petscbt.h>
7c6db04a5SJed Brown #include <petscblaslapack.h>
849b5e25fSSatish Balay 
9d71ae5a4SJacob Faibussowitsch PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
10d71ae5a4SJacob Faibussowitsch {
115eee224dSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
127bede89fSBarry Smith   PetscInt        brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
135d0c19d7SBarry Smith   const PetscInt *idx;
14db41cccfSHong Zhang   PetscBT         table_out, table_in;
15d94109b8SHong Zhang 
16d94109b8SHong Zhang   PetscFunctionBegin;
1708401ef6SPierre Jolivet   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
18d94109b8SHong Zhang   mbs = a->mbs;
19d94109b8SHong Zhang   ai  = a->i;
20d94109b8SHong Zhang   aj  = a->j;
21d0f46423SBarry Smith   bs  = A->rmap->bs;
229566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(mbs, &table_out));
239566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(mbs + 1, &nidx));
249566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(mbs, &table_in));
25d94109b8SHong Zhang 
26d94109b8SHong Zhang   for (i = 0; i < is_max; i++) { /* for each is */
27d94109b8SHong Zhang     isz = 0;
289566063dSJacob Faibussowitsch     PetscCall(PetscBTMemzero(mbs, table_out));
29d94109b8SHong Zhang 
30d94109b8SHong Zhang     /* Extract the indices, assume there can be duplicate entries */
319566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(is[i], &idx));
329566063dSJacob Faibussowitsch     PetscCall(ISGetLocalSize(is[i], &n));
33d94109b8SHong Zhang 
34db41cccfSHong Zhang     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
35dbe03f88SHong Zhang     bcol_max = 0;
36d94109b8SHong Zhang     for (j = 0; j < n; ++j) {
37d94109b8SHong Zhang       brow = idx[j] / bs; /* convert the indices into block indices */
3808401ef6SPierre Jolivet       PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
39db41cccfSHong Zhang       if (!PetscBTLookupSet(table_out, brow)) {
40dbe03f88SHong Zhang         nidx[isz++] = brow;
41dbe03f88SHong Zhang         if (bcol_max < brow) bcol_max = brow;
42dbe03f88SHong Zhang       }
43d94109b8SHong Zhang     }
449566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(is[i], &idx));
459566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&is[i]));
46d94109b8SHong Zhang 
47d94109b8SHong Zhang     k = 0;
48d94109b8SHong Zhang     for (j = 0; j < ov; j++) { /* for each overlap */
49db41cccfSHong Zhang       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
509566063dSJacob Faibussowitsch       PetscCall(PetscBTMemzero(mbs, table_in));
519566063dSJacob Faibussowitsch       for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));
52d94109b8SHong Zhang 
53d94109b8SHong Zhang       n = isz; /* length of the updated is[i] */
54d94109b8SHong Zhang       for (brow = 0; brow < mbs; brow++) {
559371c9d4SSatish Balay         start = ai[brow];
569371c9d4SSatish Balay         end   = ai[brow + 1];
57db41cccfSHong Zhang         if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
58d94109b8SHong Zhang           for (l = start; l < end; l++) {
59d94109b8SHong Zhang             bcol = aj[l];
60d7b97159SHong Zhang             if (!PetscBTLookupSet(table_out, bcol)) {
61d7b97159SHong Zhang               nidx[isz++] = bcol;
62d7b97159SHong Zhang               if (bcol_max < bcol) bcol_max = bcol;
63d7b97159SHong Zhang             }
64d94109b8SHong Zhang           }
65d94109b8SHong Zhang           k++;
66d94109b8SHong Zhang           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67da81f932SPierre Jolivet         } else {             /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
68d94109b8SHong Zhang           for (l = start; l < end; l++) {
69d94109b8SHong Zhang             bcol = aj[l];
70dbe03f88SHong Zhang             if (bcol > bcol_max) break;
71db41cccfSHong Zhang             if (PetscBTLookup(table_in, bcol)) {
7226fbe8dcSKarl Rupp               if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
73d94109b8SHong Zhang               break; /* for l = start; l<end ; l++) */
74d94109b8SHong Zhang             }
75d94109b8SHong Zhang           }
76d94109b8SHong Zhang         }
77d94109b8SHong Zhang       }
78d94109b8SHong Zhang     } /* for each overlap */
797bede89fSBarry Smith     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
807bede89fSBarry Smith   } /* for each is */
819566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&table_out));
829566063dSJacob Faibussowitsch   PetscCall(PetscFree(nidx));
839566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&table_in));
843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
8549b5e25fSSatish Balay }
8649b5e25fSSatish Balay 
87847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
887cf5f706SPierre Jolivet         Zero some ops' to avoid invalid use */
89d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
90d71ae5a4SJacob Faibussowitsch {
9149b5e25fSSatish Balay   PetscFunctionBegin;
929566063dSJacob Faibussowitsch   PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
93f4259b30SLisandro Dalcin   Bseq->ops->mult                   = NULL;
94f4259b30SLisandro Dalcin   Bseq->ops->multadd                = NULL;
95f4259b30SLisandro Dalcin   Bseq->ops->multtranspose          = NULL;
96f4259b30SLisandro Dalcin   Bseq->ops->multtransposeadd       = NULL;
97f4259b30SLisandro Dalcin   Bseq->ops->lufactor               = NULL;
98f4259b30SLisandro Dalcin   Bseq->ops->choleskyfactor         = NULL;
99f4259b30SLisandro Dalcin   Bseq->ops->lufactorsymbolic       = NULL;
100f4259b30SLisandro Dalcin   Bseq->ops->choleskyfactorsymbolic = NULL;
101f4259b30SLisandro Dalcin   Bseq->ops->getinertia             = NULL;
1023ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
10349b5e25fSSatish Balay }
10449b5e25fSSatish Balay 
1057dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
106d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
107d71ae5a4SJacob Faibussowitsch {
108e50a8114SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *c;
109e50a8114SHong Zhang   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
110e50a8114SHong Zhang   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
111e50a8114SHong Zhang   const PetscInt *irow, *icol;
112e50a8114SHong Zhang   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
113e50a8114SHong Zhang   PetscInt       *aj = a->j, *ai = a->i;
114e50a8114SHong Zhang   MatScalar      *mat_a;
115e50a8114SHong Zhang   Mat             C;
116e50a8114SHong Zhang   PetscBool       flag;
117e50a8114SHong Zhang 
118e50a8114SHong Zhang   PetscFunctionBegin;
119e50a8114SHong Zhang 
1209566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &irow));
1219566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &icol));
1229566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(isrow, &nrows));
1239566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(iscol, &ncols));
124e50a8114SHong Zhang 
1259566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(1 + oldcols, &smap));
126e50a8114SHong Zhang   ssmap = smap;
1279566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(1 + nrows, &lens));
128e50a8114SHong Zhang   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
129e50a8114SHong Zhang   /* determine lens of each row */
130e50a8114SHong Zhang   for (i = 0; i < nrows; i++) {
131e50a8114SHong Zhang     kstart  = ai[irow[i]];
132e50a8114SHong Zhang     kend    = kstart + a->ilen[irow[i]];
133e50a8114SHong Zhang     lens[i] = 0;
134e50a8114SHong Zhang     for (k = kstart; k < kend; k++) {
135e50a8114SHong Zhang       if (ssmap[aj[k]]) lens[i]++;
136e50a8114SHong Zhang     }
137e50a8114SHong Zhang   }
138e50a8114SHong Zhang   /* Create and fill new matrix */
139e50a8114SHong Zhang   if (scall == MAT_REUSE_MATRIX) {
140e50a8114SHong Zhang     c = (Mat_SeqSBAIJ *)((*B)->data);
141e50a8114SHong Zhang 
142aed4548fSBarry Smith     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
1439566063dSJacob Faibussowitsch     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
14428b400f6SJacob Faibussowitsch     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
1459566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(c->ilen, c->mbs));
146e50a8114SHong Zhang     C = *B;
147e50a8114SHong Zhang   } else {
1489566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
1499566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
1509566063dSJacob Faibussowitsch     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
1519566063dSJacob Faibussowitsch     PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
152e50a8114SHong Zhang   }
153e50a8114SHong Zhang   c = (Mat_SeqSBAIJ *)(C->data);
154e50a8114SHong Zhang   for (i = 0; i < nrows; i++) {
155e50a8114SHong Zhang     row      = irow[i];
156e50a8114SHong Zhang     kstart   = ai[row];
157e50a8114SHong Zhang     kend     = kstart + a->ilen[row];
158e50a8114SHong Zhang     mat_i    = c->i[i];
159e50a8114SHong Zhang     mat_j    = c->j + mat_i;
160e50a8114SHong Zhang     mat_a    = c->a + mat_i * bs2;
161e50a8114SHong Zhang     mat_ilen = c->ilen + i;
162e50a8114SHong Zhang     for (k = kstart; k < kend; k++) {
163e50a8114SHong Zhang       if ((tcol = ssmap[a->j[k]])) {
164e50a8114SHong Zhang         *mat_j++ = tcol - 1;
1659566063dSJacob Faibussowitsch         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
166e50a8114SHong Zhang         mat_a += bs2;
167e50a8114SHong Zhang         (*mat_ilen)++;
168e50a8114SHong Zhang       }
169e50a8114SHong Zhang     }
170e50a8114SHong Zhang   }
171e50a8114SHong Zhang   /* sort */
172e50a8114SHong Zhang   {
173e50a8114SHong Zhang     MatScalar *work;
174e50a8114SHong Zhang 
1759566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(bs2, &work));
176e50a8114SHong Zhang     for (i = 0; i < nrows; i++) {
177e50a8114SHong Zhang       PetscInt ilen;
178e50a8114SHong Zhang       mat_i = c->i[i];
179e50a8114SHong Zhang       mat_j = c->j + mat_i;
180e50a8114SHong Zhang       mat_a = c->a + mat_i * bs2;
181e50a8114SHong Zhang       ilen  = c->ilen[i];
1829566063dSJacob Faibussowitsch       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
183e50a8114SHong Zhang     }
1849566063dSJacob Faibussowitsch     PetscCall(PetscFree(work));
185e50a8114SHong Zhang   }
186e50a8114SHong Zhang 
187e50a8114SHong Zhang   /* Free work space */
1889566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &icol));
1899566063dSJacob Faibussowitsch   PetscCall(PetscFree(smap));
1909566063dSJacob Faibussowitsch   PetscCall(PetscFree(lens));
1919566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1929566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
193e50a8114SHong Zhang 
1949566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &irow));
195e50a8114SHong Zhang   *B = C;
1963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
197e50a8114SHong Zhang }
198e50a8114SHong Zhang 
199d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
200d71ae5a4SJacob Faibussowitsch {
201e50a8114SHong Zhang   IS is1, is2;
20249b5e25fSSatish Balay 
20349b5e25fSSatish Balay   PetscFunctionBegin;
204f9a48b90SPierre Jolivet   PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
205f9a48b90SPierre Jolivet   if (isrow == iscol) {
206f9a48b90SPierre Jolivet     is2 = is1;
207f9a48b90SPierre Jolivet     PetscCall(PetscObjectReference((PetscObject)is2));
208f9a48b90SPierre Jolivet   } else PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
2099566063dSJacob Faibussowitsch   PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B));
2109566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is1));
2119566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is2));
212847daeccSHong Zhang 
2138f46ffcaSHong Zhang   if (isrow != iscol) {
2148f46ffcaSHong Zhang     PetscBool isequal;
2159566063dSJacob Faibussowitsch     PetscCall(ISEqual(isrow, iscol, &isequal));
21648a46eb9SPierre Jolivet     if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
21749b5e25fSSatish Balay   }
2183ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21949b5e25fSSatish Balay }
22049b5e25fSSatish Balay 
221d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
222d71ae5a4SJacob Faibussowitsch {
22313f74950SBarry Smith   PetscInt i;
22449b5e25fSSatish Balay 
22549b5e25fSSatish Balay   PetscFunctionBegin;
226*314f503fSPierre Jolivet   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B));
227e50a8114SHong Zhang 
22848a46eb9SPierre Jolivet   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
2293ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
23049b5e25fSSatish Balay }
23149b5e25fSSatish Balay 
23249b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */
233d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
234d71ae5a4SJacob Faibussowitsch {
23549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
236d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, zero = 0.0;
237d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
238d9ca1df4SBarry Smith   const MatScalar   *v;
239d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
240d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
24198c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
24249b5e25fSSatish Balay 
24349b5e25fSSatish Balay   PetscFunctionBegin;
2449566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
2453ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
2469566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
2479566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
24849b5e25fSSatish Balay 
24949b5e25fSSatish Balay   v  = a->a;
25049b5e25fSSatish Balay   xb = x;
25149b5e25fSSatish Balay 
25249b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
25349b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
2549371c9d4SSatish Balay     x1   = xb[0];
2559371c9d4SSatish Balay     x2   = xb[1];
25649b5e25fSSatish Balay     ib   = aj + *ai;
257831a3094SHong Zhang     jmin = 0;
25898c9bda7SSatish Balay     nonzerorow += (n > 0);
2597fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
26049b5e25fSSatish Balay       z[2 * i] += v[0] * x1 + v[2] * x2;
26149b5e25fSSatish Balay       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
2629371c9d4SSatish Balay       v += 4;
2639371c9d4SSatish Balay       jmin++;
2647fbae186SHong Zhang     }
265444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
266444d8c10SJed Brown     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
267831a3094SHong Zhang     for (j = jmin; j < n; j++) {
26849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
26949b5e25fSSatish Balay       cval = ib[j] * 2;
27049b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2;
27149b5e25fSSatish Balay       z[cval + 1] += v[2] * x1 + v[3] * x2;
27249b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
27349b5e25fSSatish Balay       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
27449b5e25fSSatish Balay       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
27549b5e25fSSatish Balay       v += 4;
27649b5e25fSSatish Balay     }
2779371c9d4SSatish Balay     xb += 2;
2789371c9d4SSatish Balay     ai++;
27949b5e25fSSatish Balay   }
28049b5e25fSSatish Balay 
2819566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
2829566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
2839566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
2843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
28549b5e25fSSatish Balay }
28649b5e25fSSatish Balay 
287d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
288d71ae5a4SJacob Faibussowitsch {
28949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
290d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, zero = 0.0;
291d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
292d9ca1df4SBarry Smith   const MatScalar   *v;
293d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
294d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
29598c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
29649b5e25fSSatish Balay 
29749b5e25fSSatish Balay   PetscFunctionBegin;
2989566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
2993ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
3009566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3019566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
30249b5e25fSSatish Balay 
30349b5e25fSSatish Balay   v  = a->a;
30449b5e25fSSatish Balay   xb = x;
30549b5e25fSSatish Balay 
30649b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
30749b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
3089371c9d4SSatish Balay     x1   = xb[0];
3099371c9d4SSatish Balay     x2   = xb[1];
3109371c9d4SSatish Balay     x3   = xb[2];
31149b5e25fSSatish Balay     ib   = aj + *ai;
312831a3094SHong Zhang     jmin = 0;
31398c9bda7SSatish Balay     nonzerorow += (n > 0);
3147fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
31549b5e25fSSatish Balay       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
31649b5e25fSSatish Balay       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
31749b5e25fSSatish Balay       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3189371c9d4SSatish Balay       v += 9;
3199371c9d4SSatish Balay       jmin++;
3207fbae186SHong Zhang     }
321444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
322444d8c10SJed Brown     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
323831a3094SHong Zhang     for (j = jmin; j < n; j++) {
32449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
32549b5e25fSSatish Balay       cval = ib[j] * 3;
32649b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
32749b5e25fSSatish Balay       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
32849b5e25fSSatish Balay       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
32949b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
33049b5e25fSSatish Balay       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
33149b5e25fSSatish Balay       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
33249b5e25fSSatish Balay       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
33349b5e25fSSatish Balay       v += 9;
33449b5e25fSSatish Balay     }
3359371c9d4SSatish Balay     xb += 3;
3369371c9d4SSatish Balay     ai++;
33749b5e25fSSatish Balay   }
33849b5e25fSSatish Balay 
3399566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
3409566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
3419566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
3423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
34349b5e25fSSatish Balay }
34449b5e25fSSatish Balay 
345d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
346d71ae5a4SJacob Faibussowitsch {
34749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
348d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, zero = 0.0;
349d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
350d9ca1df4SBarry Smith   const MatScalar   *v;
351d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
352d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
35398c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
35449b5e25fSSatish Balay 
35549b5e25fSSatish Balay   PetscFunctionBegin;
3569566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
3573ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
3589566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3599566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
36049b5e25fSSatish Balay 
36149b5e25fSSatish Balay   v  = a->a;
36249b5e25fSSatish Balay   xb = x;
36349b5e25fSSatish Balay 
36449b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
36549b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
3669371c9d4SSatish Balay     x1   = xb[0];
3679371c9d4SSatish Balay     x2   = xb[1];
3689371c9d4SSatish Balay     x3   = xb[2];
3699371c9d4SSatish Balay     x4   = xb[3];
37049b5e25fSSatish Balay     ib   = aj + *ai;
371831a3094SHong Zhang     jmin = 0;
37298c9bda7SSatish Balay     nonzerorow += (n > 0);
3737fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
37449b5e25fSSatish Balay       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
37549b5e25fSSatish Balay       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
37649b5e25fSSatish Balay       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
37749b5e25fSSatish Balay       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
3789371c9d4SSatish Balay       v += 16;
3799371c9d4SSatish Balay       jmin++;
3807fbae186SHong Zhang     }
381444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
382444d8c10SJed Brown     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
383831a3094SHong Zhang     for (j = jmin; j < n; j++) {
38449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
38549b5e25fSSatish Balay       cval = ib[j] * 4;
38649b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
38749b5e25fSSatish Balay       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
38849b5e25fSSatish Balay       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
38949b5e25fSSatish Balay       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
39049b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
39149b5e25fSSatish Balay       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
39249b5e25fSSatish Balay       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
39349b5e25fSSatish Balay       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
39449b5e25fSSatish Balay       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
39549b5e25fSSatish Balay       v += 16;
39649b5e25fSSatish Balay     }
3979371c9d4SSatish Balay     xb += 4;
3989371c9d4SSatish Balay     ai++;
39949b5e25fSSatish Balay   }
40049b5e25fSSatish Balay 
4019566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
4029566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
4039566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
4043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
40549b5e25fSSatish Balay }
40649b5e25fSSatish Balay 
407d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
408d71ae5a4SJacob Faibussowitsch {
40949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
410d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, zero = 0.0;
411d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
412d9ca1df4SBarry Smith   const MatScalar   *v;
413d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
414d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
41598c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
41649b5e25fSSatish Balay 
41749b5e25fSSatish Balay   PetscFunctionBegin;
4189566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
4193ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
4209566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
4219566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
42249b5e25fSSatish Balay 
42349b5e25fSSatish Balay   v  = a->a;
42449b5e25fSSatish Balay   xb = x;
42549b5e25fSSatish Balay 
42649b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
42749b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
4289371c9d4SSatish Balay     x1   = xb[0];
4299371c9d4SSatish Balay     x2   = xb[1];
4309371c9d4SSatish Balay     x3   = xb[2];
4319371c9d4SSatish Balay     x4   = xb[3];
4329371c9d4SSatish Balay     x5   = xb[4];
43349b5e25fSSatish Balay     ib   = aj + *ai;
434831a3094SHong Zhang     jmin = 0;
43598c9bda7SSatish Balay     nonzerorow += (n > 0);
4367fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
43749b5e25fSSatish Balay       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
43849b5e25fSSatish Balay       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
43949b5e25fSSatish Balay       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
44049b5e25fSSatish Balay       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
44149b5e25fSSatish Balay       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
4429371c9d4SSatish Balay       v += 25;
4439371c9d4SSatish Balay       jmin++;
4447fbae186SHong Zhang     }
445444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
446444d8c10SJed Brown     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
447831a3094SHong Zhang     for (j = jmin; j < n; j++) {
44849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
44949b5e25fSSatish Balay       cval = ib[j] * 5;
45049b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
45149b5e25fSSatish Balay       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
45249b5e25fSSatish Balay       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
45349b5e25fSSatish Balay       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
45449b5e25fSSatish Balay       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
45549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
45649b5e25fSSatish Balay       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
45749b5e25fSSatish Balay       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
45849b5e25fSSatish Balay       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
45949b5e25fSSatish Balay       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
46049b5e25fSSatish Balay       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
46149b5e25fSSatish Balay       v += 25;
46249b5e25fSSatish Balay     }
4639371c9d4SSatish Balay     xb += 5;
4649371c9d4SSatish Balay     ai++;
46549b5e25fSSatish Balay   }
46649b5e25fSSatish Balay 
4679566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
4689566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
4699566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
4703ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
47149b5e25fSSatish Balay }
47249b5e25fSSatish Balay 
473d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
474d71ae5a4SJacob Faibussowitsch {
47549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
476d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
477d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
478d9ca1df4SBarry Smith   const MatScalar   *v;
479d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
480d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
48198c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
48249b5e25fSSatish Balay 
48349b5e25fSSatish Balay   PetscFunctionBegin;
4849566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
4853ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
4869566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
4879566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
48849b5e25fSSatish Balay 
48949b5e25fSSatish Balay   v  = a->a;
49049b5e25fSSatish Balay   xb = x;
49149b5e25fSSatish Balay 
49249b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
49349b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
4949371c9d4SSatish Balay     x1   = xb[0];
4959371c9d4SSatish Balay     x2   = xb[1];
4969371c9d4SSatish Balay     x3   = xb[2];
4979371c9d4SSatish Balay     x4   = xb[3];
4989371c9d4SSatish Balay     x5   = xb[4];
4999371c9d4SSatish Balay     x6   = xb[5];
50049b5e25fSSatish Balay     ib   = aj + *ai;
501831a3094SHong Zhang     jmin = 0;
50298c9bda7SSatish Balay     nonzerorow += (n > 0);
5037fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
50449b5e25fSSatish Balay       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
50549b5e25fSSatish Balay       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
50649b5e25fSSatish Balay       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
50749b5e25fSSatish Balay       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
50849b5e25fSSatish Balay       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
50949b5e25fSSatish Balay       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
5109371c9d4SSatish Balay       v += 36;
5119371c9d4SSatish Balay       jmin++;
5127fbae186SHong Zhang     }
513444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
514444d8c10SJed Brown     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
515831a3094SHong Zhang     for (j = jmin; j < n; j++) {
51649b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
51749b5e25fSSatish Balay       cval = ib[j] * 6;
51849b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
51949b5e25fSSatish Balay       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
52049b5e25fSSatish Balay       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
52149b5e25fSSatish Balay       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
52249b5e25fSSatish Balay       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
52349b5e25fSSatish Balay       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
52449b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
52549b5e25fSSatish Balay       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
52649b5e25fSSatish Balay       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
52749b5e25fSSatish Balay       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
52849b5e25fSSatish Balay       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
52949b5e25fSSatish Balay       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
53049b5e25fSSatish Balay       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
53149b5e25fSSatish Balay       v += 36;
53249b5e25fSSatish Balay     }
5339371c9d4SSatish Balay     xb += 6;
5349371c9d4SSatish Balay     ai++;
53549b5e25fSSatish Balay   }
53649b5e25fSSatish Balay 
5379566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
5389566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
5399566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
5403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
54149b5e25fSSatish Balay }
542c2916339SPierre Jolivet 
543d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
544d71ae5a4SJacob Faibussowitsch {
54549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
546d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
547d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
548d9ca1df4SBarry Smith   const MatScalar   *v;
549d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
550d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
55198c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
55249b5e25fSSatish Balay 
55349b5e25fSSatish Balay   PetscFunctionBegin;
5549566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
5553ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
5569566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
5579566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
55849b5e25fSSatish Balay 
55949b5e25fSSatish Balay   v  = a->a;
56049b5e25fSSatish Balay   xb = x;
56149b5e25fSSatish Balay 
56249b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
56349b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
5649371c9d4SSatish Balay     x1   = xb[0];
5659371c9d4SSatish Balay     x2   = xb[1];
5669371c9d4SSatish Balay     x3   = xb[2];
5679371c9d4SSatish Balay     x4   = xb[3];
5689371c9d4SSatish Balay     x5   = xb[4];
5699371c9d4SSatish Balay     x6   = xb[5];
5709371c9d4SSatish Balay     x7   = xb[6];
57149b5e25fSSatish Balay     ib   = aj + *ai;
572831a3094SHong Zhang     jmin = 0;
57398c9bda7SSatish Balay     nonzerorow += (n > 0);
5747fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
57549b5e25fSSatish Balay       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
57649b5e25fSSatish Balay       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
57749b5e25fSSatish Balay       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
57849b5e25fSSatish Balay       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
57949b5e25fSSatish Balay       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
58049b5e25fSSatish Balay       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
58149b5e25fSSatish Balay       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
5829371c9d4SSatish Balay       v += 49;
5839371c9d4SSatish Balay       jmin++;
5847fbae186SHong Zhang     }
585444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
586444d8c10SJed Brown     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
587831a3094SHong Zhang     for (j = jmin; j < n; j++) {
58849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
58949b5e25fSSatish Balay       cval = ib[j] * 7;
59049b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
59149b5e25fSSatish Balay       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
59249b5e25fSSatish Balay       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
59349b5e25fSSatish Balay       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
59449b5e25fSSatish Balay       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
59549b5e25fSSatish Balay       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
59649b5e25fSSatish Balay       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
59749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
59849b5e25fSSatish Balay       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
59949b5e25fSSatish Balay       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
60049b5e25fSSatish Balay       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
60149b5e25fSSatish Balay       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
60249b5e25fSSatish Balay       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
60349b5e25fSSatish Balay       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
60449b5e25fSSatish Balay       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
60549b5e25fSSatish Balay       v += 49;
60649b5e25fSSatish Balay     }
6079371c9d4SSatish Balay     xb += 7;
6089371c9d4SSatish Balay     ai++;
60949b5e25fSSatish Balay   }
6109566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
6119566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
6129566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
6133ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
61449b5e25fSSatish Balay }
61549b5e25fSSatish Balay 
61649b5e25fSSatish Balay /*
61749b5e25fSSatish Balay     This will not work with MatScalar == float because it calls the BLAS
61849b5e25fSSatish Balay */
619d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
620d71ae5a4SJacob Faibussowitsch {
62149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
622d9ca1df4SBarry Smith   PetscScalar       *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
623d9ca1df4SBarry Smith   const PetscScalar *x, *x_ptr, *xb;
624d9ca1df4SBarry Smith   const MatScalar   *v;
625d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
626d9ca1df4SBarry Smith   const PetscInt    *idx, *aj, *ii;
62798c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
62849b5e25fSSatish Balay 
62949b5e25fSSatish Balay   PetscFunctionBegin;
6309566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
6313ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
6329566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
6339566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
63459689ffbSStefano Zampini 
63559689ffbSStefano Zampini   x_ptr = x;
63659689ffbSStefano Zampini   z_ptr = z;
63749b5e25fSSatish Balay 
63849b5e25fSSatish Balay   aj = a->j;
63949b5e25fSSatish Balay   v  = a->a;
64049b5e25fSSatish Balay   ii = a->i;
64149b5e25fSSatish Balay 
64248a46eb9SPierre Jolivet   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
64349b5e25fSSatish Balay   work = a->mult_work;
64449b5e25fSSatish Balay 
64549b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
6469371c9d4SSatish Balay     n     = ii[1] - ii[0];
6479371c9d4SSatish Balay     ncols = n * bs;
6489371c9d4SSatish Balay     workt = work;
6499371c9d4SSatish Balay     idx   = aj + ii[0];
65098c9bda7SSatish Balay     nonzerorow += (n > 0);
65149b5e25fSSatish Balay 
65249b5e25fSSatish Balay     /* upper triangular part */
65349b5e25fSSatish Balay     for (j = 0; j < n; j++) {
65449b5e25fSSatish Balay       xb = x_ptr + bs * (*idx++);
65549b5e25fSSatish Balay       for (k = 0; k < bs; k++) workt[k] = xb[k];
65649b5e25fSSatish Balay       workt += bs;
65749b5e25fSSatish Balay     }
65849b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
65996b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
66049b5e25fSSatish Balay 
66149b5e25fSSatish Balay     /* strict lower triangular part */
662831a3094SHong Zhang     idx = aj + ii[0];
66359689ffbSStefano Zampini     if (n && *idx == i) {
6649371c9d4SSatish Balay       ncols -= bs;
6659371c9d4SSatish Balay       v += bs2;
6669371c9d4SSatish Balay       idx++;
6679371c9d4SSatish Balay       n--;
668831a3094SHong Zhang     }
66996b9376eSHong Zhang 
67049b5e25fSSatish Balay     if (ncols > 0) {
67149b5e25fSSatish Balay       workt = work;
6729566063dSJacob Faibussowitsch       PetscCall(PetscArrayzero(workt, ncols));
67396b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
674831a3094SHong Zhang       for (j = 0; j < n; j++) {
675831a3094SHong Zhang         zb = z_ptr + bs * (*idx++);
67649b5e25fSSatish Balay         for (k = 0; k < bs; k++) zb[k] += workt[k];
67749b5e25fSSatish Balay         workt += bs;
67849b5e25fSSatish Balay       }
67949b5e25fSSatish Balay     }
6809371c9d4SSatish Balay     x += bs;
6819371c9d4SSatish Balay     v += n * bs2;
6829371c9d4SSatish Balay     z += bs;
6839371c9d4SSatish Balay     ii++;
68449b5e25fSSatish Balay   }
68549b5e25fSSatish Balay 
6869566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
6879566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
6889566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
6893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
69049b5e25fSSatish Balay }
69149b5e25fSSatish Balay 
692d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
693d71ae5a4SJacob Faibussowitsch {
69449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
695d9ca1df4SBarry Smith   PetscScalar       *z, x1;
696d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
697d9ca1df4SBarry Smith   const MatScalar   *v;
698d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
699d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
70098c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
701eb1ec7c1SStefano Zampini #if defined(PETSC_USE_COMPLEX)
702b94d7dedSBarry Smith   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
703eb1ec7c1SStefano Zampini #else
704eb1ec7c1SStefano Zampini   const int aconj = 0;
705eb1ec7c1SStefano Zampini #endif
70649b5e25fSSatish Balay 
70749b5e25fSSatish Balay   PetscFunctionBegin;
7089566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
7093ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
7109566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
7119566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
71249b5e25fSSatish Balay   v  = a->a;
71349b5e25fSSatish Balay   xb = x;
71449b5e25fSSatish Balay 
71549b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
71649b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th row of A */
71749b5e25fSSatish Balay     x1   = xb[0];
71849b5e25fSSatish Balay     ib   = aj + *ai;
719831a3094SHong Zhang     jmin = 0;
72098c9bda7SSatish Balay     nonzerorow += (n > 0);
7213d9ade75SStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
7229371c9d4SSatish Balay       z[i] += *v++ * x[*ib++];
7239371c9d4SSatish Balay       jmin++;
724831a3094SHong Zhang     }
725eb1ec7c1SStefano Zampini     if (aconj) {
726eb1ec7c1SStefano Zampini       for (j = jmin; j < n; j++) {
727eb1ec7c1SStefano Zampini         cval = *ib;
728eb1ec7c1SStefano Zampini         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
729eb1ec7c1SStefano Zampini         z[i] += *v++ * x[*ib++];       /* (strict upper triangular part of A)*x  */
730eb1ec7c1SStefano Zampini       }
731eb1ec7c1SStefano Zampini     } else {
732831a3094SHong Zhang       for (j = jmin; j < n; j++) {
73349b5e25fSSatish Balay         cval = *ib;
73449b5e25fSSatish Balay         z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
73549b5e25fSSatish Balay         z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
73649b5e25fSSatish Balay       }
737eb1ec7c1SStefano Zampini     }
7389371c9d4SSatish Balay     xb++;
7399371c9d4SSatish Balay     ai++;
74049b5e25fSSatish Balay   }
74149b5e25fSSatish Balay 
7429566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
7439566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
74449b5e25fSSatish Balay 
7459566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
7463ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
74749b5e25fSSatish Balay }
74849b5e25fSSatish Balay 
749d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
750d71ae5a4SJacob Faibussowitsch {
75149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
752d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2;
753d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
754d9ca1df4SBarry Smith   const MatScalar   *v;
755d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
756d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
75798c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
75849b5e25fSSatish Balay 
75949b5e25fSSatish Balay   PetscFunctionBegin;
7609566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
7613ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
7629566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
7639566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
76449b5e25fSSatish Balay 
76549b5e25fSSatish Balay   v  = a->a;
76649b5e25fSSatish Balay   xb = x;
76749b5e25fSSatish Balay 
76849b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
76949b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
7709371c9d4SSatish Balay     x1   = xb[0];
7719371c9d4SSatish Balay     x2   = xb[1];
77249b5e25fSSatish Balay     ib   = aj + *ai;
773831a3094SHong Zhang     jmin = 0;
77498c9bda7SSatish Balay     nonzerorow += (n > 0);
77559689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
77649b5e25fSSatish Balay       z[2 * i] += v[0] * x1 + v[2] * x2;
77749b5e25fSSatish Balay       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
7789371c9d4SSatish Balay       v += 4;
7799371c9d4SSatish Balay       jmin++;
7807fbae186SHong Zhang     }
781444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
782444d8c10SJed Brown     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
783831a3094SHong Zhang     for (j = jmin; j < n; j++) {
78449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
78549b5e25fSSatish Balay       cval = ib[j] * 2;
78649b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2;
78749b5e25fSSatish Balay       z[cval + 1] += v[2] * x1 + v[3] * x2;
78849b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
78949b5e25fSSatish Balay       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
79049b5e25fSSatish Balay       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
79149b5e25fSSatish Balay       v += 4;
79249b5e25fSSatish Balay     }
7939371c9d4SSatish Balay     xb += 2;
7949371c9d4SSatish Balay     ai++;
79549b5e25fSSatish Balay   }
7969566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
7979566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
79849b5e25fSSatish Balay 
7999566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
8003ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
80149b5e25fSSatish Balay }
80249b5e25fSSatish Balay 
803d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
804d71ae5a4SJacob Faibussowitsch {
80549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
806d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3;
807d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
808d9ca1df4SBarry Smith   const MatScalar   *v;
809d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
810d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
81198c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
81249b5e25fSSatish Balay 
81349b5e25fSSatish Balay   PetscFunctionBegin;
8149566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
8153ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
8169566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
8179566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
81849b5e25fSSatish Balay 
81949b5e25fSSatish Balay   v  = a->a;
82049b5e25fSSatish Balay   xb = x;
82149b5e25fSSatish Balay 
82249b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
82349b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
8249371c9d4SSatish Balay     x1   = xb[0];
8259371c9d4SSatish Balay     x2   = xb[1];
8269371c9d4SSatish Balay     x3   = xb[2];
82749b5e25fSSatish Balay     ib   = aj + *ai;
828831a3094SHong Zhang     jmin = 0;
82998c9bda7SSatish Balay     nonzerorow += (n > 0);
83059689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
83149b5e25fSSatish Balay       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
83249b5e25fSSatish Balay       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
83349b5e25fSSatish Balay       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
8349371c9d4SSatish Balay       v += 9;
8359371c9d4SSatish Balay       jmin++;
8367fbae186SHong Zhang     }
837444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
838444d8c10SJed Brown     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
839831a3094SHong Zhang     for (j = jmin; j < n; j++) {
84049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
84149b5e25fSSatish Balay       cval = ib[j] * 3;
84249b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
84349b5e25fSSatish Balay       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
84449b5e25fSSatish Balay       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
84549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
84649b5e25fSSatish Balay       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
84749b5e25fSSatish Balay       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
84849b5e25fSSatish Balay       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
84949b5e25fSSatish Balay       v += 9;
85049b5e25fSSatish Balay     }
8519371c9d4SSatish Balay     xb += 3;
8529371c9d4SSatish Balay     ai++;
85349b5e25fSSatish Balay   }
85449b5e25fSSatish Balay 
8559566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
8569566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
85749b5e25fSSatish Balay 
8589566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
8593ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
86049b5e25fSSatish Balay }
86149b5e25fSSatish Balay 
862d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
863d71ae5a4SJacob Faibussowitsch {
86449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
865d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4;
866d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
867d9ca1df4SBarry Smith   const MatScalar   *v;
868d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
869d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
87098c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
87149b5e25fSSatish Balay 
87249b5e25fSSatish Balay   PetscFunctionBegin;
8739566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
8743ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
8759566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
8769566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
87749b5e25fSSatish Balay 
87849b5e25fSSatish Balay   v  = a->a;
87949b5e25fSSatish Balay   xb = x;
88049b5e25fSSatish Balay 
88149b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
88249b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
8839371c9d4SSatish Balay     x1   = xb[0];
8849371c9d4SSatish Balay     x2   = xb[1];
8859371c9d4SSatish Balay     x3   = xb[2];
8869371c9d4SSatish Balay     x4   = xb[3];
88749b5e25fSSatish Balay     ib   = aj + *ai;
888831a3094SHong Zhang     jmin = 0;
88998c9bda7SSatish Balay     nonzerorow += (n > 0);
89059689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
89149b5e25fSSatish Balay       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
89249b5e25fSSatish Balay       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
89349b5e25fSSatish Balay       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
89449b5e25fSSatish Balay       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
8959371c9d4SSatish Balay       v += 16;
8969371c9d4SSatish Balay       jmin++;
8977fbae186SHong Zhang     }
898444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
899444d8c10SJed Brown     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
900831a3094SHong Zhang     for (j = jmin; j < n; j++) {
90149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
90249b5e25fSSatish Balay       cval = ib[j] * 4;
90349b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
90449b5e25fSSatish Balay       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
90549b5e25fSSatish Balay       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
90649b5e25fSSatish Balay       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
90749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
90849b5e25fSSatish Balay       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
90949b5e25fSSatish Balay       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
91049b5e25fSSatish Balay       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
91149b5e25fSSatish Balay       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
91249b5e25fSSatish Balay       v += 16;
91349b5e25fSSatish Balay     }
9149371c9d4SSatish Balay     xb += 4;
9159371c9d4SSatish Balay     ai++;
91649b5e25fSSatish Balay   }
91749b5e25fSSatish Balay 
9189566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
9199566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
92049b5e25fSSatish Balay 
9219566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
9223ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
92349b5e25fSSatish Balay }
92449b5e25fSSatish Balay 
925d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
926d71ae5a4SJacob Faibussowitsch {
92749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
928d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5;
929d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
930d9ca1df4SBarry Smith   const MatScalar   *v;
931d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
932d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
93398c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
93449b5e25fSSatish Balay 
93549b5e25fSSatish Balay   PetscFunctionBegin;
9369566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
9373ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
9389566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
9399566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
94049b5e25fSSatish Balay 
94149b5e25fSSatish Balay   v  = a->a;
94249b5e25fSSatish Balay   xb = x;
94349b5e25fSSatish Balay 
94449b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
94549b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
9469371c9d4SSatish Balay     x1   = xb[0];
9479371c9d4SSatish Balay     x2   = xb[1];
9489371c9d4SSatish Balay     x3   = xb[2];
9499371c9d4SSatish Balay     x4   = xb[3];
9509371c9d4SSatish Balay     x5   = xb[4];
95149b5e25fSSatish Balay     ib   = aj + *ai;
952831a3094SHong Zhang     jmin = 0;
95398c9bda7SSatish Balay     nonzerorow += (n > 0);
95459689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
95549b5e25fSSatish Balay       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
95649b5e25fSSatish Balay       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
95749b5e25fSSatish Balay       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
95849b5e25fSSatish Balay       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
95949b5e25fSSatish Balay       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
9609371c9d4SSatish Balay       v += 25;
9619371c9d4SSatish Balay       jmin++;
9627fbae186SHong Zhang     }
963444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
964444d8c10SJed Brown     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
965831a3094SHong Zhang     for (j = jmin; j < n; j++) {
96649b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
96749b5e25fSSatish Balay       cval = ib[j] * 5;
96849b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
96949b5e25fSSatish Balay       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
97049b5e25fSSatish Balay       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
97149b5e25fSSatish Balay       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
97249b5e25fSSatish Balay       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
97349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
97449b5e25fSSatish Balay       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
97549b5e25fSSatish Balay       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
97649b5e25fSSatish Balay       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
97749b5e25fSSatish Balay       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
97849b5e25fSSatish Balay       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
97949b5e25fSSatish Balay       v += 25;
98049b5e25fSSatish Balay     }
9819371c9d4SSatish Balay     xb += 5;
9829371c9d4SSatish Balay     ai++;
98349b5e25fSSatish Balay   }
98449b5e25fSSatish Balay 
9859566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
9869566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
98749b5e25fSSatish Balay 
9889566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
9893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
99049b5e25fSSatish Balay }
991c2916339SPierre Jolivet 
992d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
993d71ae5a4SJacob Faibussowitsch {
99449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
995d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, x6;
996d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
997d9ca1df4SBarry Smith   const MatScalar   *v;
998d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
999d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
100098c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
100149b5e25fSSatish Balay 
100249b5e25fSSatish Balay   PetscFunctionBegin;
10039566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
10043ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
10059566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
10069566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
100749b5e25fSSatish Balay 
100849b5e25fSSatish Balay   v  = a->a;
100949b5e25fSSatish Balay   xb = x;
101049b5e25fSSatish Balay 
101149b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
101249b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
10139371c9d4SSatish Balay     x1   = xb[0];
10149371c9d4SSatish Balay     x2   = xb[1];
10159371c9d4SSatish Balay     x3   = xb[2];
10169371c9d4SSatish Balay     x4   = xb[3];
10179371c9d4SSatish Balay     x5   = xb[4];
10189371c9d4SSatish Balay     x6   = xb[5];
101949b5e25fSSatish Balay     ib   = aj + *ai;
1020831a3094SHong Zhang     jmin = 0;
102198c9bda7SSatish Balay     nonzerorow += (n > 0);
102259689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
102349b5e25fSSatish Balay       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
102449b5e25fSSatish Balay       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
102549b5e25fSSatish Balay       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
102649b5e25fSSatish Balay       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
102749b5e25fSSatish Balay       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
102849b5e25fSSatish Balay       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
10299371c9d4SSatish Balay       v += 36;
10309371c9d4SSatish Balay       jmin++;
10317fbae186SHong Zhang     }
1032444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1033444d8c10SJed Brown     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1034831a3094SHong Zhang     for (j = jmin; j < n; j++) {
103549b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
103649b5e25fSSatish Balay       cval = ib[j] * 6;
103749b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
103849b5e25fSSatish Balay       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
103949b5e25fSSatish Balay       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
104049b5e25fSSatish Balay       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
104149b5e25fSSatish Balay       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
104249b5e25fSSatish Balay       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
104349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
104449b5e25fSSatish Balay       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
104549b5e25fSSatish Balay       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
104649b5e25fSSatish Balay       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
104749b5e25fSSatish Balay       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
104849b5e25fSSatish Balay       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
104949b5e25fSSatish Balay       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
105049b5e25fSSatish Balay       v += 36;
105149b5e25fSSatish Balay     }
10529371c9d4SSatish Balay     xb += 6;
10539371c9d4SSatish Balay     ai++;
105449b5e25fSSatish Balay   }
105549b5e25fSSatish Balay 
10569566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
10579566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
105849b5e25fSSatish Balay 
10599566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
10603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
106149b5e25fSSatish Balay }
106249b5e25fSSatish Balay 
1063d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1064d71ae5a4SJacob Faibussowitsch {
106549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1066d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7;
1067d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
1068d9ca1df4SBarry Smith   const MatScalar   *v;
1069d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1070d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
107198c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
107249b5e25fSSatish Balay 
107349b5e25fSSatish Balay   PetscFunctionBegin;
10749566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
10753ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
10769566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
10779566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
107849b5e25fSSatish Balay 
107949b5e25fSSatish Balay   v  = a->a;
108049b5e25fSSatish Balay   xb = x;
108149b5e25fSSatish Balay 
108249b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
108349b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
10849371c9d4SSatish Balay     x1   = xb[0];
10859371c9d4SSatish Balay     x2   = xb[1];
10869371c9d4SSatish Balay     x3   = xb[2];
10879371c9d4SSatish Balay     x4   = xb[3];
10889371c9d4SSatish Balay     x5   = xb[4];
10899371c9d4SSatish Balay     x6   = xb[5];
10909371c9d4SSatish Balay     x7   = xb[6];
109149b5e25fSSatish Balay     ib   = aj + *ai;
1092831a3094SHong Zhang     jmin = 0;
109398c9bda7SSatish Balay     nonzerorow += (n > 0);
109459689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
109549b5e25fSSatish Balay       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
109649b5e25fSSatish Balay       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
109749b5e25fSSatish Balay       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
109849b5e25fSSatish Balay       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
109949b5e25fSSatish Balay       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
110049b5e25fSSatish Balay       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
110149b5e25fSSatish Balay       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
11029371c9d4SSatish Balay       v += 49;
11039371c9d4SSatish Balay       jmin++;
11047fbae186SHong Zhang     }
1105444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1106444d8c10SJed Brown     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1107831a3094SHong Zhang     for (j = jmin; j < n; j++) {
110849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
110949b5e25fSSatish Balay       cval = ib[j] * 7;
111049b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
111149b5e25fSSatish Balay       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
111249b5e25fSSatish Balay       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
111349b5e25fSSatish Balay       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
111449b5e25fSSatish Balay       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
111549b5e25fSSatish Balay       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
111649b5e25fSSatish Balay       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
111749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
111849b5e25fSSatish Balay       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
111949b5e25fSSatish Balay       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
112049b5e25fSSatish Balay       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
112149b5e25fSSatish Balay       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
112249b5e25fSSatish Balay       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
112349b5e25fSSatish Balay       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
112449b5e25fSSatish Balay       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
112549b5e25fSSatish Balay       v += 49;
112649b5e25fSSatish Balay     }
11279371c9d4SSatish Balay     xb += 7;
11289371c9d4SSatish Balay     ai++;
112949b5e25fSSatish Balay   }
113049b5e25fSSatish Balay 
11319566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
11329566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
113349b5e25fSSatish Balay 
11349566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
11353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
113649b5e25fSSatish Balay }
113749b5e25fSSatish Balay 
1138d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1139d71ae5a4SJacob Faibussowitsch {
114049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1141f4259b30SLisandro Dalcin   PetscScalar       *z, *z_ptr = NULL, *zb, *work, *workt;
1142d9ca1df4SBarry Smith   const PetscScalar *x, *x_ptr, *xb;
1143d9ca1df4SBarry Smith   const MatScalar   *v;
1144d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1145d9ca1df4SBarry Smith   const PetscInt    *idx, *aj, *ii;
114698c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
114749b5e25fSSatish Balay 
114849b5e25fSSatish Balay   PetscFunctionBegin;
11499566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
11503ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
11519371c9d4SSatish Balay   PetscCall(VecGetArrayRead(xx, &x));
11529371c9d4SSatish Balay   x_ptr = x;
11539371c9d4SSatish Balay   PetscCall(VecGetArray(zz, &z));
11549371c9d4SSatish Balay   z_ptr = z;
115549b5e25fSSatish Balay 
115649b5e25fSSatish Balay   aj = a->j;
115749b5e25fSSatish Balay   v  = a->a;
115849b5e25fSSatish Balay   ii = a->i;
115949b5e25fSSatish Balay 
116048a46eb9SPierre Jolivet   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
116149b5e25fSSatish Balay   work = a->mult_work;
116249b5e25fSSatish Balay 
116349b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
11649371c9d4SSatish Balay     n     = ii[1] - ii[0];
11659371c9d4SSatish Balay     ncols = n * bs;
11669371c9d4SSatish Balay     workt = work;
11679371c9d4SSatish Balay     idx   = aj + ii[0];
116898c9bda7SSatish Balay     nonzerorow += (n > 0);
116949b5e25fSSatish Balay 
117049b5e25fSSatish Balay     /* upper triangular part */
117149b5e25fSSatish Balay     for (j = 0; j < n; j++) {
117249b5e25fSSatish Balay       xb = x_ptr + bs * (*idx++);
117349b5e25fSSatish Balay       for (k = 0; k < bs; k++) workt[k] = xb[k];
117449b5e25fSSatish Balay       workt += bs;
117549b5e25fSSatish Balay     }
117649b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
117796b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
117849b5e25fSSatish Balay 
117949b5e25fSSatish Balay     /* strict lower triangular part */
1180831a3094SHong Zhang     idx = aj + ii[0];
118159689ffbSStefano Zampini     if (n && *idx == i) {
11829371c9d4SSatish Balay       ncols -= bs;
11839371c9d4SSatish Balay       v += bs2;
11849371c9d4SSatish Balay       idx++;
11859371c9d4SSatish Balay       n--;
1186831a3094SHong Zhang     }
118749b5e25fSSatish Balay     if (ncols > 0) {
118849b5e25fSSatish Balay       workt = work;
11899566063dSJacob Faibussowitsch       PetscCall(PetscArrayzero(workt, ncols));
119096b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1191831a3094SHong Zhang       for (j = 0; j < n; j++) {
1192831a3094SHong Zhang         zb = z_ptr + bs * (*idx++);
119349b5e25fSSatish Balay         for (k = 0; k < bs; k++) zb[k] += workt[k];
119449b5e25fSSatish Balay         workt += bs;
119549b5e25fSSatish Balay       }
119649b5e25fSSatish Balay     }
119749b5e25fSSatish Balay 
11989371c9d4SSatish Balay     x += bs;
11999371c9d4SSatish Balay     v += n * bs2;
12009371c9d4SSatish Balay     z += bs;
12019371c9d4SSatish Balay     ii++;
120249b5e25fSSatish Balay   }
120349b5e25fSSatish Balay 
12049566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
12059566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
120649b5e25fSSatish Balay 
12079566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
12083ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
120949b5e25fSSatish Balay }
121049b5e25fSSatish Balay 
1211d71ae5a4SJacob Faibussowitsch PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1212d71ae5a4SJacob Faibussowitsch {
121349b5e25fSSatish Balay   Mat_SeqSBAIJ *a      = (Mat_SeqSBAIJ *)inA->data;
1214f4df32b1SMatthew Knepley   PetscScalar   oalpha = alpha;
1215c5df96a5SBarry Smith   PetscBLASInt  one    = 1, totalnz;
121649b5e25fSSatish Balay 
121749b5e25fSSatish Balay   PetscFunctionBegin;
12189566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1219792fecdfSBarry Smith   PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
12209566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(totalnz));
12213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
122249b5e25fSSatish Balay }
122349b5e25fSSatish Balay 
1224d71ae5a4SJacob Faibussowitsch PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1225d71ae5a4SJacob Faibussowitsch {
122649b5e25fSSatish Balay   Mat_SeqSBAIJ    *a        = (Mat_SeqSBAIJ *)A->data;
1227d9ca1df4SBarry Smith   const MatScalar *v        = a->a;
122849b5e25fSSatish Balay   PetscReal        sum_diag = 0.0, sum_off = 0.0, *sum;
1229d9ca1df4SBarry Smith   PetscInt         i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1230d9ca1df4SBarry Smith   const PetscInt  *aj = a->j, *col;
123149b5e25fSSatish Balay 
123249b5e25fSSatish Balay   PetscFunctionBegin;
1233c40ae873SPierre Jolivet   if (!a->nz) {
1234c40ae873SPierre Jolivet     *norm = 0.0;
12353ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1236c40ae873SPierre Jolivet   }
123749b5e25fSSatish Balay   if (type == NORM_FROBENIUS) {
123849b5e25fSSatish Balay     for (k = 0; k < mbs; k++) {
123959689ffbSStefano Zampini       jmin = a->i[k];
124059689ffbSStefano Zampini       jmax = a->i[k + 1];
1241831a3094SHong Zhang       col  = aj + jmin;
124259689ffbSStefano Zampini       if (jmax - jmin > 0 && *col == k) { /* diagonal block */
124349b5e25fSSatish Balay         for (i = 0; i < bs2; i++) {
12449371c9d4SSatish Balay           sum_diag += PetscRealPart(PetscConj(*v) * (*v));
12459371c9d4SSatish Balay           v++;
124649b5e25fSSatish Balay         }
1247831a3094SHong Zhang         jmin++;
1248831a3094SHong Zhang       }
1249831a3094SHong Zhang       for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
125049b5e25fSSatish Balay         for (i = 0; i < bs2; i++) {
12519371c9d4SSatish Balay           sum_off += PetscRealPart(PetscConj(*v) * (*v));
12529371c9d4SSatish Balay           v++;
125349b5e25fSSatish Balay         }
125449b5e25fSSatish Balay       }
125549b5e25fSSatish Balay     }
12568f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
12579566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
12580b8dc8d2SHong Zhang   } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
12599566063dSJacob Faibussowitsch     PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
12600b8dc8d2SHong Zhang     for (i = 0; i < mbs; i++) jl[i] = mbs;
12610b8dc8d2SHong Zhang     il[0] = 0;
126249b5e25fSSatish Balay 
126349b5e25fSSatish Balay     *norm = 0.0;
126449b5e25fSSatish Balay     for (k = 0; k < mbs; k++) { /* k_th block row */
126549b5e25fSSatish Balay       for (j = 0; j < bs; j++) sum[j] = 0.0;
126649b5e25fSSatish Balay       /*-- col sum --*/
126749b5e25fSSatish Balay       i = jl[k]; /* first |A(i,k)| to be added */
1268bbea24aaSStefano Zampini       /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window)
1269831a3094SHong Zhang                   at step k */
127049b5e25fSSatish Balay       while (i < mbs) {
127149b5e25fSSatish Balay         nexti = jl[i]; /* next block row to be added */
127249b5e25fSSatish Balay         ik    = il[i]; /* block index of A(i,k) in the array a */
127349b5e25fSSatish Balay         for (j = 0; j < bs; j++) {
127449b5e25fSSatish Balay           v = a->a + ik * bs2 + j * bs;
127549b5e25fSSatish Balay           for (k1 = 0; k1 < bs; k1++) {
12769371c9d4SSatish Balay             sum[j] += PetscAbsScalar(*v);
12779371c9d4SSatish Balay             v++;
127849b5e25fSSatish Balay           }
127949b5e25fSSatish Balay         }
128049b5e25fSSatish Balay         /* update il, jl */
1281831a3094SHong Zhang         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1282831a3094SHong Zhang         jmax = a->i[i + 1];
128349b5e25fSSatish Balay         if (jmin < jmax) {
128449b5e25fSSatish Balay           il[i] = jmin;
128549b5e25fSSatish Balay           j     = a->j[jmin];
12869371c9d4SSatish Balay           jl[i] = jl[j];
12879371c9d4SSatish Balay           jl[j] = i;
128849b5e25fSSatish Balay         }
128949b5e25fSSatish Balay         i = nexti;
129049b5e25fSSatish Balay       }
129149b5e25fSSatish Balay       /*-- row sum --*/
129259689ffbSStefano Zampini       jmin = a->i[k];
129359689ffbSStefano Zampini       jmax = a->i[k + 1];
129449b5e25fSSatish Balay       for (i = jmin; i < jmax; i++) {
129549b5e25fSSatish Balay         for (j = 0; j < bs; j++) {
129649b5e25fSSatish Balay           v = a->a + i * bs2 + j;
129749b5e25fSSatish Balay           for (k1 = 0; k1 < bs; k1++) {
12989371c9d4SSatish Balay             sum[j] += PetscAbsScalar(*v);
12999371c9d4SSatish Balay             v += bs;
130049b5e25fSSatish Balay           }
130149b5e25fSSatish Balay         }
130249b5e25fSSatish Balay       }
130349b5e25fSSatish Balay       /* add k_th block row to il, jl */
1304831a3094SHong Zhang       col = aj + jmin;
130559689ffbSStefano Zampini       if (jmax - jmin > 0 && *col == k) jmin++;
130649b5e25fSSatish Balay       if (jmin < jmax) {
130749b5e25fSSatish Balay         il[k] = jmin;
13089371c9d4SSatish Balay         j     = a->j[jmin];
13099371c9d4SSatish Balay         jl[k] = jl[j];
13109371c9d4SSatish Balay         jl[j] = k;
131149b5e25fSSatish Balay       }
131249b5e25fSSatish Balay       for (j = 0; j < bs; j++) {
131349b5e25fSSatish Balay         if (sum[j] > *norm) *norm = sum[j];
131449b5e25fSSatish Balay       }
131549b5e25fSSatish Balay     }
13169566063dSJacob Faibussowitsch     PetscCall(PetscFree3(sum, il, jl));
13179566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1318f23aa3ddSBarry Smith   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
13193ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
132049b5e25fSSatish Balay }
132149b5e25fSSatish Balay 
1322d71ae5a4SJacob Faibussowitsch PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1323d71ae5a4SJacob Faibussowitsch {
132449b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
132549b5e25fSSatish Balay 
132649b5e25fSSatish Balay   PetscFunctionBegin;
132749b5e25fSSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1328d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1329ef511fbeSHong Zhang     *flg = PETSC_FALSE;
13303ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
133149b5e25fSSatish Balay   }
133249b5e25fSSatish Balay 
133349b5e25fSSatish Balay   /* if the a->i are the same */
13349566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
13353ba16761SJacob Faibussowitsch   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
133649b5e25fSSatish Balay 
133749b5e25fSSatish Balay   /* if a->j are the same */
13389566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
13393ba16761SJacob Faibussowitsch   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
134026fbe8dcSKarl Rupp 
134149b5e25fSSatish Balay   /* if a->a are the same */
13429566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg));
13433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
134449b5e25fSSatish Balay }
134549b5e25fSSatish Balay 
1346d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1347d71ae5a4SJacob Faibussowitsch {
134849b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1349d9ca1df4SBarry Smith   PetscInt         i, j, k, row, bs, ambs, bs2;
1350d9ca1df4SBarry Smith   const PetscInt  *ai, *aj;
135187828ca2SBarry Smith   PetscScalar     *x, zero = 0.0;
1352d9ca1df4SBarry Smith   const MatScalar *aa, *aa_j;
135349b5e25fSSatish Balay 
135449b5e25fSSatish Balay   PetscFunctionBegin;
1355d0f46423SBarry Smith   bs = A->rmap->bs;
1356aed4548fSBarry Smith   PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");
135782799104SHong Zhang 
135849b5e25fSSatish Balay   aa   = a->a;
13598a0c6efdSHong Zhang   ambs = a->mbs;
13608a0c6efdSHong Zhang 
13618a0c6efdSHong Zhang   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
13628a0c6efdSHong Zhang     PetscInt *diag = a->diag;
13638a0c6efdSHong Zhang     aa             = a->a;
13648a0c6efdSHong Zhang     ambs           = a->mbs;
13659566063dSJacob Faibussowitsch     PetscCall(VecGetArray(v, &x));
13668a0c6efdSHong Zhang     for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
13679566063dSJacob Faibussowitsch     PetscCall(VecRestoreArray(v, &x));
13683ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
13698a0c6efdSHong Zhang   }
13708a0c6efdSHong Zhang 
137149b5e25fSSatish Balay   ai  = a->i;
137249b5e25fSSatish Balay   aj  = a->j;
137349b5e25fSSatish Balay   bs2 = a->bs2;
13749566063dSJacob Faibussowitsch   PetscCall(VecSet(v, zero));
13753ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
13769566063dSJacob Faibussowitsch   PetscCall(VecGetArray(v, &x));
137749b5e25fSSatish Balay   for (i = 0; i < ambs; i++) {
137849b5e25fSSatish Balay     j = ai[i];
137949b5e25fSSatish Balay     if (aj[j] == i) { /* if this is a diagonal element */
138049b5e25fSSatish Balay       row  = i * bs;
138149b5e25fSSatish Balay       aa_j = aa + j * bs2;
138249b5e25fSSatish Balay       for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
138349b5e25fSSatish Balay     }
138449b5e25fSSatish Balay   }
13859566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(v, &x));
13863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
138749b5e25fSSatish Balay }
138849b5e25fSSatish Balay 
1389d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1390d71ae5a4SJacob Faibussowitsch {
139149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1392d9ca1df4SBarry Smith   PetscScalar        x;
1393d9ca1df4SBarry Smith   const PetscScalar *l, *li, *ri;
139449b5e25fSSatish Balay   MatScalar         *aa, *v;
1395fff8e43fSBarry Smith   PetscInt           i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1396fff8e43fSBarry Smith   const PetscInt    *ai, *aj;
1397ace3abfcSBarry Smith   PetscBool          flg;
139849b5e25fSSatish Balay 
139949b5e25fSSatish Balay   PetscFunctionBegin;
1400b3bf805bSHong Zhang   if (ll != rr) {
14019566063dSJacob Faibussowitsch     PetscCall(VecEqual(ll, rr, &flg));
140228b400f6SJacob Faibussowitsch     PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same");
1403b3bf805bSHong Zhang   }
14043ba16761SJacob Faibussowitsch   if (!ll) PetscFunctionReturn(PETSC_SUCCESS);
140549b5e25fSSatish Balay   ai  = a->i;
140649b5e25fSSatish Balay   aj  = a->j;
140749b5e25fSSatish Balay   aa  = a->a;
1408d0f46423SBarry Smith   m   = A->rmap->N;
1409d0f46423SBarry Smith   bs  = A->rmap->bs;
141049b5e25fSSatish Balay   mbs = a->mbs;
141149b5e25fSSatish Balay   bs2 = a->bs2;
141249b5e25fSSatish Balay 
14139566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(ll, &l));
14149566063dSJacob Faibussowitsch   PetscCall(VecGetLocalSize(ll, &lm));
141508401ef6SPierre Jolivet   PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
141649b5e25fSSatish Balay   for (i = 0; i < mbs; i++) { /* for each block row */
141749b5e25fSSatish Balay     M  = ai[i + 1] - ai[i];
141849b5e25fSSatish Balay     li = l + i * bs;
141949b5e25fSSatish Balay     v  = aa + bs2 * ai[i];
142049b5e25fSSatish Balay     for (j = 0; j < M; j++) { /* for each block */
142149b5e25fSSatish Balay       ri = l + bs * aj[ai[i] + j];
14225e90f9d9SHong Zhang       for (k = 0; k < bs; k++) {
142349b5e25fSSatish Balay         x = ri[k];
142449b5e25fSSatish Balay         for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
142549b5e25fSSatish Balay       }
142649b5e25fSSatish Balay     }
142749b5e25fSSatish Balay   }
14289566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(ll, &l));
14299566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz));
14303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
143149b5e25fSSatish Balay }
143249b5e25fSSatish Balay 
1433d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1434d71ae5a4SJacob Faibussowitsch {
143549b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
143649b5e25fSSatish Balay 
143749b5e25fSSatish Balay   PetscFunctionBegin;
143849b5e25fSSatish Balay   info->block_size   = a->bs2;
1439ceed8ce5SJed Brown   info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
14406c6c5352SBarry Smith   info->nz_used      = a->bs2 * a->nz;    /*num. of nonzeros in upper triangular part */
14413966268fSBarry Smith   info->nz_unneeded  = info->nz_allocated - info->nz_used;
144249b5e25fSSatish Balay   info->assemblies   = A->num_ass;
14438e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
14444dfa11a4SJacob Faibussowitsch   info->memory       = 0; /* REVIEW ME */
1445d5f3da31SBarry Smith   if (A->factortype) {
144649b5e25fSSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
144749b5e25fSSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
144849b5e25fSSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
144949b5e25fSSatish Balay   } else {
145049b5e25fSSatish Balay     info->fill_ratio_given  = 0;
145149b5e25fSSatish Balay     info->fill_ratio_needed = 0;
145249b5e25fSSatish Balay     info->factor_mallocs    = 0;
145349b5e25fSSatish Balay   }
14543ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
145549b5e25fSSatish Balay }
145649b5e25fSSatish Balay 
1457d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1458d71ae5a4SJacob Faibussowitsch {
145949b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
146049b5e25fSSatish Balay 
146149b5e25fSSatish Balay   PetscFunctionBegin;
14629566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
14633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
146449b5e25fSSatish Balay }
1465dc354874SHong Zhang 
1466d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1467d71ae5a4SJacob Faibussowitsch {
1468dc354874SHong Zhang   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1469d9ca1df4SBarry Smith   PetscInt         i, j, n, row, col, bs, mbs;
1470d9ca1df4SBarry Smith   const PetscInt  *ai, *aj;
1471c3fca9a7SHong Zhang   PetscReal        atmp;
1472d9ca1df4SBarry Smith   const MatScalar *aa;
1473985db425SBarry Smith   PetscScalar     *x;
147413f74950SBarry Smith   PetscInt         ncols, brow, bcol, krow, kcol;
1475dc354874SHong Zhang 
1476dc354874SHong Zhang   PetscFunctionBegin;
147728b400f6SJacob Faibussowitsch   PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
147828b400f6SJacob Faibussowitsch   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1479d0f46423SBarry Smith   bs  = A->rmap->bs;
1480dc354874SHong Zhang   aa  = a->a;
1481dc354874SHong Zhang   ai  = a->i;
1482dc354874SHong Zhang   aj  = a->j;
148344117c81SHong Zhang   mbs = a->mbs;
1484dc354874SHong Zhang 
14859566063dSJacob Faibussowitsch   PetscCall(VecSet(v, 0.0));
14869566063dSJacob Faibussowitsch   PetscCall(VecGetArray(v, &x));
14879566063dSJacob Faibussowitsch   PetscCall(VecGetLocalSize(v, &n));
148808401ef6SPierre Jolivet   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
148944117c81SHong Zhang   for (i = 0; i < mbs; i++) {
14909371c9d4SSatish Balay     ncols = ai[1] - ai[0];
14919371c9d4SSatish Balay     ai++;
1492d0f6400bSHong Zhang     brow = bs * i;
149344117c81SHong Zhang     for (j = 0; j < ncols; j++) {
1494d0f6400bSHong Zhang       bcol = bs * (*aj);
149544117c81SHong Zhang       for (kcol = 0; kcol < bs; kcol++) {
1496d0f6400bSHong Zhang         col = bcol + kcol; /* col index */
149744117c81SHong Zhang         for (krow = 0; krow < bs; krow++) {
14989371c9d4SSatish Balay           atmp = PetscAbsScalar(*aa);
14999371c9d4SSatish Balay           aa++;
1500d0f6400bSHong Zhang           row = brow + krow; /* row index */
1501c3fca9a7SHong Zhang           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1502c3fca9a7SHong Zhang           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
150344117c81SHong Zhang         }
150444117c81SHong Zhang       }
1505d0f6400bSHong Zhang       aj++;
1506dc354874SHong Zhang     }
1507dc354874SHong Zhang   }
15089566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(v, &x));
15093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1510dc354874SHong Zhang }
1511c2916339SPierre Jolivet 
1512d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1513d71ae5a4SJacob Faibussowitsch {
1514c2916339SPierre Jolivet   PetscFunctionBegin;
15159566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
15164222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
15173ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1518c2916339SPierre Jolivet }
1519c2916339SPierre Jolivet 
1520d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1521d71ae5a4SJacob Faibussowitsch {
1522c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1523c2916339SPierre Jolivet   PetscScalar       *z = c;
1524c2916339SPierre Jolivet   const PetscScalar *xb;
1525c2916339SPierre Jolivet   PetscScalar        x1;
1526c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1527c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
15283c2e41e1SStefano Zampini #if defined(PETSC_USE_COMPLEX)
1529b94d7dedSBarry Smith   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
15303c2e41e1SStefano Zampini #else
15313c2e41e1SStefano Zampini   const int aconj = 0;
15323c2e41e1SStefano Zampini #endif
1533c2916339SPierre Jolivet 
1534c2916339SPierre Jolivet   PetscFunctionBegin;
1535c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
15369371c9d4SSatish Balay     n = ii[1] - ii[0];
15379371c9d4SSatish Balay     ii++;
1538c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1539c2916339SPierre Jolivet     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1540c2916339SPierre Jolivet     jj = idx;
1541c2916339SPierre Jolivet     vv = v;
1542c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1543c2916339SPierre Jolivet       idx = jj;
1544c2916339SPierre Jolivet       v   = vv;
1545c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
15469371c9d4SSatish Balay         xb = b + (*idx);
15479371c9d4SSatish Balay         x1 = xb[0 + k * bm];
1548c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1;
15493c2e41e1SStefano Zampini         if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1550c2916339SPierre Jolivet         v += 1;
1551c2916339SPierre Jolivet         ++idx;
1552c2916339SPierre Jolivet       }
1553c2916339SPierre Jolivet     }
1554c2916339SPierre Jolivet     z += 1;
1555c2916339SPierre Jolivet   }
15563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1557c2916339SPierre Jolivet }
1558c2916339SPierre Jolivet 
1559d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1560d71ae5a4SJacob Faibussowitsch {
1561c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1562c2916339SPierre Jolivet   PetscScalar       *z = c;
1563c2916339SPierre Jolivet   const PetscScalar *xb;
1564c2916339SPierre Jolivet   PetscScalar        x1, x2;
1565c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1566c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1567c2916339SPierre Jolivet 
1568c2916339SPierre Jolivet   PetscFunctionBegin;
1569c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
15709371c9d4SSatish Balay     n = ii[1] - ii[0];
15719371c9d4SSatish Balay     ii++;
1572c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1573c2916339SPierre Jolivet     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1574c2916339SPierre Jolivet     jj = idx;
1575c2916339SPierre Jolivet     vv = v;
1576c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1577c2916339SPierre Jolivet       idx = jj;
1578c2916339SPierre Jolivet       v   = vv;
1579c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
15809371c9d4SSatish Balay         xb = b + 2 * (*idx);
15819371c9d4SSatish Balay         x1 = xb[0 + k * bm];
15829371c9d4SSatish Balay         x2 = xb[1 + k * bm];
1583c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1584c2916339SPierre Jolivet         z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1585c2916339SPierre Jolivet         if (*idx != i) {
1586c2916339SPierre Jolivet           c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1587c2916339SPierre Jolivet           c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1588c2916339SPierre Jolivet         }
1589c2916339SPierre Jolivet         v += 4;
1590c2916339SPierre Jolivet         ++idx;
1591c2916339SPierre Jolivet       }
1592c2916339SPierre Jolivet     }
1593c2916339SPierre Jolivet     z += 2;
1594c2916339SPierre Jolivet   }
15953ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1596c2916339SPierre Jolivet }
1597c2916339SPierre Jolivet 
1598d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1599d71ae5a4SJacob Faibussowitsch {
1600c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1601c2916339SPierre Jolivet   PetscScalar       *z = c;
1602c2916339SPierre Jolivet   const PetscScalar *xb;
1603c2916339SPierre Jolivet   PetscScalar        x1, x2, x3;
1604c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1605c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1606c2916339SPierre Jolivet 
1607c2916339SPierre Jolivet   PetscFunctionBegin;
1608c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
16099371c9d4SSatish Balay     n = ii[1] - ii[0];
16109371c9d4SSatish Balay     ii++;
1611c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1612c2916339SPierre Jolivet     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1613c2916339SPierre Jolivet     jj = idx;
1614c2916339SPierre Jolivet     vv = v;
1615c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1616c2916339SPierre Jolivet       idx = jj;
1617c2916339SPierre Jolivet       v   = vv;
1618c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
16199371c9d4SSatish Balay         xb = b + 3 * (*idx);
16209371c9d4SSatish Balay         x1 = xb[0 + k * bm];
16219371c9d4SSatish Balay         x2 = xb[1 + k * bm];
16229371c9d4SSatish Balay         x3 = xb[2 + k * bm];
1623c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1624c2916339SPierre Jolivet         z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1625c2916339SPierre Jolivet         z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1626c2916339SPierre Jolivet         if (*idx != i) {
1627c2916339SPierre Jolivet           c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1628c2916339SPierre Jolivet           c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1629c2916339SPierre Jolivet           c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1630c2916339SPierre Jolivet         }
1631c2916339SPierre Jolivet         v += 9;
1632c2916339SPierre Jolivet         ++idx;
1633c2916339SPierre Jolivet       }
1634c2916339SPierre Jolivet     }
1635c2916339SPierre Jolivet     z += 3;
1636c2916339SPierre Jolivet   }
16373ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1638c2916339SPierre Jolivet }
1639c2916339SPierre Jolivet 
1640d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1641d71ae5a4SJacob Faibussowitsch {
1642c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1643c2916339SPierre Jolivet   PetscScalar       *z = c;
1644c2916339SPierre Jolivet   const PetscScalar *xb;
1645c2916339SPierre Jolivet   PetscScalar        x1, x2, x3, x4;
1646c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1647c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1648c2916339SPierre Jolivet 
1649c2916339SPierre Jolivet   PetscFunctionBegin;
1650c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
16519371c9d4SSatish Balay     n = ii[1] - ii[0];
16529371c9d4SSatish Balay     ii++;
1653c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1654c2916339SPierre Jolivet     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1655c2916339SPierre Jolivet     jj = idx;
1656c2916339SPierre Jolivet     vv = v;
1657c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1658c2916339SPierre Jolivet       idx = jj;
1659c2916339SPierre Jolivet       v   = vv;
1660c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
16619371c9d4SSatish Balay         xb = b + 4 * (*idx);
16629371c9d4SSatish Balay         x1 = xb[0 + k * bm];
16639371c9d4SSatish Balay         x2 = xb[1 + k * bm];
16649371c9d4SSatish Balay         x3 = xb[2 + k * bm];
16659371c9d4SSatish Balay         x4 = xb[3 + k * bm];
1666c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1667c2916339SPierre Jolivet         z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1668c2916339SPierre Jolivet         z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1669c2916339SPierre Jolivet         z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1670c2916339SPierre Jolivet         if (*idx != i) {
1671c2916339SPierre Jolivet           c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1672c2916339SPierre Jolivet           c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1673c2916339SPierre Jolivet           c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1674c2916339SPierre Jolivet           c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1675c2916339SPierre Jolivet         }
1676c2916339SPierre Jolivet         v += 16;
1677c2916339SPierre Jolivet         ++idx;
1678c2916339SPierre Jolivet       }
1679c2916339SPierre Jolivet     }
1680c2916339SPierre Jolivet     z += 4;
1681c2916339SPierre Jolivet   }
16823ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1683c2916339SPierre Jolivet }
1684c2916339SPierre Jolivet 
1685d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1686d71ae5a4SJacob Faibussowitsch {
1687c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1688c2916339SPierre Jolivet   PetscScalar       *z = c;
1689c2916339SPierre Jolivet   const PetscScalar *xb;
1690c2916339SPierre Jolivet   PetscScalar        x1, x2, x3, x4, x5;
1691c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1692c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1693c2916339SPierre Jolivet 
1694c2916339SPierre Jolivet   PetscFunctionBegin;
1695c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
16969371c9d4SSatish Balay     n = ii[1] - ii[0];
16979371c9d4SSatish Balay     ii++;
1698c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1699c2916339SPierre Jolivet     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1700c2916339SPierre Jolivet     jj = idx;
1701c2916339SPierre Jolivet     vv = v;
1702c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1703c2916339SPierre Jolivet       idx = jj;
1704c2916339SPierre Jolivet       v   = vv;
1705c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
17069371c9d4SSatish Balay         xb = b + 5 * (*idx);
17079371c9d4SSatish Balay         x1 = xb[0 + k * bm];
17089371c9d4SSatish Balay         x2 = xb[1 + k * bm];
17099371c9d4SSatish Balay         x3 = xb[2 + k * bm];
17109371c9d4SSatish Balay         x4 = xb[3 + k * bm];
17119371c9d4SSatish Balay         x5 = xb[4 + k * cm];
1712c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1713c2916339SPierre Jolivet         z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1714c2916339SPierre Jolivet         z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1715c2916339SPierre Jolivet         z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1716c2916339SPierre Jolivet         z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1717c2916339SPierre Jolivet         if (*idx != i) {
1718c2916339SPierre Jolivet           c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1719c2916339SPierre Jolivet           c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1720c2916339SPierre Jolivet           c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1721c2916339SPierre Jolivet           c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1722c2916339SPierre Jolivet           c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1723c2916339SPierre Jolivet         }
1724c2916339SPierre Jolivet         v += 25;
1725c2916339SPierre Jolivet         ++idx;
1726c2916339SPierre Jolivet       }
1727c2916339SPierre Jolivet     }
1728c2916339SPierre Jolivet     z += 5;
1729c2916339SPierre Jolivet   }
17303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1731c2916339SPierre Jolivet }
1732c2916339SPierre Jolivet 
1733d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1734d71ae5a4SJacob Faibussowitsch {
1735c2916339SPierre Jolivet   Mat_SeqSBAIJ    *a  = (Mat_SeqSBAIJ *)A->data;
1736c2916339SPierre Jolivet   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
1737281439baSStefano Zampini   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
1738c2916339SPierre Jolivet   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1739c2916339SPierre Jolivet   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1740c2916339SPierre Jolivet   PetscBLASInt     bbs, bcn, bbm, bcm;
1741f4259b30SLisandro Dalcin   PetscScalar     *z = NULL;
1742c2916339SPierre Jolivet   PetscScalar     *c, *b;
1743c2916339SPierre Jolivet   const MatScalar *v;
1744c2916339SPierre Jolivet   const PetscInt  *idx, *ii;
1745c2916339SPierre Jolivet   PetscScalar      _DOne = 1.0;
1746c2916339SPierre Jolivet 
1747c2916339SPierre Jolivet   PetscFunctionBegin;
17483ba16761SJacob Faibussowitsch   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
174908401ef6SPierre Jolivet   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
175008401ef6SPierre Jolivet   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
175108401ef6SPierre Jolivet   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1752c2916339SPierre Jolivet   b = bd->v;
17539566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
17549566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArray(C, &c));
1755c2916339SPierre Jolivet   switch (bs) {
1756d71ae5a4SJacob Faibussowitsch   case 1:
1757d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1758d71ae5a4SJacob Faibussowitsch     break;
1759d71ae5a4SJacob Faibussowitsch   case 2:
1760d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1761d71ae5a4SJacob Faibussowitsch     break;
1762d71ae5a4SJacob Faibussowitsch   case 3:
1763d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1764d71ae5a4SJacob Faibussowitsch     break;
1765d71ae5a4SJacob Faibussowitsch   case 4:
1766d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1767d71ae5a4SJacob Faibussowitsch     break;
1768d71ae5a4SJacob Faibussowitsch   case 5:
1769d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1770d71ae5a4SJacob Faibussowitsch     break;
1771c2916339SPierre Jolivet   default: /* block sizes larger than 5 by 5 are handled by BLAS */
17729566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bs, &bbs));
17739566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cn, &bcn));
17749566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bm, &bbm));
17759566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cm, &bcm));
1776c2916339SPierre Jolivet     idx = a->j;
1777c2916339SPierre Jolivet     v   = a->a;
1778c2916339SPierre Jolivet     mbs = a->mbs;
1779c2916339SPierre Jolivet     ii  = a->i;
1780c2916339SPierre Jolivet     z   = c;
1781c2916339SPierre Jolivet     for (i = 0; i < mbs; i++) {
17829371c9d4SSatish Balay       n = ii[1] - ii[0];
17839371c9d4SSatish Balay       ii++;
1784c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
1785792fecdfSBarry Smith         if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1786792fecdfSBarry Smith         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1787c2916339SPierre Jolivet         v += bs2;
1788c2916339SPierre Jolivet       }
1789c2916339SPierre Jolivet       z += bs;
1790c2916339SPierre Jolivet     }
1791c2916339SPierre Jolivet   }
17929566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArray(C, &c));
17939566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
17943ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1795c2916339SPierre Jolivet }
1796