xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision da81f9329be15cc55f054c8a00978087195c9247)
149b5e25fSSatish Balay 
2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
3c2916339SPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h>
4c2916339SPierre Jolivet #include <../src/mat/impls/sbaij/seq/sbaij.h>
5af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
6c6db04a5SJed Brown #include <petscbt.h>
7c6db04a5SJed Brown #include <petscblaslapack.h>
849b5e25fSSatish Balay 
9d71ae5a4SJacob Faibussowitsch PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
10d71ae5a4SJacob Faibussowitsch {
115eee224dSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
127bede89fSBarry Smith   PetscInt        brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
135d0c19d7SBarry Smith   const PetscInt *idx;
14db41cccfSHong Zhang   PetscBT         table_out, table_in;
15d94109b8SHong Zhang 
16d94109b8SHong Zhang   PetscFunctionBegin;
1708401ef6SPierre Jolivet   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
18d94109b8SHong Zhang   mbs = a->mbs;
19d94109b8SHong Zhang   ai  = a->i;
20d94109b8SHong Zhang   aj  = a->j;
21d0f46423SBarry Smith   bs  = A->rmap->bs;
229566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(mbs, &table_out));
239566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(mbs + 1, &nidx));
249566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(mbs, &table_in));
25d94109b8SHong Zhang 
26d94109b8SHong Zhang   for (i = 0; i < is_max; i++) { /* for each is */
27d94109b8SHong Zhang     isz = 0;
289566063dSJacob Faibussowitsch     PetscCall(PetscBTMemzero(mbs, table_out));
29d94109b8SHong Zhang 
30d94109b8SHong Zhang     /* Extract the indices, assume there can be duplicate entries */
319566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(is[i], &idx));
329566063dSJacob Faibussowitsch     PetscCall(ISGetLocalSize(is[i], &n));
33d94109b8SHong Zhang 
34db41cccfSHong Zhang     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
35dbe03f88SHong Zhang     bcol_max = 0;
36d94109b8SHong Zhang     for (j = 0; j < n; ++j) {
37d94109b8SHong Zhang       brow = idx[j] / bs; /* convert the indices into block indices */
3808401ef6SPierre Jolivet       PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
39db41cccfSHong Zhang       if (!PetscBTLookupSet(table_out, brow)) {
40dbe03f88SHong Zhang         nidx[isz++] = brow;
41dbe03f88SHong Zhang         if (bcol_max < brow) bcol_max = brow;
42dbe03f88SHong Zhang       }
43d94109b8SHong Zhang     }
449566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(is[i], &idx));
459566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&is[i]));
46d94109b8SHong Zhang 
47d94109b8SHong Zhang     k = 0;
48d94109b8SHong Zhang     for (j = 0; j < ov; j++) { /* for each overlap */
49db41cccfSHong Zhang       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
509566063dSJacob Faibussowitsch       PetscCall(PetscBTMemzero(mbs, table_in));
519566063dSJacob Faibussowitsch       for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));
52d94109b8SHong Zhang 
53d94109b8SHong Zhang       n = isz; /* length of the updated is[i] */
54d94109b8SHong Zhang       for (brow = 0; brow < mbs; brow++) {
559371c9d4SSatish Balay         start = ai[brow];
569371c9d4SSatish Balay         end   = ai[brow + 1];
57db41cccfSHong Zhang         if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
58d94109b8SHong Zhang           for (l = start; l < end; l++) {
59d94109b8SHong Zhang             bcol = aj[l];
60d7b97159SHong Zhang             if (!PetscBTLookupSet(table_out, bcol)) {
61d7b97159SHong Zhang               nidx[isz++] = bcol;
62d7b97159SHong Zhang               if (bcol_max < bcol) bcol_max = bcol;
63d7b97159SHong Zhang             }
64d94109b8SHong Zhang           }
65d94109b8SHong Zhang           k++;
66d94109b8SHong Zhang           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67*da81f932SPierre Jolivet         } else {             /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
68d94109b8SHong Zhang           for (l = start; l < end; l++) {
69d94109b8SHong Zhang             bcol = aj[l];
70dbe03f88SHong Zhang             if (bcol > bcol_max) break;
71db41cccfSHong Zhang             if (PetscBTLookup(table_in, bcol)) {
7226fbe8dcSKarl Rupp               if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
73d94109b8SHong Zhang               break; /* for l = start; l<end ; l++) */
74d94109b8SHong Zhang             }
75d94109b8SHong Zhang           }
76d94109b8SHong Zhang         }
77d94109b8SHong Zhang       }
78d94109b8SHong Zhang     } /* for each overlap */
797bede89fSBarry Smith     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
807bede89fSBarry Smith   } /* for each is */
819566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&table_out));
829566063dSJacob Faibussowitsch   PetscCall(PetscFree(nidx));
839566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&table_in));
845eee224dSHong Zhang   PetscFunctionReturn(0);
8549b5e25fSSatish Balay }
8649b5e25fSSatish Balay 
87847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
88847daeccSHong Zhang         Zero some ops' to avoid invalid usse */
89d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
90d71ae5a4SJacob Faibussowitsch {
9149b5e25fSSatish Balay   PetscFunctionBegin;
929566063dSJacob Faibussowitsch   PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
93f4259b30SLisandro Dalcin   Bseq->ops->mult                   = NULL;
94f4259b30SLisandro Dalcin   Bseq->ops->multadd                = NULL;
95f4259b30SLisandro Dalcin   Bseq->ops->multtranspose          = NULL;
96f4259b30SLisandro Dalcin   Bseq->ops->multtransposeadd       = NULL;
97f4259b30SLisandro Dalcin   Bseq->ops->lufactor               = NULL;
98f4259b30SLisandro Dalcin   Bseq->ops->choleskyfactor         = NULL;
99f4259b30SLisandro Dalcin   Bseq->ops->lufactorsymbolic       = NULL;
100f4259b30SLisandro Dalcin   Bseq->ops->choleskyfactorsymbolic = NULL;
101f4259b30SLisandro Dalcin   Bseq->ops->getinertia             = NULL;
10249b5e25fSSatish Balay   PetscFunctionReturn(0);
10349b5e25fSSatish Balay }
10449b5e25fSSatish Balay 
1057dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
106d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
107d71ae5a4SJacob Faibussowitsch {
108e50a8114SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *c;
109e50a8114SHong Zhang   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
110e50a8114SHong Zhang   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
111e50a8114SHong Zhang   const PetscInt *irow, *icol;
112e50a8114SHong Zhang   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
113e50a8114SHong Zhang   PetscInt       *aj = a->j, *ai = a->i;
114e50a8114SHong Zhang   MatScalar      *mat_a;
115e50a8114SHong Zhang   Mat             C;
116e50a8114SHong Zhang   PetscBool       flag;
117e50a8114SHong Zhang 
118e50a8114SHong Zhang   PetscFunctionBegin;
119e50a8114SHong Zhang 
1209566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &irow));
1219566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &icol));
1229566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(isrow, &nrows));
1239566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(iscol, &ncols));
124e50a8114SHong Zhang 
1259566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(1 + oldcols, &smap));
126e50a8114SHong Zhang   ssmap = smap;
1279566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(1 + nrows, &lens));
128e50a8114SHong Zhang   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
129e50a8114SHong Zhang   /* determine lens of each row */
130e50a8114SHong Zhang   for (i = 0; i < nrows; i++) {
131e50a8114SHong Zhang     kstart  = ai[irow[i]];
132e50a8114SHong Zhang     kend    = kstart + a->ilen[irow[i]];
133e50a8114SHong Zhang     lens[i] = 0;
134e50a8114SHong Zhang     for (k = kstart; k < kend; k++) {
135e50a8114SHong Zhang       if (ssmap[aj[k]]) lens[i]++;
136e50a8114SHong Zhang     }
137e50a8114SHong Zhang   }
138e50a8114SHong Zhang   /* Create and fill new matrix */
139e50a8114SHong Zhang   if (scall == MAT_REUSE_MATRIX) {
140e50a8114SHong Zhang     c = (Mat_SeqSBAIJ *)((*B)->data);
141e50a8114SHong Zhang 
142aed4548fSBarry Smith     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
1439566063dSJacob Faibussowitsch     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
14428b400f6SJacob Faibussowitsch     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
1459566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(c->ilen, c->mbs));
146e50a8114SHong Zhang     C = *B;
147e50a8114SHong Zhang   } else {
1489566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
1499566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
1509566063dSJacob Faibussowitsch     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
1519566063dSJacob Faibussowitsch     PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
152e50a8114SHong Zhang   }
153e50a8114SHong Zhang   c = (Mat_SeqSBAIJ *)(C->data);
154e50a8114SHong Zhang   for (i = 0; i < nrows; i++) {
155e50a8114SHong Zhang     row      = irow[i];
156e50a8114SHong Zhang     kstart   = ai[row];
157e50a8114SHong Zhang     kend     = kstart + a->ilen[row];
158e50a8114SHong Zhang     mat_i    = c->i[i];
159e50a8114SHong Zhang     mat_j    = c->j + mat_i;
160e50a8114SHong Zhang     mat_a    = c->a + mat_i * bs2;
161e50a8114SHong Zhang     mat_ilen = c->ilen + i;
162e50a8114SHong Zhang     for (k = kstart; k < kend; k++) {
163e50a8114SHong Zhang       if ((tcol = ssmap[a->j[k]])) {
164e50a8114SHong Zhang         *mat_j++ = tcol - 1;
1659566063dSJacob Faibussowitsch         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
166e50a8114SHong Zhang         mat_a += bs2;
167e50a8114SHong Zhang         (*mat_ilen)++;
168e50a8114SHong Zhang       }
169e50a8114SHong Zhang     }
170e50a8114SHong Zhang   }
171e50a8114SHong Zhang   /* sort */
172e50a8114SHong Zhang   {
173e50a8114SHong Zhang     MatScalar *work;
174e50a8114SHong Zhang 
1759566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(bs2, &work));
176e50a8114SHong Zhang     for (i = 0; i < nrows; i++) {
177e50a8114SHong Zhang       PetscInt ilen;
178e50a8114SHong Zhang       mat_i = c->i[i];
179e50a8114SHong Zhang       mat_j = c->j + mat_i;
180e50a8114SHong Zhang       mat_a = c->a + mat_i * bs2;
181e50a8114SHong Zhang       ilen  = c->ilen[i];
1829566063dSJacob Faibussowitsch       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
183e50a8114SHong Zhang     }
1849566063dSJacob Faibussowitsch     PetscCall(PetscFree(work));
185e50a8114SHong Zhang   }
186e50a8114SHong Zhang 
187e50a8114SHong Zhang   /* Free work space */
1889566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &icol));
1899566063dSJacob Faibussowitsch   PetscCall(PetscFree(smap));
1909566063dSJacob Faibussowitsch   PetscCall(PetscFree(lens));
1919566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1929566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
193e50a8114SHong Zhang 
1949566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &irow));
195e50a8114SHong Zhang   *B = C;
196e50a8114SHong Zhang   PetscFunctionReturn(0);
197e50a8114SHong Zhang }
198e50a8114SHong Zhang 
199d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
200d71ae5a4SJacob Faibussowitsch {
201e50a8114SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
202e50a8114SHong Zhang   IS              is1, is2;
203e50a8114SHong Zhang   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs;
204e50a8114SHong Zhang   const PetscInt *irow, *icol;
20549b5e25fSSatish Balay 
20649b5e25fSSatish Balay   PetscFunctionBegin;
2079566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &irow));
2089566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &icol));
2099566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(isrow, &nrows));
2109566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(iscol, &ncols));
211e50a8114SHong Zhang 
212*da81f932SPierre Jolivet   /* Verify if the indices correspond to each element in a block
213e50a8114SHong Zhang    and form the IS with compressed IS */
214e50a8114SHong Zhang   maxmnbs = PetscMax(a->mbs, a->nbs);
2159566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary));
2169566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(vary, a->mbs));
217e50a8114SHong Zhang   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
218ad540459SPierre Jolivet   for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks");
219e50a8114SHong Zhang   count = 0;
220e50a8114SHong Zhang   for (i = 0; i < nrows; i++) {
221e50a8114SHong Zhang     PetscInt j = irow[i] / bs;
222e50a8114SHong Zhang     if ((vary[j]--) == bs) iary[count++] = j;
223e50a8114SHong Zhang   }
2249566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1));
225e50a8114SHong Zhang 
2269566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(vary, a->nbs));
227e50a8114SHong Zhang   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
228ad540459SPierre Jolivet   for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc");
229e50a8114SHong Zhang   count = 0;
230e50a8114SHong Zhang   for (i = 0; i < ncols; i++) {
231e50a8114SHong Zhang     PetscInt j = icol[i] / bs;
232e50a8114SHong Zhang     if ((vary[j]--) == bs) iary[count++] = j;
233e50a8114SHong Zhang   }
2349566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2));
2359566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &irow));
2369566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &icol));
2379566063dSJacob Faibussowitsch   PetscCall(PetscFree2(vary, iary));
238e50a8114SHong Zhang 
2399566063dSJacob Faibussowitsch   PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B));
2409566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is1));
2419566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is2));
242847daeccSHong Zhang 
2438f46ffcaSHong Zhang   if (isrow != iscol) {
2448f46ffcaSHong Zhang     PetscBool isequal;
2459566063dSJacob Faibussowitsch     PetscCall(ISEqual(isrow, iscol, &isequal));
24648a46eb9SPierre Jolivet     if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
24749b5e25fSSatish Balay   }
24849b5e25fSSatish Balay   PetscFunctionReturn(0);
24949b5e25fSSatish Balay }
25049b5e25fSSatish Balay 
251d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
252d71ae5a4SJacob Faibussowitsch {
25313f74950SBarry Smith   PetscInt i;
25449b5e25fSSatish Balay 
25549b5e25fSSatish Balay   PetscFunctionBegin;
25648a46eb9SPierre Jolivet   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));
257e50a8114SHong Zhang 
25848a46eb9SPierre Jolivet   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
25949b5e25fSSatish Balay   PetscFunctionReturn(0);
26049b5e25fSSatish Balay }
26149b5e25fSSatish Balay 
26249b5e25fSSatish Balay /* -------------------------------------------------------*/
26349b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */
26449b5e25fSSatish Balay /* -------------------------------------------------------*/
26549b5e25fSSatish Balay 
266d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
267d71ae5a4SJacob Faibussowitsch {
26849b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
269d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, zero = 0.0;
270d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
271d9ca1df4SBarry Smith   const MatScalar   *v;
272d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
273d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
27498c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
27549b5e25fSSatish Balay 
27649b5e25fSSatish Balay   PetscFunctionBegin;
2779566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
278c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
2799566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
2809566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
28149b5e25fSSatish Balay 
28249b5e25fSSatish Balay   v  = a->a;
28349b5e25fSSatish Balay   xb = x;
28449b5e25fSSatish Balay 
28549b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
28649b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
2879371c9d4SSatish Balay     x1   = xb[0];
2889371c9d4SSatish Balay     x2   = xb[1];
28949b5e25fSSatish Balay     ib   = aj + *ai;
290831a3094SHong Zhang     jmin = 0;
29198c9bda7SSatish Balay     nonzerorow += (n > 0);
2927fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
29349b5e25fSSatish Balay       z[2 * i] += v[0] * x1 + v[2] * x2;
29449b5e25fSSatish Balay       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
2959371c9d4SSatish Balay       v += 4;
2969371c9d4SSatish Balay       jmin++;
2977fbae186SHong Zhang     }
298444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
299444d8c10SJed Brown     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
300831a3094SHong Zhang     for (j = jmin; j < n; j++) {
30149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
30249b5e25fSSatish Balay       cval = ib[j] * 2;
30349b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2;
30449b5e25fSSatish Balay       z[cval + 1] += v[2] * x1 + v[3] * x2;
30549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
30649b5e25fSSatish Balay       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
30749b5e25fSSatish Balay       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
30849b5e25fSSatish Balay       v += 4;
30949b5e25fSSatish Balay     }
3109371c9d4SSatish Balay     xb += 2;
3119371c9d4SSatish Balay     ai++;
31249b5e25fSSatish Balay   }
31349b5e25fSSatish Balay 
3149566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
3159566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
3169566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
31749b5e25fSSatish Balay   PetscFunctionReturn(0);
31849b5e25fSSatish Balay }
31949b5e25fSSatish Balay 
320d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
321d71ae5a4SJacob Faibussowitsch {
32249b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
323d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, zero = 0.0;
324d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
325d9ca1df4SBarry Smith   const MatScalar   *v;
326d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
327d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
32898c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
32949b5e25fSSatish Balay 
33049b5e25fSSatish Balay   PetscFunctionBegin;
3319566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
332c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
3339566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3349566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
33549b5e25fSSatish Balay 
33649b5e25fSSatish Balay   v  = a->a;
33749b5e25fSSatish Balay   xb = x;
33849b5e25fSSatish Balay 
33949b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
34049b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
3419371c9d4SSatish Balay     x1   = xb[0];
3429371c9d4SSatish Balay     x2   = xb[1];
3439371c9d4SSatish Balay     x3   = xb[2];
34449b5e25fSSatish Balay     ib   = aj + *ai;
345831a3094SHong Zhang     jmin = 0;
34698c9bda7SSatish Balay     nonzerorow += (n > 0);
3477fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
34849b5e25fSSatish Balay       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
34949b5e25fSSatish Balay       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
35049b5e25fSSatish Balay       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3519371c9d4SSatish Balay       v += 9;
3529371c9d4SSatish Balay       jmin++;
3537fbae186SHong Zhang     }
354444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
355444d8c10SJed Brown     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
356831a3094SHong Zhang     for (j = jmin; j < n; j++) {
35749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
35849b5e25fSSatish Balay       cval = ib[j] * 3;
35949b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
36049b5e25fSSatish Balay       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
36149b5e25fSSatish Balay       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
36249b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
36349b5e25fSSatish Balay       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
36449b5e25fSSatish Balay       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
36549b5e25fSSatish Balay       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
36649b5e25fSSatish Balay       v += 9;
36749b5e25fSSatish Balay     }
3689371c9d4SSatish Balay     xb += 3;
3699371c9d4SSatish Balay     ai++;
37049b5e25fSSatish Balay   }
37149b5e25fSSatish Balay 
3729566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
3739566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
3749566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
37549b5e25fSSatish Balay   PetscFunctionReturn(0);
37649b5e25fSSatish Balay }
37749b5e25fSSatish Balay 
378d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
379d71ae5a4SJacob Faibussowitsch {
38049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
381d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, zero = 0.0;
382d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
383d9ca1df4SBarry Smith   const MatScalar   *v;
384d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
385d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
38698c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
38749b5e25fSSatish Balay 
38849b5e25fSSatish Balay   PetscFunctionBegin;
3899566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
390c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
3919566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3929566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
39349b5e25fSSatish Balay 
39449b5e25fSSatish Balay   v  = a->a;
39549b5e25fSSatish Balay   xb = x;
39649b5e25fSSatish Balay 
39749b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
39849b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
3999371c9d4SSatish Balay     x1   = xb[0];
4009371c9d4SSatish Balay     x2   = xb[1];
4019371c9d4SSatish Balay     x3   = xb[2];
4029371c9d4SSatish Balay     x4   = xb[3];
40349b5e25fSSatish Balay     ib   = aj + *ai;
404831a3094SHong Zhang     jmin = 0;
40598c9bda7SSatish Balay     nonzerorow += (n > 0);
4067fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
40749b5e25fSSatish Balay       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
40849b5e25fSSatish Balay       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
40949b5e25fSSatish Balay       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
41049b5e25fSSatish Balay       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
4119371c9d4SSatish Balay       v += 16;
4129371c9d4SSatish Balay       jmin++;
4137fbae186SHong Zhang     }
414444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
415444d8c10SJed Brown     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
416831a3094SHong Zhang     for (j = jmin; j < n; j++) {
41749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
41849b5e25fSSatish Balay       cval = ib[j] * 4;
41949b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
42049b5e25fSSatish Balay       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
42149b5e25fSSatish Balay       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
42249b5e25fSSatish Balay       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
42349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
42449b5e25fSSatish Balay       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
42549b5e25fSSatish Balay       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
42649b5e25fSSatish Balay       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
42749b5e25fSSatish Balay       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
42849b5e25fSSatish Balay       v += 16;
42949b5e25fSSatish Balay     }
4309371c9d4SSatish Balay     xb += 4;
4319371c9d4SSatish Balay     ai++;
43249b5e25fSSatish Balay   }
43349b5e25fSSatish Balay 
4349566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
4359566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
4369566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
43749b5e25fSSatish Balay   PetscFunctionReturn(0);
43849b5e25fSSatish Balay }
43949b5e25fSSatish Balay 
440d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
441d71ae5a4SJacob Faibussowitsch {
44249b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
443d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, zero = 0.0;
444d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
445d9ca1df4SBarry Smith   const MatScalar   *v;
446d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
447d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
44898c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
44949b5e25fSSatish Balay 
45049b5e25fSSatish Balay   PetscFunctionBegin;
4519566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
452c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
4539566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
4549566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
45549b5e25fSSatish Balay 
45649b5e25fSSatish Balay   v  = a->a;
45749b5e25fSSatish Balay   xb = x;
45849b5e25fSSatish Balay 
45949b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
46049b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
4619371c9d4SSatish Balay     x1   = xb[0];
4629371c9d4SSatish Balay     x2   = xb[1];
4639371c9d4SSatish Balay     x3   = xb[2];
4649371c9d4SSatish Balay     x4   = xb[3];
4659371c9d4SSatish Balay     x5   = xb[4];
46649b5e25fSSatish Balay     ib   = aj + *ai;
467831a3094SHong Zhang     jmin = 0;
46898c9bda7SSatish Balay     nonzerorow += (n > 0);
4697fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
47049b5e25fSSatish Balay       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
47149b5e25fSSatish Balay       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
47249b5e25fSSatish Balay       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
47349b5e25fSSatish Balay       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
47449b5e25fSSatish Balay       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
4759371c9d4SSatish Balay       v += 25;
4769371c9d4SSatish Balay       jmin++;
4777fbae186SHong Zhang     }
478444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
479444d8c10SJed Brown     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
480831a3094SHong Zhang     for (j = jmin; j < n; j++) {
48149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
48249b5e25fSSatish Balay       cval = ib[j] * 5;
48349b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
48449b5e25fSSatish Balay       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
48549b5e25fSSatish Balay       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
48649b5e25fSSatish Balay       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
48749b5e25fSSatish Balay       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
48849b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
48949b5e25fSSatish Balay       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
49049b5e25fSSatish Balay       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
49149b5e25fSSatish Balay       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
49249b5e25fSSatish Balay       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
49349b5e25fSSatish Balay       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
49449b5e25fSSatish Balay       v += 25;
49549b5e25fSSatish Balay     }
4969371c9d4SSatish Balay     xb += 5;
4979371c9d4SSatish Balay     ai++;
49849b5e25fSSatish Balay   }
49949b5e25fSSatish Balay 
5009566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
5019566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
5029566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
50349b5e25fSSatish Balay   PetscFunctionReturn(0);
50449b5e25fSSatish Balay }
50549b5e25fSSatish Balay 
506d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
507d71ae5a4SJacob Faibussowitsch {
50849b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
509d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
510d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
511d9ca1df4SBarry Smith   const MatScalar   *v;
512d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
513d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
51498c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
51549b5e25fSSatish Balay 
51649b5e25fSSatish Balay   PetscFunctionBegin;
5179566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
518c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
5199566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
5209566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
52149b5e25fSSatish Balay 
52249b5e25fSSatish Balay   v  = a->a;
52349b5e25fSSatish Balay   xb = x;
52449b5e25fSSatish Balay 
52549b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
52649b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
5279371c9d4SSatish Balay     x1   = xb[0];
5289371c9d4SSatish Balay     x2   = xb[1];
5299371c9d4SSatish Balay     x3   = xb[2];
5309371c9d4SSatish Balay     x4   = xb[3];
5319371c9d4SSatish Balay     x5   = xb[4];
5329371c9d4SSatish Balay     x6   = xb[5];
53349b5e25fSSatish Balay     ib   = aj + *ai;
534831a3094SHong Zhang     jmin = 0;
53598c9bda7SSatish Balay     nonzerorow += (n > 0);
5367fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
53749b5e25fSSatish Balay       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
53849b5e25fSSatish Balay       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
53949b5e25fSSatish Balay       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
54049b5e25fSSatish Balay       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
54149b5e25fSSatish Balay       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
54249b5e25fSSatish Balay       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
5439371c9d4SSatish Balay       v += 36;
5449371c9d4SSatish Balay       jmin++;
5457fbae186SHong Zhang     }
546444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
547444d8c10SJed Brown     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
548831a3094SHong Zhang     for (j = jmin; j < n; j++) {
54949b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
55049b5e25fSSatish Balay       cval = ib[j] * 6;
55149b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
55249b5e25fSSatish Balay       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
55349b5e25fSSatish Balay       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
55449b5e25fSSatish Balay       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
55549b5e25fSSatish Balay       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
55649b5e25fSSatish Balay       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
55749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
55849b5e25fSSatish Balay       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
55949b5e25fSSatish Balay       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
56049b5e25fSSatish Balay       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
56149b5e25fSSatish Balay       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
56249b5e25fSSatish Balay       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
56349b5e25fSSatish Balay       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
56449b5e25fSSatish Balay       v += 36;
56549b5e25fSSatish Balay     }
5669371c9d4SSatish Balay     xb += 6;
5679371c9d4SSatish Balay     ai++;
56849b5e25fSSatish Balay   }
56949b5e25fSSatish Balay 
5709566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
5719566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
5729566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
57349b5e25fSSatish Balay   PetscFunctionReturn(0);
57449b5e25fSSatish Balay }
575c2916339SPierre Jolivet 
576d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
577d71ae5a4SJacob Faibussowitsch {
57849b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
579d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
580d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
581d9ca1df4SBarry Smith   const MatScalar   *v;
582d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
583d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
58498c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
58549b5e25fSSatish Balay 
58649b5e25fSSatish Balay   PetscFunctionBegin;
5879566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
588c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
5899566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
5909566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
59149b5e25fSSatish Balay 
59249b5e25fSSatish Balay   v  = a->a;
59349b5e25fSSatish Balay   xb = x;
59449b5e25fSSatish Balay 
59549b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
59649b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
5979371c9d4SSatish Balay     x1   = xb[0];
5989371c9d4SSatish Balay     x2   = xb[1];
5999371c9d4SSatish Balay     x3   = xb[2];
6009371c9d4SSatish Balay     x4   = xb[3];
6019371c9d4SSatish Balay     x5   = xb[4];
6029371c9d4SSatish Balay     x6   = xb[5];
6039371c9d4SSatish Balay     x7   = xb[6];
60449b5e25fSSatish Balay     ib   = aj + *ai;
605831a3094SHong Zhang     jmin = 0;
60698c9bda7SSatish Balay     nonzerorow += (n > 0);
6077fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
60849b5e25fSSatish Balay       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
60949b5e25fSSatish Balay       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
61049b5e25fSSatish Balay       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
61149b5e25fSSatish Balay       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
61249b5e25fSSatish Balay       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
61349b5e25fSSatish Balay       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
61449b5e25fSSatish Balay       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
6159371c9d4SSatish Balay       v += 49;
6169371c9d4SSatish Balay       jmin++;
6177fbae186SHong Zhang     }
618444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
619444d8c10SJed Brown     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
620831a3094SHong Zhang     for (j = jmin; j < n; j++) {
62149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
62249b5e25fSSatish Balay       cval = ib[j] * 7;
62349b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
62449b5e25fSSatish Balay       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
62549b5e25fSSatish Balay       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
62649b5e25fSSatish Balay       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
62749b5e25fSSatish Balay       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
62849b5e25fSSatish Balay       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
62949b5e25fSSatish Balay       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
63049b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
63149b5e25fSSatish Balay       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
63249b5e25fSSatish Balay       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
63349b5e25fSSatish Balay       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
63449b5e25fSSatish Balay       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
63549b5e25fSSatish Balay       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
63649b5e25fSSatish Balay       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
63749b5e25fSSatish Balay       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
63849b5e25fSSatish Balay       v += 49;
63949b5e25fSSatish Balay     }
6409371c9d4SSatish Balay     xb += 7;
6419371c9d4SSatish Balay     ai++;
64249b5e25fSSatish Balay   }
6439566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
6449566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
6459566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
64649b5e25fSSatish Balay   PetscFunctionReturn(0);
64749b5e25fSSatish Balay }
64849b5e25fSSatish Balay 
64949b5e25fSSatish Balay /*
65049b5e25fSSatish Balay     This will not work with MatScalar == float because it calls the BLAS
65149b5e25fSSatish Balay */
652d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
653d71ae5a4SJacob Faibussowitsch {
65449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
655d9ca1df4SBarry Smith   PetscScalar       *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
656d9ca1df4SBarry Smith   const PetscScalar *x, *x_ptr, *xb;
657d9ca1df4SBarry Smith   const MatScalar   *v;
658d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
659d9ca1df4SBarry Smith   const PetscInt    *idx, *aj, *ii;
66098c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
66149b5e25fSSatish Balay 
66249b5e25fSSatish Balay   PetscFunctionBegin;
6639566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
664c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
6659566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
6669566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
66759689ffbSStefano Zampini 
66859689ffbSStefano Zampini   x_ptr = x;
66959689ffbSStefano Zampini   z_ptr = z;
67049b5e25fSSatish Balay 
67149b5e25fSSatish Balay   aj = a->j;
67249b5e25fSSatish Balay   v  = a->a;
67349b5e25fSSatish Balay   ii = a->i;
67449b5e25fSSatish Balay 
67548a46eb9SPierre Jolivet   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
67649b5e25fSSatish Balay   work = a->mult_work;
67749b5e25fSSatish Balay 
67849b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
6799371c9d4SSatish Balay     n     = ii[1] - ii[0];
6809371c9d4SSatish Balay     ncols = n * bs;
6819371c9d4SSatish Balay     workt = work;
6829371c9d4SSatish Balay     idx   = aj + ii[0];
68398c9bda7SSatish Balay     nonzerorow += (n > 0);
68449b5e25fSSatish Balay 
68549b5e25fSSatish Balay     /* upper triangular part */
68649b5e25fSSatish Balay     for (j = 0; j < n; j++) {
68749b5e25fSSatish Balay       xb = x_ptr + bs * (*idx++);
68849b5e25fSSatish Balay       for (k = 0; k < bs; k++) workt[k] = xb[k];
68949b5e25fSSatish Balay       workt += bs;
69049b5e25fSSatish Balay     }
69149b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
69296b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
69349b5e25fSSatish Balay 
69449b5e25fSSatish Balay     /* strict lower triangular part */
695831a3094SHong Zhang     idx = aj + ii[0];
69659689ffbSStefano Zampini     if (n && *idx == i) {
6979371c9d4SSatish Balay       ncols -= bs;
6989371c9d4SSatish Balay       v += bs2;
6999371c9d4SSatish Balay       idx++;
7009371c9d4SSatish Balay       n--;
701831a3094SHong Zhang     }
70296b9376eSHong Zhang 
70349b5e25fSSatish Balay     if (ncols > 0) {
70449b5e25fSSatish Balay       workt = work;
7059566063dSJacob Faibussowitsch       PetscCall(PetscArrayzero(workt, ncols));
70696b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
707831a3094SHong Zhang       for (j = 0; j < n; j++) {
708831a3094SHong Zhang         zb = z_ptr + bs * (*idx++);
70949b5e25fSSatish Balay         for (k = 0; k < bs; k++) zb[k] += workt[k];
71049b5e25fSSatish Balay         workt += bs;
71149b5e25fSSatish Balay       }
71249b5e25fSSatish Balay     }
7139371c9d4SSatish Balay     x += bs;
7149371c9d4SSatish Balay     v += n * bs2;
7159371c9d4SSatish Balay     z += bs;
7169371c9d4SSatish Balay     ii++;
71749b5e25fSSatish Balay   }
71849b5e25fSSatish Balay 
7199566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
7209566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
7219566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
72249b5e25fSSatish Balay   PetscFunctionReturn(0);
72349b5e25fSSatish Balay }
72449b5e25fSSatish Balay 
725d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
726d71ae5a4SJacob Faibussowitsch {
72749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
728d9ca1df4SBarry Smith   PetscScalar       *z, x1;
729d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
730d9ca1df4SBarry Smith   const MatScalar   *v;
731d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
732d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
73398c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
734eb1ec7c1SStefano Zampini #if defined(PETSC_USE_COMPLEX)
735b94d7dedSBarry Smith   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
736eb1ec7c1SStefano Zampini #else
737eb1ec7c1SStefano Zampini   const int aconj = 0;
738eb1ec7c1SStefano Zampini #endif
73949b5e25fSSatish Balay 
74049b5e25fSSatish Balay   PetscFunctionBegin;
7419566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
742c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
7439566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
7449566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
74549b5e25fSSatish Balay   v  = a->a;
74649b5e25fSSatish Balay   xb = x;
74749b5e25fSSatish Balay 
74849b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
74949b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th row of A */
75049b5e25fSSatish Balay     x1   = xb[0];
75149b5e25fSSatish Balay     ib   = aj + *ai;
752831a3094SHong Zhang     jmin = 0;
75398c9bda7SSatish Balay     nonzerorow += (n > 0);
7543d9ade75SStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
7559371c9d4SSatish Balay       z[i] += *v++ * x[*ib++];
7569371c9d4SSatish Balay       jmin++;
757831a3094SHong Zhang     }
758eb1ec7c1SStefano Zampini     if (aconj) {
759eb1ec7c1SStefano Zampini       for (j = jmin; j < n; j++) {
760eb1ec7c1SStefano Zampini         cval = *ib;
761eb1ec7c1SStefano Zampini         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
762eb1ec7c1SStefano Zampini         z[i] += *v++ * x[*ib++];       /* (strict upper triangular part of A)*x  */
763eb1ec7c1SStefano Zampini       }
764eb1ec7c1SStefano Zampini     } else {
765831a3094SHong Zhang       for (j = jmin; j < n; j++) {
76649b5e25fSSatish Balay         cval = *ib;
76749b5e25fSSatish Balay         z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
76849b5e25fSSatish Balay         z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
76949b5e25fSSatish Balay       }
770eb1ec7c1SStefano Zampini     }
7719371c9d4SSatish Balay     xb++;
7729371c9d4SSatish Balay     ai++;
77349b5e25fSSatish Balay   }
77449b5e25fSSatish Balay 
7759566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
7769566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
77749b5e25fSSatish Balay 
7789566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
77949b5e25fSSatish Balay   PetscFunctionReturn(0);
78049b5e25fSSatish Balay }
78149b5e25fSSatish Balay 
782d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
783d71ae5a4SJacob Faibussowitsch {
78449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
785d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2;
786d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
787d9ca1df4SBarry Smith   const MatScalar   *v;
788d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
789d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
79098c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
79149b5e25fSSatish Balay 
79249b5e25fSSatish Balay   PetscFunctionBegin;
7939566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
794c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
7959566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
7969566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
79749b5e25fSSatish Balay 
79849b5e25fSSatish Balay   v  = a->a;
79949b5e25fSSatish Balay   xb = x;
80049b5e25fSSatish Balay 
80149b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
80249b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
8039371c9d4SSatish Balay     x1   = xb[0];
8049371c9d4SSatish Balay     x2   = xb[1];
80549b5e25fSSatish Balay     ib   = aj + *ai;
806831a3094SHong Zhang     jmin = 0;
80798c9bda7SSatish Balay     nonzerorow += (n > 0);
80859689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
80949b5e25fSSatish Balay       z[2 * i] += v[0] * x1 + v[2] * x2;
81049b5e25fSSatish Balay       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
8119371c9d4SSatish Balay       v += 4;
8129371c9d4SSatish Balay       jmin++;
8137fbae186SHong Zhang     }
814444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
815444d8c10SJed Brown     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
816831a3094SHong Zhang     for (j = jmin; j < n; j++) {
81749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
81849b5e25fSSatish Balay       cval = ib[j] * 2;
81949b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2;
82049b5e25fSSatish Balay       z[cval + 1] += v[2] * x1 + v[3] * x2;
82149b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
82249b5e25fSSatish Balay       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
82349b5e25fSSatish Balay       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
82449b5e25fSSatish Balay       v += 4;
82549b5e25fSSatish Balay     }
8269371c9d4SSatish Balay     xb += 2;
8279371c9d4SSatish Balay     ai++;
82849b5e25fSSatish Balay   }
8299566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
8309566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
83149b5e25fSSatish Balay 
8329566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
83349b5e25fSSatish Balay   PetscFunctionReturn(0);
83449b5e25fSSatish Balay }
83549b5e25fSSatish Balay 
836d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
837d71ae5a4SJacob Faibussowitsch {
83849b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
839d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3;
840d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
841d9ca1df4SBarry Smith   const MatScalar   *v;
842d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
843d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
84498c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
84549b5e25fSSatish Balay 
84649b5e25fSSatish Balay   PetscFunctionBegin;
8479566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
848c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
8499566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
8509566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
85149b5e25fSSatish Balay 
85249b5e25fSSatish Balay   v  = a->a;
85349b5e25fSSatish Balay   xb = x;
85449b5e25fSSatish Balay 
85549b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
85649b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
8579371c9d4SSatish Balay     x1   = xb[0];
8589371c9d4SSatish Balay     x2   = xb[1];
8599371c9d4SSatish Balay     x3   = xb[2];
86049b5e25fSSatish Balay     ib   = aj + *ai;
861831a3094SHong Zhang     jmin = 0;
86298c9bda7SSatish Balay     nonzerorow += (n > 0);
86359689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
86449b5e25fSSatish Balay       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
86549b5e25fSSatish Balay       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
86649b5e25fSSatish Balay       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
8679371c9d4SSatish Balay       v += 9;
8689371c9d4SSatish Balay       jmin++;
8697fbae186SHong Zhang     }
870444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
871444d8c10SJed Brown     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
872831a3094SHong Zhang     for (j = jmin; j < n; j++) {
87349b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
87449b5e25fSSatish Balay       cval = ib[j] * 3;
87549b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
87649b5e25fSSatish Balay       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
87749b5e25fSSatish Balay       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
87849b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
87949b5e25fSSatish Balay       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
88049b5e25fSSatish Balay       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
88149b5e25fSSatish Balay       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
88249b5e25fSSatish Balay       v += 9;
88349b5e25fSSatish Balay     }
8849371c9d4SSatish Balay     xb += 3;
8859371c9d4SSatish Balay     ai++;
88649b5e25fSSatish Balay   }
88749b5e25fSSatish Balay 
8889566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
8899566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
89049b5e25fSSatish Balay 
8919566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
89249b5e25fSSatish Balay   PetscFunctionReturn(0);
89349b5e25fSSatish Balay }
89449b5e25fSSatish Balay 
895d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
896d71ae5a4SJacob Faibussowitsch {
89749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
898d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4;
899d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
900d9ca1df4SBarry Smith   const MatScalar   *v;
901d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
902d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
90398c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
90449b5e25fSSatish Balay 
90549b5e25fSSatish Balay   PetscFunctionBegin;
9069566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
907c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
9089566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
9099566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
91049b5e25fSSatish Balay 
91149b5e25fSSatish Balay   v  = a->a;
91249b5e25fSSatish Balay   xb = x;
91349b5e25fSSatish Balay 
91449b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
91549b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
9169371c9d4SSatish Balay     x1   = xb[0];
9179371c9d4SSatish Balay     x2   = xb[1];
9189371c9d4SSatish Balay     x3   = xb[2];
9199371c9d4SSatish Balay     x4   = xb[3];
92049b5e25fSSatish Balay     ib   = aj + *ai;
921831a3094SHong Zhang     jmin = 0;
92298c9bda7SSatish Balay     nonzerorow += (n > 0);
92359689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
92449b5e25fSSatish Balay       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
92549b5e25fSSatish Balay       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
92649b5e25fSSatish Balay       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
92749b5e25fSSatish Balay       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
9289371c9d4SSatish Balay       v += 16;
9299371c9d4SSatish Balay       jmin++;
9307fbae186SHong Zhang     }
931444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
932444d8c10SJed Brown     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
933831a3094SHong Zhang     for (j = jmin; j < n; j++) {
93449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
93549b5e25fSSatish Balay       cval = ib[j] * 4;
93649b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
93749b5e25fSSatish Balay       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
93849b5e25fSSatish Balay       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
93949b5e25fSSatish Balay       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
94049b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
94149b5e25fSSatish Balay       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
94249b5e25fSSatish Balay       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
94349b5e25fSSatish Balay       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
94449b5e25fSSatish Balay       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
94549b5e25fSSatish Balay       v += 16;
94649b5e25fSSatish Balay     }
9479371c9d4SSatish Balay     xb += 4;
9489371c9d4SSatish Balay     ai++;
94949b5e25fSSatish Balay   }
95049b5e25fSSatish Balay 
9519566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
9529566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
95349b5e25fSSatish Balay 
9549566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
95549b5e25fSSatish Balay   PetscFunctionReturn(0);
95649b5e25fSSatish Balay }
95749b5e25fSSatish Balay 
958d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
959d71ae5a4SJacob Faibussowitsch {
96049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
961d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5;
962d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
963d9ca1df4SBarry Smith   const MatScalar   *v;
964d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
965d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
96698c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
96749b5e25fSSatish Balay 
96849b5e25fSSatish Balay   PetscFunctionBegin;
9699566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
970c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
9719566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
9729566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
97349b5e25fSSatish Balay 
97449b5e25fSSatish Balay   v  = a->a;
97549b5e25fSSatish Balay   xb = x;
97649b5e25fSSatish Balay 
97749b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
97849b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
9799371c9d4SSatish Balay     x1   = xb[0];
9809371c9d4SSatish Balay     x2   = xb[1];
9819371c9d4SSatish Balay     x3   = xb[2];
9829371c9d4SSatish Balay     x4   = xb[3];
9839371c9d4SSatish Balay     x5   = xb[4];
98449b5e25fSSatish Balay     ib   = aj + *ai;
985831a3094SHong Zhang     jmin = 0;
98698c9bda7SSatish Balay     nonzerorow += (n > 0);
98759689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
98849b5e25fSSatish Balay       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
98949b5e25fSSatish Balay       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
99049b5e25fSSatish Balay       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
99149b5e25fSSatish Balay       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
99249b5e25fSSatish Balay       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
9939371c9d4SSatish Balay       v += 25;
9949371c9d4SSatish Balay       jmin++;
9957fbae186SHong Zhang     }
996444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
997444d8c10SJed Brown     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
998831a3094SHong Zhang     for (j = jmin; j < n; j++) {
99949b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
100049b5e25fSSatish Balay       cval = ib[j] * 5;
100149b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
100249b5e25fSSatish Balay       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
100349b5e25fSSatish Balay       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
100449b5e25fSSatish Balay       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
100549b5e25fSSatish Balay       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
100649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
100749b5e25fSSatish Balay       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
100849b5e25fSSatish Balay       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
100949b5e25fSSatish Balay       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
101049b5e25fSSatish Balay       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
101149b5e25fSSatish Balay       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
101249b5e25fSSatish Balay       v += 25;
101349b5e25fSSatish Balay     }
10149371c9d4SSatish Balay     xb += 5;
10159371c9d4SSatish Balay     ai++;
101649b5e25fSSatish Balay   }
101749b5e25fSSatish Balay 
10189566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
10199566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
102049b5e25fSSatish Balay 
10219566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
102249b5e25fSSatish Balay   PetscFunctionReturn(0);
102349b5e25fSSatish Balay }
1024c2916339SPierre Jolivet 
1025d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1026d71ae5a4SJacob Faibussowitsch {
102749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1028d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, x6;
1029d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
1030d9ca1df4SBarry Smith   const MatScalar   *v;
1031d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1032d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
103398c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
103449b5e25fSSatish Balay 
103549b5e25fSSatish Balay   PetscFunctionBegin;
10369566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
1037c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
10389566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
10399566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
104049b5e25fSSatish Balay 
104149b5e25fSSatish Balay   v  = a->a;
104249b5e25fSSatish Balay   xb = x;
104349b5e25fSSatish Balay 
104449b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
104549b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
10469371c9d4SSatish Balay     x1   = xb[0];
10479371c9d4SSatish Balay     x2   = xb[1];
10489371c9d4SSatish Balay     x3   = xb[2];
10499371c9d4SSatish Balay     x4   = xb[3];
10509371c9d4SSatish Balay     x5   = xb[4];
10519371c9d4SSatish Balay     x6   = xb[5];
105249b5e25fSSatish Balay     ib   = aj + *ai;
1053831a3094SHong Zhang     jmin = 0;
105498c9bda7SSatish Balay     nonzerorow += (n > 0);
105559689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
105649b5e25fSSatish Balay       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
105749b5e25fSSatish Balay       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
105849b5e25fSSatish Balay       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
105949b5e25fSSatish Balay       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
106049b5e25fSSatish Balay       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
106149b5e25fSSatish Balay       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
10629371c9d4SSatish Balay       v += 36;
10639371c9d4SSatish Balay       jmin++;
10647fbae186SHong Zhang     }
1065444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1066444d8c10SJed Brown     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1067831a3094SHong Zhang     for (j = jmin; j < n; j++) {
106849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
106949b5e25fSSatish Balay       cval = ib[j] * 6;
107049b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
107149b5e25fSSatish Balay       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
107249b5e25fSSatish Balay       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
107349b5e25fSSatish Balay       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
107449b5e25fSSatish Balay       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
107549b5e25fSSatish Balay       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
107649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
107749b5e25fSSatish Balay       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
107849b5e25fSSatish Balay       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
107949b5e25fSSatish Balay       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
108049b5e25fSSatish Balay       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
108149b5e25fSSatish Balay       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
108249b5e25fSSatish Balay       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
108349b5e25fSSatish Balay       v += 36;
108449b5e25fSSatish Balay     }
10859371c9d4SSatish Balay     xb += 6;
10869371c9d4SSatish Balay     ai++;
108749b5e25fSSatish Balay   }
108849b5e25fSSatish Balay 
10899566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
10909566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
109149b5e25fSSatish Balay 
10929566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
109349b5e25fSSatish Balay   PetscFunctionReturn(0);
109449b5e25fSSatish Balay }
109549b5e25fSSatish Balay 
1096d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1097d71ae5a4SJacob Faibussowitsch {
109849b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1099d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7;
1100d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
1101d9ca1df4SBarry Smith   const MatScalar   *v;
1102d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1103d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
110498c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
110549b5e25fSSatish Balay 
110649b5e25fSSatish Balay   PetscFunctionBegin;
11079566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
1108c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
11099566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
11109566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
111149b5e25fSSatish Balay 
111249b5e25fSSatish Balay   v  = a->a;
111349b5e25fSSatish Balay   xb = x;
111449b5e25fSSatish Balay 
111549b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
111649b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
11179371c9d4SSatish Balay     x1   = xb[0];
11189371c9d4SSatish Balay     x2   = xb[1];
11199371c9d4SSatish Balay     x3   = xb[2];
11209371c9d4SSatish Balay     x4   = xb[3];
11219371c9d4SSatish Balay     x5   = xb[4];
11229371c9d4SSatish Balay     x6   = xb[5];
11239371c9d4SSatish Balay     x7   = xb[6];
112449b5e25fSSatish Balay     ib   = aj + *ai;
1125831a3094SHong Zhang     jmin = 0;
112698c9bda7SSatish Balay     nonzerorow += (n > 0);
112759689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
112849b5e25fSSatish Balay       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
112949b5e25fSSatish Balay       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
113049b5e25fSSatish Balay       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
113149b5e25fSSatish Balay       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
113249b5e25fSSatish Balay       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
113349b5e25fSSatish Balay       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
113449b5e25fSSatish Balay       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
11359371c9d4SSatish Balay       v += 49;
11369371c9d4SSatish Balay       jmin++;
11377fbae186SHong Zhang     }
1138444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1139444d8c10SJed Brown     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1140831a3094SHong Zhang     for (j = jmin; j < n; j++) {
114149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
114249b5e25fSSatish Balay       cval = ib[j] * 7;
114349b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
114449b5e25fSSatish Balay       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
114549b5e25fSSatish Balay       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
114649b5e25fSSatish Balay       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
114749b5e25fSSatish Balay       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
114849b5e25fSSatish Balay       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
114949b5e25fSSatish Balay       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
115049b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
115149b5e25fSSatish Balay       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
115249b5e25fSSatish Balay       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
115349b5e25fSSatish Balay       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
115449b5e25fSSatish Balay       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
115549b5e25fSSatish Balay       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
115649b5e25fSSatish Balay       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
115749b5e25fSSatish Balay       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
115849b5e25fSSatish Balay       v += 49;
115949b5e25fSSatish Balay     }
11609371c9d4SSatish Balay     xb += 7;
11619371c9d4SSatish Balay     ai++;
116249b5e25fSSatish Balay   }
116349b5e25fSSatish Balay 
11649566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
11659566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
116649b5e25fSSatish Balay 
11679566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
116849b5e25fSSatish Balay   PetscFunctionReturn(0);
116949b5e25fSSatish Balay }
117049b5e25fSSatish Balay 
1171d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1172d71ae5a4SJacob Faibussowitsch {
117349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1174f4259b30SLisandro Dalcin   PetscScalar       *z, *z_ptr = NULL, *zb, *work, *workt;
1175d9ca1df4SBarry Smith   const PetscScalar *x, *x_ptr, *xb;
1176d9ca1df4SBarry Smith   const MatScalar   *v;
1177d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1178d9ca1df4SBarry Smith   const PetscInt    *idx, *aj, *ii;
117998c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
118049b5e25fSSatish Balay 
118149b5e25fSSatish Balay   PetscFunctionBegin;
11829566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
1183c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
11849371c9d4SSatish Balay   PetscCall(VecGetArrayRead(xx, &x));
11859371c9d4SSatish Balay   x_ptr = x;
11869371c9d4SSatish Balay   PetscCall(VecGetArray(zz, &z));
11879371c9d4SSatish Balay   z_ptr = z;
118849b5e25fSSatish Balay 
118949b5e25fSSatish Balay   aj = a->j;
119049b5e25fSSatish Balay   v  = a->a;
119149b5e25fSSatish Balay   ii = a->i;
119249b5e25fSSatish Balay 
119348a46eb9SPierre Jolivet   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
119449b5e25fSSatish Balay   work = a->mult_work;
119549b5e25fSSatish Balay 
119649b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
11979371c9d4SSatish Balay     n     = ii[1] - ii[0];
11989371c9d4SSatish Balay     ncols = n * bs;
11999371c9d4SSatish Balay     workt = work;
12009371c9d4SSatish Balay     idx   = aj + ii[0];
120198c9bda7SSatish Balay     nonzerorow += (n > 0);
120249b5e25fSSatish Balay 
120349b5e25fSSatish Balay     /* upper triangular part */
120449b5e25fSSatish Balay     for (j = 0; j < n; j++) {
120549b5e25fSSatish Balay       xb = x_ptr + bs * (*idx++);
120649b5e25fSSatish Balay       for (k = 0; k < bs; k++) workt[k] = xb[k];
120749b5e25fSSatish Balay       workt += bs;
120849b5e25fSSatish Balay     }
120949b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
121096b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
121149b5e25fSSatish Balay 
121249b5e25fSSatish Balay     /* strict lower triangular part */
1213831a3094SHong Zhang     idx = aj + ii[0];
121459689ffbSStefano Zampini     if (n && *idx == i) {
12159371c9d4SSatish Balay       ncols -= bs;
12169371c9d4SSatish Balay       v += bs2;
12179371c9d4SSatish Balay       idx++;
12189371c9d4SSatish Balay       n--;
1219831a3094SHong Zhang     }
122049b5e25fSSatish Balay     if (ncols > 0) {
122149b5e25fSSatish Balay       workt = work;
12229566063dSJacob Faibussowitsch       PetscCall(PetscArrayzero(workt, ncols));
122396b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1224831a3094SHong Zhang       for (j = 0; j < n; j++) {
1225831a3094SHong Zhang         zb = z_ptr + bs * (*idx++);
122649b5e25fSSatish Balay         for (k = 0; k < bs; k++) zb[k] += workt[k];
122749b5e25fSSatish Balay         workt += bs;
122849b5e25fSSatish Balay       }
122949b5e25fSSatish Balay     }
123049b5e25fSSatish Balay 
12319371c9d4SSatish Balay     x += bs;
12329371c9d4SSatish Balay     v += n * bs2;
12339371c9d4SSatish Balay     z += bs;
12349371c9d4SSatish Balay     ii++;
123549b5e25fSSatish Balay   }
123649b5e25fSSatish Balay 
12379566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
12389566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
123949b5e25fSSatish Balay 
12409566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
124149b5e25fSSatish Balay   PetscFunctionReturn(0);
124249b5e25fSSatish Balay }
124349b5e25fSSatish Balay 
1244d71ae5a4SJacob Faibussowitsch PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1245d71ae5a4SJacob Faibussowitsch {
124649b5e25fSSatish Balay   Mat_SeqSBAIJ *a      = (Mat_SeqSBAIJ *)inA->data;
1247f4df32b1SMatthew Knepley   PetscScalar   oalpha = alpha;
1248c5df96a5SBarry Smith   PetscBLASInt  one    = 1, totalnz;
124949b5e25fSSatish Balay 
125049b5e25fSSatish Balay   PetscFunctionBegin;
12519566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1252792fecdfSBarry Smith   PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
12539566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(totalnz));
125449b5e25fSSatish Balay   PetscFunctionReturn(0);
125549b5e25fSSatish Balay }
125649b5e25fSSatish Balay 
1257d71ae5a4SJacob Faibussowitsch PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1258d71ae5a4SJacob Faibussowitsch {
125949b5e25fSSatish Balay   Mat_SeqSBAIJ    *a        = (Mat_SeqSBAIJ *)A->data;
1260d9ca1df4SBarry Smith   const MatScalar *v        = a->a;
126149b5e25fSSatish Balay   PetscReal        sum_diag = 0.0, sum_off = 0.0, *sum;
1262d9ca1df4SBarry Smith   PetscInt         i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1263d9ca1df4SBarry Smith   const PetscInt  *aj = a->j, *col;
126449b5e25fSSatish Balay 
126549b5e25fSSatish Balay   PetscFunctionBegin;
1266c40ae873SPierre Jolivet   if (!a->nz) {
1267c40ae873SPierre Jolivet     *norm = 0.0;
1268c40ae873SPierre Jolivet     PetscFunctionReturn(0);
1269c40ae873SPierre Jolivet   }
127049b5e25fSSatish Balay   if (type == NORM_FROBENIUS) {
127149b5e25fSSatish Balay     for (k = 0; k < mbs; k++) {
127259689ffbSStefano Zampini       jmin = a->i[k];
127359689ffbSStefano Zampini       jmax = a->i[k + 1];
1274831a3094SHong Zhang       col  = aj + jmin;
127559689ffbSStefano Zampini       if (jmax - jmin > 0 && *col == k) { /* diagonal block */
127649b5e25fSSatish Balay         for (i = 0; i < bs2; i++) {
12779371c9d4SSatish Balay           sum_diag += PetscRealPart(PetscConj(*v) * (*v));
12789371c9d4SSatish Balay           v++;
127949b5e25fSSatish Balay         }
1280831a3094SHong Zhang         jmin++;
1281831a3094SHong Zhang       }
1282831a3094SHong Zhang       for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
128349b5e25fSSatish Balay         for (i = 0; i < bs2; i++) {
12849371c9d4SSatish Balay           sum_off += PetscRealPart(PetscConj(*v) * (*v));
12859371c9d4SSatish Balay           v++;
128649b5e25fSSatish Balay         }
128749b5e25fSSatish Balay       }
128849b5e25fSSatish Balay     }
12898f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
12909566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
12910b8dc8d2SHong Zhang   } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
12929566063dSJacob Faibussowitsch     PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
12930b8dc8d2SHong Zhang     for (i = 0; i < mbs; i++) jl[i] = mbs;
12940b8dc8d2SHong Zhang     il[0] = 0;
129549b5e25fSSatish Balay 
129649b5e25fSSatish Balay     *norm = 0.0;
129749b5e25fSSatish Balay     for (k = 0; k < mbs; k++) { /* k_th block row */
129849b5e25fSSatish Balay       for (j = 0; j < bs; j++) sum[j] = 0.0;
129949b5e25fSSatish Balay       /*-- col sum --*/
130049b5e25fSSatish Balay       i = jl[k]; /* first |A(i,k)| to be added */
1301831a3094SHong Zhang       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1302831a3094SHong Zhang                   at step k */
130349b5e25fSSatish Balay       while (i < mbs) {
130449b5e25fSSatish Balay         nexti = jl[i]; /* next block row to be added */
130549b5e25fSSatish Balay         ik    = il[i]; /* block index of A(i,k) in the array a */
130649b5e25fSSatish Balay         for (j = 0; j < bs; j++) {
130749b5e25fSSatish Balay           v = a->a + ik * bs2 + j * bs;
130849b5e25fSSatish Balay           for (k1 = 0; k1 < bs; k1++) {
13099371c9d4SSatish Balay             sum[j] += PetscAbsScalar(*v);
13109371c9d4SSatish Balay             v++;
131149b5e25fSSatish Balay           }
131249b5e25fSSatish Balay         }
131349b5e25fSSatish Balay         /* update il, jl */
1314831a3094SHong Zhang         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1315831a3094SHong Zhang         jmax = a->i[i + 1];
131649b5e25fSSatish Balay         if (jmin < jmax) {
131749b5e25fSSatish Balay           il[i] = jmin;
131849b5e25fSSatish Balay           j     = a->j[jmin];
13199371c9d4SSatish Balay           jl[i] = jl[j];
13209371c9d4SSatish Balay           jl[j] = i;
132149b5e25fSSatish Balay         }
132249b5e25fSSatish Balay         i = nexti;
132349b5e25fSSatish Balay       }
132449b5e25fSSatish Balay       /*-- row sum --*/
132559689ffbSStefano Zampini       jmin = a->i[k];
132659689ffbSStefano Zampini       jmax = a->i[k + 1];
132749b5e25fSSatish Balay       for (i = jmin; i < jmax; i++) {
132849b5e25fSSatish Balay         for (j = 0; j < bs; j++) {
132949b5e25fSSatish Balay           v = a->a + i * bs2 + j;
133049b5e25fSSatish Balay           for (k1 = 0; k1 < bs; k1++) {
13319371c9d4SSatish Balay             sum[j] += PetscAbsScalar(*v);
13329371c9d4SSatish Balay             v += bs;
133349b5e25fSSatish Balay           }
133449b5e25fSSatish Balay         }
133549b5e25fSSatish Balay       }
133649b5e25fSSatish Balay       /* add k_th block row to il, jl */
1337831a3094SHong Zhang       col = aj + jmin;
133859689ffbSStefano Zampini       if (jmax - jmin > 0 && *col == k) jmin++;
133949b5e25fSSatish Balay       if (jmin < jmax) {
134049b5e25fSSatish Balay         il[k] = jmin;
13419371c9d4SSatish Balay         j     = a->j[jmin];
13429371c9d4SSatish Balay         jl[k] = jl[j];
13439371c9d4SSatish Balay         jl[j] = k;
134449b5e25fSSatish Balay       }
134549b5e25fSSatish Balay       for (j = 0; j < bs; j++) {
134649b5e25fSSatish Balay         if (sum[j] > *norm) *norm = sum[j];
134749b5e25fSSatish Balay       }
134849b5e25fSSatish Balay     }
13499566063dSJacob Faibussowitsch     PetscCall(PetscFree3(sum, il, jl));
13509566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1351f23aa3ddSBarry Smith   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
135249b5e25fSSatish Balay   PetscFunctionReturn(0);
135349b5e25fSSatish Balay }
135449b5e25fSSatish Balay 
1355d71ae5a4SJacob Faibussowitsch PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1356d71ae5a4SJacob Faibussowitsch {
135749b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
135849b5e25fSSatish Balay 
135949b5e25fSSatish Balay   PetscFunctionBegin;
136049b5e25fSSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1361d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1362ef511fbeSHong Zhang     *flg = PETSC_FALSE;
1363ef511fbeSHong Zhang     PetscFunctionReturn(0);
136449b5e25fSSatish Balay   }
136549b5e25fSSatish Balay 
136649b5e25fSSatish Balay   /* if the a->i are the same */
13679566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
136826fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
136949b5e25fSSatish Balay 
137049b5e25fSSatish Balay   /* if a->j are the same */
13719566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
137226fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
137326fbe8dcSKarl Rupp 
137449b5e25fSSatish Balay   /* if a->a are the same */
13759566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg));
1376935af2e7SHong Zhang   PetscFunctionReturn(0);
137749b5e25fSSatish Balay }
137849b5e25fSSatish Balay 
1379d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1380d71ae5a4SJacob Faibussowitsch {
138149b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1382d9ca1df4SBarry Smith   PetscInt         i, j, k, row, bs, ambs, bs2;
1383d9ca1df4SBarry Smith   const PetscInt  *ai, *aj;
138487828ca2SBarry Smith   PetscScalar     *x, zero = 0.0;
1385d9ca1df4SBarry Smith   const MatScalar *aa, *aa_j;
138649b5e25fSSatish Balay 
138749b5e25fSSatish Balay   PetscFunctionBegin;
1388d0f46423SBarry Smith   bs = A->rmap->bs;
1389aed4548fSBarry Smith   PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");
139082799104SHong Zhang 
139149b5e25fSSatish Balay   aa   = a->a;
13928a0c6efdSHong Zhang   ambs = a->mbs;
13938a0c6efdSHong Zhang 
13948a0c6efdSHong Zhang   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
13958a0c6efdSHong Zhang     PetscInt *diag = a->diag;
13968a0c6efdSHong Zhang     aa             = a->a;
13978a0c6efdSHong Zhang     ambs           = a->mbs;
13989566063dSJacob Faibussowitsch     PetscCall(VecGetArray(v, &x));
13998a0c6efdSHong Zhang     for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
14009566063dSJacob Faibussowitsch     PetscCall(VecRestoreArray(v, &x));
14018a0c6efdSHong Zhang     PetscFunctionReturn(0);
14028a0c6efdSHong Zhang   }
14038a0c6efdSHong Zhang 
140449b5e25fSSatish Balay   ai  = a->i;
140549b5e25fSSatish Balay   aj  = a->j;
140649b5e25fSSatish Balay   bs2 = a->bs2;
14079566063dSJacob Faibussowitsch   PetscCall(VecSet(v, zero));
1408c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
14099566063dSJacob Faibussowitsch   PetscCall(VecGetArray(v, &x));
141049b5e25fSSatish Balay   for (i = 0; i < ambs; i++) {
141149b5e25fSSatish Balay     j = ai[i];
141249b5e25fSSatish Balay     if (aj[j] == i) { /* if this is a diagonal element */
141349b5e25fSSatish Balay       row  = i * bs;
141449b5e25fSSatish Balay       aa_j = aa + j * bs2;
141549b5e25fSSatish Balay       for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
141649b5e25fSSatish Balay     }
141749b5e25fSSatish Balay   }
14189566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(v, &x));
141949b5e25fSSatish Balay   PetscFunctionReturn(0);
142049b5e25fSSatish Balay }
142149b5e25fSSatish Balay 
1422d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1423d71ae5a4SJacob Faibussowitsch {
142449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1425d9ca1df4SBarry Smith   PetscScalar        x;
1426d9ca1df4SBarry Smith   const PetscScalar *l, *li, *ri;
142749b5e25fSSatish Balay   MatScalar         *aa, *v;
1428fff8e43fSBarry Smith   PetscInt           i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1429fff8e43fSBarry Smith   const PetscInt    *ai, *aj;
1430ace3abfcSBarry Smith   PetscBool          flg;
143149b5e25fSSatish Balay 
143249b5e25fSSatish Balay   PetscFunctionBegin;
1433b3bf805bSHong Zhang   if (ll != rr) {
14349566063dSJacob Faibussowitsch     PetscCall(VecEqual(ll, rr, &flg));
143528b400f6SJacob Faibussowitsch     PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same");
1436b3bf805bSHong Zhang   }
1437b3bf805bSHong Zhang   if (!ll) PetscFunctionReturn(0);
143849b5e25fSSatish Balay   ai  = a->i;
143949b5e25fSSatish Balay   aj  = a->j;
144049b5e25fSSatish Balay   aa  = a->a;
1441d0f46423SBarry Smith   m   = A->rmap->N;
1442d0f46423SBarry Smith   bs  = A->rmap->bs;
144349b5e25fSSatish Balay   mbs = a->mbs;
144449b5e25fSSatish Balay   bs2 = a->bs2;
144549b5e25fSSatish Balay 
14469566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(ll, &l));
14479566063dSJacob Faibussowitsch   PetscCall(VecGetLocalSize(ll, &lm));
144808401ef6SPierre Jolivet   PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
144949b5e25fSSatish Balay   for (i = 0; i < mbs; i++) { /* for each block row */
145049b5e25fSSatish Balay     M  = ai[i + 1] - ai[i];
145149b5e25fSSatish Balay     li = l + i * bs;
145249b5e25fSSatish Balay     v  = aa + bs2 * ai[i];
145349b5e25fSSatish Balay     for (j = 0; j < M; j++) { /* for each block */
145449b5e25fSSatish Balay       ri = l + bs * aj[ai[i] + j];
14555e90f9d9SHong Zhang       for (k = 0; k < bs; k++) {
145649b5e25fSSatish Balay         x = ri[k];
145749b5e25fSSatish Balay         for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
145849b5e25fSSatish Balay       }
145949b5e25fSSatish Balay     }
146049b5e25fSSatish Balay   }
14619566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(ll, &l));
14629566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz));
146349b5e25fSSatish Balay   PetscFunctionReturn(0);
146449b5e25fSSatish Balay }
146549b5e25fSSatish Balay 
1466d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1467d71ae5a4SJacob Faibussowitsch {
146849b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
146949b5e25fSSatish Balay 
147049b5e25fSSatish Balay   PetscFunctionBegin;
147149b5e25fSSatish Balay   info->block_size   = a->bs2;
1472ceed8ce5SJed Brown   info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
14736c6c5352SBarry Smith   info->nz_used      = a->bs2 * a->nz;    /*num. of nonzeros in upper triangular part */
14743966268fSBarry Smith   info->nz_unneeded  = info->nz_allocated - info->nz_used;
147549b5e25fSSatish Balay   info->assemblies   = A->num_ass;
14768e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
14774dfa11a4SJacob Faibussowitsch   info->memory       = 0; /* REVIEW ME */
1478d5f3da31SBarry Smith   if (A->factortype) {
147949b5e25fSSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
148049b5e25fSSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
148149b5e25fSSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
148249b5e25fSSatish Balay   } else {
148349b5e25fSSatish Balay     info->fill_ratio_given  = 0;
148449b5e25fSSatish Balay     info->fill_ratio_needed = 0;
148549b5e25fSSatish Balay     info->factor_mallocs    = 0;
148649b5e25fSSatish Balay   }
148749b5e25fSSatish Balay   PetscFunctionReturn(0);
148849b5e25fSSatish Balay }
148949b5e25fSSatish Balay 
1490d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1491d71ae5a4SJacob Faibussowitsch {
149249b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
149349b5e25fSSatish Balay 
149449b5e25fSSatish Balay   PetscFunctionBegin;
14959566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
149649b5e25fSSatish Balay   PetscFunctionReturn(0);
149749b5e25fSSatish Balay }
1498dc354874SHong Zhang 
1499d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1500d71ae5a4SJacob Faibussowitsch {
1501dc354874SHong Zhang   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1502d9ca1df4SBarry Smith   PetscInt         i, j, n, row, col, bs, mbs;
1503d9ca1df4SBarry Smith   const PetscInt  *ai, *aj;
1504c3fca9a7SHong Zhang   PetscReal        atmp;
1505d9ca1df4SBarry Smith   const MatScalar *aa;
1506985db425SBarry Smith   PetscScalar     *x;
150713f74950SBarry Smith   PetscInt         ncols, brow, bcol, krow, kcol;
1508dc354874SHong Zhang 
1509dc354874SHong Zhang   PetscFunctionBegin;
151028b400f6SJacob Faibussowitsch   PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
151128b400f6SJacob Faibussowitsch   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1512d0f46423SBarry Smith   bs  = A->rmap->bs;
1513dc354874SHong Zhang   aa  = a->a;
1514dc354874SHong Zhang   ai  = a->i;
1515dc354874SHong Zhang   aj  = a->j;
151644117c81SHong Zhang   mbs = a->mbs;
1517dc354874SHong Zhang 
15189566063dSJacob Faibussowitsch   PetscCall(VecSet(v, 0.0));
15199566063dSJacob Faibussowitsch   PetscCall(VecGetArray(v, &x));
15209566063dSJacob Faibussowitsch   PetscCall(VecGetLocalSize(v, &n));
152108401ef6SPierre Jolivet   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
152244117c81SHong Zhang   for (i = 0; i < mbs; i++) {
15239371c9d4SSatish Balay     ncols = ai[1] - ai[0];
15249371c9d4SSatish Balay     ai++;
1525d0f6400bSHong Zhang     brow = bs * i;
152644117c81SHong Zhang     for (j = 0; j < ncols; j++) {
1527d0f6400bSHong Zhang       bcol = bs * (*aj);
152844117c81SHong Zhang       for (kcol = 0; kcol < bs; kcol++) {
1529d0f6400bSHong Zhang         col = bcol + kcol; /* col index */
153044117c81SHong Zhang         for (krow = 0; krow < bs; krow++) {
15319371c9d4SSatish Balay           atmp = PetscAbsScalar(*aa);
15329371c9d4SSatish Balay           aa++;
1533d0f6400bSHong Zhang           row = brow + krow; /* row index */
1534c3fca9a7SHong Zhang           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1535c3fca9a7SHong Zhang           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
153644117c81SHong Zhang         }
153744117c81SHong Zhang       }
1538d0f6400bSHong Zhang       aj++;
1539dc354874SHong Zhang     }
1540dc354874SHong Zhang   }
15419566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(v, &x));
1542dc354874SHong Zhang   PetscFunctionReturn(0);
1543dc354874SHong Zhang }
1544c2916339SPierre Jolivet 
1545d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1546d71ae5a4SJacob Faibussowitsch {
1547c2916339SPierre Jolivet   PetscFunctionBegin;
15489566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
15494222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1550c2916339SPierre Jolivet   PetscFunctionReturn(0);
1551c2916339SPierre Jolivet }
1552c2916339SPierre Jolivet 
1553d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1554d71ae5a4SJacob Faibussowitsch {
1555c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1556c2916339SPierre Jolivet   PetscScalar       *z = c;
1557c2916339SPierre Jolivet   const PetscScalar *xb;
1558c2916339SPierre Jolivet   PetscScalar        x1;
1559c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1560c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
15613c2e41e1SStefano Zampini #if defined(PETSC_USE_COMPLEX)
1562b94d7dedSBarry Smith   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
15633c2e41e1SStefano Zampini #else
15643c2e41e1SStefano Zampini   const int aconj = 0;
15653c2e41e1SStefano Zampini #endif
1566c2916339SPierre Jolivet 
1567c2916339SPierre Jolivet   PetscFunctionBegin;
1568c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
15699371c9d4SSatish Balay     n = ii[1] - ii[0];
15709371c9d4SSatish Balay     ii++;
1571c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1572c2916339SPierre Jolivet     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1573c2916339SPierre Jolivet     jj = idx;
1574c2916339SPierre Jolivet     vv = v;
1575c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1576c2916339SPierre Jolivet       idx = jj;
1577c2916339SPierre Jolivet       v   = vv;
1578c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
15799371c9d4SSatish Balay         xb = b + (*idx);
15809371c9d4SSatish Balay         x1 = xb[0 + k * bm];
1581c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1;
15823c2e41e1SStefano Zampini         if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1583c2916339SPierre Jolivet         v += 1;
1584c2916339SPierre Jolivet         ++idx;
1585c2916339SPierre Jolivet       }
1586c2916339SPierre Jolivet     }
1587c2916339SPierre Jolivet     z += 1;
1588c2916339SPierre Jolivet   }
1589c2916339SPierre Jolivet   PetscFunctionReturn(0);
1590c2916339SPierre Jolivet }
1591c2916339SPierre Jolivet 
1592d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1593d71ae5a4SJacob Faibussowitsch {
1594c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1595c2916339SPierre Jolivet   PetscScalar       *z = c;
1596c2916339SPierre Jolivet   const PetscScalar *xb;
1597c2916339SPierre Jolivet   PetscScalar        x1, x2;
1598c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1599c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1600c2916339SPierre Jolivet 
1601c2916339SPierre Jolivet   PetscFunctionBegin;
1602c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
16039371c9d4SSatish Balay     n = ii[1] - ii[0];
16049371c9d4SSatish Balay     ii++;
1605c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1606c2916339SPierre Jolivet     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1607c2916339SPierre Jolivet     jj = idx;
1608c2916339SPierre Jolivet     vv = v;
1609c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1610c2916339SPierre Jolivet       idx = jj;
1611c2916339SPierre Jolivet       v   = vv;
1612c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
16139371c9d4SSatish Balay         xb = b + 2 * (*idx);
16149371c9d4SSatish Balay         x1 = xb[0 + k * bm];
16159371c9d4SSatish Balay         x2 = xb[1 + k * bm];
1616c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1617c2916339SPierre Jolivet         z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1618c2916339SPierre Jolivet         if (*idx != i) {
1619c2916339SPierre Jolivet           c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1620c2916339SPierre Jolivet           c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1621c2916339SPierre Jolivet         }
1622c2916339SPierre Jolivet         v += 4;
1623c2916339SPierre Jolivet         ++idx;
1624c2916339SPierre Jolivet       }
1625c2916339SPierre Jolivet     }
1626c2916339SPierre Jolivet     z += 2;
1627c2916339SPierre Jolivet   }
1628c2916339SPierre Jolivet   PetscFunctionReturn(0);
1629c2916339SPierre Jolivet }
1630c2916339SPierre Jolivet 
1631d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1632d71ae5a4SJacob Faibussowitsch {
1633c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1634c2916339SPierre Jolivet   PetscScalar       *z = c;
1635c2916339SPierre Jolivet   const PetscScalar *xb;
1636c2916339SPierre Jolivet   PetscScalar        x1, x2, x3;
1637c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1638c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1639c2916339SPierre Jolivet 
1640c2916339SPierre Jolivet   PetscFunctionBegin;
1641c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
16429371c9d4SSatish Balay     n = ii[1] - ii[0];
16439371c9d4SSatish Balay     ii++;
1644c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1645c2916339SPierre Jolivet     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1646c2916339SPierre Jolivet     jj = idx;
1647c2916339SPierre Jolivet     vv = v;
1648c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1649c2916339SPierre Jolivet       idx = jj;
1650c2916339SPierre Jolivet       v   = vv;
1651c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
16529371c9d4SSatish Balay         xb = b + 3 * (*idx);
16539371c9d4SSatish Balay         x1 = xb[0 + k * bm];
16549371c9d4SSatish Balay         x2 = xb[1 + k * bm];
16559371c9d4SSatish Balay         x3 = xb[2 + k * bm];
1656c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1657c2916339SPierre Jolivet         z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1658c2916339SPierre Jolivet         z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1659c2916339SPierre Jolivet         if (*idx != i) {
1660c2916339SPierre Jolivet           c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1661c2916339SPierre Jolivet           c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1662c2916339SPierre Jolivet           c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1663c2916339SPierre Jolivet         }
1664c2916339SPierre Jolivet         v += 9;
1665c2916339SPierre Jolivet         ++idx;
1666c2916339SPierre Jolivet       }
1667c2916339SPierre Jolivet     }
1668c2916339SPierre Jolivet     z += 3;
1669c2916339SPierre Jolivet   }
1670c2916339SPierre Jolivet   PetscFunctionReturn(0);
1671c2916339SPierre Jolivet }
1672c2916339SPierre Jolivet 
1673d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1674d71ae5a4SJacob Faibussowitsch {
1675c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1676c2916339SPierre Jolivet   PetscScalar       *z = c;
1677c2916339SPierre Jolivet   const PetscScalar *xb;
1678c2916339SPierre Jolivet   PetscScalar        x1, x2, x3, x4;
1679c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1680c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1681c2916339SPierre Jolivet 
1682c2916339SPierre Jolivet   PetscFunctionBegin;
1683c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
16849371c9d4SSatish Balay     n = ii[1] - ii[0];
16859371c9d4SSatish Balay     ii++;
1686c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1687c2916339SPierre Jolivet     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1688c2916339SPierre Jolivet     jj = idx;
1689c2916339SPierre Jolivet     vv = v;
1690c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1691c2916339SPierre Jolivet       idx = jj;
1692c2916339SPierre Jolivet       v   = vv;
1693c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
16949371c9d4SSatish Balay         xb = b + 4 * (*idx);
16959371c9d4SSatish Balay         x1 = xb[0 + k * bm];
16969371c9d4SSatish Balay         x2 = xb[1 + k * bm];
16979371c9d4SSatish Balay         x3 = xb[2 + k * bm];
16989371c9d4SSatish Balay         x4 = xb[3 + k * bm];
1699c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1700c2916339SPierre Jolivet         z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1701c2916339SPierre Jolivet         z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1702c2916339SPierre Jolivet         z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1703c2916339SPierre Jolivet         if (*idx != i) {
1704c2916339SPierre Jolivet           c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1705c2916339SPierre Jolivet           c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1706c2916339SPierre Jolivet           c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1707c2916339SPierre Jolivet           c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1708c2916339SPierre Jolivet         }
1709c2916339SPierre Jolivet         v += 16;
1710c2916339SPierre Jolivet         ++idx;
1711c2916339SPierre Jolivet       }
1712c2916339SPierre Jolivet     }
1713c2916339SPierre Jolivet     z += 4;
1714c2916339SPierre Jolivet   }
1715c2916339SPierre Jolivet   PetscFunctionReturn(0);
1716c2916339SPierre Jolivet }
1717c2916339SPierre Jolivet 
1718d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1719d71ae5a4SJacob Faibussowitsch {
1720c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1721c2916339SPierre Jolivet   PetscScalar       *z = c;
1722c2916339SPierre Jolivet   const PetscScalar *xb;
1723c2916339SPierre Jolivet   PetscScalar        x1, x2, x3, x4, x5;
1724c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1725c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1726c2916339SPierre Jolivet 
1727c2916339SPierre Jolivet   PetscFunctionBegin;
1728c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
17299371c9d4SSatish Balay     n = ii[1] - ii[0];
17309371c9d4SSatish Balay     ii++;
1731c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1732c2916339SPierre Jolivet     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1733c2916339SPierre Jolivet     jj = idx;
1734c2916339SPierre Jolivet     vv = v;
1735c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1736c2916339SPierre Jolivet       idx = jj;
1737c2916339SPierre Jolivet       v   = vv;
1738c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
17399371c9d4SSatish Balay         xb = b + 5 * (*idx);
17409371c9d4SSatish Balay         x1 = xb[0 + k * bm];
17419371c9d4SSatish Balay         x2 = xb[1 + k * bm];
17429371c9d4SSatish Balay         x3 = xb[2 + k * bm];
17439371c9d4SSatish Balay         x4 = xb[3 + k * bm];
17449371c9d4SSatish Balay         x5 = xb[4 + k * cm];
1745c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1746c2916339SPierre Jolivet         z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1747c2916339SPierre Jolivet         z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1748c2916339SPierre Jolivet         z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1749c2916339SPierre Jolivet         z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1750c2916339SPierre Jolivet         if (*idx != i) {
1751c2916339SPierre Jolivet           c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1752c2916339SPierre Jolivet           c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1753c2916339SPierre Jolivet           c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1754c2916339SPierre Jolivet           c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1755c2916339SPierre Jolivet           c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1756c2916339SPierre Jolivet         }
1757c2916339SPierre Jolivet         v += 25;
1758c2916339SPierre Jolivet         ++idx;
1759c2916339SPierre Jolivet       }
1760c2916339SPierre Jolivet     }
1761c2916339SPierre Jolivet     z += 5;
1762c2916339SPierre Jolivet   }
1763c2916339SPierre Jolivet   PetscFunctionReturn(0);
1764c2916339SPierre Jolivet }
1765c2916339SPierre Jolivet 
1766d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1767d71ae5a4SJacob Faibussowitsch {
1768c2916339SPierre Jolivet   Mat_SeqSBAIJ    *a  = (Mat_SeqSBAIJ *)A->data;
1769c2916339SPierre Jolivet   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
1770281439baSStefano Zampini   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
1771c2916339SPierre Jolivet   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1772c2916339SPierre Jolivet   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1773c2916339SPierre Jolivet   PetscBLASInt     bbs, bcn, bbm, bcm;
1774f4259b30SLisandro Dalcin   PetscScalar     *z = NULL;
1775c2916339SPierre Jolivet   PetscScalar     *c, *b;
1776c2916339SPierre Jolivet   const MatScalar *v;
1777c2916339SPierre Jolivet   const PetscInt  *idx, *ii;
1778c2916339SPierre Jolivet   PetscScalar      _DOne = 1.0;
1779c2916339SPierre Jolivet 
1780c2916339SPierre Jolivet   PetscFunctionBegin;
1781c2916339SPierre Jolivet   if (!cm || !cn) PetscFunctionReturn(0);
178208401ef6SPierre Jolivet   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
178308401ef6SPierre Jolivet   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
178408401ef6SPierre Jolivet   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1785c2916339SPierre Jolivet   b = bd->v;
17869566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
17879566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArray(C, &c));
1788c2916339SPierre Jolivet   switch (bs) {
1789d71ae5a4SJacob Faibussowitsch   case 1:
1790d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1791d71ae5a4SJacob Faibussowitsch     break;
1792d71ae5a4SJacob Faibussowitsch   case 2:
1793d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1794d71ae5a4SJacob Faibussowitsch     break;
1795d71ae5a4SJacob Faibussowitsch   case 3:
1796d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1797d71ae5a4SJacob Faibussowitsch     break;
1798d71ae5a4SJacob Faibussowitsch   case 4:
1799d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1800d71ae5a4SJacob Faibussowitsch     break;
1801d71ae5a4SJacob Faibussowitsch   case 5:
1802d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1803d71ae5a4SJacob Faibussowitsch     break;
1804c2916339SPierre Jolivet   default: /* block sizes larger than 5 by 5 are handled by BLAS */
18059566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bs, &bbs));
18069566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cn, &bcn));
18079566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bm, &bbm));
18089566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cm, &bcm));
1809c2916339SPierre Jolivet     idx = a->j;
1810c2916339SPierre Jolivet     v   = a->a;
1811c2916339SPierre Jolivet     mbs = a->mbs;
1812c2916339SPierre Jolivet     ii  = a->i;
1813c2916339SPierre Jolivet     z   = c;
1814c2916339SPierre Jolivet     for (i = 0; i < mbs; i++) {
18159371c9d4SSatish Balay       n = ii[1] - ii[0];
18169371c9d4SSatish Balay       ii++;
1817c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
1818792fecdfSBarry Smith         if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1819792fecdfSBarry Smith         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1820c2916339SPierre Jolivet         v += bs2;
1821c2916339SPierre Jolivet       }
1822c2916339SPierre Jolivet       z += bs;
1823c2916339SPierre Jolivet     }
1824c2916339SPierre Jolivet   }
18259566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArray(C, &c));
18269566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
1827c2916339SPierre Jolivet   PetscFunctionReturn(0);
1828c2916339SPierre Jolivet }
1829