xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision fdfbdca60871d1230bda7d5d096e2bd2d2a23f7a)
149b5e25fSSatish Balay 
2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
3c2916339SPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h>
4c2916339SPierre Jolivet #include <../src/mat/impls/sbaij/seq/sbaij.h>
5af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
6c6db04a5SJed Brown #include <petscbt.h>
7c6db04a5SJed Brown #include <petscblaslapack.h>
849b5e25fSSatish Balay 
9d71ae5a4SJacob Faibussowitsch PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
10d71ae5a4SJacob Faibussowitsch {
115eee224dSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
127bede89fSBarry Smith   PetscInt        brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
135d0c19d7SBarry Smith   const PetscInt *idx;
14db41cccfSHong Zhang   PetscBT         table_out, table_in;
15d94109b8SHong Zhang 
16d94109b8SHong Zhang   PetscFunctionBegin;
1708401ef6SPierre Jolivet   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
18d94109b8SHong Zhang   mbs = a->mbs;
19d94109b8SHong Zhang   ai  = a->i;
20d94109b8SHong Zhang   aj  = a->j;
21d0f46423SBarry Smith   bs  = A->rmap->bs;
229566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(mbs, &table_out));
239566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(mbs + 1, &nidx));
249566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(mbs, &table_in));
25d94109b8SHong Zhang 
26d94109b8SHong Zhang   for (i = 0; i < is_max; i++) { /* for each is */
27d94109b8SHong Zhang     isz = 0;
289566063dSJacob Faibussowitsch     PetscCall(PetscBTMemzero(mbs, table_out));
29d94109b8SHong Zhang 
30d94109b8SHong Zhang     /* Extract the indices, assume there can be duplicate entries */
319566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(is[i], &idx));
329566063dSJacob Faibussowitsch     PetscCall(ISGetLocalSize(is[i], &n));
33d94109b8SHong Zhang 
34db41cccfSHong Zhang     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
35dbe03f88SHong Zhang     bcol_max = 0;
36d94109b8SHong Zhang     for (j = 0; j < n; ++j) {
37d94109b8SHong Zhang       brow = idx[j] / bs; /* convert the indices into block indices */
3808401ef6SPierre Jolivet       PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
39db41cccfSHong Zhang       if (!PetscBTLookupSet(table_out, brow)) {
40dbe03f88SHong Zhang         nidx[isz++] = brow;
41dbe03f88SHong Zhang         if (bcol_max < brow) bcol_max = brow;
42dbe03f88SHong Zhang       }
43d94109b8SHong Zhang     }
449566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(is[i], &idx));
459566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&is[i]));
46d94109b8SHong Zhang 
47d94109b8SHong Zhang     k = 0;
48d94109b8SHong Zhang     for (j = 0; j < ov; j++) { /* for each overlap */
49db41cccfSHong Zhang       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
509566063dSJacob Faibussowitsch       PetscCall(PetscBTMemzero(mbs, table_in));
519566063dSJacob Faibussowitsch       for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));
52d94109b8SHong Zhang 
53d94109b8SHong Zhang       n = isz; /* length of the updated is[i] */
54d94109b8SHong Zhang       for (brow = 0; brow < mbs; brow++) {
559371c9d4SSatish Balay         start = ai[brow];
569371c9d4SSatish Balay         end   = ai[brow + 1];
57db41cccfSHong Zhang         if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
58d94109b8SHong Zhang           for (l = start; l < end; l++) {
59d94109b8SHong Zhang             bcol = aj[l];
60d7b97159SHong Zhang             if (!PetscBTLookupSet(table_out, bcol)) {
61d7b97159SHong Zhang               nidx[isz++] = bcol;
62d7b97159SHong Zhang               if (bcol_max < bcol) bcol_max = bcol;
63d7b97159SHong Zhang             }
64d94109b8SHong Zhang           }
65d94109b8SHong Zhang           k++;
66d94109b8SHong Zhang           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67da81f932SPierre Jolivet         } else {             /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
68d94109b8SHong Zhang           for (l = start; l < end; l++) {
69d94109b8SHong Zhang             bcol = aj[l];
70dbe03f88SHong Zhang             if (bcol > bcol_max) break;
71db41cccfSHong Zhang             if (PetscBTLookup(table_in, bcol)) {
7226fbe8dcSKarl Rupp               if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
73d94109b8SHong Zhang               break; /* for l = start; l<end ; l++) */
74d94109b8SHong Zhang             }
75d94109b8SHong Zhang           }
76d94109b8SHong Zhang         }
77d94109b8SHong Zhang       }
78d94109b8SHong Zhang     } /* for each overlap */
797bede89fSBarry Smith     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
807bede89fSBarry Smith   } /* for each is */
819566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&table_out));
829566063dSJacob Faibussowitsch   PetscCall(PetscFree(nidx));
839566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&table_in));
843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
8549b5e25fSSatish Balay }
8649b5e25fSSatish Balay 
87847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
887cf5f706SPierre Jolivet         Zero some ops' to avoid invalid use */
89d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
90d71ae5a4SJacob Faibussowitsch {
9149b5e25fSSatish Balay   PetscFunctionBegin;
929566063dSJacob Faibussowitsch   PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
93f4259b30SLisandro Dalcin   Bseq->ops->mult                   = NULL;
94f4259b30SLisandro Dalcin   Bseq->ops->multadd                = NULL;
95f4259b30SLisandro Dalcin   Bseq->ops->multtranspose          = NULL;
96f4259b30SLisandro Dalcin   Bseq->ops->multtransposeadd       = NULL;
97f4259b30SLisandro Dalcin   Bseq->ops->lufactor               = NULL;
98f4259b30SLisandro Dalcin   Bseq->ops->choleskyfactor         = NULL;
99f4259b30SLisandro Dalcin   Bseq->ops->lufactorsymbolic       = NULL;
100f4259b30SLisandro Dalcin   Bseq->ops->choleskyfactorsymbolic = NULL;
101f4259b30SLisandro Dalcin   Bseq->ops->getinertia             = NULL;
1023ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
10349b5e25fSSatish Balay }
10449b5e25fSSatish Balay 
1057dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
106*fdfbdca6SPierre Jolivet static PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B, PetscBool sym)
107d71ae5a4SJacob Faibussowitsch {
108*fdfbdca6SPierre Jolivet   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *c = NULL;
109*fdfbdca6SPierre Jolivet   Mat_SeqBAIJ    *d = NULL;
110e50a8114SHong Zhang   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
111e50a8114SHong Zhang   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
112e50a8114SHong Zhang   const PetscInt *irow, *icol;
113e50a8114SHong Zhang   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
114e50a8114SHong Zhang   PetscInt       *aj = a->j, *ai = a->i;
115e50a8114SHong Zhang   MatScalar      *mat_a;
116e50a8114SHong Zhang   Mat             C;
117e50a8114SHong Zhang   PetscBool       flag;
118e50a8114SHong Zhang 
119e50a8114SHong Zhang   PetscFunctionBegin;
120e50a8114SHong Zhang 
1219566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &irow));
1229566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &icol));
1239566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(isrow, &nrows));
1249566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(iscol, &ncols));
125e50a8114SHong Zhang 
1269566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(1 + oldcols, &smap));
127e50a8114SHong Zhang   ssmap = smap;
1289566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(1 + nrows, &lens));
129e50a8114SHong Zhang   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
130e50a8114SHong Zhang   /* determine lens of each row */
131e50a8114SHong Zhang   for (i = 0; i < nrows; i++) {
132e50a8114SHong Zhang     kstart  = ai[irow[i]];
133e50a8114SHong Zhang     kend    = kstart + a->ilen[irow[i]];
134e50a8114SHong Zhang     lens[i] = 0;
135e50a8114SHong Zhang     for (k = kstart; k < kend; k++) {
136e50a8114SHong Zhang       if (ssmap[aj[k]]) lens[i]++;
137e50a8114SHong Zhang     }
138e50a8114SHong Zhang   }
139e50a8114SHong Zhang   /* Create and fill new matrix */
140e50a8114SHong Zhang   if (scall == MAT_REUSE_MATRIX) {
141*fdfbdca6SPierre Jolivet     if (sym) {
142e50a8114SHong Zhang       c = (Mat_SeqSBAIJ *)((*B)->data);
143e50a8114SHong Zhang 
144aed4548fSBarry Smith       PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
1459566063dSJacob Faibussowitsch       PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
146*fdfbdca6SPierre Jolivet       PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
1479566063dSJacob Faibussowitsch       PetscCall(PetscArrayzero(c->ilen, c->mbs));
148*fdfbdca6SPierre Jolivet     } else {
149*fdfbdca6SPierre Jolivet       d = (Mat_SeqBAIJ *)((*B)->data);
150*fdfbdca6SPierre Jolivet 
151*fdfbdca6SPierre Jolivet       PetscCheck(d->mbs == nrows && d->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
152*fdfbdca6SPierre Jolivet       PetscCall(PetscArraycmp(d->ilen, lens, d->mbs, &flag));
153*fdfbdca6SPierre Jolivet       PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
154*fdfbdca6SPierre Jolivet       PetscCall(PetscArrayzero(d->ilen, d->mbs));
155*fdfbdca6SPierre Jolivet     }
156e50a8114SHong Zhang     C = *B;
157e50a8114SHong Zhang   } else {
1589566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
1599566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
160*fdfbdca6SPierre Jolivet     if (sym) {
1619566063dSJacob Faibussowitsch       PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
1629566063dSJacob Faibussowitsch       PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
163*fdfbdca6SPierre Jolivet     } else {
164*fdfbdca6SPierre Jolivet       PetscCall(MatSetType(C, MATSEQBAIJ));
165*fdfbdca6SPierre Jolivet       PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
166e50a8114SHong Zhang     }
167*fdfbdca6SPierre Jolivet   }
168*fdfbdca6SPierre Jolivet   if (sym) c = (Mat_SeqSBAIJ *)(C->data);
169*fdfbdca6SPierre Jolivet   else d = (Mat_SeqBAIJ *)(C->data);
170e50a8114SHong Zhang   for (i = 0; i < nrows; i++) {
171e50a8114SHong Zhang     row    = irow[i];
172e50a8114SHong Zhang     kstart = ai[row];
173e50a8114SHong Zhang     kend   = kstart + a->ilen[row];
174*fdfbdca6SPierre Jolivet     if (sym) {
175e50a8114SHong Zhang       mat_i    = c->i[i];
176e50a8114SHong Zhang       mat_j    = c->j + mat_i;
177e50a8114SHong Zhang       mat_a    = c->a + mat_i * bs2;
178e50a8114SHong Zhang       mat_ilen = c->ilen + i;
179*fdfbdca6SPierre Jolivet     } else {
180*fdfbdca6SPierre Jolivet       mat_i    = d->i[i];
181*fdfbdca6SPierre Jolivet       mat_j    = d->j + mat_i;
182*fdfbdca6SPierre Jolivet       mat_a    = d->a + mat_i * bs2;
183*fdfbdca6SPierre Jolivet       mat_ilen = d->ilen + i;
184*fdfbdca6SPierre Jolivet     }
185e50a8114SHong Zhang     for (k = kstart; k < kend; k++) {
186e50a8114SHong Zhang       if ((tcol = ssmap[a->j[k]])) {
187e50a8114SHong Zhang         *mat_j++ = tcol - 1;
1889566063dSJacob Faibussowitsch         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
189e50a8114SHong Zhang         mat_a += bs2;
190e50a8114SHong Zhang         (*mat_ilen)++;
191e50a8114SHong Zhang       }
192e50a8114SHong Zhang     }
193e50a8114SHong Zhang   }
194e50a8114SHong Zhang   /* sort */
195e50a8114SHong Zhang   {
196e50a8114SHong Zhang     MatScalar *work;
197e50a8114SHong Zhang 
1989566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(bs2, &work));
199e50a8114SHong Zhang     for (i = 0; i < nrows; i++) {
200e50a8114SHong Zhang       PetscInt ilen;
201*fdfbdca6SPierre Jolivet       if (sym) {
202e50a8114SHong Zhang         mat_i = c->i[i];
203e50a8114SHong Zhang         mat_j = c->j + mat_i;
204e50a8114SHong Zhang         mat_a = c->a + mat_i * bs2;
205e50a8114SHong Zhang         ilen  = c->ilen[i];
206*fdfbdca6SPierre Jolivet       } else {
207*fdfbdca6SPierre Jolivet         mat_i = d->i[i];
208*fdfbdca6SPierre Jolivet         mat_j = d->j + mat_i;
209*fdfbdca6SPierre Jolivet         mat_a = d->a + mat_i * bs2;
210*fdfbdca6SPierre Jolivet         ilen  = d->ilen[i];
211*fdfbdca6SPierre Jolivet       }
2129566063dSJacob Faibussowitsch       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
213e50a8114SHong Zhang     }
2149566063dSJacob Faibussowitsch     PetscCall(PetscFree(work));
215e50a8114SHong Zhang   }
216e50a8114SHong Zhang 
217e50a8114SHong Zhang   /* Free work space */
2189566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &icol));
2199566063dSJacob Faibussowitsch   PetscCall(PetscFree(smap));
2209566063dSJacob Faibussowitsch   PetscCall(PetscFree(lens));
2219566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
2229566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
223e50a8114SHong Zhang 
2249566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &irow));
225e50a8114SHong Zhang   *B = C;
2263ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
227e50a8114SHong Zhang }
228e50a8114SHong Zhang 
229d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
230d71ae5a4SJacob Faibussowitsch {
231*fdfbdca6SPierre Jolivet   Mat       C[2];
232*fdfbdca6SPierre Jolivet   IS        is1, is2, intersect = NULL;
233*fdfbdca6SPierre Jolivet   PetscInt  n1, n2, ni;
234*fdfbdca6SPierre Jolivet   PetscBool sym = PETSC_TRUE;
23549b5e25fSSatish Balay 
23649b5e25fSSatish Balay   PetscFunctionBegin;
237f9a48b90SPierre Jolivet   PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
238f9a48b90SPierre Jolivet   if (isrow == iscol) {
239f9a48b90SPierre Jolivet     is2 = is1;
240f9a48b90SPierre Jolivet     PetscCall(PetscObjectReference((PetscObject)is2));
241*fdfbdca6SPierre Jolivet   } else {
242*fdfbdca6SPierre Jolivet     PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
243*fdfbdca6SPierre Jolivet     PetscCall(ISIntersect(is1, is2, &intersect));
244*fdfbdca6SPierre Jolivet     PetscCall(ISGetLocalSize(intersect, &ni));
245*fdfbdca6SPierre Jolivet     PetscCall(ISDestroy(&intersect));
246*fdfbdca6SPierre Jolivet     if (ni == 0) sym = PETSC_FALSE;
247*fdfbdca6SPierre Jolivet     else if (PetscDefined(USE_DEBUG)) {
248*fdfbdca6SPierre Jolivet       PetscCall(ISGetLocalSize(is1, &n1));
249*fdfbdca6SPierre Jolivet       PetscCall(ISGetLocalSize(is2, &n2));
250*fdfbdca6SPierre Jolivet       PetscCheck(ni == n1 && ni == n2, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create such a submatrix");
251*fdfbdca6SPierre Jolivet     }
252*fdfbdca6SPierre Jolivet   }
253*fdfbdca6SPierre Jolivet   if (sym) PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B, sym));
254*fdfbdca6SPierre Jolivet   else {
255*fdfbdca6SPierre Jolivet     PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, MAT_INITIAL_MATRIX, C, sym));
256*fdfbdca6SPierre Jolivet     PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is2, is1, MAT_INITIAL_MATRIX, C + 1, sym));
257*fdfbdca6SPierre Jolivet     PetscCall(MatTranspose(C[1], MAT_INPLACE_MATRIX, C + 1));
258*fdfbdca6SPierre Jolivet     PetscCall(MatAXPY(C[0], 1.0, C[1], DIFFERENT_NONZERO_PATTERN));
259*fdfbdca6SPierre Jolivet     PetscCheck(scall != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
260*fdfbdca6SPierre Jolivet     if (scall == MAT_REUSE_MATRIX) PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
261*fdfbdca6SPierre Jolivet     else if (A->rmap->bs == 1) PetscCall(MatConvert(C[0], MATAIJ, MAT_INITIAL_MATRIX, B));
262*fdfbdca6SPierre Jolivet     else PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
263*fdfbdca6SPierre Jolivet     PetscCall(MatDestroy(C));
264*fdfbdca6SPierre Jolivet     PetscCall(MatDestroy(C + 1));
265*fdfbdca6SPierre Jolivet   }
2669566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is1));
2679566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is2));
268847daeccSHong Zhang 
269*fdfbdca6SPierre Jolivet   if (sym && isrow != iscol) {
2708f46ffcaSHong Zhang     PetscBool isequal;
2719566063dSJacob Faibussowitsch     PetscCall(ISEqual(isrow, iscol, &isequal));
27248a46eb9SPierre Jolivet     if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
27349b5e25fSSatish Balay   }
2743ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
27549b5e25fSSatish Balay }
27649b5e25fSSatish Balay 
277d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
278d71ae5a4SJacob Faibussowitsch {
27913f74950SBarry Smith   PetscInt i;
28049b5e25fSSatish Balay 
28149b5e25fSSatish Balay   PetscFunctionBegin;
282314f503fSPierre Jolivet   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B));
283e50a8114SHong Zhang 
28448a46eb9SPierre Jolivet   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
2853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
28649b5e25fSSatish Balay }
28749b5e25fSSatish Balay 
28849b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */
289d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
290d71ae5a4SJacob Faibussowitsch {
29149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
292d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, zero = 0.0;
293d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
294d9ca1df4SBarry Smith   const MatScalar   *v;
295d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
296d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
29798c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
29849b5e25fSSatish Balay 
29949b5e25fSSatish Balay   PetscFunctionBegin;
3009566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
3013ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
3029566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3039566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
30449b5e25fSSatish Balay 
30549b5e25fSSatish Balay   v  = a->a;
30649b5e25fSSatish Balay   xb = x;
30749b5e25fSSatish Balay 
30849b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
30949b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
3109371c9d4SSatish Balay     x1   = xb[0];
3119371c9d4SSatish Balay     x2   = xb[1];
31249b5e25fSSatish Balay     ib   = aj + *ai;
313831a3094SHong Zhang     jmin = 0;
31498c9bda7SSatish Balay     nonzerorow += (n > 0);
3157fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
31649b5e25fSSatish Balay       z[2 * i] += v[0] * x1 + v[2] * x2;
31749b5e25fSSatish Balay       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
3189371c9d4SSatish Balay       v += 4;
3199371c9d4SSatish Balay       jmin++;
3207fbae186SHong Zhang     }
321444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
322444d8c10SJed Brown     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
323831a3094SHong Zhang     for (j = jmin; j < n; j++) {
32449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
32549b5e25fSSatish Balay       cval = ib[j] * 2;
32649b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2;
32749b5e25fSSatish Balay       z[cval + 1] += v[2] * x1 + v[3] * x2;
32849b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
32949b5e25fSSatish Balay       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
33049b5e25fSSatish Balay       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
33149b5e25fSSatish Balay       v += 4;
33249b5e25fSSatish Balay     }
3339371c9d4SSatish Balay     xb += 2;
3349371c9d4SSatish Balay     ai++;
33549b5e25fSSatish Balay   }
33649b5e25fSSatish Balay 
3379566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
3389566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
3399566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
3403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
34149b5e25fSSatish Balay }
34249b5e25fSSatish Balay 
343d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
344d71ae5a4SJacob Faibussowitsch {
34549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
346d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, zero = 0.0;
347d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
348d9ca1df4SBarry Smith   const MatScalar   *v;
349d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
350d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
35198c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
35249b5e25fSSatish Balay 
35349b5e25fSSatish Balay   PetscFunctionBegin;
3549566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
3553ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
3569566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3579566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
35849b5e25fSSatish Balay 
35949b5e25fSSatish Balay   v  = a->a;
36049b5e25fSSatish Balay   xb = x;
36149b5e25fSSatish Balay 
36249b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
36349b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
3649371c9d4SSatish Balay     x1   = xb[0];
3659371c9d4SSatish Balay     x2   = xb[1];
3669371c9d4SSatish Balay     x3   = xb[2];
36749b5e25fSSatish Balay     ib   = aj + *ai;
368831a3094SHong Zhang     jmin = 0;
36998c9bda7SSatish Balay     nonzerorow += (n > 0);
3707fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
37149b5e25fSSatish Balay       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
37249b5e25fSSatish Balay       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
37349b5e25fSSatish Balay       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3749371c9d4SSatish Balay       v += 9;
3759371c9d4SSatish Balay       jmin++;
3767fbae186SHong Zhang     }
377444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
378444d8c10SJed Brown     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
379831a3094SHong Zhang     for (j = jmin; j < n; j++) {
38049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
38149b5e25fSSatish Balay       cval = ib[j] * 3;
38249b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
38349b5e25fSSatish Balay       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
38449b5e25fSSatish Balay       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
38549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
38649b5e25fSSatish Balay       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
38749b5e25fSSatish Balay       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
38849b5e25fSSatish Balay       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
38949b5e25fSSatish Balay       v += 9;
39049b5e25fSSatish Balay     }
3919371c9d4SSatish Balay     xb += 3;
3929371c9d4SSatish Balay     ai++;
39349b5e25fSSatish Balay   }
39449b5e25fSSatish Balay 
3959566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
3969566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
3979566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
3983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
39949b5e25fSSatish Balay }
40049b5e25fSSatish Balay 
401d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
402d71ae5a4SJacob Faibussowitsch {
40349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
404d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, zero = 0.0;
405d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
406d9ca1df4SBarry Smith   const MatScalar   *v;
407d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
408d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
40998c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
41049b5e25fSSatish Balay 
41149b5e25fSSatish Balay   PetscFunctionBegin;
4129566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
4133ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
4149566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
4159566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
41649b5e25fSSatish Balay 
41749b5e25fSSatish Balay   v  = a->a;
41849b5e25fSSatish Balay   xb = x;
41949b5e25fSSatish Balay 
42049b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
42149b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
4229371c9d4SSatish Balay     x1   = xb[0];
4239371c9d4SSatish Balay     x2   = xb[1];
4249371c9d4SSatish Balay     x3   = xb[2];
4259371c9d4SSatish Balay     x4   = xb[3];
42649b5e25fSSatish Balay     ib   = aj + *ai;
427831a3094SHong Zhang     jmin = 0;
42898c9bda7SSatish Balay     nonzerorow += (n > 0);
4297fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
43049b5e25fSSatish Balay       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
43149b5e25fSSatish Balay       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
43249b5e25fSSatish Balay       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
43349b5e25fSSatish Balay       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
4349371c9d4SSatish Balay       v += 16;
4359371c9d4SSatish Balay       jmin++;
4367fbae186SHong Zhang     }
437444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
438444d8c10SJed Brown     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
439831a3094SHong Zhang     for (j = jmin; j < n; j++) {
44049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
44149b5e25fSSatish Balay       cval = ib[j] * 4;
44249b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
44349b5e25fSSatish Balay       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
44449b5e25fSSatish Balay       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
44549b5e25fSSatish Balay       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
44649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
44749b5e25fSSatish Balay       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
44849b5e25fSSatish Balay       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
44949b5e25fSSatish Balay       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
45049b5e25fSSatish Balay       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
45149b5e25fSSatish Balay       v += 16;
45249b5e25fSSatish Balay     }
4539371c9d4SSatish Balay     xb += 4;
4549371c9d4SSatish Balay     ai++;
45549b5e25fSSatish Balay   }
45649b5e25fSSatish Balay 
4579566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
4589566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
4599566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
4603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
46149b5e25fSSatish Balay }
46249b5e25fSSatish Balay 
463d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
464d71ae5a4SJacob Faibussowitsch {
46549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
466d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, zero = 0.0;
467d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
468d9ca1df4SBarry Smith   const MatScalar   *v;
469d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
470d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
47198c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
47249b5e25fSSatish Balay 
47349b5e25fSSatish Balay   PetscFunctionBegin;
4749566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
4753ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
4769566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
4779566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
47849b5e25fSSatish Balay 
47949b5e25fSSatish Balay   v  = a->a;
48049b5e25fSSatish Balay   xb = x;
48149b5e25fSSatish Balay 
48249b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
48349b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
4849371c9d4SSatish Balay     x1   = xb[0];
4859371c9d4SSatish Balay     x2   = xb[1];
4869371c9d4SSatish Balay     x3   = xb[2];
4879371c9d4SSatish Balay     x4   = xb[3];
4889371c9d4SSatish Balay     x5   = xb[4];
48949b5e25fSSatish Balay     ib   = aj + *ai;
490831a3094SHong Zhang     jmin = 0;
49198c9bda7SSatish Balay     nonzerorow += (n > 0);
4927fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
49349b5e25fSSatish Balay       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
49449b5e25fSSatish Balay       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
49549b5e25fSSatish Balay       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
49649b5e25fSSatish Balay       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
49749b5e25fSSatish Balay       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
4989371c9d4SSatish Balay       v += 25;
4999371c9d4SSatish Balay       jmin++;
5007fbae186SHong Zhang     }
501444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
502444d8c10SJed Brown     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
503831a3094SHong Zhang     for (j = jmin; j < n; j++) {
50449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
50549b5e25fSSatish Balay       cval = ib[j] * 5;
50649b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
50749b5e25fSSatish Balay       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
50849b5e25fSSatish Balay       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
50949b5e25fSSatish Balay       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
51049b5e25fSSatish Balay       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
51149b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
51249b5e25fSSatish Balay       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
51349b5e25fSSatish Balay       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
51449b5e25fSSatish Balay       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
51549b5e25fSSatish Balay       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
51649b5e25fSSatish Balay       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
51749b5e25fSSatish Balay       v += 25;
51849b5e25fSSatish Balay     }
5199371c9d4SSatish Balay     xb += 5;
5209371c9d4SSatish Balay     ai++;
52149b5e25fSSatish Balay   }
52249b5e25fSSatish Balay 
5239566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
5249566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
5259566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
5263ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
52749b5e25fSSatish Balay }
52849b5e25fSSatish Balay 
529d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
530d71ae5a4SJacob Faibussowitsch {
53149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
532d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
533d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
534d9ca1df4SBarry Smith   const MatScalar   *v;
535d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
536d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
53798c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
53849b5e25fSSatish Balay 
53949b5e25fSSatish Balay   PetscFunctionBegin;
5409566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
5413ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
5429566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
5439566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
54449b5e25fSSatish Balay 
54549b5e25fSSatish Balay   v  = a->a;
54649b5e25fSSatish Balay   xb = x;
54749b5e25fSSatish Balay 
54849b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
54949b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
5509371c9d4SSatish Balay     x1   = xb[0];
5519371c9d4SSatish Balay     x2   = xb[1];
5529371c9d4SSatish Balay     x3   = xb[2];
5539371c9d4SSatish Balay     x4   = xb[3];
5549371c9d4SSatish Balay     x5   = xb[4];
5559371c9d4SSatish Balay     x6   = xb[5];
55649b5e25fSSatish Balay     ib   = aj + *ai;
557831a3094SHong Zhang     jmin = 0;
55898c9bda7SSatish Balay     nonzerorow += (n > 0);
5597fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
56049b5e25fSSatish Balay       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
56149b5e25fSSatish Balay       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
56249b5e25fSSatish Balay       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
56349b5e25fSSatish Balay       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
56449b5e25fSSatish Balay       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
56549b5e25fSSatish Balay       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
5669371c9d4SSatish Balay       v += 36;
5679371c9d4SSatish Balay       jmin++;
5687fbae186SHong Zhang     }
569444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
570444d8c10SJed Brown     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
571831a3094SHong Zhang     for (j = jmin; j < n; j++) {
57249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
57349b5e25fSSatish Balay       cval = ib[j] * 6;
57449b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
57549b5e25fSSatish Balay       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
57649b5e25fSSatish Balay       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
57749b5e25fSSatish Balay       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
57849b5e25fSSatish Balay       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
57949b5e25fSSatish Balay       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
58049b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
58149b5e25fSSatish Balay       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
58249b5e25fSSatish Balay       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
58349b5e25fSSatish Balay       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
58449b5e25fSSatish Balay       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
58549b5e25fSSatish Balay       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
58649b5e25fSSatish Balay       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
58749b5e25fSSatish Balay       v += 36;
58849b5e25fSSatish Balay     }
5899371c9d4SSatish Balay     xb += 6;
5909371c9d4SSatish Balay     ai++;
59149b5e25fSSatish Balay   }
59249b5e25fSSatish Balay 
5939566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
5949566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
5959566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
5963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
59749b5e25fSSatish Balay }
598c2916339SPierre Jolivet 
599d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
600d71ae5a4SJacob Faibussowitsch {
60149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
602d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
603d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
604d9ca1df4SBarry Smith   const MatScalar   *v;
605d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
606d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
60798c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
60849b5e25fSSatish Balay 
60949b5e25fSSatish Balay   PetscFunctionBegin;
6109566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
6113ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
6129566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
6139566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
61449b5e25fSSatish Balay 
61549b5e25fSSatish Balay   v  = a->a;
61649b5e25fSSatish Balay   xb = x;
61749b5e25fSSatish Balay 
61849b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
61949b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
6209371c9d4SSatish Balay     x1   = xb[0];
6219371c9d4SSatish Balay     x2   = xb[1];
6229371c9d4SSatish Balay     x3   = xb[2];
6239371c9d4SSatish Balay     x4   = xb[3];
6249371c9d4SSatish Balay     x5   = xb[4];
6259371c9d4SSatish Balay     x6   = xb[5];
6269371c9d4SSatish Balay     x7   = xb[6];
62749b5e25fSSatish Balay     ib   = aj + *ai;
628831a3094SHong Zhang     jmin = 0;
62998c9bda7SSatish Balay     nonzerorow += (n > 0);
6307fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
63149b5e25fSSatish Balay       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
63249b5e25fSSatish Balay       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
63349b5e25fSSatish Balay       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
63449b5e25fSSatish Balay       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
63549b5e25fSSatish Balay       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
63649b5e25fSSatish Balay       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
63749b5e25fSSatish Balay       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
6389371c9d4SSatish Balay       v += 49;
6399371c9d4SSatish Balay       jmin++;
6407fbae186SHong Zhang     }
641444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
642444d8c10SJed Brown     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
643831a3094SHong Zhang     for (j = jmin; j < n; j++) {
64449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
64549b5e25fSSatish Balay       cval = ib[j] * 7;
64649b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
64749b5e25fSSatish Balay       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
64849b5e25fSSatish Balay       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
64949b5e25fSSatish Balay       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
65049b5e25fSSatish Balay       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
65149b5e25fSSatish Balay       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
65249b5e25fSSatish Balay       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
65349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
65449b5e25fSSatish Balay       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
65549b5e25fSSatish Balay       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
65649b5e25fSSatish Balay       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
65749b5e25fSSatish Balay       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
65849b5e25fSSatish Balay       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
65949b5e25fSSatish Balay       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
66049b5e25fSSatish Balay       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
66149b5e25fSSatish Balay       v += 49;
66249b5e25fSSatish Balay     }
6639371c9d4SSatish Balay     xb += 7;
6649371c9d4SSatish Balay     ai++;
66549b5e25fSSatish Balay   }
6669566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
6679566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
6689566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
6693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
67049b5e25fSSatish Balay }
67149b5e25fSSatish Balay 
67249b5e25fSSatish Balay /*
67349b5e25fSSatish Balay     This will not work with MatScalar == float because it calls the BLAS
67449b5e25fSSatish Balay */
675d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
676d71ae5a4SJacob Faibussowitsch {
67749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
678d9ca1df4SBarry Smith   PetscScalar       *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
679d9ca1df4SBarry Smith   const PetscScalar *x, *x_ptr, *xb;
680d9ca1df4SBarry Smith   const MatScalar   *v;
681d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
682d9ca1df4SBarry Smith   const PetscInt    *idx, *aj, *ii;
68398c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
68449b5e25fSSatish Balay 
68549b5e25fSSatish Balay   PetscFunctionBegin;
6869566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
6873ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
6889566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
6899566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
69059689ffbSStefano Zampini 
69159689ffbSStefano Zampini   x_ptr = x;
69259689ffbSStefano Zampini   z_ptr = z;
69349b5e25fSSatish Balay 
69449b5e25fSSatish Balay   aj = a->j;
69549b5e25fSSatish Balay   v  = a->a;
69649b5e25fSSatish Balay   ii = a->i;
69749b5e25fSSatish Balay 
69848a46eb9SPierre Jolivet   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
69949b5e25fSSatish Balay   work = a->mult_work;
70049b5e25fSSatish Balay 
70149b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
7029371c9d4SSatish Balay     n     = ii[1] - ii[0];
7039371c9d4SSatish Balay     ncols = n * bs;
7049371c9d4SSatish Balay     workt = work;
7059371c9d4SSatish Balay     idx   = aj + ii[0];
70698c9bda7SSatish Balay     nonzerorow += (n > 0);
70749b5e25fSSatish Balay 
70849b5e25fSSatish Balay     /* upper triangular part */
70949b5e25fSSatish Balay     for (j = 0; j < n; j++) {
71049b5e25fSSatish Balay       xb = x_ptr + bs * (*idx++);
71149b5e25fSSatish Balay       for (k = 0; k < bs; k++) workt[k] = xb[k];
71249b5e25fSSatish Balay       workt += bs;
71349b5e25fSSatish Balay     }
71449b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
71596b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
71649b5e25fSSatish Balay 
71749b5e25fSSatish Balay     /* strict lower triangular part */
718831a3094SHong Zhang     idx = aj + ii[0];
71959689ffbSStefano Zampini     if (n && *idx == i) {
7209371c9d4SSatish Balay       ncols -= bs;
7219371c9d4SSatish Balay       v += bs2;
7229371c9d4SSatish Balay       idx++;
7239371c9d4SSatish Balay       n--;
724831a3094SHong Zhang     }
72596b9376eSHong Zhang 
72649b5e25fSSatish Balay     if (ncols > 0) {
72749b5e25fSSatish Balay       workt = work;
7289566063dSJacob Faibussowitsch       PetscCall(PetscArrayzero(workt, ncols));
72996b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
730831a3094SHong Zhang       for (j = 0; j < n; j++) {
731831a3094SHong Zhang         zb = z_ptr + bs * (*idx++);
73249b5e25fSSatish Balay         for (k = 0; k < bs; k++) zb[k] += workt[k];
73349b5e25fSSatish Balay         workt += bs;
73449b5e25fSSatish Balay       }
73549b5e25fSSatish Balay     }
7369371c9d4SSatish Balay     x += bs;
7379371c9d4SSatish Balay     v += n * bs2;
7389371c9d4SSatish Balay     z += bs;
7399371c9d4SSatish Balay     ii++;
74049b5e25fSSatish Balay   }
74149b5e25fSSatish Balay 
7429566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
7439566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
7449566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
7453ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
74649b5e25fSSatish Balay }
74749b5e25fSSatish Balay 
748d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
749d71ae5a4SJacob Faibussowitsch {
75049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
751d9ca1df4SBarry Smith   PetscScalar       *z, x1;
752d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
753d9ca1df4SBarry Smith   const MatScalar   *v;
754d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
755d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
75698c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
757eb1ec7c1SStefano Zampini #if defined(PETSC_USE_COMPLEX)
758b94d7dedSBarry Smith   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
759eb1ec7c1SStefano Zampini #else
760eb1ec7c1SStefano Zampini   const int aconj = 0;
761eb1ec7c1SStefano Zampini #endif
76249b5e25fSSatish Balay 
76349b5e25fSSatish Balay   PetscFunctionBegin;
7649566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
7653ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
7669566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
7679566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
76849b5e25fSSatish Balay   v  = a->a;
76949b5e25fSSatish Balay   xb = x;
77049b5e25fSSatish Balay 
77149b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
77249b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th row of A */
77349b5e25fSSatish Balay     x1   = xb[0];
77449b5e25fSSatish Balay     ib   = aj + *ai;
775831a3094SHong Zhang     jmin = 0;
77698c9bda7SSatish Balay     nonzerorow += (n > 0);
7773d9ade75SStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
7789371c9d4SSatish Balay       z[i] += *v++ * x[*ib++];
7799371c9d4SSatish Balay       jmin++;
780831a3094SHong Zhang     }
781eb1ec7c1SStefano Zampini     if (aconj) {
782eb1ec7c1SStefano Zampini       for (j = jmin; j < n; j++) {
783eb1ec7c1SStefano Zampini         cval = *ib;
784eb1ec7c1SStefano Zampini         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
785eb1ec7c1SStefano Zampini         z[i] += *v++ * x[*ib++];       /* (strict upper triangular part of A)*x  */
786eb1ec7c1SStefano Zampini       }
787eb1ec7c1SStefano Zampini     } else {
788831a3094SHong Zhang       for (j = jmin; j < n; j++) {
78949b5e25fSSatish Balay         cval = *ib;
79049b5e25fSSatish Balay         z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
79149b5e25fSSatish Balay         z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
79249b5e25fSSatish Balay       }
793eb1ec7c1SStefano Zampini     }
7949371c9d4SSatish Balay     xb++;
7959371c9d4SSatish Balay     ai++;
79649b5e25fSSatish Balay   }
79749b5e25fSSatish Balay 
7989566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
7999566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
80049b5e25fSSatish Balay 
8019566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
8023ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
80349b5e25fSSatish Balay }
80449b5e25fSSatish Balay 
805d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
806d71ae5a4SJacob Faibussowitsch {
80749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
808d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2;
809d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
810d9ca1df4SBarry Smith   const MatScalar   *v;
811d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
812d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
81398c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
81449b5e25fSSatish Balay 
81549b5e25fSSatish Balay   PetscFunctionBegin;
8169566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
8173ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
8189566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
8199566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
82049b5e25fSSatish Balay 
82149b5e25fSSatish Balay   v  = a->a;
82249b5e25fSSatish Balay   xb = x;
82349b5e25fSSatish Balay 
82449b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
82549b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
8269371c9d4SSatish Balay     x1   = xb[0];
8279371c9d4SSatish Balay     x2   = xb[1];
82849b5e25fSSatish Balay     ib   = aj + *ai;
829831a3094SHong Zhang     jmin = 0;
83098c9bda7SSatish Balay     nonzerorow += (n > 0);
83159689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
83249b5e25fSSatish Balay       z[2 * i] += v[0] * x1 + v[2] * x2;
83349b5e25fSSatish Balay       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
8349371c9d4SSatish Balay       v += 4;
8359371c9d4SSatish Balay       jmin++;
8367fbae186SHong Zhang     }
837444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
838444d8c10SJed Brown     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
839831a3094SHong Zhang     for (j = jmin; j < n; j++) {
84049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
84149b5e25fSSatish Balay       cval = ib[j] * 2;
84249b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2;
84349b5e25fSSatish Balay       z[cval + 1] += v[2] * x1 + v[3] * x2;
84449b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
84549b5e25fSSatish Balay       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
84649b5e25fSSatish Balay       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
84749b5e25fSSatish Balay       v += 4;
84849b5e25fSSatish Balay     }
8499371c9d4SSatish Balay     xb += 2;
8509371c9d4SSatish Balay     ai++;
85149b5e25fSSatish Balay   }
8529566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
8539566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
85449b5e25fSSatish Balay 
8559566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
8563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
85749b5e25fSSatish Balay }
85849b5e25fSSatish Balay 
859d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
860d71ae5a4SJacob Faibussowitsch {
86149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
862d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3;
863d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
864d9ca1df4SBarry Smith   const MatScalar   *v;
865d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
866d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
86798c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
86849b5e25fSSatish Balay 
86949b5e25fSSatish Balay   PetscFunctionBegin;
8709566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
8713ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
8729566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
8739566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
87449b5e25fSSatish Balay 
87549b5e25fSSatish Balay   v  = a->a;
87649b5e25fSSatish Balay   xb = x;
87749b5e25fSSatish Balay 
87849b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
87949b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
8809371c9d4SSatish Balay     x1   = xb[0];
8819371c9d4SSatish Balay     x2   = xb[1];
8829371c9d4SSatish Balay     x3   = xb[2];
88349b5e25fSSatish Balay     ib   = aj + *ai;
884831a3094SHong Zhang     jmin = 0;
88598c9bda7SSatish Balay     nonzerorow += (n > 0);
88659689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
88749b5e25fSSatish Balay       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
88849b5e25fSSatish Balay       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
88949b5e25fSSatish Balay       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
8909371c9d4SSatish Balay       v += 9;
8919371c9d4SSatish Balay       jmin++;
8927fbae186SHong Zhang     }
893444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
894444d8c10SJed Brown     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
895831a3094SHong Zhang     for (j = jmin; j < n; j++) {
89649b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
89749b5e25fSSatish Balay       cval = ib[j] * 3;
89849b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
89949b5e25fSSatish Balay       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
90049b5e25fSSatish Balay       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
90149b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
90249b5e25fSSatish Balay       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
90349b5e25fSSatish Balay       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
90449b5e25fSSatish Balay       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
90549b5e25fSSatish Balay       v += 9;
90649b5e25fSSatish Balay     }
9079371c9d4SSatish Balay     xb += 3;
9089371c9d4SSatish Balay     ai++;
90949b5e25fSSatish Balay   }
91049b5e25fSSatish Balay 
9119566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
9129566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
91349b5e25fSSatish Balay 
9149566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
9153ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
91649b5e25fSSatish Balay }
91749b5e25fSSatish Balay 
918d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
919d71ae5a4SJacob Faibussowitsch {
92049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
921d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4;
922d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
923d9ca1df4SBarry Smith   const MatScalar   *v;
924d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
925d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
92698c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
92749b5e25fSSatish Balay 
92849b5e25fSSatish Balay   PetscFunctionBegin;
9299566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
9303ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
9319566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
9329566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
93349b5e25fSSatish Balay 
93449b5e25fSSatish Balay   v  = a->a;
93549b5e25fSSatish Balay   xb = x;
93649b5e25fSSatish Balay 
93749b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
93849b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
9399371c9d4SSatish Balay     x1   = xb[0];
9409371c9d4SSatish Balay     x2   = xb[1];
9419371c9d4SSatish Balay     x3   = xb[2];
9429371c9d4SSatish Balay     x4   = xb[3];
94349b5e25fSSatish Balay     ib   = aj + *ai;
944831a3094SHong Zhang     jmin = 0;
94598c9bda7SSatish Balay     nonzerorow += (n > 0);
94659689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
94749b5e25fSSatish Balay       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
94849b5e25fSSatish Balay       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
94949b5e25fSSatish Balay       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
95049b5e25fSSatish Balay       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
9519371c9d4SSatish Balay       v += 16;
9529371c9d4SSatish Balay       jmin++;
9537fbae186SHong Zhang     }
954444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
955444d8c10SJed Brown     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
956831a3094SHong Zhang     for (j = jmin; j < n; j++) {
95749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
95849b5e25fSSatish Balay       cval = ib[j] * 4;
95949b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
96049b5e25fSSatish Balay       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
96149b5e25fSSatish Balay       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
96249b5e25fSSatish Balay       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
96349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
96449b5e25fSSatish Balay       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
96549b5e25fSSatish Balay       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
96649b5e25fSSatish Balay       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
96749b5e25fSSatish Balay       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
96849b5e25fSSatish Balay       v += 16;
96949b5e25fSSatish Balay     }
9709371c9d4SSatish Balay     xb += 4;
9719371c9d4SSatish Balay     ai++;
97249b5e25fSSatish Balay   }
97349b5e25fSSatish Balay 
9749566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
9759566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
97649b5e25fSSatish Balay 
9779566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
9783ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
97949b5e25fSSatish Balay }
98049b5e25fSSatish Balay 
981d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
982d71ae5a4SJacob Faibussowitsch {
98349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
984d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5;
985d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
986d9ca1df4SBarry Smith   const MatScalar   *v;
987d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
988d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
98998c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
99049b5e25fSSatish Balay 
99149b5e25fSSatish Balay   PetscFunctionBegin;
9929566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
9933ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
9949566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
9959566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
99649b5e25fSSatish Balay 
99749b5e25fSSatish Balay   v  = a->a;
99849b5e25fSSatish Balay   xb = x;
99949b5e25fSSatish Balay 
100049b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
100149b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
10029371c9d4SSatish Balay     x1   = xb[0];
10039371c9d4SSatish Balay     x2   = xb[1];
10049371c9d4SSatish Balay     x3   = xb[2];
10059371c9d4SSatish Balay     x4   = xb[3];
10069371c9d4SSatish Balay     x5   = xb[4];
100749b5e25fSSatish Balay     ib   = aj + *ai;
1008831a3094SHong Zhang     jmin = 0;
100998c9bda7SSatish Balay     nonzerorow += (n > 0);
101059689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
101149b5e25fSSatish Balay       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
101249b5e25fSSatish Balay       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
101349b5e25fSSatish Balay       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
101449b5e25fSSatish Balay       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
101549b5e25fSSatish Balay       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
10169371c9d4SSatish Balay       v += 25;
10179371c9d4SSatish Balay       jmin++;
10187fbae186SHong Zhang     }
1019444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1020444d8c10SJed Brown     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1021831a3094SHong Zhang     for (j = jmin; j < n; j++) {
102249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
102349b5e25fSSatish Balay       cval = ib[j] * 5;
102449b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
102549b5e25fSSatish Balay       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
102649b5e25fSSatish Balay       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
102749b5e25fSSatish Balay       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
102849b5e25fSSatish Balay       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
102949b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
103049b5e25fSSatish Balay       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
103149b5e25fSSatish Balay       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
103249b5e25fSSatish Balay       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
103349b5e25fSSatish Balay       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
103449b5e25fSSatish Balay       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
103549b5e25fSSatish Balay       v += 25;
103649b5e25fSSatish Balay     }
10379371c9d4SSatish Balay     xb += 5;
10389371c9d4SSatish Balay     ai++;
103949b5e25fSSatish Balay   }
104049b5e25fSSatish Balay 
10419566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
10429566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
104349b5e25fSSatish Balay 
10449566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
10453ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
104649b5e25fSSatish Balay }
1047c2916339SPierre Jolivet 
1048d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1049d71ae5a4SJacob Faibussowitsch {
105049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1051d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, x6;
1052d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
1053d9ca1df4SBarry Smith   const MatScalar   *v;
1054d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1055d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
105698c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
105749b5e25fSSatish Balay 
105849b5e25fSSatish Balay   PetscFunctionBegin;
10599566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
10603ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
10619566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
10629566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
106349b5e25fSSatish Balay 
106449b5e25fSSatish Balay   v  = a->a;
106549b5e25fSSatish Balay   xb = x;
106649b5e25fSSatish Balay 
106749b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
106849b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
10699371c9d4SSatish Balay     x1   = xb[0];
10709371c9d4SSatish Balay     x2   = xb[1];
10719371c9d4SSatish Balay     x3   = xb[2];
10729371c9d4SSatish Balay     x4   = xb[3];
10739371c9d4SSatish Balay     x5   = xb[4];
10749371c9d4SSatish Balay     x6   = xb[5];
107549b5e25fSSatish Balay     ib   = aj + *ai;
1076831a3094SHong Zhang     jmin = 0;
107798c9bda7SSatish Balay     nonzerorow += (n > 0);
107859689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
107949b5e25fSSatish Balay       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
108049b5e25fSSatish Balay       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
108149b5e25fSSatish Balay       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
108249b5e25fSSatish Balay       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
108349b5e25fSSatish Balay       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
108449b5e25fSSatish Balay       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
10859371c9d4SSatish Balay       v += 36;
10869371c9d4SSatish Balay       jmin++;
10877fbae186SHong Zhang     }
1088444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1089444d8c10SJed Brown     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1090831a3094SHong Zhang     for (j = jmin; j < n; j++) {
109149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
109249b5e25fSSatish Balay       cval = ib[j] * 6;
109349b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
109449b5e25fSSatish Balay       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
109549b5e25fSSatish Balay       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
109649b5e25fSSatish Balay       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
109749b5e25fSSatish Balay       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
109849b5e25fSSatish Balay       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
109949b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
110049b5e25fSSatish Balay       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
110149b5e25fSSatish Balay       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
110249b5e25fSSatish Balay       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
110349b5e25fSSatish Balay       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
110449b5e25fSSatish Balay       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
110549b5e25fSSatish Balay       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
110649b5e25fSSatish Balay       v += 36;
110749b5e25fSSatish Balay     }
11089371c9d4SSatish Balay     xb += 6;
11099371c9d4SSatish Balay     ai++;
111049b5e25fSSatish Balay   }
111149b5e25fSSatish Balay 
11129566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
11139566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
111449b5e25fSSatish Balay 
11159566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
11163ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
111749b5e25fSSatish Balay }
111849b5e25fSSatish Balay 
1119d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1120d71ae5a4SJacob Faibussowitsch {
112149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1122d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7;
1123d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
1124d9ca1df4SBarry Smith   const MatScalar   *v;
1125d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1126d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
112798c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
112849b5e25fSSatish Balay 
112949b5e25fSSatish Balay   PetscFunctionBegin;
11309566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
11313ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
11329566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
11339566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
113449b5e25fSSatish Balay 
113549b5e25fSSatish Balay   v  = a->a;
113649b5e25fSSatish Balay   xb = x;
113749b5e25fSSatish Balay 
113849b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
113949b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
11409371c9d4SSatish Balay     x1   = xb[0];
11419371c9d4SSatish Balay     x2   = xb[1];
11429371c9d4SSatish Balay     x3   = xb[2];
11439371c9d4SSatish Balay     x4   = xb[3];
11449371c9d4SSatish Balay     x5   = xb[4];
11459371c9d4SSatish Balay     x6   = xb[5];
11469371c9d4SSatish Balay     x7   = xb[6];
114749b5e25fSSatish Balay     ib   = aj + *ai;
1148831a3094SHong Zhang     jmin = 0;
114998c9bda7SSatish Balay     nonzerorow += (n > 0);
115059689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
115149b5e25fSSatish Balay       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
115249b5e25fSSatish Balay       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
115349b5e25fSSatish Balay       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
115449b5e25fSSatish Balay       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
115549b5e25fSSatish Balay       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
115649b5e25fSSatish Balay       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
115749b5e25fSSatish Balay       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
11589371c9d4SSatish Balay       v += 49;
11599371c9d4SSatish Balay       jmin++;
11607fbae186SHong Zhang     }
1161444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1162444d8c10SJed Brown     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1163831a3094SHong Zhang     for (j = jmin; j < n; j++) {
116449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
116549b5e25fSSatish Balay       cval = ib[j] * 7;
116649b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
116749b5e25fSSatish Balay       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
116849b5e25fSSatish Balay       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
116949b5e25fSSatish Balay       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
117049b5e25fSSatish Balay       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
117149b5e25fSSatish Balay       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
117249b5e25fSSatish Balay       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
117349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
117449b5e25fSSatish Balay       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
117549b5e25fSSatish Balay       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
117649b5e25fSSatish Balay       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
117749b5e25fSSatish Balay       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
117849b5e25fSSatish Balay       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
117949b5e25fSSatish Balay       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
118049b5e25fSSatish Balay       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
118149b5e25fSSatish Balay       v += 49;
118249b5e25fSSatish Balay     }
11839371c9d4SSatish Balay     xb += 7;
11849371c9d4SSatish Balay     ai++;
118549b5e25fSSatish Balay   }
118649b5e25fSSatish Balay 
11879566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
11889566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
118949b5e25fSSatish Balay 
11909566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
11913ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
119249b5e25fSSatish Balay }
119349b5e25fSSatish Balay 
1194d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1195d71ae5a4SJacob Faibussowitsch {
119649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1197f4259b30SLisandro Dalcin   PetscScalar       *z, *z_ptr = NULL, *zb, *work, *workt;
1198d9ca1df4SBarry Smith   const PetscScalar *x, *x_ptr, *xb;
1199d9ca1df4SBarry Smith   const MatScalar   *v;
1200d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1201d9ca1df4SBarry Smith   const PetscInt    *idx, *aj, *ii;
120298c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
120349b5e25fSSatish Balay 
120449b5e25fSSatish Balay   PetscFunctionBegin;
12059566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
12063ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
12079371c9d4SSatish Balay   PetscCall(VecGetArrayRead(xx, &x));
12089371c9d4SSatish Balay   x_ptr = x;
12099371c9d4SSatish Balay   PetscCall(VecGetArray(zz, &z));
12109371c9d4SSatish Balay   z_ptr = z;
121149b5e25fSSatish Balay 
121249b5e25fSSatish Balay   aj = a->j;
121349b5e25fSSatish Balay   v  = a->a;
121449b5e25fSSatish Balay   ii = a->i;
121549b5e25fSSatish Balay 
121648a46eb9SPierre Jolivet   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
121749b5e25fSSatish Balay   work = a->mult_work;
121849b5e25fSSatish Balay 
121949b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
12209371c9d4SSatish Balay     n     = ii[1] - ii[0];
12219371c9d4SSatish Balay     ncols = n * bs;
12229371c9d4SSatish Balay     workt = work;
12239371c9d4SSatish Balay     idx   = aj + ii[0];
122498c9bda7SSatish Balay     nonzerorow += (n > 0);
122549b5e25fSSatish Balay 
122649b5e25fSSatish Balay     /* upper triangular part */
122749b5e25fSSatish Balay     for (j = 0; j < n; j++) {
122849b5e25fSSatish Balay       xb = x_ptr + bs * (*idx++);
122949b5e25fSSatish Balay       for (k = 0; k < bs; k++) workt[k] = xb[k];
123049b5e25fSSatish Balay       workt += bs;
123149b5e25fSSatish Balay     }
123249b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
123396b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
123449b5e25fSSatish Balay 
123549b5e25fSSatish Balay     /* strict lower triangular part */
1236831a3094SHong Zhang     idx = aj + ii[0];
123759689ffbSStefano Zampini     if (n && *idx == i) {
12389371c9d4SSatish Balay       ncols -= bs;
12399371c9d4SSatish Balay       v += bs2;
12409371c9d4SSatish Balay       idx++;
12419371c9d4SSatish Balay       n--;
1242831a3094SHong Zhang     }
124349b5e25fSSatish Balay     if (ncols > 0) {
124449b5e25fSSatish Balay       workt = work;
12459566063dSJacob Faibussowitsch       PetscCall(PetscArrayzero(workt, ncols));
124696b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1247831a3094SHong Zhang       for (j = 0; j < n; j++) {
1248831a3094SHong Zhang         zb = z_ptr + bs * (*idx++);
124949b5e25fSSatish Balay         for (k = 0; k < bs; k++) zb[k] += workt[k];
125049b5e25fSSatish Balay         workt += bs;
125149b5e25fSSatish Balay       }
125249b5e25fSSatish Balay     }
125349b5e25fSSatish Balay 
12549371c9d4SSatish Balay     x += bs;
12559371c9d4SSatish Balay     v += n * bs2;
12569371c9d4SSatish Balay     z += bs;
12579371c9d4SSatish Balay     ii++;
125849b5e25fSSatish Balay   }
125949b5e25fSSatish Balay 
12609566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
12619566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
126249b5e25fSSatish Balay 
12639566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
12643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
126549b5e25fSSatish Balay }
126649b5e25fSSatish Balay 
1267d71ae5a4SJacob Faibussowitsch PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1268d71ae5a4SJacob Faibussowitsch {
126949b5e25fSSatish Balay   Mat_SeqSBAIJ *a      = (Mat_SeqSBAIJ *)inA->data;
1270f4df32b1SMatthew Knepley   PetscScalar   oalpha = alpha;
1271c5df96a5SBarry Smith   PetscBLASInt  one    = 1, totalnz;
127249b5e25fSSatish Balay 
127349b5e25fSSatish Balay   PetscFunctionBegin;
12749566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1275792fecdfSBarry Smith   PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
12769566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(totalnz));
12773ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
127849b5e25fSSatish Balay }
127949b5e25fSSatish Balay 
1280d71ae5a4SJacob Faibussowitsch PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1281d71ae5a4SJacob Faibussowitsch {
128249b5e25fSSatish Balay   Mat_SeqSBAIJ    *a        = (Mat_SeqSBAIJ *)A->data;
1283d9ca1df4SBarry Smith   const MatScalar *v        = a->a;
128449b5e25fSSatish Balay   PetscReal        sum_diag = 0.0, sum_off = 0.0, *sum;
1285d9ca1df4SBarry Smith   PetscInt         i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1286d9ca1df4SBarry Smith   const PetscInt  *aj = a->j, *col;
128749b5e25fSSatish Balay 
128849b5e25fSSatish Balay   PetscFunctionBegin;
1289c40ae873SPierre Jolivet   if (!a->nz) {
1290c40ae873SPierre Jolivet     *norm = 0.0;
12913ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1292c40ae873SPierre Jolivet   }
129349b5e25fSSatish Balay   if (type == NORM_FROBENIUS) {
129449b5e25fSSatish Balay     for (k = 0; k < mbs; k++) {
129559689ffbSStefano Zampini       jmin = a->i[k];
129659689ffbSStefano Zampini       jmax = a->i[k + 1];
1297831a3094SHong Zhang       col  = aj + jmin;
129859689ffbSStefano Zampini       if (jmax - jmin > 0 && *col == k) { /* diagonal block */
129949b5e25fSSatish Balay         for (i = 0; i < bs2; i++) {
13009371c9d4SSatish Balay           sum_diag += PetscRealPart(PetscConj(*v) * (*v));
13019371c9d4SSatish Balay           v++;
130249b5e25fSSatish Balay         }
1303831a3094SHong Zhang         jmin++;
1304831a3094SHong Zhang       }
1305831a3094SHong Zhang       for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
130649b5e25fSSatish Balay         for (i = 0; i < bs2; i++) {
13079371c9d4SSatish Balay           sum_off += PetscRealPart(PetscConj(*v) * (*v));
13089371c9d4SSatish Balay           v++;
130949b5e25fSSatish Balay         }
131049b5e25fSSatish Balay       }
131149b5e25fSSatish Balay     }
13128f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
13139566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
13140b8dc8d2SHong Zhang   } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
13159566063dSJacob Faibussowitsch     PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
13160b8dc8d2SHong Zhang     for (i = 0; i < mbs; i++) jl[i] = mbs;
13170b8dc8d2SHong Zhang     il[0] = 0;
131849b5e25fSSatish Balay 
131949b5e25fSSatish Balay     *norm = 0.0;
132049b5e25fSSatish Balay     for (k = 0; k < mbs; k++) { /* k_th block row */
132149b5e25fSSatish Balay       for (j = 0; j < bs; j++) sum[j] = 0.0;
132249b5e25fSSatish Balay       /*-- col sum --*/
132349b5e25fSSatish Balay       i = jl[k]; /* first |A(i,k)| to be added */
1324bbea24aaSStefano Zampini       /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window)
1325831a3094SHong Zhang                   at step k */
132649b5e25fSSatish Balay       while (i < mbs) {
132749b5e25fSSatish Balay         nexti = jl[i]; /* next block row to be added */
132849b5e25fSSatish Balay         ik    = il[i]; /* block index of A(i,k) in the array a */
132949b5e25fSSatish Balay         for (j = 0; j < bs; j++) {
133049b5e25fSSatish Balay           v = a->a + ik * bs2 + j * bs;
133149b5e25fSSatish Balay           for (k1 = 0; k1 < bs; k1++) {
13329371c9d4SSatish Balay             sum[j] += PetscAbsScalar(*v);
13339371c9d4SSatish Balay             v++;
133449b5e25fSSatish Balay           }
133549b5e25fSSatish Balay         }
133649b5e25fSSatish Balay         /* update il, jl */
1337831a3094SHong Zhang         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1338831a3094SHong Zhang         jmax = a->i[i + 1];
133949b5e25fSSatish Balay         if (jmin < jmax) {
134049b5e25fSSatish Balay           il[i] = jmin;
134149b5e25fSSatish Balay           j     = a->j[jmin];
13429371c9d4SSatish Balay           jl[i] = jl[j];
13439371c9d4SSatish Balay           jl[j] = i;
134449b5e25fSSatish Balay         }
134549b5e25fSSatish Balay         i = nexti;
134649b5e25fSSatish Balay       }
134749b5e25fSSatish Balay       /*-- row sum --*/
134859689ffbSStefano Zampini       jmin = a->i[k];
134959689ffbSStefano Zampini       jmax = a->i[k + 1];
135049b5e25fSSatish Balay       for (i = jmin; i < jmax; i++) {
135149b5e25fSSatish Balay         for (j = 0; j < bs; j++) {
135249b5e25fSSatish Balay           v = a->a + i * bs2 + j;
135349b5e25fSSatish Balay           for (k1 = 0; k1 < bs; k1++) {
13549371c9d4SSatish Balay             sum[j] += PetscAbsScalar(*v);
13559371c9d4SSatish Balay             v += bs;
135649b5e25fSSatish Balay           }
135749b5e25fSSatish Balay         }
135849b5e25fSSatish Balay       }
135949b5e25fSSatish Balay       /* add k_th block row to il, jl */
1360831a3094SHong Zhang       col = aj + jmin;
136159689ffbSStefano Zampini       if (jmax - jmin > 0 && *col == k) jmin++;
136249b5e25fSSatish Balay       if (jmin < jmax) {
136349b5e25fSSatish Balay         il[k] = jmin;
13649371c9d4SSatish Balay         j     = a->j[jmin];
13659371c9d4SSatish Balay         jl[k] = jl[j];
13669371c9d4SSatish Balay         jl[j] = k;
136749b5e25fSSatish Balay       }
136849b5e25fSSatish Balay       for (j = 0; j < bs; j++) {
136949b5e25fSSatish Balay         if (sum[j] > *norm) *norm = sum[j];
137049b5e25fSSatish Balay       }
137149b5e25fSSatish Balay     }
13729566063dSJacob Faibussowitsch     PetscCall(PetscFree3(sum, il, jl));
13739566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1374f23aa3ddSBarry Smith   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
13753ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
137649b5e25fSSatish Balay }
137749b5e25fSSatish Balay 
1378d71ae5a4SJacob Faibussowitsch PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1379d71ae5a4SJacob Faibussowitsch {
138049b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
138149b5e25fSSatish Balay 
138249b5e25fSSatish Balay   PetscFunctionBegin;
138349b5e25fSSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1384d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1385ef511fbeSHong Zhang     *flg = PETSC_FALSE;
13863ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
138749b5e25fSSatish Balay   }
138849b5e25fSSatish Balay 
138949b5e25fSSatish Balay   /* if the a->i are the same */
13909566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
13913ba16761SJacob Faibussowitsch   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
139249b5e25fSSatish Balay 
139349b5e25fSSatish Balay   /* if a->j are the same */
13949566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
13953ba16761SJacob Faibussowitsch   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
139626fbe8dcSKarl Rupp 
139749b5e25fSSatish Balay   /* if a->a are the same */
13989566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg));
13993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
140049b5e25fSSatish Balay }
140149b5e25fSSatish Balay 
1402d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1403d71ae5a4SJacob Faibussowitsch {
140449b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1405d9ca1df4SBarry Smith   PetscInt         i, j, k, row, bs, ambs, bs2;
1406d9ca1df4SBarry Smith   const PetscInt  *ai, *aj;
140787828ca2SBarry Smith   PetscScalar     *x, zero = 0.0;
1408d9ca1df4SBarry Smith   const MatScalar *aa, *aa_j;
140949b5e25fSSatish Balay 
141049b5e25fSSatish Balay   PetscFunctionBegin;
1411d0f46423SBarry Smith   bs = A->rmap->bs;
1412aed4548fSBarry Smith   PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");
141382799104SHong Zhang 
141449b5e25fSSatish Balay   aa   = a->a;
14158a0c6efdSHong Zhang   ambs = a->mbs;
14168a0c6efdSHong Zhang 
14178a0c6efdSHong Zhang   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
14188a0c6efdSHong Zhang     PetscInt *diag = a->diag;
14198a0c6efdSHong Zhang     aa             = a->a;
14208a0c6efdSHong Zhang     ambs           = a->mbs;
14219566063dSJacob Faibussowitsch     PetscCall(VecGetArray(v, &x));
14228a0c6efdSHong Zhang     for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
14239566063dSJacob Faibussowitsch     PetscCall(VecRestoreArray(v, &x));
14243ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
14258a0c6efdSHong Zhang   }
14268a0c6efdSHong Zhang 
142749b5e25fSSatish Balay   ai  = a->i;
142849b5e25fSSatish Balay   aj  = a->j;
142949b5e25fSSatish Balay   bs2 = a->bs2;
14309566063dSJacob Faibussowitsch   PetscCall(VecSet(v, zero));
14313ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
14329566063dSJacob Faibussowitsch   PetscCall(VecGetArray(v, &x));
143349b5e25fSSatish Balay   for (i = 0; i < ambs; i++) {
143449b5e25fSSatish Balay     j = ai[i];
143549b5e25fSSatish Balay     if (aj[j] == i) { /* if this is a diagonal element */
143649b5e25fSSatish Balay       row  = i * bs;
143749b5e25fSSatish Balay       aa_j = aa + j * bs2;
143849b5e25fSSatish Balay       for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
143949b5e25fSSatish Balay     }
144049b5e25fSSatish Balay   }
14419566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(v, &x));
14423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
144349b5e25fSSatish Balay }
144449b5e25fSSatish Balay 
1445d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1446d71ae5a4SJacob Faibussowitsch {
144749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1448d9ca1df4SBarry Smith   PetscScalar        x;
1449d9ca1df4SBarry Smith   const PetscScalar *l, *li, *ri;
145049b5e25fSSatish Balay   MatScalar         *aa, *v;
1451fff8e43fSBarry Smith   PetscInt           i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1452fff8e43fSBarry Smith   const PetscInt    *ai, *aj;
1453ace3abfcSBarry Smith   PetscBool          flg;
145449b5e25fSSatish Balay 
145549b5e25fSSatish Balay   PetscFunctionBegin;
1456b3bf805bSHong Zhang   if (ll != rr) {
14579566063dSJacob Faibussowitsch     PetscCall(VecEqual(ll, rr, &flg));
145828b400f6SJacob Faibussowitsch     PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same");
1459b3bf805bSHong Zhang   }
14603ba16761SJacob Faibussowitsch   if (!ll) PetscFunctionReturn(PETSC_SUCCESS);
146149b5e25fSSatish Balay   ai  = a->i;
146249b5e25fSSatish Balay   aj  = a->j;
146349b5e25fSSatish Balay   aa  = a->a;
1464d0f46423SBarry Smith   m   = A->rmap->N;
1465d0f46423SBarry Smith   bs  = A->rmap->bs;
146649b5e25fSSatish Balay   mbs = a->mbs;
146749b5e25fSSatish Balay   bs2 = a->bs2;
146849b5e25fSSatish Balay 
14699566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(ll, &l));
14709566063dSJacob Faibussowitsch   PetscCall(VecGetLocalSize(ll, &lm));
147108401ef6SPierre Jolivet   PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
147249b5e25fSSatish Balay   for (i = 0; i < mbs; i++) { /* for each block row */
147349b5e25fSSatish Balay     M  = ai[i + 1] - ai[i];
147449b5e25fSSatish Balay     li = l + i * bs;
147549b5e25fSSatish Balay     v  = aa + bs2 * ai[i];
147649b5e25fSSatish Balay     for (j = 0; j < M; j++) { /* for each block */
147749b5e25fSSatish Balay       ri = l + bs * aj[ai[i] + j];
14785e90f9d9SHong Zhang       for (k = 0; k < bs; k++) {
147949b5e25fSSatish Balay         x = ri[k];
148049b5e25fSSatish Balay         for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
148149b5e25fSSatish Balay       }
148249b5e25fSSatish Balay     }
148349b5e25fSSatish Balay   }
14849566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(ll, &l));
14859566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz));
14863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
148749b5e25fSSatish Balay }
148849b5e25fSSatish Balay 
1489d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1490d71ae5a4SJacob Faibussowitsch {
149149b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
149249b5e25fSSatish Balay 
149349b5e25fSSatish Balay   PetscFunctionBegin;
149449b5e25fSSatish Balay   info->block_size   = a->bs2;
1495ceed8ce5SJed Brown   info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
14966c6c5352SBarry Smith   info->nz_used      = a->bs2 * a->nz;    /*num. of nonzeros in upper triangular part */
14973966268fSBarry Smith   info->nz_unneeded  = info->nz_allocated - info->nz_used;
149849b5e25fSSatish Balay   info->assemblies   = A->num_ass;
14998e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
15004dfa11a4SJacob Faibussowitsch   info->memory       = 0; /* REVIEW ME */
1501d5f3da31SBarry Smith   if (A->factortype) {
150249b5e25fSSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
150349b5e25fSSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
150449b5e25fSSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
150549b5e25fSSatish Balay   } else {
150649b5e25fSSatish Balay     info->fill_ratio_given  = 0;
150749b5e25fSSatish Balay     info->fill_ratio_needed = 0;
150849b5e25fSSatish Balay     info->factor_mallocs    = 0;
150949b5e25fSSatish Balay   }
15103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
151149b5e25fSSatish Balay }
151249b5e25fSSatish Balay 
1513d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1514d71ae5a4SJacob Faibussowitsch {
151549b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
151649b5e25fSSatish Balay 
151749b5e25fSSatish Balay   PetscFunctionBegin;
15189566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
15193ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
152049b5e25fSSatish Balay }
1521dc354874SHong Zhang 
1522d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1523d71ae5a4SJacob Faibussowitsch {
1524dc354874SHong Zhang   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1525d9ca1df4SBarry Smith   PetscInt         i, j, n, row, col, bs, mbs;
1526d9ca1df4SBarry Smith   const PetscInt  *ai, *aj;
1527c3fca9a7SHong Zhang   PetscReal        atmp;
1528d9ca1df4SBarry Smith   const MatScalar *aa;
1529985db425SBarry Smith   PetscScalar     *x;
153013f74950SBarry Smith   PetscInt         ncols, brow, bcol, krow, kcol;
1531dc354874SHong Zhang 
1532dc354874SHong Zhang   PetscFunctionBegin;
153328b400f6SJacob Faibussowitsch   PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
153428b400f6SJacob Faibussowitsch   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1535d0f46423SBarry Smith   bs  = A->rmap->bs;
1536dc354874SHong Zhang   aa  = a->a;
1537dc354874SHong Zhang   ai  = a->i;
1538dc354874SHong Zhang   aj  = a->j;
153944117c81SHong Zhang   mbs = a->mbs;
1540dc354874SHong Zhang 
15419566063dSJacob Faibussowitsch   PetscCall(VecSet(v, 0.0));
15429566063dSJacob Faibussowitsch   PetscCall(VecGetArray(v, &x));
15439566063dSJacob Faibussowitsch   PetscCall(VecGetLocalSize(v, &n));
154408401ef6SPierre Jolivet   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
154544117c81SHong Zhang   for (i = 0; i < mbs; i++) {
15469371c9d4SSatish Balay     ncols = ai[1] - ai[0];
15479371c9d4SSatish Balay     ai++;
1548d0f6400bSHong Zhang     brow = bs * i;
154944117c81SHong Zhang     for (j = 0; j < ncols; j++) {
1550d0f6400bSHong Zhang       bcol = bs * (*aj);
155144117c81SHong Zhang       for (kcol = 0; kcol < bs; kcol++) {
1552d0f6400bSHong Zhang         col = bcol + kcol; /* col index */
155344117c81SHong Zhang         for (krow = 0; krow < bs; krow++) {
15549371c9d4SSatish Balay           atmp = PetscAbsScalar(*aa);
15559371c9d4SSatish Balay           aa++;
1556d0f6400bSHong Zhang           row = brow + krow; /* row index */
1557c3fca9a7SHong Zhang           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1558c3fca9a7SHong Zhang           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
155944117c81SHong Zhang         }
156044117c81SHong Zhang       }
1561d0f6400bSHong Zhang       aj++;
1562dc354874SHong Zhang     }
1563dc354874SHong Zhang   }
15649566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(v, &x));
15653ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1566dc354874SHong Zhang }
1567c2916339SPierre Jolivet 
1568d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1569d71ae5a4SJacob Faibussowitsch {
1570c2916339SPierre Jolivet   PetscFunctionBegin;
15719566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
15724222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
15733ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1574c2916339SPierre Jolivet }
1575c2916339SPierre Jolivet 
157666976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1577d71ae5a4SJacob Faibussowitsch {
1578c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1579c2916339SPierre Jolivet   PetscScalar       *z = c;
1580c2916339SPierre Jolivet   const PetscScalar *xb;
1581c2916339SPierre Jolivet   PetscScalar        x1;
1582c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1583c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
15843c2e41e1SStefano Zampini #if defined(PETSC_USE_COMPLEX)
1585b94d7dedSBarry Smith   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
15863c2e41e1SStefano Zampini #else
15873c2e41e1SStefano Zampini   const int aconj = 0;
15883c2e41e1SStefano Zampini #endif
1589c2916339SPierre Jolivet 
1590c2916339SPierre Jolivet   PetscFunctionBegin;
1591c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
15929371c9d4SSatish Balay     n = ii[1] - ii[0];
15939371c9d4SSatish Balay     ii++;
1594c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1595c2916339SPierre Jolivet     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1596c2916339SPierre Jolivet     jj = idx;
1597c2916339SPierre Jolivet     vv = v;
1598c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1599c2916339SPierre Jolivet       idx = jj;
1600c2916339SPierre Jolivet       v   = vv;
1601c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
16029371c9d4SSatish Balay         xb = b + (*idx);
16039371c9d4SSatish Balay         x1 = xb[0 + k * bm];
1604c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1;
16053c2e41e1SStefano Zampini         if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1606c2916339SPierre Jolivet         v += 1;
1607c2916339SPierre Jolivet         ++idx;
1608c2916339SPierre Jolivet       }
1609c2916339SPierre Jolivet     }
1610c2916339SPierre Jolivet     z += 1;
1611c2916339SPierre Jolivet   }
16123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1613c2916339SPierre Jolivet }
1614c2916339SPierre Jolivet 
161566976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1616d71ae5a4SJacob Faibussowitsch {
1617c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1618c2916339SPierre Jolivet   PetscScalar       *z = c;
1619c2916339SPierre Jolivet   const PetscScalar *xb;
1620c2916339SPierre Jolivet   PetscScalar        x1, x2;
1621c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1622c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1623c2916339SPierre Jolivet 
1624c2916339SPierre Jolivet   PetscFunctionBegin;
1625c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
16269371c9d4SSatish Balay     n = ii[1] - ii[0];
16279371c9d4SSatish Balay     ii++;
1628c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1629c2916339SPierre Jolivet     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1630c2916339SPierre Jolivet     jj = idx;
1631c2916339SPierre Jolivet     vv = v;
1632c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1633c2916339SPierre Jolivet       idx = jj;
1634c2916339SPierre Jolivet       v   = vv;
1635c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
16369371c9d4SSatish Balay         xb = b + 2 * (*idx);
16379371c9d4SSatish Balay         x1 = xb[0 + k * bm];
16389371c9d4SSatish Balay         x2 = xb[1 + k * bm];
1639c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1640c2916339SPierre Jolivet         z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1641c2916339SPierre Jolivet         if (*idx != i) {
1642c2916339SPierre Jolivet           c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1643c2916339SPierre Jolivet           c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1644c2916339SPierre Jolivet         }
1645c2916339SPierre Jolivet         v += 4;
1646c2916339SPierre Jolivet         ++idx;
1647c2916339SPierre Jolivet       }
1648c2916339SPierre Jolivet     }
1649c2916339SPierre Jolivet     z += 2;
1650c2916339SPierre Jolivet   }
16513ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1652c2916339SPierre Jolivet }
1653c2916339SPierre Jolivet 
165466976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1655d71ae5a4SJacob Faibussowitsch {
1656c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1657c2916339SPierre Jolivet   PetscScalar       *z = c;
1658c2916339SPierre Jolivet   const PetscScalar *xb;
1659c2916339SPierre Jolivet   PetscScalar        x1, x2, x3;
1660c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1661c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1662c2916339SPierre Jolivet 
1663c2916339SPierre Jolivet   PetscFunctionBegin;
1664c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
16659371c9d4SSatish Balay     n = ii[1] - ii[0];
16669371c9d4SSatish Balay     ii++;
1667c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1668c2916339SPierre Jolivet     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1669c2916339SPierre Jolivet     jj = idx;
1670c2916339SPierre Jolivet     vv = v;
1671c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1672c2916339SPierre Jolivet       idx = jj;
1673c2916339SPierre Jolivet       v   = vv;
1674c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
16759371c9d4SSatish Balay         xb = b + 3 * (*idx);
16769371c9d4SSatish Balay         x1 = xb[0 + k * bm];
16779371c9d4SSatish Balay         x2 = xb[1 + k * bm];
16789371c9d4SSatish Balay         x3 = xb[2 + k * bm];
1679c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1680c2916339SPierre Jolivet         z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1681c2916339SPierre Jolivet         z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1682c2916339SPierre Jolivet         if (*idx != i) {
1683c2916339SPierre Jolivet           c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1684c2916339SPierre Jolivet           c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1685c2916339SPierre Jolivet           c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1686c2916339SPierre Jolivet         }
1687c2916339SPierre Jolivet         v += 9;
1688c2916339SPierre Jolivet         ++idx;
1689c2916339SPierre Jolivet       }
1690c2916339SPierre Jolivet     }
1691c2916339SPierre Jolivet     z += 3;
1692c2916339SPierre Jolivet   }
16933ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1694c2916339SPierre Jolivet }
1695c2916339SPierre Jolivet 
169666976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1697d71ae5a4SJacob Faibussowitsch {
1698c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1699c2916339SPierre Jolivet   PetscScalar       *z = c;
1700c2916339SPierre Jolivet   const PetscScalar *xb;
1701c2916339SPierre Jolivet   PetscScalar        x1, x2, x3, x4;
1702c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1703c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1704c2916339SPierre Jolivet 
1705c2916339SPierre Jolivet   PetscFunctionBegin;
1706c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
17079371c9d4SSatish Balay     n = ii[1] - ii[0];
17089371c9d4SSatish Balay     ii++;
1709c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1710c2916339SPierre Jolivet     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1711c2916339SPierre Jolivet     jj = idx;
1712c2916339SPierre Jolivet     vv = v;
1713c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1714c2916339SPierre Jolivet       idx = jj;
1715c2916339SPierre Jolivet       v   = vv;
1716c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
17179371c9d4SSatish Balay         xb = b + 4 * (*idx);
17189371c9d4SSatish Balay         x1 = xb[0 + k * bm];
17199371c9d4SSatish Balay         x2 = xb[1 + k * bm];
17209371c9d4SSatish Balay         x3 = xb[2 + k * bm];
17219371c9d4SSatish Balay         x4 = xb[3 + k * bm];
1722c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1723c2916339SPierre Jolivet         z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1724c2916339SPierre Jolivet         z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1725c2916339SPierre Jolivet         z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1726c2916339SPierre Jolivet         if (*idx != i) {
1727c2916339SPierre Jolivet           c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1728c2916339SPierre Jolivet           c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1729c2916339SPierre Jolivet           c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1730c2916339SPierre Jolivet           c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1731c2916339SPierre Jolivet         }
1732c2916339SPierre Jolivet         v += 16;
1733c2916339SPierre Jolivet         ++idx;
1734c2916339SPierre Jolivet       }
1735c2916339SPierre Jolivet     }
1736c2916339SPierre Jolivet     z += 4;
1737c2916339SPierre Jolivet   }
17383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1739c2916339SPierre Jolivet }
1740c2916339SPierre Jolivet 
174166976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1742d71ae5a4SJacob Faibussowitsch {
1743c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1744c2916339SPierre Jolivet   PetscScalar       *z = c;
1745c2916339SPierre Jolivet   const PetscScalar *xb;
1746c2916339SPierre Jolivet   PetscScalar        x1, x2, x3, x4, x5;
1747c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1748c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1749c2916339SPierre Jolivet 
1750c2916339SPierre Jolivet   PetscFunctionBegin;
1751c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
17529371c9d4SSatish Balay     n = ii[1] - ii[0];
17539371c9d4SSatish Balay     ii++;
1754c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1755c2916339SPierre Jolivet     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1756c2916339SPierre Jolivet     jj = idx;
1757c2916339SPierre Jolivet     vv = v;
1758c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1759c2916339SPierre Jolivet       idx = jj;
1760c2916339SPierre Jolivet       v   = vv;
1761c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
17629371c9d4SSatish Balay         xb = b + 5 * (*idx);
17639371c9d4SSatish Balay         x1 = xb[0 + k * bm];
17649371c9d4SSatish Balay         x2 = xb[1 + k * bm];
17659371c9d4SSatish Balay         x3 = xb[2 + k * bm];
17669371c9d4SSatish Balay         x4 = xb[3 + k * bm];
17679371c9d4SSatish Balay         x5 = xb[4 + k * cm];
1768c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1769c2916339SPierre Jolivet         z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1770c2916339SPierre Jolivet         z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1771c2916339SPierre Jolivet         z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1772c2916339SPierre Jolivet         z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1773c2916339SPierre Jolivet         if (*idx != i) {
1774c2916339SPierre Jolivet           c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1775c2916339SPierre Jolivet           c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1776c2916339SPierre Jolivet           c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1777c2916339SPierre Jolivet           c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1778c2916339SPierre Jolivet           c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1779c2916339SPierre Jolivet         }
1780c2916339SPierre Jolivet         v += 25;
1781c2916339SPierre Jolivet         ++idx;
1782c2916339SPierre Jolivet       }
1783c2916339SPierre Jolivet     }
1784c2916339SPierre Jolivet     z += 5;
1785c2916339SPierre Jolivet   }
17863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1787c2916339SPierre Jolivet }
1788c2916339SPierre Jolivet 
1789d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1790d71ae5a4SJacob Faibussowitsch {
1791c2916339SPierre Jolivet   Mat_SeqSBAIJ    *a  = (Mat_SeqSBAIJ *)A->data;
1792c2916339SPierre Jolivet   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
1793281439baSStefano Zampini   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
1794c2916339SPierre Jolivet   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1795c2916339SPierre Jolivet   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1796c2916339SPierre Jolivet   PetscBLASInt     bbs, bcn, bbm, bcm;
1797f4259b30SLisandro Dalcin   PetscScalar     *z = NULL;
1798c2916339SPierre Jolivet   PetscScalar     *c, *b;
1799c2916339SPierre Jolivet   const MatScalar *v;
1800c2916339SPierre Jolivet   const PetscInt  *idx, *ii;
1801c2916339SPierre Jolivet   PetscScalar      _DOne = 1.0;
1802c2916339SPierre Jolivet 
1803c2916339SPierre Jolivet   PetscFunctionBegin;
18043ba16761SJacob Faibussowitsch   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
180508401ef6SPierre Jolivet   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
180608401ef6SPierre Jolivet   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
180708401ef6SPierre Jolivet   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1808c2916339SPierre Jolivet   b = bd->v;
18099566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
18109566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArray(C, &c));
1811c2916339SPierre Jolivet   switch (bs) {
1812d71ae5a4SJacob Faibussowitsch   case 1:
1813d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1814d71ae5a4SJacob Faibussowitsch     break;
1815d71ae5a4SJacob Faibussowitsch   case 2:
1816d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1817d71ae5a4SJacob Faibussowitsch     break;
1818d71ae5a4SJacob Faibussowitsch   case 3:
1819d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1820d71ae5a4SJacob Faibussowitsch     break;
1821d71ae5a4SJacob Faibussowitsch   case 4:
1822d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1823d71ae5a4SJacob Faibussowitsch     break;
1824d71ae5a4SJacob Faibussowitsch   case 5:
1825d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1826d71ae5a4SJacob Faibussowitsch     break;
1827c2916339SPierre Jolivet   default: /* block sizes larger than 5 by 5 are handled by BLAS */
18289566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bs, &bbs));
18299566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cn, &bcn));
18309566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bm, &bbm));
18319566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cm, &bcm));
1832c2916339SPierre Jolivet     idx = a->j;
1833c2916339SPierre Jolivet     v   = a->a;
1834c2916339SPierre Jolivet     mbs = a->mbs;
1835c2916339SPierre Jolivet     ii  = a->i;
1836c2916339SPierre Jolivet     z   = c;
1837c2916339SPierre Jolivet     for (i = 0; i < mbs; i++) {
18389371c9d4SSatish Balay       n = ii[1] - ii[0];
18399371c9d4SSatish Balay       ii++;
1840c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
1841792fecdfSBarry Smith         if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1842792fecdfSBarry Smith         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1843c2916339SPierre Jolivet         v += bs2;
1844c2916339SPierre Jolivet       }
1845c2916339SPierre Jolivet       z += bs;
1846c2916339SPierre Jolivet     }
1847c2916339SPierre Jolivet   }
18489566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArray(C, &c));
18499566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
18503ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1851c2916339SPierre Jolivet }
1852