xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 9371c9d470a9602b6d10a8bf50c9b2280a79e45a)
1c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
2a001520aSPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h>
3af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
4c6db04a5SJed Brown #include <petscbt.h>
5c6db04a5SJed Brown #include <petscblaslapack.h>
6cac129eeSSatish Balay 
75f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
896e086a2SDaniel Kokron #include <immintrin.h>
996e086a2SDaniel Kokron #endif
1096e086a2SDaniel Kokron 
11*9371c9d4SSatish Balay PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) {
12a3192f15SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
135d0c19d7SBarry Smith   PetscInt        row, i, j, k, l, m, n, *nidx, isz, val, ival;
145d0c19d7SBarry Smith   const PetscInt *idx;
15690b6cddSBarry Smith   PetscInt        start, end, *ai, *aj, bs, *nidx2;
16f1af5d2fSBarry Smith   PetscBT         table;
17a3192f15SSatish Balay 
183a40ed3dSBarry Smith   PetscFunctionBegin;
19a3192f15SSatish Balay   m  = a->mbs;
20a3192f15SSatish Balay   ai = a->i;
21a3192f15SSatish Balay   aj = a->j;
22d0f46423SBarry Smith   bs = A->rmap->bs;
23a3192f15SSatish Balay 
2408401ef6SPierre Jolivet   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
25a3192f15SSatish Balay 
269566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(m, &table));
279566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m + 1, &nidx));
289566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(A->rmap->N + 1, &nidx2));
29a3192f15SSatish Balay 
30a3192f15SSatish Balay   for (i = 0; i < is_max; i++) {
31a3192f15SSatish Balay     /* Initialise the two local arrays */
32a3192f15SSatish Balay     isz = 0;
339566063dSJacob Faibussowitsch     PetscCall(PetscBTMemzero(m, table));
34a3192f15SSatish Balay 
35a3192f15SSatish Balay     /* Extract the indices, assume there can be duplicate entries */
369566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(is[i], &idx));
379566063dSJacob Faibussowitsch     PetscCall(ISGetLocalSize(is[i], &n));
38a3192f15SSatish Balay 
39a3192f15SSatish Balay     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40a3192f15SSatish Balay     for (j = 0; j < n; ++j) {
41218c64b6SSatish Balay       ival = idx[j] / bs; /* convert the indices into block indices */
4208401ef6SPierre Jolivet       PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
4326fbe8dcSKarl Rupp       if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
44a3192f15SSatish Balay     }
459566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(is[i], &idx));
469566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&is[i]));
47a3192f15SSatish Balay 
48a3192f15SSatish Balay     k = 0;
49a3192f15SSatish Balay     for (j = 0; j < ov; j++) { /* for each overlap*/
50a3192f15SSatish Balay       n = isz;
51a3192f15SSatish Balay       for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
52a3192f15SSatish Balay         row   = nidx[k];
53a3192f15SSatish Balay         start = ai[row];
54a3192f15SSatish Balay         end   = ai[row + 1];
55a3192f15SSatish Balay         for (l = start; l < end; l++) {
56a3192f15SSatish Balay           val = aj[l];
5726fbe8dcSKarl Rupp           if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
58a3192f15SSatish Balay         }
59a3192f15SSatish Balay       }
60a3192f15SSatish Balay     }
61218c64b6SSatish Balay     /* expand the Index Set */
62218c64b6SSatish Balay     for (j = 0; j < isz; j++) {
6326fbe8dcSKarl Rupp       for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k;
64218c64b6SSatish Balay     }
659566063dSJacob Faibussowitsch     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i));
66a3192f15SSatish Balay   }
679566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&table));
689566063dSJacob Faibussowitsch   PetscCall(PetscFree(nidx));
699566063dSJacob Faibussowitsch   PetscCall(PetscFree(nidx2));
703a40ed3dSBarry Smith   PetscFunctionReturn(0);
71a3192f15SSatish Balay }
721c351548SSatish Balay 
73*9371c9d4SSatish Balay PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) {
74736121d4SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *c;
75690b6cddSBarry Smith   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
76690b6cddSBarry Smith   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
775d0c19d7SBarry Smith   const PetscInt *irow, *icol;
785d0c19d7SBarry Smith   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
79690b6cddSBarry Smith   PetscInt       *aj = a->j, *ai = a->i;
803f1db9ecSBarry Smith   MatScalar      *mat_a;
81736121d4SSatish Balay   Mat             C;
826041f1b1SToby Isaac   PetscBool       flag;
83736121d4SSatish Balay 
843a40ed3dSBarry Smith   PetscFunctionBegin;
859566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &irow));
869566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &icol));
879566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(isrow, &nrows));
889566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(iscol, &ncols));
89736121d4SSatish Balay 
909566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(1 + oldcols, &smap));
91736121d4SSatish Balay   ssmap = smap;
929566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(1 + nrows, &lens));
93736121d4SSatish Balay   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
94736121d4SSatish Balay   /* determine lens of each row */
95736121d4SSatish Balay   for (i = 0; i < nrows; i++) {
96736121d4SSatish Balay     kstart  = ai[irow[i]];
97736121d4SSatish Balay     kend    = kstart + a->ilen[irow[i]];
98736121d4SSatish Balay     lens[i] = 0;
99736121d4SSatish Balay     for (k = kstart; k < kend; k++) {
10026fbe8dcSKarl Rupp       if (ssmap[aj[k]]) lens[i]++;
101736121d4SSatish Balay     }
102736121d4SSatish Balay   }
103736121d4SSatish Balay   /* Create and fill new matrix */
104736121d4SSatish Balay   if (scall == MAT_REUSE_MATRIX) {
105736121d4SSatish Balay     c = (Mat_SeqBAIJ *)((*B)->data);
106736121d4SSatish Balay 
107aed4548fSBarry Smith     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
1089566063dSJacob Faibussowitsch     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
10928b400f6SJacob Faibussowitsch     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
1109566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(c->ilen, c->mbs));
111736121d4SSatish Balay     C = *B;
1123a40ed3dSBarry Smith   } else {
1139566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
1149566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
1159566063dSJacob Faibussowitsch     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
1169566063dSJacob Faibussowitsch     PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
117736121d4SSatish Balay   }
118736121d4SSatish Balay   c = (Mat_SeqBAIJ *)(C->data);
119736121d4SSatish Balay   for (i = 0; i < nrows; i++) {
120736121d4SSatish Balay     row      = irow[i];
121736121d4SSatish Balay     kstart   = ai[row];
122736121d4SSatish Balay     kend     = kstart + a->ilen[row];
123736121d4SSatish Balay     mat_i    = c->i[i];
124d29f2997SMatthew Woehlke     mat_j    = c->j ? c->j + mat_i : NULL;       /* mustn't add to NULL, that is UB */
125d29f2997SMatthew Woehlke     mat_a    = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */
126736121d4SSatish Balay     mat_ilen = c->ilen + i;
127736121d4SSatish Balay     for (k = kstart; k < kend; k++) {
128736121d4SSatish Balay       if ((tcol = ssmap[a->j[k]])) {
129736121d4SSatish Balay         *mat_j++ = tcol - 1;
1309566063dSJacob Faibussowitsch         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
131549d3d68SSatish Balay         mat_a += bs2;
132736121d4SSatish Balay         (*mat_ilen)++;
133736121d4SSatish Balay       }
134736121d4SSatish Balay     }
135736121d4SSatish Balay   }
136cdc6f3adSToby Isaac   /* sort */
137d29f2997SMatthew Woehlke   if (c->j && c->a) {
138cdc6f3adSToby Isaac     MatScalar *work;
1399566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(bs2, &work));
140cdc6f3adSToby Isaac     for (i = 0; i < nrows; i++) {
141cdc6f3adSToby Isaac       PetscInt ilen;
142cdc6f3adSToby Isaac       mat_i = c->i[i];
143cdc6f3adSToby Isaac       mat_j = c->j + mat_i;
144cdc6f3adSToby Isaac       mat_a = c->a + mat_i * bs2;
145cdc6f3adSToby Isaac       ilen  = c->ilen[i];
1469566063dSJacob Faibussowitsch       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
147cdc6f3adSToby Isaac     }
1489566063dSJacob Faibussowitsch     PetscCall(PetscFree(work));
149cdc6f3adSToby Isaac   }
150218c64b6SSatish Balay 
151736121d4SSatish Balay   /* Free work space */
1529566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &icol));
1539566063dSJacob Faibussowitsch   PetscCall(PetscFree(smap));
1549566063dSJacob Faibussowitsch   PetscCall(PetscFree(lens));
1559566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1569566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
157736121d4SSatish Balay 
1589566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &irow));
159736121d4SSatish Balay   *B = C;
1603a40ed3dSBarry Smith   PetscFunctionReturn(0);
161736121d4SSatish Balay }
162736121d4SSatish Balay 
163*9371c9d4SSatish Balay PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) {
164218c64b6SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
165218c64b6SSatish Balay   IS              is1, is2;
166afebec48SHong Zhang   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j;
1675d0c19d7SBarry Smith   const PetscInt *irow, *icol;
168218c64b6SSatish Balay 
1693a40ed3dSBarry Smith   PetscFunctionBegin;
1709566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &irow));
1719566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &icol));
1729566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(isrow, &nrows));
1739566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(iscol, &ncols));
174218c64b6SSatish Balay 
175218c64b6SSatish Balay   /* Verify if the indices corespond to each element in a block
176218c64b6SSatish Balay    and form the IS with compressed IS */
177f8ecb639SStefano Zampini   maxmnbs = PetscMax(a->mbs, a->nbs);
1789566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary));
1799566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(vary, a->mbs));
180218c64b6SSatish Balay   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
181*9371c9d4SSatish Balay   for (i = 0; i < a->mbs; i++) { PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks"); }
1826041f1b1SToby Isaac   count = 0;
1836041f1b1SToby Isaac   for (i = 0; i < nrows; i++) {
184afebec48SHong Zhang     j = irow[i] / bs;
1856041f1b1SToby Isaac     if ((vary[j]--) == bs) iary[count++] = j;
186218c64b6SSatish Balay   }
1879566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1));
188218c64b6SSatish Balay 
1899566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(vary, a->nbs));
190218c64b6SSatish Balay   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
191*9371c9d4SSatish Balay   for (i = 0; i < a->nbs; i++) { PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc"); }
1926041f1b1SToby Isaac   count = 0;
1936041f1b1SToby Isaac   for (i = 0; i < ncols; i++) {
194afebec48SHong Zhang     j = icol[i] / bs;
1956041f1b1SToby Isaac     if ((vary[j]--) == bs) iary[count++] = j;
1966041f1b1SToby Isaac   }
1979566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2));
1989566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &irow));
1999566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &icol));
2009566063dSJacob Faibussowitsch   PetscCall(PetscFree2(vary, iary));
201218c64b6SSatish Balay 
2029566063dSJacob Faibussowitsch   PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B));
2039566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is1));
2049566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is2));
2053a40ed3dSBarry Smith   PetscFunctionReturn(0);
206218c64b6SSatish Balay }
207218c64b6SSatish Balay 
208*9371c9d4SSatish Balay PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) {
20916b64355SHong Zhang   Mat_SeqBAIJ *c       = (Mat_SeqBAIJ *)C->data;
2105c39f6d9SHong Zhang   Mat_SubSppt *submatj = c->submatis1;
21116b64355SHong Zhang 
21216b64355SHong Zhang   PetscFunctionBegin;
2139566063dSJacob Faibussowitsch   PetscCall((*submatj->destroy)(C));
2149566063dSJacob Faibussowitsch   PetscCall(MatDestroySubMatrix_Private(submatj));
21516b64355SHong Zhang   PetscFunctionReturn(0);
21616b64355SHong Zhang }
21716b64355SHong Zhang 
21889a1a59bSHong Zhang /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
219*9371c9d4SSatish Balay PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[]) {
22086e85357SHong Zhang   PetscInt     i;
22186e85357SHong Zhang   Mat          C;
22286e85357SHong Zhang   Mat_SeqBAIJ *c;
22386e85357SHong Zhang   Mat_SubSppt *submatj;
22486e85357SHong Zhang 
22586e85357SHong Zhang   PetscFunctionBegin;
22686e85357SHong Zhang   for (i = 0; i < n; i++) {
22786e85357SHong Zhang     C       = (*mat)[i];
22886e85357SHong Zhang     c       = (Mat_SeqBAIJ *)C->data;
22986e85357SHong Zhang     submatj = c->submatis1;
23086e85357SHong Zhang     if (submatj) {
2317daefbafSJunchao Zhang       if (--((PetscObject)C)->refct <= 0) {
23226cc229bSBarry Smith         PetscCall(PetscFree(C->factorprefix));
2339566063dSJacob Faibussowitsch         PetscCall((*submatj->destroy)(C));
2349566063dSJacob Faibussowitsch         PetscCall(MatDestroySubMatrix_Private(submatj));
2359566063dSJacob Faibussowitsch         PetscCall(PetscFree(C->defaultvectype));
2369566063dSJacob Faibussowitsch         PetscCall(PetscLayoutDestroy(&C->rmap));
2379566063dSJacob Faibussowitsch         PetscCall(PetscLayoutDestroy(&C->cmap));
2389566063dSJacob Faibussowitsch         PetscCall(PetscHeaderDestroy(&C));
2397daefbafSJunchao Zhang       }
24086e85357SHong Zhang     } else {
2419566063dSJacob Faibussowitsch       PetscCall(MatDestroy(&C));
24286e85357SHong Zhang     }
24386e85357SHong Zhang   }
2447daefbafSJunchao Zhang 
2457daefbafSJunchao Zhang   /* Destroy Dummy submatrices created for reuse */
2469566063dSJacob Faibussowitsch   PetscCall(MatDestroySubMatrices_Dummy(n, mat));
2477daefbafSJunchao Zhang 
2489566063dSJacob Faibussowitsch   PetscCall(PetscFree(*mat));
24986e85357SHong Zhang   PetscFunctionReturn(0);
25086e85357SHong Zhang }
25186e85357SHong Zhang 
252*9371c9d4SSatish Balay PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) {
253690b6cddSBarry Smith   PetscInt i;
254736121d4SSatish Balay 
2553a40ed3dSBarry Smith   PetscFunctionBegin;
256*9371c9d4SSatish Balay   if (scall == MAT_INITIAL_MATRIX) { PetscCall(PetscCalloc1(n + 1, B)); }
257736121d4SSatish Balay 
258*9371c9d4SSatish Balay   for (i = 0; i < n; i++) { PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); }
2593a40ed3dSBarry Smith   PetscFunctionReturn(0);
260736121d4SSatish Balay }
261218c64b6SSatish Balay 
2622d61bbb3SSatish Balay /* -------------------------------------------------------*/
2632d61bbb3SSatish Balay /* Should check that shapes of vectors and matrices match */
2642d61bbb3SSatish Balay /* -------------------------------------------------------*/
2652d61bbb3SSatish Balay 
266*9371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz) {
2672d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
268d9fead3dSBarry Smith   PetscScalar       *z, sum;
269d9fead3dSBarry Smith   const PetscScalar *x;
270d9fead3dSBarry Smith   const MatScalar   *v;
2717c565772SBarry Smith   PetscInt           mbs, i, n;
2720298fd71SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
273ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
2742d61bbb3SSatish Balay 
2752d61bbb3SSatish Balay   PetscFunctionBegin;
2769566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
2779566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &z));
2782d61bbb3SSatish Balay 
27926e093fcSHong Zhang   if (usecprow) {
28026e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
28126e093fcSHong Zhang     ii   = a->compressedrow.i;
2827b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
2839566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(z, a->mbs));
28426e093fcSHong Zhang   } else {
28526e093fcSHong Zhang     mbs = a->mbs;
2862d61bbb3SSatish Balay     ii  = a->i;
28726e093fcSHong Zhang   }
2882d61bbb3SSatish Balay 
2892d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
290ee54c7eeSHong Zhang     n   = ii[1] - ii[0];
291ee54c7eeSHong Zhang     v   = a->a + ii[0];
292ee54c7eeSHong Zhang     idx = a->j + ii[0];
293ee54c7eeSHong Zhang     ii++;
294444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
295444d8c10SJed Brown     PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2962d61bbb3SSatish Balay     sum = 0.0;
2972162cab8SBarry Smith     PetscSparseDensePlusDot(sum, x, v, idx, n);
29826e093fcSHong Zhang     if (usecprow) {
2997b2bb3b9SHong Zhang       z[ridx[i]] = sum;
30026e093fcSHong Zhang     } else {
3012d61bbb3SSatish Balay       z[i] = sum;
3022d61bbb3SSatish Balay     }
30326e093fcSHong Zhang   }
3049566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
3059566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &z));
3069566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt));
3072d61bbb3SSatish Balay   PetscFunctionReturn(0);
3082d61bbb3SSatish Balay }
3092d61bbb3SSatish Balay 
310*9371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz) {
3112d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
312f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, *zarray;
313d9fead3dSBarry Smith   const PetscScalar *x, *xb;
31487828ca2SBarry Smith   PetscScalar        x1, x2;
315d9fead3dSBarry Smith   const MatScalar   *v;
3167c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
317ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
3182d61bbb3SSatish Balay 
3192d61bbb3SSatish Balay   PetscFunctionBegin;
3209566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3219566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
3222d61bbb3SSatish Balay 
3232d61bbb3SSatish Balay   idx = a->j;
3242d61bbb3SSatish Balay   v   = a->a;
32526e093fcSHong Zhang   if (usecprow) {
32626e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
32726e093fcSHong Zhang     ii   = a->compressedrow.i;
3287b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
3299566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 2 * a->mbs));
33026e093fcSHong Zhang   } else {
33126e093fcSHong Zhang     mbs = a->mbs;
3322d61bbb3SSatish Balay     ii  = a->i;
33326e093fcSHong Zhang     z   = zarray;
33426e093fcSHong Zhang   }
3352d61bbb3SSatish Balay 
3362d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
337*9371c9d4SSatish Balay     n = ii[1] - ii[0];
338*9371c9d4SSatish Balay     ii++;
339*9371c9d4SSatish Balay     sum1 = 0.0;
340*9371c9d4SSatish Balay     sum2 = 0.0;
341444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
342444d8c10SJed Brown     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3432d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
344*9371c9d4SSatish Balay       xb = x + 2 * (*idx++);
345*9371c9d4SSatish Balay       x1 = xb[0];
346*9371c9d4SSatish Balay       x2 = xb[1];
3472d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[2] * x2;
3482d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[3] * x2;
3492d61bbb3SSatish Balay       v += 4;
3502d61bbb3SSatish Balay     }
3517b2bb3b9SHong Zhang     if (usecprow) z = zarray + 2 * ridx[i];
352*9371c9d4SSatish Balay     z[0] = sum1;
353*9371c9d4SSatish Balay     z[1] = sum2;
35426e093fcSHong Zhang     if (!usecprow) z += 2;
3552d61bbb3SSatish Balay   }
3569566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
3579566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
3589566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt));
3592d61bbb3SSatish Balay   PetscFunctionReturn(0);
3602d61bbb3SSatish Balay }
3612d61bbb3SSatish Balay 
362*9371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz) {
3632d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
364f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
365d9fead3dSBarry Smith   const PetscScalar *x, *xb;
366d9fead3dSBarry Smith   const MatScalar   *v;
3677c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
368ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
36926e093fcSHong Zhang 
370b6410449SSatish Balay #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
371fee21e36SBarry Smith #pragma disjoint(*v, *z, *xb)
372fee21e36SBarry Smith #endif
373fee21e36SBarry Smith 
3742d61bbb3SSatish Balay   PetscFunctionBegin;
3759566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3769566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
3772d61bbb3SSatish Balay 
3782d61bbb3SSatish Balay   idx = a->j;
3792d61bbb3SSatish Balay   v   = a->a;
38026e093fcSHong Zhang   if (usecprow) {
38126e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
38226e093fcSHong Zhang     ii   = a->compressedrow.i;
3837b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
3849566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 3 * a->mbs));
38526e093fcSHong Zhang   } else {
38626e093fcSHong Zhang     mbs = a->mbs;
3872d61bbb3SSatish Balay     ii  = a->i;
38826e093fcSHong Zhang     z   = zarray;
38926e093fcSHong Zhang   }
3902d61bbb3SSatish Balay 
3912d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
392*9371c9d4SSatish Balay     n = ii[1] - ii[0];
393*9371c9d4SSatish Balay     ii++;
394*9371c9d4SSatish Balay     sum1 = 0.0;
395*9371c9d4SSatish Balay     sum2 = 0.0;
396*9371c9d4SSatish Balay     sum3 = 0.0;
397444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
398444d8c10SJed Brown     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3992d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
40026fbe8dcSKarl Rupp       xb = x + 3 * (*idx++);
40126fbe8dcSKarl Rupp       x1 = xb[0];
40226fbe8dcSKarl Rupp       x2 = xb[1];
40326fbe8dcSKarl Rupp       x3 = xb[2];
40426fbe8dcSKarl Rupp 
4052d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
4062d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
4072d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
4082d61bbb3SSatish Balay       v += 9;
4092d61bbb3SSatish Balay     }
4107b2bb3b9SHong Zhang     if (usecprow) z = zarray + 3 * ridx[i];
411*9371c9d4SSatish Balay     z[0] = sum1;
412*9371c9d4SSatish Balay     z[1] = sum2;
413*9371c9d4SSatish Balay     z[2] = sum3;
41426e093fcSHong Zhang     if (!usecprow) z += 3;
4152d61bbb3SSatish Balay   }
4169566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
4179566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
4189566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt));
4192d61bbb3SSatish Balay   PetscFunctionReturn(0);
4202d61bbb3SSatish Balay }
4212d61bbb3SSatish Balay 
422*9371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz) {
4232d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
424f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
425d9fead3dSBarry Smith   const PetscScalar *x, *xb;
426d9fead3dSBarry Smith   const MatScalar   *v;
4277c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
428ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
4292d61bbb3SSatish Balay 
4302d61bbb3SSatish Balay   PetscFunctionBegin;
4319566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
4329566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
4332d61bbb3SSatish Balay 
4342d61bbb3SSatish Balay   idx = a->j;
4352d61bbb3SSatish Balay   v   = a->a;
43626e093fcSHong Zhang   if (usecprow) {
43726e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
43826e093fcSHong Zhang     ii   = a->compressedrow.i;
4397b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
4409566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 4 * a->mbs));
44126e093fcSHong Zhang   } else {
44226e093fcSHong Zhang     mbs = a->mbs;
4432d61bbb3SSatish Balay     ii  = a->i;
44426e093fcSHong Zhang     z   = zarray;
44526e093fcSHong Zhang   }
4462d61bbb3SSatish Balay 
4472d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
44826fbe8dcSKarl Rupp     n = ii[1] - ii[0];
44926fbe8dcSKarl Rupp     ii++;
45026fbe8dcSKarl Rupp     sum1 = 0.0;
45126fbe8dcSKarl Rupp     sum2 = 0.0;
45226fbe8dcSKarl Rupp     sum3 = 0.0;
45326fbe8dcSKarl Rupp     sum4 = 0.0;
45426fbe8dcSKarl Rupp 
455444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
456444d8c10SJed Brown     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
4572d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
4582d61bbb3SSatish Balay       xb = x + 4 * (*idx++);
459*9371c9d4SSatish Balay       x1 = xb[0];
460*9371c9d4SSatish Balay       x2 = xb[1];
461*9371c9d4SSatish Balay       x3 = xb[2];
462*9371c9d4SSatish Balay       x4 = xb[3];
4632d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
4642d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
4652d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
4662d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
4672d61bbb3SSatish Balay       v += 16;
4682d61bbb3SSatish Balay     }
4697b2bb3b9SHong Zhang     if (usecprow) z = zarray + 4 * ridx[i];
470*9371c9d4SSatish Balay     z[0] = sum1;
471*9371c9d4SSatish Balay     z[1] = sum2;
472*9371c9d4SSatish Balay     z[2] = sum3;
473*9371c9d4SSatish Balay     z[3] = sum4;
47426e093fcSHong Zhang     if (!usecprow) z += 4;
4752d61bbb3SSatish Balay   }
4769566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
4779566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
4789566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt));
4792d61bbb3SSatish Balay   PetscFunctionReturn(0);
4802d61bbb3SSatish Balay }
4812d61bbb3SSatish Balay 
482*9371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz) {
4832d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
484f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
485d9fead3dSBarry Smith   const PetscScalar *xb, *x;
486d9fead3dSBarry Smith   const MatScalar   *v;
4870298fd71SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
4887c565772SBarry Smith   PetscInt           mbs, i, j, n;
489ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
4902d61bbb3SSatish Balay 
491433994e6SBarry Smith   PetscFunctionBegin;
4929566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
4939566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
4942d61bbb3SSatish Balay 
4952d61bbb3SSatish Balay   idx = a->j;
4962d61bbb3SSatish Balay   v   = a->a;
49726e093fcSHong Zhang   if (usecprow) {
49826e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
49926e093fcSHong Zhang     ii   = a->compressedrow.i;
5007b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
5019566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 5 * a->mbs));
50226e093fcSHong Zhang   } else {
50326e093fcSHong Zhang     mbs = a->mbs;
5042d61bbb3SSatish Balay     ii  = a->i;
50526e093fcSHong Zhang     z   = zarray;
50626e093fcSHong Zhang   }
5072d61bbb3SSatish Balay 
5082d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
509*9371c9d4SSatish Balay     n = ii[1] - ii[0];
510*9371c9d4SSatish Balay     ii++;
511*9371c9d4SSatish Balay     sum1 = 0.0;
512*9371c9d4SSatish Balay     sum2 = 0.0;
513*9371c9d4SSatish Balay     sum3 = 0.0;
514*9371c9d4SSatish Balay     sum4 = 0.0;
515*9371c9d4SSatish Balay     sum5 = 0.0;
516444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
517444d8c10SJed Brown     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
5182d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
5192d61bbb3SSatish Balay       xb = x + 5 * (*idx++);
520*9371c9d4SSatish Balay       x1 = xb[0];
521*9371c9d4SSatish Balay       x2 = xb[1];
522*9371c9d4SSatish Balay       x3 = xb[2];
523*9371c9d4SSatish Balay       x4 = xb[3];
524*9371c9d4SSatish Balay       x5 = xb[4];
5252d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
5262d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
5272d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
5282d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
5292d61bbb3SSatish Balay       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
5302d61bbb3SSatish Balay       v += 25;
5312d61bbb3SSatish Balay     }
5327b2bb3b9SHong Zhang     if (usecprow) z = zarray + 5 * ridx[i];
533*9371c9d4SSatish Balay     z[0] = sum1;
534*9371c9d4SSatish Balay     z[1] = sum2;
535*9371c9d4SSatish Balay     z[2] = sum3;
536*9371c9d4SSatish Balay     z[3] = sum4;
537*9371c9d4SSatish Balay     z[4] = sum5;
53826e093fcSHong Zhang     if (!usecprow) z += 5;
5392d61bbb3SSatish Balay   }
5409566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
5419566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
5429566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt));
5432d61bbb3SSatish Balay   PetscFunctionReturn(0);
5442d61bbb3SSatish Balay }
5452d61bbb3SSatish Balay 
546*9371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz) {
54715091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
548f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
549d9fead3dSBarry Smith   const PetscScalar *x, *xb;
55026e093fcSHong Zhang   PetscScalar        x1, x2, x3, x4, x5, x6, *zarray;
551d9fead3dSBarry Smith   const MatScalar   *v;
5527c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
553ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
55415091d37SBarry Smith 
555433994e6SBarry Smith   PetscFunctionBegin;
5569566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
5579566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
55815091d37SBarry Smith 
55915091d37SBarry Smith   idx = a->j;
56015091d37SBarry Smith   v   = a->a;
56126e093fcSHong Zhang   if (usecprow) {
56226e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
56326e093fcSHong Zhang     ii   = a->compressedrow.i;
5647b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
5659566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 6 * a->mbs));
56626e093fcSHong Zhang   } else {
56726e093fcSHong Zhang     mbs = a->mbs;
56815091d37SBarry Smith     ii  = a->i;
56926e093fcSHong Zhang     z   = zarray;
57026e093fcSHong Zhang   }
57115091d37SBarry Smith 
57215091d37SBarry Smith   for (i = 0; i < mbs; i++) {
57326fbe8dcSKarl Rupp     n = ii[1] - ii[0];
57426fbe8dcSKarl Rupp     ii++;
57526fbe8dcSKarl Rupp     sum1 = 0.0;
57626fbe8dcSKarl Rupp     sum2 = 0.0;
57726fbe8dcSKarl Rupp     sum3 = 0.0;
57826fbe8dcSKarl Rupp     sum4 = 0.0;
57926fbe8dcSKarl Rupp     sum5 = 0.0;
58026fbe8dcSKarl Rupp     sum6 = 0.0;
58126fbe8dcSKarl Rupp 
582444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
583444d8c10SJed Brown     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
58415091d37SBarry Smith     for (j = 0; j < n; j++) {
58515091d37SBarry Smith       xb = x + 6 * (*idx++);
586*9371c9d4SSatish Balay       x1 = xb[0];
587*9371c9d4SSatish Balay       x2 = xb[1];
588*9371c9d4SSatish Balay       x3 = xb[2];
589*9371c9d4SSatish Balay       x4 = xb[3];
590*9371c9d4SSatish Balay       x5 = xb[4];
591*9371c9d4SSatish Balay       x6 = xb[5];
59215091d37SBarry Smith       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
59315091d37SBarry Smith       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
59415091d37SBarry Smith       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
59515091d37SBarry Smith       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
59615091d37SBarry Smith       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
59715091d37SBarry Smith       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
59815091d37SBarry Smith       v += 36;
59915091d37SBarry Smith     }
6007b2bb3b9SHong Zhang     if (usecprow) z = zarray + 6 * ridx[i];
601*9371c9d4SSatish Balay     z[0] = sum1;
602*9371c9d4SSatish Balay     z[1] = sum2;
603*9371c9d4SSatish Balay     z[2] = sum3;
604*9371c9d4SSatish Balay     z[3] = sum4;
605*9371c9d4SSatish Balay     z[4] = sum5;
606*9371c9d4SSatish Balay     z[5] = sum6;
60726e093fcSHong Zhang     if (!usecprow) z += 6;
60815091d37SBarry Smith   }
60915091d37SBarry Smith 
6109566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
6119566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
6129566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt));
61315091d37SBarry Smith   PetscFunctionReturn(0);
61415091d37SBarry Smith }
6158ab949d8SShri Abhyankar 
616*9371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz) {
6172d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
618f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
619d9fead3dSBarry Smith   const PetscScalar *x, *xb;
62026e093fcSHong Zhang   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *zarray;
621d9fead3dSBarry Smith   const MatScalar   *v;
6227c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
623ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
6242d61bbb3SSatish Balay 
625433994e6SBarry Smith   PetscFunctionBegin;
6269566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
6279566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
6282d61bbb3SSatish Balay 
6292d61bbb3SSatish Balay   idx = a->j;
6302d61bbb3SSatish Balay   v   = a->a;
63126e093fcSHong Zhang   if (usecprow) {
63226e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
63326e093fcSHong Zhang     ii   = a->compressedrow.i;
6347b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
6359566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 7 * a->mbs));
63626e093fcSHong Zhang   } else {
63726e093fcSHong Zhang     mbs = a->mbs;
6382d61bbb3SSatish Balay     ii  = a->i;
63926e093fcSHong Zhang     z   = zarray;
64026e093fcSHong Zhang   }
6412d61bbb3SSatish Balay 
6422d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
64326fbe8dcSKarl Rupp     n = ii[1] - ii[0];
64426fbe8dcSKarl Rupp     ii++;
64526fbe8dcSKarl Rupp     sum1 = 0.0;
64626fbe8dcSKarl Rupp     sum2 = 0.0;
64726fbe8dcSKarl Rupp     sum3 = 0.0;
64826fbe8dcSKarl Rupp     sum4 = 0.0;
64926fbe8dcSKarl Rupp     sum5 = 0.0;
65026fbe8dcSKarl Rupp     sum6 = 0.0;
65126fbe8dcSKarl Rupp     sum7 = 0.0;
65226fbe8dcSKarl Rupp 
653444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
654444d8c10SJed Brown     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
6552d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
6562d61bbb3SSatish Balay       xb = x + 7 * (*idx++);
657*9371c9d4SSatish Balay       x1 = xb[0];
658*9371c9d4SSatish Balay       x2 = xb[1];
659*9371c9d4SSatish Balay       x3 = xb[2];
660*9371c9d4SSatish Balay       x4 = xb[3];
661*9371c9d4SSatish Balay       x5 = xb[4];
662*9371c9d4SSatish Balay       x6 = xb[5];
663*9371c9d4SSatish Balay       x7 = xb[6];
6642d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
6652d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
6662d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
6672d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
6682d61bbb3SSatish Balay       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
6692d61bbb3SSatish Balay       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
6702d61bbb3SSatish Balay       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
6712d61bbb3SSatish Balay       v += 49;
6722d61bbb3SSatish Balay     }
6737b2bb3b9SHong Zhang     if (usecprow) z = zarray + 7 * ridx[i];
674*9371c9d4SSatish Balay     z[0] = sum1;
675*9371c9d4SSatish Balay     z[1] = sum2;
676*9371c9d4SSatish Balay     z[2] = sum3;
677*9371c9d4SSatish Balay     z[3] = sum4;
678*9371c9d4SSatish Balay     z[4] = sum5;
679*9371c9d4SSatish Balay     z[5] = sum6;
680*9371c9d4SSatish Balay     z[6] = sum7;
68126e093fcSHong Zhang     if (!usecprow) z += 7;
6822d61bbb3SSatish Balay   }
6832d61bbb3SSatish Balay 
6849566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
6859566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
6869566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt));
6872d61bbb3SSatish Balay   PetscFunctionReturn(0);
6882d61bbb3SSatish Balay }
6892d61bbb3SSatish Balay 
6905f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
691*9371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz) {
69296e086a2SDaniel Kokron   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
693f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, *work, *workt, *zarray;
69496e086a2SDaniel Kokron   const PetscScalar *x, *xb;
69596e086a2SDaniel Kokron   const MatScalar   *v;
69696e086a2SDaniel Kokron   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
69796e086a2SDaniel Kokron   const PetscInt    *idx, *ii, *ridx = NULL;
698ce68d72fSJed Brown   PetscInt           k;
69996e086a2SDaniel Kokron   PetscBool          usecprow = a->compressedrow.use;
70096e086a2SDaniel Kokron 
70196e086a2SDaniel Kokron   __m256d a0, a1, a2, a3, a4, a5;
702ce68d72fSJed Brown   __m256d w0, w1, w2, w3;
70396e086a2SDaniel Kokron   __m256d z0, z1, z2;
70496e086a2SDaniel Kokron   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
70596e086a2SDaniel Kokron 
70696e086a2SDaniel Kokron   PetscFunctionBegin;
7079566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
7089566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
70996e086a2SDaniel Kokron 
71096e086a2SDaniel Kokron   idx = a->j;
71196e086a2SDaniel Kokron   v   = a->a;
71296e086a2SDaniel Kokron   if (usecprow) {
71396e086a2SDaniel Kokron     mbs  = a->compressedrow.nrows;
71496e086a2SDaniel Kokron     ii   = a->compressedrow.i;
71596e086a2SDaniel Kokron     ridx = a->compressedrow.rindex;
7169566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
71796e086a2SDaniel Kokron   } else {
71896e086a2SDaniel Kokron     mbs = a->mbs;
71996e086a2SDaniel Kokron     ii  = a->i;
72096e086a2SDaniel Kokron     z   = zarray;
72196e086a2SDaniel Kokron   }
72296e086a2SDaniel Kokron 
72396e086a2SDaniel Kokron   if (!a->mult_work) {
72496e086a2SDaniel Kokron     k = PetscMax(A->rmap->n, A->cmap->n);
7259566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
72696e086a2SDaniel Kokron   }
72796e086a2SDaniel Kokron 
72896e086a2SDaniel Kokron   work = a->mult_work;
72996e086a2SDaniel Kokron   for (i = 0; i < mbs; i++) {
730*9371c9d4SSatish Balay     n = ii[1] - ii[0];
731*9371c9d4SSatish Balay     ii++;
73296e086a2SDaniel Kokron     workt = work;
73396e086a2SDaniel Kokron     for (j = 0; j < n; j++) {
73496e086a2SDaniel Kokron       xb = x + bs * (*idx++);
73596e086a2SDaniel Kokron       for (k = 0; k < bs; k++) workt[k] = xb[k];
73696e086a2SDaniel Kokron       workt += bs;
73796e086a2SDaniel Kokron     }
73896e086a2SDaniel Kokron     if (usecprow) z = zarray + bs * ridx[i];
73996e086a2SDaniel Kokron 
740*9371c9d4SSatish Balay     z0 = _mm256_setzero_pd();
741*9371c9d4SSatish Balay     z1 = _mm256_setzero_pd();
742*9371c9d4SSatish Balay     z2 = _mm256_setzero_pd();
74396e086a2SDaniel Kokron 
74496e086a2SDaniel Kokron     for (j = 0; j < n; j++) {
745c05b70c4SSatish Balay       /* first column of a */
74696e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9]);
747*9371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81]);
748*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
749*9371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
750*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
751*9371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
752*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
75396e086a2SDaniel Kokron 
754c05b70c4SSatish Balay       /* second column of a */
75596e086a2SDaniel Kokron       w1 = _mm256_set1_pd(work[j * 9 + 1]);
756*9371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
757*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w1, z0);
758*9371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
759*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w1, z1);
760*9371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
761*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w1, z2);
76296e086a2SDaniel Kokron 
763c05b70c4SSatish Balay       /* third column of a */
76496e086a2SDaniel Kokron       w2 = _mm256_set1_pd(work[j * 9 + 2]);
765*9371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
766*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w2, z0);
767*9371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
768*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w2, z1);
769*9371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
770*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w2, z2);
77196e086a2SDaniel Kokron 
772c05b70c4SSatish Balay       /* fourth column of a */
77396e086a2SDaniel Kokron       w3 = _mm256_set1_pd(work[j * 9 + 3]);
774*9371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
775*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w3, z0);
776*9371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
777*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w3, z1);
778*9371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
779*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w3, z2);
78096e086a2SDaniel Kokron 
781c05b70c4SSatish Balay       /* fifth column of a */
78296e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9 + 4]);
783*9371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
784*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w0, z0);
785*9371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
786*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w0, z1);
787*9371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
788*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w0, z2);
78996e086a2SDaniel Kokron 
790c05b70c4SSatish Balay       /* sixth column of a */
79196e086a2SDaniel Kokron       w1 = _mm256_set1_pd(work[j * 9 + 5]);
792*9371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
793*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w1, z0);
794*9371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
795*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w1, z1);
796*9371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
797*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w1, z2);
79896e086a2SDaniel Kokron 
799c05b70c4SSatish Balay       /* seventh column of a */
80096e086a2SDaniel Kokron       w2 = _mm256_set1_pd(work[j * 9 + 6]);
801*9371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
802*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
803*9371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
804*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
805*9371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
806*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
80796e086a2SDaniel Kokron 
8086aad120cSJose E. Roman       /* eighth column of a */
80996e086a2SDaniel Kokron       w3 = _mm256_set1_pd(work[j * 9 + 7]);
810*9371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
811*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
812*9371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
813*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
814*9371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
815*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
81696e086a2SDaniel Kokron 
817c05b70c4SSatish Balay       /* ninth column of a */
81896e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9 + 8]);
819*9371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
820*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
821*9371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
822*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
823*9371c9d4SSatish Balay       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
824*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
82596e086a2SDaniel Kokron     }
82696e086a2SDaniel Kokron 
827*9371c9d4SSatish Balay     _mm256_storeu_pd(&z[0], z0);
828*9371c9d4SSatish Balay     _mm256_storeu_pd(&z[4], z1);
829*9371c9d4SSatish Balay     _mm256_maskstore_pd(&z[8], mask1, z2);
83096e086a2SDaniel Kokron 
83196e086a2SDaniel Kokron     v += n * bs2;
83296e086a2SDaniel Kokron     if (!usecprow) z += bs;
83396e086a2SDaniel Kokron   }
8349566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
8359566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
8369566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
83796e086a2SDaniel Kokron   PetscFunctionReturn(0);
83896e086a2SDaniel Kokron }
83996e086a2SDaniel Kokron #endif
84096e086a2SDaniel Kokron 
841*9371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz) {
842ebada01fSBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
843f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
844ebada01fSBarry Smith   const PetscScalar *x, *xb;
845ebada01fSBarry Smith   PetscScalar       *zarray, xv;
846ebada01fSBarry Smith   const MatScalar   *v;
847ebada01fSBarry Smith   const PetscInt    *ii, *ij = a->j, *idx;
848ebada01fSBarry Smith   PetscInt           mbs, i, j, k, n, *ridx = NULL;
849ebada01fSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
850ebada01fSBarry Smith 
851ebada01fSBarry Smith   PetscFunctionBegin;
8529566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
8539566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
854ebada01fSBarry Smith 
855ebada01fSBarry Smith   v = a->a;
856ebada01fSBarry Smith   if (usecprow) {
857ebada01fSBarry Smith     mbs  = a->compressedrow.nrows;
858ebada01fSBarry Smith     ii   = a->compressedrow.i;
859ebada01fSBarry Smith     ridx = a->compressedrow.rindex;
8609566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 11 * a->mbs));
861ebada01fSBarry Smith   } else {
862ebada01fSBarry Smith     mbs = a->mbs;
863ebada01fSBarry Smith     ii  = a->i;
864ebada01fSBarry Smith     z   = zarray;
865ebada01fSBarry Smith   }
866ebada01fSBarry Smith 
867ebada01fSBarry Smith   for (i = 0; i < mbs; i++) {
868ebada01fSBarry Smith     n     = ii[i + 1] - ii[i];
869ebada01fSBarry Smith     idx   = ij + ii[i];
870*9371c9d4SSatish Balay     sum1  = 0.0;
871*9371c9d4SSatish Balay     sum2  = 0.0;
872*9371c9d4SSatish Balay     sum3  = 0.0;
873*9371c9d4SSatish Balay     sum4  = 0.0;
874*9371c9d4SSatish Balay     sum5  = 0.0;
875*9371c9d4SSatish Balay     sum6  = 0.0;
876*9371c9d4SSatish Balay     sum7  = 0.0;
877*9371c9d4SSatish Balay     sum8  = 0.0;
878*9371c9d4SSatish Balay     sum9  = 0.0;
879*9371c9d4SSatish Balay     sum10 = 0.0;
880*9371c9d4SSatish Balay     sum11 = 0.0;
881ebada01fSBarry Smith 
882ebada01fSBarry Smith     for (j = 0; j < n; j++) {
883ebada01fSBarry Smith       xb = x + 11 * (idx[j]);
884ebada01fSBarry Smith 
885ebada01fSBarry Smith       for (k = 0; k < 11; k++) {
886ebada01fSBarry Smith         xv = xb[k];
887ebada01fSBarry Smith         sum1 += v[0] * xv;
888ebada01fSBarry Smith         sum2 += v[1] * xv;
889ebada01fSBarry Smith         sum3 += v[2] * xv;
890ebada01fSBarry Smith         sum4 += v[3] * xv;
891ebada01fSBarry Smith         sum5 += v[4] * xv;
892ebada01fSBarry Smith         sum6 += v[5] * xv;
893ebada01fSBarry Smith         sum7 += v[6] * xv;
894ebada01fSBarry Smith         sum8 += v[7] * xv;
895ebada01fSBarry Smith         sum9 += v[8] * xv;
896ebada01fSBarry Smith         sum10 += v[9] * xv;
897ebada01fSBarry Smith         sum11 += v[10] * xv;
898ebada01fSBarry Smith         v += 11;
899ebada01fSBarry Smith       }
900ebada01fSBarry Smith     }
901ebada01fSBarry Smith     if (usecprow) z = zarray + 11 * ridx[i];
902*9371c9d4SSatish Balay     z[0]  = sum1;
903*9371c9d4SSatish Balay     z[1]  = sum2;
904*9371c9d4SSatish Balay     z[2]  = sum3;
905*9371c9d4SSatish Balay     z[3]  = sum4;
906*9371c9d4SSatish Balay     z[4]  = sum5;
907*9371c9d4SSatish Balay     z[5]  = sum6;
908*9371c9d4SSatish Balay     z[6]  = sum7;
909*9371c9d4SSatish Balay     z[7]  = sum8;
910*9371c9d4SSatish Balay     z[8]  = sum9;
911*9371c9d4SSatish Balay     z[9]  = sum10;
912*9371c9d4SSatish Balay     z[10] = sum11;
913ebada01fSBarry Smith 
914ebada01fSBarry Smith     if (!usecprow) z += 11;
915ebada01fSBarry Smith   }
916ebada01fSBarry Smith 
9179566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
9189566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
9199566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt));
920ebada01fSBarry Smith   PetscFunctionReturn(0);
921ebada01fSBarry Smith }
922ebada01fSBarry Smith 
9236679dcc1SBarry Smith /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
924*9371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz) {
9256679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
9266679dcc1SBarry Smith   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
9276679dcc1SBarry Smith   const PetscScalar *x, *xb;
9286679dcc1SBarry Smith   PetscScalar       *zarray, xv;
9296679dcc1SBarry Smith   const MatScalar   *v;
9306679dcc1SBarry Smith   const PetscInt    *ii, *ij = a->j, *idx;
9316679dcc1SBarry Smith   PetscInt           mbs, i, j, k, n, *ridx = NULL;
9326679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
9336679dcc1SBarry Smith 
9346679dcc1SBarry Smith   PetscFunctionBegin;
9359566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
9369566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
9376679dcc1SBarry Smith 
9386679dcc1SBarry Smith   v = a->a;
9396679dcc1SBarry Smith   if (usecprow) {
9406679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
9416679dcc1SBarry Smith     ii   = a->compressedrow.i;
9426679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
9439566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
9446679dcc1SBarry Smith   } else {
9456679dcc1SBarry Smith     mbs = a->mbs;
9466679dcc1SBarry Smith     ii  = a->i;
9476679dcc1SBarry Smith     z   = zarray;
9486679dcc1SBarry Smith   }
9496679dcc1SBarry Smith 
9506679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
9516679dcc1SBarry Smith     n     = ii[i + 1] - ii[i];
9526679dcc1SBarry Smith     idx   = ij + ii[i];
953*9371c9d4SSatish Balay     sum1  = 0.0;
954*9371c9d4SSatish Balay     sum2  = 0.0;
955*9371c9d4SSatish Balay     sum3  = 0.0;
956*9371c9d4SSatish Balay     sum4  = 0.0;
957*9371c9d4SSatish Balay     sum5  = 0.0;
958*9371c9d4SSatish Balay     sum6  = 0.0;
959*9371c9d4SSatish Balay     sum7  = 0.0;
960*9371c9d4SSatish Balay     sum8  = 0.0;
961*9371c9d4SSatish Balay     sum9  = 0.0;
962*9371c9d4SSatish Balay     sum10 = 0.0;
963*9371c9d4SSatish Balay     sum11 = 0.0;
964*9371c9d4SSatish Balay     sum12 = 0.0;
9656679dcc1SBarry Smith 
9666679dcc1SBarry Smith     for (j = 0; j < n; j++) {
9676679dcc1SBarry Smith       xb = x + 12 * (idx[j]);
9686679dcc1SBarry Smith 
9696679dcc1SBarry Smith       for (k = 0; k < 12; k++) {
9706679dcc1SBarry Smith         xv = xb[k];
9716679dcc1SBarry Smith         sum1 += v[0] * xv;
9726679dcc1SBarry Smith         sum2 += v[1] * xv;
9736679dcc1SBarry Smith         sum3 += v[2] * xv;
9746679dcc1SBarry Smith         sum4 += v[3] * xv;
9756679dcc1SBarry Smith         sum5 += v[4] * xv;
9766679dcc1SBarry Smith         sum6 += v[5] * xv;
9776679dcc1SBarry Smith         sum7 += v[6] * xv;
9786679dcc1SBarry Smith         sum8 += v[7] * xv;
9796679dcc1SBarry Smith         sum9 += v[8] * xv;
9806679dcc1SBarry Smith         sum10 += v[9] * xv;
9816679dcc1SBarry Smith         sum11 += v[10] * xv;
9826679dcc1SBarry Smith         sum12 += v[11] * xv;
9836679dcc1SBarry Smith         v += 12;
9846679dcc1SBarry Smith       }
9856679dcc1SBarry Smith     }
9866679dcc1SBarry Smith     if (usecprow) z = zarray + 12 * ridx[i];
987*9371c9d4SSatish Balay     z[0]  = sum1;
988*9371c9d4SSatish Balay     z[1]  = sum2;
989*9371c9d4SSatish Balay     z[2]  = sum3;
990*9371c9d4SSatish Balay     z[3]  = sum4;
991*9371c9d4SSatish Balay     z[4]  = sum5;
992*9371c9d4SSatish Balay     z[5]  = sum6;
993*9371c9d4SSatish Balay     z[6]  = sum7;
994*9371c9d4SSatish Balay     z[7]  = sum8;
995*9371c9d4SSatish Balay     z[8]  = sum9;
996*9371c9d4SSatish Balay     z[9]  = sum10;
997*9371c9d4SSatish Balay     z[10] = sum11;
998*9371c9d4SSatish Balay     z[11] = sum12;
9996679dcc1SBarry Smith     if (!usecprow) z += 12;
10006679dcc1SBarry Smith   }
10019566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
10029566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
10039566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
10046679dcc1SBarry Smith   PetscFunctionReturn(0);
10056679dcc1SBarry Smith }
10066679dcc1SBarry Smith 
1007*9371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz) {
10086679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
10096679dcc1SBarry Smith   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
10106679dcc1SBarry Smith   const PetscScalar *x, *xb;
10116679dcc1SBarry Smith   PetscScalar       *zarray, *yarray, xv;
10126679dcc1SBarry Smith   const MatScalar   *v;
10136679dcc1SBarry Smith   const PetscInt    *ii, *ij = a->j, *idx;
10146679dcc1SBarry Smith   PetscInt           mbs = a->mbs, i, j, k, n, *ridx = NULL;
10156679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
10166679dcc1SBarry Smith 
10176679dcc1SBarry Smith   PetscFunctionBegin;
10189566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
10199566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
10206679dcc1SBarry Smith 
10216679dcc1SBarry Smith   v = a->a;
10226679dcc1SBarry Smith   if (usecprow) {
1023*9371c9d4SSatish Balay     if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); }
10246679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
10256679dcc1SBarry Smith     ii   = a->compressedrow.i;
10266679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
10276679dcc1SBarry Smith   } else {
10286679dcc1SBarry Smith     ii = a->i;
10296679dcc1SBarry Smith     y  = yarray;
10306679dcc1SBarry Smith     z  = zarray;
10316679dcc1SBarry Smith   }
10326679dcc1SBarry Smith 
10336679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
10346679dcc1SBarry Smith     n   = ii[i + 1] - ii[i];
10356679dcc1SBarry Smith     idx = ij + ii[i];
10366679dcc1SBarry Smith 
10376679dcc1SBarry Smith     if (usecprow) {
10386679dcc1SBarry Smith       y = yarray + 12 * ridx[i];
10396679dcc1SBarry Smith       z = zarray + 12 * ridx[i];
10406679dcc1SBarry Smith     }
1041*9371c9d4SSatish Balay     sum1  = y[0];
1042*9371c9d4SSatish Balay     sum2  = y[1];
1043*9371c9d4SSatish Balay     sum3  = y[2];
1044*9371c9d4SSatish Balay     sum4  = y[3];
1045*9371c9d4SSatish Balay     sum5  = y[4];
1046*9371c9d4SSatish Balay     sum6  = y[5];
1047*9371c9d4SSatish Balay     sum7  = y[6];
1048*9371c9d4SSatish Balay     sum8  = y[7];
1049*9371c9d4SSatish Balay     sum9  = y[8];
1050*9371c9d4SSatish Balay     sum10 = y[9];
1051*9371c9d4SSatish Balay     sum11 = y[10];
1052*9371c9d4SSatish Balay     sum12 = y[11];
10536679dcc1SBarry Smith 
10546679dcc1SBarry Smith     for (j = 0; j < n; j++) {
10556679dcc1SBarry Smith       xb = x + 12 * (idx[j]);
10566679dcc1SBarry Smith 
10576679dcc1SBarry Smith       for (k = 0; k < 12; k++) {
10586679dcc1SBarry Smith         xv = xb[k];
10596679dcc1SBarry Smith         sum1 += v[0] * xv;
10606679dcc1SBarry Smith         sum2 += v[1] * xv;
10616679dcc1SBarry Smith         sum3 += v[2] * xv;
10626679dcc1SBarry Smith         sum4 += v[3] * xv;
10636679dcc1SBarry Smith         sum5 += v[4] * xv;
10646679dcc1SBarry Smith         sum6 += v[5] * xv;
10656679dcc1SBarry Smith         sum7 += v[6] * xv;
10666679dcc1SBarry Smith         sum8 += v[7] * xv;
10676679dcc1SBarry Smith         sum9 += v[8] * xv;
10686679dcc1SBarry Smith         sum10 += v[9] * xv;
10696679dcc1SBarry Smith         sum11 += v[10] * xv;
10706679dcc1SBarry Smith         sum12 += v[11] * xv;
10716679dcc1SBarry Smith         v += 12;
10726679dcc1SBarry Smith       }
10736679dcc1SBarry Smith     }
10746679dcc1SBarry Smith 
1075*9371c9d4SSatish Balay     z[0]  = sum1;
1076*9371c9d4SSatish Balay     z[1]  = sum2;
1077*9371c9d4SSatish Balay     z[2]  = sum3;
1078*9371c9d4SSatish Balay     z[3]  = sum4;
1079*9371c9d4SSatish Balay     z[4]  = sum5;
1080*9371c9d4SSatish Balay     z[5]  = sum6;
1081*9371c9d4SSatish Balay     z[6]  = sum7;
1082*9371c9d4SSatish Balay     z[7]  = sum8;
1083*9371c9d4SSatish Balay     z[8]  = sum9;
1084*9371c9d4SSatish Balay     z[9]  = sum10;
1085*9371c9d4SSatish Balay     z[10] = sum11;
1086*9371c9d4SSatish Balay     z[11] = sum12;
10876679dcc1SBarry Smith     if (!usecprow) {
10886679dcc1SBarry Smith       y += 12;
10896679dcc1SBarry Smith       z += 12;
10906679dcc1SBarry Smith     }
10916679dcc1SBarry Smith   }
10929566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
10939566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
10949566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
10956679dcc1SBarry Smith   PetscFunctionReturn(0);
10966679dcc1SBarry Smith }
10976679dcc1SBarry Smith 
10986679dcc1SBarry Smith /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1099*9371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz) {
11006679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
11016679dcc1SBarry Smith   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
11026679dcc1SBarry Smith   const PetscScalar *x, *xb;
11036679dcc1SBarry Smith   PetscScalar        x1, x2, x3, x4, *zarray;
11046679dcc1SBarry Smith   const MatScalar   *v;
11056679dcc1SBarry Smith   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
11066679dcc1SBarry Smith   PetscInt           mbs, i, j, n;
11076679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
11086679dcc1SBarry Smith 
11096679dcc1SBarry Smith   PetscFunctionBegin;
11109566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
11119566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
11126679dcc1SBarry Smith 
11136679dcc1SBarry Smith   v = a->a;
11146679dcc1SBarry Smith   if (usecprow) {
11156679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
11166679dcc1SBarry Smith     ii   = a->compressedrow.i;
11176679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
11189566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
11196679dcc1SBarry Smith   } else {
11206679dcc1SBarry Smith     mbs = a->mbs;
11216679dcc1SBarry Smith     ii  = a->i;
11226679dcc1SBarry Smith     z   = zarray;
11236679dcc1SBarry Smith   }
11246679dcc1SBarry Smith 
11256679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
11266679dcc1SBarry Smith     n   = ii[i + 1] - ii[i];
11276679dcc1SBarry Smith     idx = ij + ii[i];
11286679dcc1SBarry Smith 
11296679dcc1SBarry Smith     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
11306679dcc1SBarry Smith     for (j = 0; j < n; j++) {
11316679dcc1SBarry Smith       xb = x + 12 * (idx[j]);
1132*9371c9d4SSatish Balay       x1 = xb[0];
1133*9371c9d4SSatish Balay       x2 = xb[1];
1134*9371c9d4SSatish Balay       x3 = xb[2];
1135*9371c9d4SSatish Balay       x4 = xb[3];
11366679dcc1SBarry Smith 
11376679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
11386679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
11396679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
11406679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
11416679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
11426679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
11436679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
11446679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
11456679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
11466679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
11476679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
11486679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
11496679dcc1SBarry Smith       v += 48;
11506679dcc1SBarry Smith 
1151*9371c9d4SSatish Balay       x1 = xb[4];
1152*9371c9d4SSatish Balay       x2 = xb[5];
1153*9371c9d4SSatish Balay       x3 = xb[6];
1154*9371c9d4SSatish Balay       x4 = xb[7];
11556679dcc1SBarry Smith 
11566679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
11576679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
11586679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
11596679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
11606679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
11616679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
11626679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
11636679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
11646679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
11656679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
11666679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
11676679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
11686679dcc1SBarry Smith       v += 48;
11696679dcc1SBarry Smith 
1170*9371c9d4SSatish Balay       x1 = xb[8];
1171*9371c9d4SSatish Balay       x2 = xb[9];
1172*9371c9d4SSatish Balay       x3 = xb[10];
1173*9371c9d4SSatish Balay       x4 = xb[11];
11746679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
11756679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
11766679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
11776679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
11786679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
11796679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
11806679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
11816679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
11826679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
11836679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
11846679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
11856679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
11866679dcc1SBarry Smith       v += 48;
11876679dcc1SBarry Smith     }
11886679dcc1SBarry Smith     if (usecprow) z = zarray + 12 * ridx[i];
1189*9371c9d4SSatish Balay     z[0]  = sum1;
1190*9371c9d4SSatish Balay     z[1]  = sum2;
1191*9371c9d4SSatish Balay     z[2]  = sum3;
1192*9371c9d4SSatish Balay     z[3]  = sum4;
1193*9371c9d4SSatish Balay     z[4]  = sum5;
1194*9371c9d4SSatish Balay     z[5]  = sum6;
1195*9371c9d4SSatish Balay     z[6]  = sum7;
1196*9371c9d4SSatish Balay     z[7]  = sum8;
1197*9371c9d4SSatish Balay     z[8]  = sum9;
1198*9371c9d4SSatish Balay     z[9]  = sum10;
1199*9371c9d4SSatish Balay     z[10] = sum11;
1200*9371c9d4SSatish Balay     z[11] = sum12;
12016679dcc1SBarry Smith     if (!usecprow) z += 12;
12026679dcc1SBarry Smith   }
12039566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
12049566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
12059566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
12066679dcc1SBarry Smith   PetscFunctionReturn(0);
12076679dcc1SBarry Smith }
12086679dcc1SBarry Smith 
12096679dcc1SBarry Smith /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1210*9371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz) {
12116679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
12126679dcc1SBarry Smith   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
12136679dcc1SBarry Smith   const PetscScalar *x, *xb;
12146679dcc1SBarry Smith   PetscScalar        x1, x2, x3, x4, *zarray, *yarray;
12156679dcc1SBarry Smith   const MatScalar   *v;
12166679dcc1SBarry Smith   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
12176679dcc1SBarry Smith   PetscInt           mbs      = a->mbs, i, j, n;
12186679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
12196679dcc1SBarry Smith 
12206679dcc1SBarry Smith   PetscFunctionBegin;
12219566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
12229566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
12236679dcc1SBarry Smith 
12246679dcc1SBarry Smith   v = a->a;
12256679dcc1SBarry Smith   if (usecprow) {
1226*9371c9d4SSatish Balay     if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); }
12276679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
12286679dcc1SBarry Smith     ii   = a->compressedrow.i;
12296679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
12306679dcc1SBarry Smith   } else {
12316679dcc1SBarry Smith     ii = a->i;
12326679dcc1SBarry Smith     y  = yarray;
12336679dcc1SBarry Smith     z  = zarray;
12346679dcc1SBarry Smith   }
12356679dcc1SBarry Smith 
12366679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
12376679dcc1SBarry Smith     n   = ii[i + 1] - ii[i];
12386679dcc1SBarry Smith     idx = ij + ii[i];
12396679dcc1SBarry Smith 
12406679dcc1SBarry Smith     if (usecprow) {
12416679dcc1SBarry Smith       y = yarray + 12 * ridx[i];
12426679dcc1SBarry Smith       z = zarray + 12 * ridx[i];
12436679dcc1SBarry Smith     }
1244*9371c9d4SSatish Balay     sum1  = y[0];
1245*9371c9d4SSatish Balay     sum2  = y[1];
1246*9371c9d4SSatish Balay     sum3  = y[2];
1247*9371c9d4SSatish Balay     sum4  = y[3];
1248*9371c9d4SSatish Balay     sum5  = y[4];
1249*9371c9d4SSatish Balay     sum6  = y[5];
1250*9371c9d4SSatish Balay     sum7  = y[6];
1251*9371c9d4SSatish Balay     sum8  = y[7];
1252*9371c9d4SSatish Balay     sum9  = y[8];
1253*9371c9d4SSatish Balay     sum10 = y[9];
1254*9371c9d4SSatish Balay     sum11 = y[10];
1255*9371c9d4SSatish Balay     sum12 = y[11];
12566679dcc1SBarry Smith 
12576679dcc1SBarry Smith     for (j = 0; j < n; j++) {
12586679dcc1SBarry Smith       xb = x + 12 * (idx[j]);
1259*9371c9d4SSatish Balay       x1 = xb[0];
1260*9371c9d4SSatish Balay       x2 = xb[1];
1261*9371c9d4SSatish Balay       x3 = xb[2];
1262*9371c9d4SSatish Balay       x4 = xb[3];
12636679dcc1SBarry Smith 
12646679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
12656679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
12666679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
12676679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
12686679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
12696679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
12706679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
12716679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
12726679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
12736679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
12746679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
12756679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
12766679dcc1SBarry Smith       v += 48;
12776679dcc1SBarry Smith 
1278*9371c9d4SSatish Balay       x1 = xb[4];
1279*9371c9d4SSatish Balay       x2 = xb[5];
1280*9371c9d4SSatish Balay       x3 = xb[6];
1281*9371c9d4SSatish Balay       x4 = xb[7];
12826679dcc1SBarry Smith 
12836679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
12846679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
12856679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
12866679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
12876679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
12886679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
12896679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
12906679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
12916679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
12926679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
12936679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
12946679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
12956679dcc1SBarry Smith       v += 48;
12966679dcc1SBarry Smith 
1297*9371c9d4SSatish Balay       x1 = xb[8];
1298*9371c9d4SSatish Balay       x2 = xb[9];
1299*9371c9d4SSatish Balay       x3 = xb[10];
1300*9371c9d4SSatish Balay       x4 = xb[11];
13016679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
13026679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
13036679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
13046679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
13056679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
13066679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
13076679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
13086679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
13096679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
13106679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
13116679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
13126679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
13136679dcc1SBarry Smith       v += 48;
13146679dcc1SBarry Smith     }
1315*9371c9d4SSatish Balay     z[0]  = sum1;
1316*9371c9d4SSatish Balay     z[1]  = sum2;
1317*9371c9d4SSatish Balay     z[2]  = sum3;
1318*9371c9d4SSatish Balay     z[3]  = sum4;
1319*9371c9d4SSatish Balay     z[4]  = sum5;
1320*9371c9d4SSatish Balay     z[5]  = sum6;
1321*9371c9d4SSatish Balay     z[6]  = sum7;
1322*9371c9d4SSatish Balay     z[7]  = sum8;
1323*9371c9d4SSatish Balay     z[8]  = sum9;
1324*9371c9d4SSatish Balay     z[9]  = sum10;
1325*9371c9d4SSatish Balay     z[10] = sum11;
1326*9371c9d4SSatish Balay     z[11] = sum12;
13276679dcc1SBarry Smith     if (!usecprow) {
13286679dcc1SBarry Smith       y += 12;
13296679dcc1SBarry Smith       z += 12;
13306679dcc1SBarry Smith     }
13316679dcc1SBarry Smith   }
13329566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
13339566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
13349566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
13356679dcc1SBarry Smith   PetscFunctionReturn(0);
13366679dcc1SBarry Smith }
13376679dcc1SBarry Smith 
13386679dcc1SBarry Smith #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1339*9371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz) {
13406679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
13416679dcc1SBarry Smith   PetscScalar       *z = NULL, *zarray;
13426679dcc1SBarry Smith   const PetscScalar *x, *work;
13436679dcc1SBarry Smith   const MatScalar   *v = a->a;
13446679dcc1SBarry Smith   PetscInt           mbs, i, j, n;
13456679dcc1SBarry Smith   const PetscInt    *idx = a->j, *ii, *ridx = NULL;
13466679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
13476679dcc1SBarry Smith   const PetscInt     bs = 12, bs2 = 144;
13486679dcc1SBarry Smith 
13496679dcc1SBarry Smith   __m256d a0, a1, a2, a3, a4, a5;
13506679dcc1SBarry Smith   __m256d w0, w1, w2, w3;
13516679dcc1SBarry Smith   __m256d z0, z1, z2;
13526679dcc1SBarry Smith 
13536679dcc1SBarry Smith   PetscFunctionBegin;
13549566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
13559566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
13566679dcc1SBarry Smith 
13576679dcc1SBarry Smith   if (usecprow) {
13586679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
13596679dcc1SBarry Smith     ii   = a->compressedrow.i;
13606679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
13619566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
13626679dcc1SBarry Smith   } else {
13636679dcc1SBarry Smith     mbs = a->mbs;
13646679dcc1SBarry Smith     ii  = a->i;
13656679dcc1SBarry Smith     z   = zarray;
13666679dcc1SBarry Smith   }
13676679dcc1SBarry Smith 
13686679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
1369*9371c9d4SSatish Balay     z0 = _mm256_setzero_pd();
1370*9371c9d4SSatish Balay     z1 = _mm256_setzero_pd();
1371*9371c9d4SSatish Balay     z2 = _mm256_setzero_pd();
13726679dcc1SBarry Smith 
1373*9371c9d4SSatish Balay     n = ii[1] - ii[0];
1374*9371c9d4SSatish Balay     ii++;
13756679dcc1SBarry Smith     for (j = 0; j < n; j++) {
13766679dcc1SBarry Smith       work = x + bs * (*idx++);
13776679dcc1SBarry Smith 
13786679dcc1SBarry Smith       /* first column of a */
13796679dcc1SBarry Smith       w0 = _mm256_set1_pd(work[0]);
1380*9371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 0);
1381*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
1382*9371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 4);
1383*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
1384*9371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 8);
1385*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
13866679dcc1SBarry Smith 
13876679dcc1SBarry Smith       /* second column of a */
13886679dcc1SBarry Smith       w1 = _mm256_set1_pd(work[1]);
1389*9371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 12);
1390*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w1, z0);
1391*9371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 16);
1392*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w1, z1);
1393*9371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 20);
1394*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w1, z2);
13956679dcc1SBarry Smith 
13966679dcc1SBarry Smith       /* third column of a */
13976679dcc1SBarry Smith       w2 = _mm256_set1_pd(work[2]);
1398*9371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 24);
1399*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
1400*9371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 28);
1401*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
1402*9371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 32);
1403*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
14046679dcc1SBarry Smith 
14056679dcc1SBarry Smith       /* fourth column of a */
14066679dcc1SBarry Smith       w3 = _mm256_set1_pd(work[3]);
1407*9371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 36);
1408*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
1409*9371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 40);
1410*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
1411*9371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 44);
1412*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
14136679dcc1SBarry Smith 
14146679dcc1SBarry Smith       /* fifth column of a */
14156679dcc1SBarry Smith       w0 = _mm256_set1_pd(work[4]);
1416*9371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 48);
1417*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
1418*9371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 52);
1419*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
1420*9371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 56);
1421*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
14226679dcc1SBarry Smith 
14236679dcc1SBarry Smith       /* sixth column of a */
14246679dcc1SBarry Smith       w1 = _mm256_set1_pd(work[5]);
1425*9371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 60);
1426*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w1, z0);
1427*9371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 64);
1428*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w1, z1);
1429*9371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 68);
1430*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w1, z2);
14316679dcc1SBarry Smith 
14326679dcc1SBarry Smith       /* seventh column of a */
14336679dcc1SBarry Smith       w2 = _mm256_set1_pd(work[6]);
1434*9371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 72);
1435*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
1436*9371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 76);
1437*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
1438*9371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 80);
1439*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
14406679dcc1SBarry Smith 
14416aad120cSJose E. Roman       /* eighth column of a */
14426679dcc1SBarry Smith       w3 = _mm256_set1_pd(work[7]);
1443*9371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 84);
1444*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
1445*9371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 88);
1446*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
1447*9371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 92);
1448*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
14496679dcc1SBarry Smith 
14506679dcc1SBarry Smith       /* ninth column of a */
14516679dcc1SBarry Smith       w0 = _mm256_set1_pd(work[8]);
1452*9371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 96);
1453*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
1454*9371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 100);
1455*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
1456*9371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 104);
1457*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
14586679dcc1SBarry Smith 
14596679dcc1SBarry Smith       /* tenth column of a */
14606679dcc1SBarry Smith       w1 = _mm256_set1_pd(work[9]);
1461*9371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 108);
1462*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w1, z0);
1463*9371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 112);
1464*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w1, z1);
1465*9371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 116);
1466*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w1, z2);
14676679dcc1SBarry Smith 
14686679dcc1SBarry Smith       /* eleventh column of a */
14696679dcc1SBarry Smith       w2 = _mm256_set1_pd(work[10]);
1470*9371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 120);
1471*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
1472*9371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 124);
1473*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
1474*9371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 128);
1475*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
14766679dcc1SBarry Smith 
14776679dcc1SBarry Smith       /* twelveth column of a */
14786679dcc1SBarry Smith       w3 = _mm256_set1_pd(work[11]);
1479*9371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 132);
1480*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
1481*9371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 136);
1482*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
1483*9371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 140);
1484*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
14856679dcc1SBarry Smith 
14866679dcc1SBarry Smith       v += bs2;
14876679dcc1SBarry Smith     }
14886679dcc1SBarry Smith     if (usecprow) z = zarray + bs * ridx[i];
1489*9371c9d4SSatish Balay     _mm256_storeu_pd(&z[0], z0);
1490*9371c9d4SSatish Balay     _mm256_storeu_pd(&z[4], z1);
1491*9371c9d4SSatish Balay     _mm256_storeu_pd(&z[8], z2);
14926679dcc1SBarry Smith     if (!usecprow) z += bs;
14936679dcc1SBarry Smith   }
14949566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
14959566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
14969566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
14976679dcc1SBarry Smith   PetscFunctionReturn(0);
14986679dcc1SBarry Smith }
14996679dcc1SBarry Smith #endif
15006679dcc1SBarry Smith 
15018ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1502832cc040SShri Abhyankar /* Default MatMult for block size 15 */
1503*9371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz) {
15048ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1505f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
15068ab949d8SShri Abhyankar   const PetscScalar *x, *xb;
150753ef36baSBarry Smith   PetscScalar       *zarray, xv;
15088ab949d8SShri Abhyankar   const MatScalar   *v;
15098ab949d8SShri Abhyankar   const PetscInt    *ii, *ij = a->j, *idx;
15107c565772SBarry Smith   PetscInt           mbs, i, j, k, n, *ridx = NULL;
1511ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
15128ab949d8SShri Abhyankar 
15138ab949d8SShri Abhyankar   PetscFunctionBegin;
15149566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
15159566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
15168ab949d8SShri Abhyankar 
15178ab949d8SShri Abhyankar   v = a->a;
15188ab949d8SShri Abhyankar   if (usecprow) {
15198ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
15208ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
15218ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
15229566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
15238ab949d8SShri Abhyankar   } else {
15248ab949d8SShri Abhyankar     mbs = a->mbs;
15258ab949d8SShri Abhyankar     ii  = a->i;
15268ab949d8SShri Abhyankar     z   = zarray;
15278ab949d8SShri Abhyankar   }
15288ab949d8SShri Abhyankar 
15298ab949d8SShri Abhyankar   for (i = 0; i < mbs; i++) {
15308ab949d8SShri Abhyankar     n     = ii[i + 1] - ii[i];
15318ab949d8SShri Abhyankar     idx   = ij + ii[i];
1532*9371c9d4SSatish Balay     sum1  = 0.0;
1533*9371c9d4SSatish Balay     sum2  = 0.0;
1534*9371c9d4SSatish Balay     sum3  = 0.0;
1535*9371c9d4SSatish Balay     sum4  = 0.0;
1536*9371c9d4SSatish Balay     sum5  = 0.0;
1537*9371c9d4SSatish Balay     sum6  = 0.0;
1538*9371c9d4SSatish Balay     sum7  = 0.0;
1539*9371c9d4SSatish Balay     sum8  = 0.0;
1540*9371c9d4SSatish Balay     sum9  = 0.0;
1541*9371c9d4SSatish Balay     sum10 = 0.0;
1542*9371c9d4SSatish Balay     sum11 = 0.0;
1543*9371c9d4SSatish Balay     sum12 = 0.0;
1544*9371c9d4SSatish Balay     sum13 = 0.0;
1545*9371c9d4SSatish Balay     sum14 = 0.0;
1546*9371c9d4SSatish Balay     sum15 = 0.0;
15478ab949d8SShri Abhyankar 
15488ab949d8SShri Abhyankar     for (j = 0; j < n; j++) {
15498ab949d8SShri Abhyankar       xb = x + 15 * (idx[j]);
15508ab949d8SShri Abhyankar 
15518ab949d8SShri Abhyankar       for (k = 0; k < 15; k++) {
155253ef36baSBarry Smith         xv = xb[k];
155353ef36baSBarry Smith         sum1 += v[0] * xv;
155453ef36baSBarry Smith         sum2 += v[1] * xv;
155553ef36baSBarry Smith         sum3 += v[2] * xv;
155653ef36baSBarry Smith         sum4 += v[3] * xv;
155753ef36baSBarry Smith         sum5 += v[4] * xv;
155853ef36baSBarry Smith         sum6 += v[5] * xv;
155953ef36baSBarry Smith         sum7 += v[6] * xv;
156053ef36baSBarry Smith         sum8 += v[7] * xv;
156153ef36baSBarry Smith         sum9 += v[8] * xv;
156253ef36baSBarry Smith         sum10 += v[9] * xv;
156353ef36baSBarry Smith         sum11 += v[10] * xv;
156453ef36baSBarry Smith         sum12 += v[11] * xv;
156553ef36baSBarry Smith         sum13 += v[12] * xv;
156653ef36baSBarry Smith         sum14 += v[13] * xv;
156753ef36baSBarry Smith         sum15 += v[14] * xv;
15688ab949d8SShri Abhyankar         v += 15;
15698ab949d8SShri Abhyankar       }
15708ab949d8SShri Abhyankar     }
15718ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15 * ridx[i];
1572*9371c9d4SSatish Balay     z[0]  = sum1;
1573*9371c9d4SSatish Balay     z[1]  = sum2;
1574*9371c9d4SSatish Balay     z[2]  = sum3;
1575*9371c9d4SSatish Balay     z[3]  = sum4;
1576*9371c9d4SSatish Balay     z[4]  = sum5;
1577*9371c9d4SSatish Balay     z[5]  = sum6;
1578*9371c9d4SSatish Balay     z[6]  = sum7;
1579*9371c9d4SSatish Balay     z[7]  = sum8;
1580*9371c9d4SSatish Balay     z[8]  = sum9;
1581*9371c9d4SSatish Balay     z[9]  = sum10;
1582*9371c9d4SSatish Balay     z[10] = sum11;
1583*9371c9d4SSatish Balay     z[11] = sum12;
1584*9371c9d4SSatish Balay     z[12] = sum13;
1585*9371c9d4SSatish Balay     z[13] = sum14;
1586*9371c9d4SSatish Balay     z[14] = sum15;
15878ab949d8SShri Abhyankar 
15888ab949d8SShri Abhyankar     if (!usecprow) z += 15;
15898ab949d8SShri Abhyankar   }
15908ab949d8SShri Abhyankar 
15919566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
15929566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
15939566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
15948ab949d8SShri Abhyankar   PetscFunctionReturn(0);
15958ab949d8SShri Abhyankar }
15968ab949d8SShri Abhyankar 
15978ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1598*9371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz) {
15998ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1600f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
16018ab949d8SShri Abhyankar   const PetscScalar *x, *xb;
16020b8f6341SShri Abhyankar   PetscScalar        x1, x2, x3, x4, *zarray;
16038ab949d8SShri Abhyankar   const MatScalar   *v;
16048ab949d8SShri Abhyankar   const PetscInt    *ii, *ij = a->j, *idx;
16057c565772SBarry Smith   PetscInt           mbs, i, j, n, *ridx = NULL;
1606ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
16078ab949d8SShri Abhyankar 
16088ab949d8SShri Abhyankar   PetscFunctionBegin;
16099566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
16109566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
16118ab949d8SShri Abhyankar 
16128ab949d8SShri Abhyankar   v = a->a;
16138ab949d8SShri Abhyankar   if (usecprow) {
16148ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
16158ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
16168ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
16179566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
16188ab949d8SShri Abhyankar   } else {
16198ab949d8SShri Abhyankar     mbs = a->mbs;
16208ab949d8SShri Abhyankar     ii  = a->i;
16218ab949d8SShri Abhyankar     z   = zarray;
16228ab949d8SShri Abhyankar   }
16238ab949d8SShri Abhyankar 
16248ab949d8SShri Abhyankar   for (i = 0; i < mbs; i++) {
16258ab949d8SShri Abhyankar     n     = ii[i + 1] - ii[i];
16268ab949d8SShri Abhyankar     idx   = ij + ii[i];
1627*9371c9d4SSatish Balay     sum1  = 0.0;
1628*9371c9d4SSatish Balay     sum2  = 0.0;
1629*9371c9d4SSatish Balay     sum3  = 0.0;
1630*9371c9d4SSatish Balay     sum4  = 0.0;
1631*9371c9d4SSatish Balay     sum5  = 0.0;
1632*9371c9d4SSatish Balay     sum6  = 0.0;
1633*9371c9d4SSatish Balay     sum7  = 0.0;
1634*9371c9d4SSatish Balay     sum8  = 0.0;
1635*9371c9d4SSatish Balay     sum9  = 0.0;
1636*9371c9d4SSatish Balay     sum10 = 0.0;
1637*9371c9d4SSatish Balay     sum11 = 0.0;
1638*9371c9d4SSatish Balay     sum12 = 0.0;
1639*9371c9d4SSatish Balay     sum13 = 0.0;
1640*9371c9d4SSatish Balay     sum14 = 0.0;
1641*9371c9d4SSatish Balay     sum15 = 0.0;
16428ab949d8SShri Abhyankar 
16438ab949d8SShri Abhyankar     for (j = 0; j < n; j++) {
16448ab949d8SShri Abhyankar       xb = x + 15 * (idx[j]);
1645*9371c9d4SSatish Balay       x1 = xb[0];
1646*9371c9d4SSatish Balay       x2 = xb[1];
1647*9371c9d4SSatish Balay       x3 = xb[2];
1648*9371c9d4SSatish Balay       x4 = xb[3];
16498ab949d8SShri Abhyankar 
16508ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
16518ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
16528ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
16538ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
16548ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
16558ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
16568ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
16578ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
16588ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
16598ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
16608ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
16618ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
16628ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
16638ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
16648ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
16658ab949d8SShri Abhyankar 
16668ab949d8SShri Abhyankar       v += 60;
16678ab949d8SShri Abhyankar 
1668*9371c9d4SSatish Balay       x1 = xb[4];
1669*9371c9d4SSatish Balay       x2 = xb[5];
1670*9371c9d4SSatish Balay       x3 = xb[6];
1671*9371c9d4SSatish Balay       x4 = xb[7];
16728ab949d8SShri Abhyankar 
16738ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
16748ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
16758ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
16768ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
16778ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
16788ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
16798ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
16808ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
16818ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
16828ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
16838ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
16848ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
16858ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
16868ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
16878ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
16888ab949d8SShri Abhyankar       v += 60;
16898ab949d8SShri Abhyankar 
1690*9371c9d4SSatish Balay       x1 = xb[8];
1691*9371c9d4SSatish Balay       x2 = xb[9];
1692*9371c9d4SSatish Balay       x3 = xb[10];
1693*9371c9d4SSatish Balay       x4 = xb[11];
16940b8f6341SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
16950b8f6341SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
16960b8f6341SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
16970b8f6341SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
16980b8f6341SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
16990b8f6341SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
17000b8f6341SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
17010b8f6341SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
17020b8f6341SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
17030b8f6341SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
17040b8f6341SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
17050b8f6341SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
17060b8f6341SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
17070b8f6341SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
17080b8f6341SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
17090b8f6341SShri Abhyankar       v += 60;
17100b8f6341SShri Abhyankar 
1711*9371c9d4SSatish Balay       x1 = xb[12];
1712*9371c9d4SSatish Balay       x2 = xb[13];
1713*9371c9d4SSatish Balay       x3 = xb[14];
17148ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
17158ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
17168ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
17178ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
17188ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
17198ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
17208ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
17218ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
17228ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
17238ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
17248ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
17258ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
17268ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
17278ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
17288ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
17298ab949d8SShri Abhyankar       v += 45;
17308ab949d8SShri Abhyankar     }
17318ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15 * ridx[i];
1732*9371c9d4SSatish Balay     z[0]  = sum1;
1733*9371c9d4SSatish Balay     z[1]  = sum2;
1734*9371c9d4SSatish Balay     z[2]  = sum3;
1735*9371c9d4SSatish Balay     z[3]  = sum4;
1736*9371c9d4SSatish Balay     z[4]  = sum5;
1737*9371c9d4SSatish Balay     z[5]  = sum6;
1738*9371c9d4SSatish Balay     z[6]  = sum7;
1739*9371c9d4SSatish Balay     z[7]  = sum8;
1740*9371c9d4SSatish Balay     z[8]  = sum9;
1741*9371c9d4SSatish Balay     z[9]  = sum10;
1742*9371c9d4SSatish Balay     z[10] = sum11;
1743*9371c9d4SSatish Balay     z[11] = sum12;
1744*9371c9d4SSatish Balay     z[12] = sum13;
1745*9371c9d4SSatish Balay     z[13] = sum14;
1746*9371c9d4SSatish Balay     z[14] = sum15;
17478ab949d8SShri Abhyankar 
17488ab949d8SShri Abhyankar     if (!usecprow) z += 15;
17498ab949d8SShri Abhyankar   }
17508ab949d8SShri Abhyankar 
17519566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
17529566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
17539566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
17548ab949d8SShri Abhyankar   PetscFunctionReturn(0);
17558ab949d8SShri Abhyankar }
17568ab949d8SShri Abhyankar 
17578ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1758*9371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz) {
17598ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1760f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
17618ab949d8SShri Abhyankar   const PetscScalar *x, *xb;
17620b8f6341SShri Abhyankar   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
17638ab949d8SShri Abhyankar   const MatScalar   *v;
17648ab949d8SShri Abhyankar   const PetscInt    *ii, *ij = a->j, *idx;
17657c565772SBarry Smith   PetscInt           mbs, i, j, n, *ridx = NULL;
1766ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
17678ab949d8SShri Abhyankar 
17688ab949d8SShri Abhyankar   PetscFunctionBegin;
17699566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
17709566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
17718ab949d8SShri Abhyankar 
17728ab949d8SShri Abhyankar   v = a->a;
17738ab949d8SShri Abhyankar   if (usecprow) {
17748ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
17758ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
17768ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
17779566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
17788ab949d8SShri Abhyankar   } else {
17798ab949d8SShri Abhyankar     mbs = a->mbs;
17808ab949d8SShri Abhyankar     ii  = a->i;
17818ab949d8SShri Abhyankar     z   = zarray;
17828ab949d8SShri Abhyankar   }
17838ab949d8SShri Abhyankar 
17848ab949d8SShri Abhyankar   for (i = 0; i < mbs; i++) {
17858ab949d8SShri Abhyankar     n     = ii[i + 1] - ii[i];
17868ab949d8SShri Abhyankar     idx   = ij + ii[i];
1787*9371c9d4SSatish Balay     sum1  = 0.0;
1788*9371c9d4SSatish Balay     sum2  = 0.0;
1789*9371c9d4SSatish Balay     sum3  = 0.0;
1790*9371c9d4SSatish Balay     sum4  = 0.0;
1791*9371c9d4SSatish Balay     sum5  = 0.0;
1792*9371c9d4SSatish Balay     sum6  = 0.0;
1793*9371c9d4SSatish Balay     sum7  = 0.0;
1794*9371c9d4SSatish Balay     sum8  = 0.0;
1795*9371c9d4SSatish Balay     sum9  = 0.0;
1796*9371c9d4SSatish Balay     sum10 = 0.0;
1797*9371c9d4SSatish Balay     sum11 = 0.0;
1798*9371c9d4SSatish Balay     sum12 = 0.0;
1799*9371c9d4SSatish Balay     sum13 = 0.0;
1800*9371c9d4SSatish Balay     sum14 = 0.0;
1801*9371c9d4SSatish Balay     sum15 = 0.0;
18028ab949d8SShri Abhyankar 
18038ab949d8SShri Abhyankar     for (j = 0; j < n; j++) {
18048ab949d8SShri Abhyankar       xb = x + 15 * (idx[j]);
1805*9371c9d4SSatish Balay       x1 = xb[0];
1806*9371c9d4SSatish Balay       x2 = xb[1];
1807*9371c9d4SSatish Balay       x3 = xb[2];
1808*9371c9d4SSatish Balay       x4 = xb[3];
1809*9371c9d4SSatish Balay       x5 = xb[4];
1810*9371c9d4SSatish Balay       x6 = xb[5];
1811*9371c9d4SSatish Balay       x7 = xb[6];
18120b8f6341SShri Abhyankar       x8 = xb[7];
18138ab949d8SShri Abhyankar 
18148ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
18158ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
18168ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
18178ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
18188ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
18198ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
18208ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
18218ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
18228ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
18238ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
18248ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
18258ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
18268ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
18278ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
18288ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
18298ab949d8SShri Abhyankar       v += 120;
18308ab949d8SShri Abhyankar 
1831*9371c9d4SSatish Balay       x1 = xb[8];
1832*9371c9d4SSatish Balay       x2 = xb[9];
1833*9371c9d4SSatish Balay       x3 = xb[10];
1834*9371c9d4SSatish Balay       x4 = xb[11];
1835*9371c9d4SSatish Balay       x5 = xb[12];
1836*9371c9d4SSatish Balay       x6 = xb[13];
1837*9371c9d4SSatish Balay       x7 = xb[14];
18380b8f6341SShri Abhyankar 
18398ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
18408ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
18418ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
18428ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
18438ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
18448ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
18458ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
18468ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
18478ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
18488ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
18498ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
18508ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
18518ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
18528ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
18538ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
18548ab949d8SShri Abhyankar       v += 105;
18558ab949d8SShri Abhyankar     }
18568ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15 * ridx[i];
1857*9371c9d4SSatish Balay     z[0]  = sum1;
1858*9371c9d4SSatish Balay     z[1]  = sum2;
1859*9371c9d4SSatish Balay     z[2]  = sum3;
1860*9371c9d4SSatish Balay     z[3]  = sum4;
1861*9371c9d4SSatish Balay     z[4]  = sum5;
1862*9371c9d4SSatish Balay     z[5]  = sum6;
1863*9371c9d4SSatish Balay     z[6]  = sum7;
1864*9371c9d4SSatish Balay     z[7]  = sum8;
1865*9371c9d4SSatish Balay     z[8]  = sum9;
1866*9371c9d4SSatish Balay     z[9]  = sum10;
1867*9371c9d4SSatish Balay     z[10] = sum11;
1868*9371c9d4SSatish Balay     z[11] = sum12;
1869*9371c9d4SSatish Balay     z[12] = sum13;
1870*9371c9d4SSatish Balay     z[13] = sum14;
1871*9371c9d4SSatish Balay     z[14] = sum15;
18728ab949d8SShri Abhyankar 
18738ab949d8SShri Abhyankar     if (!usecprow) z += 15;
18748ab949d8SShri Abhyankar   }
18758ab949d8SShri Abhyankar 
18769566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
18779566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
18789566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
18798ab949d8SShri Abhyankar   PetscFunctionReturn(0);
18808ab949d8SShri Abhyankar }
18818ab949d8SShri Abhyankar 
18828ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1883*9371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz) {
18848ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1885f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
18868ab949d8SShri Abhyankar   const PetscScalar *x, *xb;
18878ab949d8SShri Abhyankar   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
18888ab949d8SShri Abhyankar   const MatScalar   *v;
18898ab949d8SShri Abhyankar   const PetscInt    *ii, *ij = a->j, *idx;
18907c565772SBarry Smith   PetscInt           mbs, i, j, n, *ridx = NULL;
1891ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
18928ab949d8SShri Abhyankar 
18938ab949d8SShri Abhyankar   PetscFunctionBegin;
18949566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
18959566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
18968ab949d8SShri Abhyankar 
18978ab949d8SShri Abhyankar   v = a->a;
18988ab949d8SShri Abhyankar   if (usecprow) {
18998ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
19008ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
19018ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
19029566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
19038ab949d8SShri Abhyankar   } else {
19048ab949d8SShri Abhyankar     mbs = a->mbs;
19058ab949d8SShri Abhyankar     ii  = a->i;
19068ab949d8SShri Abhyankar     z   = zarray;
19078ab949d8SShri Abhyankar   }
19088ab949d8SShri Abhyankar 
19098ab949d8SShri Abhyankar   for (i = 0; i < mbs; i++) {
19108ab949d8SShri Abhyankar     n     = ii[i + 1] - ii[i];
19118ab949d8SShri Abhyankar     idx   = ij + ii[i];
1912*9371c9d4SSatish Balay     sum1  = 0.0;
1913*9371c9d4SSatish Balay     sum2  = 0.0;
1914*9371c9d4SSatish Balay     sum3  = 0.0;
1915*9371c9d4SSatish Balay     sum4  = 0.0;
1916*9371c9d4SSatish Balay     sum5  = 0.0;
1917*9371c9d4SSatish Balay     sum6  = 0.0;
1918*9371c9d4SSatish Balay     sum7  = 0.0;
1919*9371c9d4SSatish Balay     sum8  = 0.0;
1920*9371c9d4SSatish Balay     sum9  = 0.0;
1921*9371c9d4SSatish Balay     sum10 = 0.0;
1922*9371c9d4SSatish Balay     sum11 = 0.0;
1923*9371c9d4SSatish Balay     sum12 = 0.0;
1924*9371c9d4SSatish Balay     sum13 = 0.0;
1925*9371c9d4SSatish Balay     sum14 = 0.0;
1926*9371c9d4SSatish Balay     sum15 = 0.0;
19278ab949d8SShri Abhyankar 
19288ab949d8SShri Abhyankar     for (j = 0; j < n; j++) {
19298ab949d8SShri Abhyankar       xb  = x + 15 * (idx[j]);
1930*9371c9d4SSatish Balay       x1  = xb[0];
1931*9371c9d4SSatish Balay       x2  = xb[1];
1932*9371c9d4SSatish Balay       x3  = xb[2];
1933*9371c9d4SSatish Balay       x4  = xb[3];
1934*9371c9d4SSatish Balay       x5  = xb[4];
1935*9371c9d4SSatish Balay       x6  = xb[5];
1936*9371c9d4SSatish Balay       x7  = xb[6];
1937*9371c9d4SSatish Balay       x8  = xb[7];
1938*9371c9d4SSatish Balay       x9  = xb[8];
1939*9371c9d4SSatish Balay       x10 = xb[9];
1940*9371c9d4SSatish Balay       x11 = xb[10];
1941*9371c9d4SSatish Balay       x12 = xb[11];
1942*9371c9d4SSatish Balay       x13 = xb[12];
1943*9371c9d4SSatish Balay       x14 = xb[13];
1944*9371c9d4SSatish Balay       x15 = xb[14];
19458ab949d8SShri Abhyankar 
19468ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
19478ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
19488ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
19498ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
19508ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
19518ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
19528ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
19538ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
19548ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
19558ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
19568ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
19578ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
19588ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
19598ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
19608ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
19618ab949d8SShri Abhyankar       v += 225;
19628ab949d8SShri Abhyankar     }
19638ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15 * ridx[i];
1964*9371c9d4SSatish Balay     z[0]  = sum1;
1965*9371c9d4SSatish Balay     z[1]  = sum2;
1966*9371c9d4SSatish Balay     z[2]  = sum3;
1967*9371c9d4SSatish Balay     z[3]  = sum4;
1968*9371c9d4SSatish Balay     z[4]  = sum5;
1969*9371c9d4SSatish Balay     z[5]  = sum6;
1970*9371c9d4SSatish Balay     z[6]  = sum7;
1971*9371c9d4SSatish Balay     z[7]  = sum8;
1972*9371c9d4SSatish Balay     z[8]  = sum9;
1973*9371c9d4SSatish Balay     z[9]  = sum10;
1974*9371c9d4SSatish Balay     z[10] = sum11;
1975*9371c9d4SSatish Balay     z[11] = sum12;
1976*9371c9d4SSatish Balay     z[12] = sum13;
1977*9371c9d4SSatish Balay     z[13] = sum14;
1978*9371c9d4SSatish Balay     z[14] = sum15;
19798ab949d8SShri Abhyankar 
19808ab949d8SShri Abhyankar     if (!usecprow) z += 15;
19818ab949d8SShri Abhyankar   }
19828ab949d8SShri Abhyankar 
19839566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
19849566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
19859566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
19868ab949d8SShri Abhyankar   PetscFunctionReturn(0);
19878ab949d8SShri Abhyankar }
19888ab949d8SShri Abhyankar 
19893f1db9ecSBarry Smith /*
19903f1db9ecSBarry Smith     This will not work with MatScalar == float because it calls the BLAS
19913f1db9ecSBarry Smith */
1992*9371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz) {
19932d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1994f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, *work, *workt, *zarray;
1995d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
1996d9ca1df4SBarry Smith   const MatScalar   *v;
1997d9ca1df4SBarry Smith   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1998d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
1999d9ca1df4SBarry Smith   PetscInt           ncols, k;
2000ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
20012d61bbb3SSatish Balay 
20022d61bbb3SSatish Balay   PetscFunctionBegin;
20039566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
20049566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
20052d61bbb3SSatish Balay 
20062d61bbb3SSatish Balay   idx = a->j;
20072d61bbb3SSatish Balay   v   = a->a;
200826e093fcSHong Zhang   if (usecprow) {
200926e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
201026e093fcSHong Zhang     ii   = a->compressedrow.i;
20117b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
20129566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
201326e093fcSHong Zhang   } else {
201426e093fcSHong Zhang     mbs = a->mbs;
20152d61bbb3SSatish Balay     ii  = a->i;
201626e093fcSHong Zhang     z   = zarray;
201726e093fcSHong Zhang   }
2018218c64b6SSatish Balay 
20192d61bbb3SSatish Balay   if (!a->mult_work) {
2020d0f46423SBarry Smith     k = PetscMax(A->rmap->n, A->cmap->n);
20219566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
20222d61bbb3SSatish Balay   }
20232d61bbb3SSatish Balay   work = a->mult_work;
20242d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
2025*9371c9d4SSatish Balay     n = ii[1] - ii[0];
2026*9371c9d4SSatish Balay     ii++;
20272d61bbb3SSatish Balay     ncols = n * bs;
20282d61bbb3SSatish Balay     workt = work;
20292d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
20302d61bbb3SSatish Balay       xb = x + bs * (*idx++);
20312d61bbb3SSatish Balay       for (k = 0; k < bs; k++) workt[k] = xb[k];
20322d61bbb3SSatish Balay       workt += bs;
20332d61bbb3SSatish Balay     }
20347b2bb3b9SHong Zhang     if (usecprow) z = zarray + bs * ridx[i];
203596b95a6bSBarry Smith     PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
20362d61bbb3SSatish Balay     v += n * bs2;
203726e093fcSHong Zhang     if (!usecprow) z += bs;
20382d61bbb3SSatish Balay   }
20399566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
20409566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
20419566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
20422d61bbb3SSatish Balay   PetscFunctionReturn(0);
20432d61bbb3SSatish Balay }
20442d61bbb3SSatish Balay 
2045*9371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) {
20462d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2047122f12eaSBarry Smith   const PetscScalar *x;
2048122f12eaSBarry Smith   PetscScalar       *y, *z, sum;
2049122f12eaSBarry Smith   const MatScalar   *v;
20507c565772SBarry Smith   PetscInt           mbs = a->mbs, i, n, *ridx = NULL;
2051122f12eaSBarry Smith   const PetscInt    *idx, *ii;
2052ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
20532d61bbb3SSatish Balay 
20542d61bbb3SSatish Balay   PetscFunctionBegin;
20559566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
20569566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
20572d61bbb3SSatish Balay 
20582d61bbb3SSatish Balay   idx = a->j;
20592d61bbb3SSatish Balay   v   = a->a;
206026e093fcSHong Zhang   if (usecprow) {
2061*9371c9d4SSatish Balay     if (zz != yy) { PetscCall(PetscArraycpy(z, y, mbs)); }
206226e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
206326e093fcSHong Zhang     ii   = a->compressedrow.i;
20647b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
206526e093fcSHong Zhang   } else {
20662d61bbb3SSatish Balay     ii = a->i;
206726e093fcSHong Zhang   }
20682d61bbb3SSatish Balay 
20692d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
2070122f12eaSBarry Smith     n = ii[1] - ii[0];
2071122f12eaSBarry Smith     ii++;
207226e093fcSHong Zhang     if (!usecprow) {
2073122f12eaSBarry Smith       sum = y[i];
2074122f12eaSBarry Smith     } else {
2075122f12eaSBarry Smith       sum = y[ridx[i]];
2076122f12eaSBarry Smith     }
2077444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2078444d8c10SJed Brown     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
2079122f12eaSBarry Smith     PetscSparseDensePlusDot(sum, x, v, idx, n);
2080122f12eaSBarry Smith     v += n;
2081122f12eaSBarry Smith     idx += n;
2082122f12eaSBarry Smith     if (usecprow) {
2083122f12eaSBarry Smith       z[ridx[i]] = sum;
2084122f12eaSBarry Smith     } else {
2085122f12eaSBarry Smith       z[i] = sum;
208626e093fcSHong Zhang     }
20872d61bbb3SSatish Balay   }
20889566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
20899566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
20909566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz));
20912d61bbb3SSatish Balay   PetscFunctionReturn(0);
20922d61bbb3SSatish Balay }
20932d61bbb3SSatish Balay 
2094*9371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) {
20952d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2096f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2;
2097d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
209826e093fcSHong Zhang   PetscScalar        x1, x2, *yarray, *zarray;
2099d9ca1df4SBarry Smith   const MatScalar   *v;
2100d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, j;
2101d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2102ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
21032d61bbb3SSatish Balay 
21042d61bbb3SSatish Balay   PetscFunctionBegin;
21059566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
21069566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
21072d61bbb3SSatish Balay 
21082d61bbb3SSatish Balay   idx = a->j;
21092d61bbb3SSatish Balay   v   = a->a;
211026e093fcSHong Zhang   if (usecprow) {
2111*9371c9d4SSatish Balay     if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs)); }
211226e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
211326e093fcSHong Zhang     ii   = a->compressedrow.i;
21147b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
211526e093fcSHong Zhang   } else {
21162d61bbb3SSatish Balay     ii = a->i;
211726e093fcSHong Zhang     y  = yarray;
211826e093fcSHong Zhang     z  = zarray;
211926e093fcSHong Zhang   }
21202d61bbb3SSatish Balay 
21212d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
2122*9371c9d4SSatish Balay     n = ii[1] - ii[0];
2123*9371c9d4SSatish Balay     ii++;
212426e093fcSHong Zhang     if (usecprow) {
21257b2bb3b9SHong Zhang       z = zarray + 2 * ridx[i];
21267b2bb3b9SHong Zhang       y = yarray + 2 * ridx[i];
212726e093fcSHong Zhang     }
2128*9371c9d4SSatish Balay     sum1 = y[0];
2129*9371c9d4SSatish Balay     sum2 = y[1];
2130444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2131444d8c10SJed Brown     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
21322d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
213326fbe8dcSKarl Rupp       xb = x + 2 * (*idx++);
213426fbe8dcSKarl Rupp       x1 = xb[0];
213526fbe8dcSKarl Rupp       x2 = xb[1];
213626fbe8dcSKarl Rupp 
21372d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[2] * x2;
21382d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[3] * x2;
21392d61bbb3SSatish Balay       v += 4;
21402d61bbb3SSatish Balay     }
2141*9371c9d4SSatish Balay     z[0] = sum1;
2142*9371c9d4SSatish Balay     z[1] = sum2;
214326e093fcSHong Zhang     if (!usecprow) {
2144*9371c9d4SSatish Balay       z += 2;
2145*9371c9d4SSatish Balay       y += 2;
21462d61bbb3SSatish Balay     }
214726e093fcSHong Zhang   }
21489566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
21499566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
21509566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(4.0 * a->nz));
21512d61bbb3SSatish Balay   PetscFunctionReturn(0);
21522d61bbb3SSatish Balay }
21532d61bbb3SSatish Balay 
2154*9371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) {
21552d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2156f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2157d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
2158d9ca1df4SBarry Smith   const MatScalar   *v;
2159d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2160d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2161ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
21622d61bbb3SSatish Balay 
21632d61bbb3SSatish Balay   PetscFunctionBegin;
21649566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
21659566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
21662d61bbb3SSatish Balay 
21672d61bbb3SSatish Balay   idx = a->j;
21682d61bbb3SSatish Balay   v   = a->a;
216926e093fcSHong Zhang   if (usecprow) {
2170*9371c9d4SSatish Balay     if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs)); }
217126e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
217226e093fcSHong Zhang     ii   = a->compressedrow.i;
21737b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
217426e093fcSHong Zhang   } else {
21752d61bbb3SSatish Balay     ii = a->i;
217626e093fcSHong Zhang     y  = yarray;
217726e093fcSHong Zhang     z  = zarray;
217826e093fcSHong Zhang   }
21792d61bbb3SSatish Balay 
21802d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
2181*9371c9d4SSatish Balay     n = ii[1] - ii[0];
2182*9371c9d4SSatish Balay     ii++;
218326e093fcSHong Zhang     if (usecprow) {
21847b2bb3b9SHong Zhang       z = zarray + 3 * ridx[i];
21857b2bb3b9SHong Zhang       y = yarray + 3 * ridx[i];
218626e093fcSHong Zhang     }
2187*9371c9d4SSatish Balay     sum1 = y[0];
2188*9371c9d4SSatish Balay     sum2 = y[1];
2189*9371c9d4SSatish Balay     sum3 = y[2];
2190444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2191444d8c10SJed Brown     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
21922d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
2193*9371c9d4SSatish Balay       xb = x + 3 * (*idx++);
2194*9371c9d4SSatish Balay       x1 = xb[0];
2195*9371c9d4SSatish Balay       x2 = xb[1];
2196*9371c9d4SSatish Balay       x3 = xb[2];
21972d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
21982d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
21992d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
22002d61bbb3SSatish Balay       v += 9;
22012d61bbb3SSatish Balay     }
2202*9371c9d4SSatish Balay     z[0] = sum1;
2203*9371c9d4SSatish Balay     z[1] = sum2;
2204*9371c9d4SSatish Balay     z[2] = sum3;
220526e093fcSHong Zhang     if (!usecprow) {
2206*9371c9d4SSatish Balay       z += 3;
2207*9371c9d4SSatish Balay       y += 3;
22082d61bbb3SSatish Balay     }
220926e093fcSHong Zhang   }
22109566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
22119566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
22129566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0 * a->nz));
22132d61bbb3SSatish Balay   PetscFunctionReturn(0);
22142d61bbb3SSatish Balay }
22152d61bbb3SSatish Balay 
2216*9371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) {
22172d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2218f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2219d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
2220d9ca1df4SBarry Smith   const MatScalar   *v;
2221d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2222d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2223ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
22242d61bbb3SSatish Balay 
22252d61bbb3SSatish Balay   PetscFunctionBegin;
22269566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
22279566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
22282d61bbb3SSatish Balay 
22292d61bbb3SSatish Balay   idx = a->j;
22302d61bbb3SSatish Balay   v   = a->a;
223126e093fcSHong Zhang   if (usecprow) {
2232*9371c9d4SSatish Balay     if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs)); }
223326e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
223426e093fcSHong Zhang     ii   = a->compressedrow.i;
22357b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
223626e093fcSHong Zhang   } else {
22372d61bbb3SSatish Balay     ii = a->i;
223826e093fcSHong Zhang     y  = yarray;
223926e093fcSHong Zhang     z  = zarray;
224026e093fcSHong Zhang   }
22412d61bbb3SSatish Balay 
22422d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
2243*9371c9d4SSatish Balay     n = ii[1] - ii[0];
2244*9371c9d4SSatish Balay     ii++;
224526e093fcSHong Zhang     if (usecprow) {
22467b2bb3b9SHong Zhang       z = zarray + 4 * ridx[i];
22477b2bb3b9SHong Zhang       y = yarray + 4 * ridx[i];
224826e093fcSHong Zhang     }
2249*9371c9d4SSatish Balay     sum1 = y[0];
2250*9371c9d4SSatish Balay     sum2 = y[1];
2251*9371c9d4SSatish Balay     sum3 = y[2];
2252*9371c9d4SSatish Balay     sum4 = y[3];
2253444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2254444d8c10SJed Brown     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
22552d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
22562d61bbb3SSatish Balay       xb = x + 4 * (*idx++);
2257*9371c9d4SSatish Balay       x1 = xb[0];
2258*9371c9d4SSatish Balay       x2 = xb[1];
2259*9371c9d4SSatish Balay       x3 = xb[2];
2260*9371c9d4SSatish Balay       x4 = xb[3];
22612d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
22622d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
22632d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
22642d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
22652d61bbb3SSatish Balay       v += 16;
22662d61bbb3SSatish Balay     }
2267*9371c9d4SSatish Balay     z[0] = sum1;
2268*9371c9d4SSatish Balay     z[1] = sum2;
2269*9371c9d4SSatish Balay     z[2] = sum3;
2270*9371c9d4SSatish Balay     z[3] = sum4;
227126e093fcSHong Zhang     if (!usecprow) {
2272*9371c9d4SSatish Balay       z += 4;
2273*9371c9d4SSatish Balay       y += 4;
22742d61bbb3SSatish Balay     }
227526e093fcSHong Zhang   }
22769566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
22779566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
22789566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0 * a->nz));
22792d61bbb3SSatish Balay   PetscFunctionReturn(0);
22802d61bbb3SSatish Balay }
22812d61bbb3SSatish Balay 
2282*9371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) {
22832d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2284f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2285d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
228626e093fcSHong Zhang   PetscScalar       *yarray, *zarray;
2287d9ca1df4SBarry Smith   const MatScalar   *v;
2288d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2289d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2290ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
22912d61bbb3SSatish Balay 
22922d61bbb3SSatish Balay   PetscFunctionBegin;
22939566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
22949566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
22952d61bbb3SSatish Balay 
22962d61bbb3SSatish Balay   idx = a->j;
22972d61bbb3SSatish Balay   v   = a->a;
229826e093fcSHong Zhang   if (usecprow) {
2299*9371c9d4SSatish Balay     if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs)); }
230026e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
230126e093fcSHong Zhang     ii   = a->compressedrow.i;
23027b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
230326e093fcSHong Zhang   } else {
23042d61bbb3SSatish Balay     ii = a->i;
230526e093fcSHong Zhang     y  = yarray;
230626e093fcSHong Zhang     z  = zarray;
230726e093fcSHong Zhang   }
23082d61bbb3SSatish Balay 
23092d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
2310*9371c9d4SSatish Balay     n = ii[1] - ii[0];
2311*9371c9d4SSatish Balay     ii++;
231226e093fcSHong Zhang     if (usecprow) {
23137b2bb3b9SHong Zhang       z = zarray + 5 * ridx[i];
23147b2bb3b9SHong Zhang       y = yarray + 5 * ridx[i];
231526e093fcSHong Zhang     }
2316*9371c9d4SSatish Balay     sum1 = y[0];
2317*9371c9d4SSatish Balay     sum2 = y[1];
2318*9371c9d4SSatish Balay     sum3 = y[2];
2319*9371c9d4SSatish Balay     sum4 = y[3];
2320*9371c9d4SSatish Balay     sum5 = y[4];
2321444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2322444d8c10SJed Brown     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
23232d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
23242d61bbb3SSatish Balay       xb = x + 5 * (*idx++);
2325*9371c9d4SSatish Balay       x1 = xb[0];
2326*9371c9d4SSatish Balay       x2 = xb[1];
2327*9371c9d4SSatish Balay       x3 = xb[2];
2328*9371c9d4SSatish Balay       x4 = xb[3];
2329*9371c9d4SSatish Balay       x5 = xb[4];
23302d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
23312d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
23322d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
23332d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
23342d61bbb3SSatish Balay       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
23352d61bbb3SSatish Balay       v += 25;
23362d61bbb3SSatish Balay     }
2337*9371c9d4SSatish Balay     z[0] = sum1;
2338*9371c9d4SSatish Balay     z[1] = sum2;
2339*9371c9d4SSatish Balay     z[2] = sum3;
2340*9371c9d4SSatish Balay     z[3] = sum4;
2341*9371c9d4SSatish Balay     z[4] = sum5;
234226e093fcSHong Zhang     if (!usecprow) {
2343*9371c9d4SSatish Balay       z += 5;
2344*9371c9d4SSatish Balay       y += 5;
23452d61bbb3SSatish Balay     }
234626e093fcSHong Zhang   }
23479566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
23489566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
23499566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0 * a->nz));
23502d61bbb3SSatish Balay   PetscFunctionReturn(0);
23512d61bbb3SSatish Balay }
2352c2916339SPierre Jolivet 
2353*9371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) {
235415091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2355f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2356d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
235726e093fcSHong Zhang   PetscScalar        x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2358d9ca1df4SBarry Smith   const MatScalar   *v;
2359d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2360d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2361ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
236215091d37SBarry Smith 
236315091d37SBarry Smith   PetscFunctionBegin;
23649566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
23659566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
236615091d37SBarry Smith 
236715091d37SBarry Smith   idx = a->j;
236815091d37SBarry Smith   v   = a->a;
236926e093fcSHong Zhang   if (usecprow) {
2370*9371c9d4SSatish Balay     if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs)); }
237126e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
237226e093fcSHong Zhang     ii   = a->compressedrow.i;
23737b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
237426e093fcSHong Zhang   } else {
237515091d37SBarry Smith     ii = a->i;
237626e093fcSHong Zhang     y  = yarray;
237726e093fcSHong Zhang     z  = zarray;
237826e093fcSHong Zhang   }
237915091d37SBarry Smith 
238015091d37SBarry Smith   for (i = 0; i < mbs; i++) {
2381*9371c9d4SSatish Balay     n = ii[1] - ii[0];
2382*9371c9d4SSatish Balay     ii++;
238326e093fcSHong Zhang     if (usecprow) {
23847b2bb3b9SHong Zhang       z = zarray + 6 * ridx[i];
23857b2bb3b9SHong Zhang       y = yarray + 6 * ridx[i];
238626e093fcSHong Zhang     }
2387*9371c9d4SSatish Balay     sum1 = y[0];
2388*9371c9d4SSatish Balay     sum2 = y[1];
2389*9371c9d4SSatish Balay     sum3 = y[2];
2390*9371c9d4SSatish Balay     sum4 = y[3];
2391*9371c9d4SSatish Balay     sum5 = y[4];
2392*9371c9d4SSatish Balay     sum6 = y[5];
2393444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2394444d8c10SJed Brown     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
239515091d37SBarry Smith     for (j = 0; j < n; j++) {
23963b95cb0eSSatish Balay       xb = x + 6 * (*idx++);
2397*9371c9d4SSatish Balay       x1 = xb[0];
2398*9371c9d4SSatish Balay       x2 = xb[1];
2399*9371c9d4SSatish Balay       x3 = xb[2];
2400*9371c9d4SSatish Balay       x4 = xb[3];
2401*9371c9d4SSatish Balay       x5 = xb[4];
2402*9371c9d4SSatish Balay       x6 = xb[5];
240315091d37SBarry Smith       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
240415091d37SBarry Smith       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
240515091d37SBarry Smith       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
240615091d37SBarry Smith       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
240715091d37SBarry Smith       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
240815091d37SBarry Smith       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
240915091d37SBarry Smith       v += 36;
241015091d37SBarry Smith     }
2411*9371c9d4SSatish Balay     z[0] = sum1;
2412*9371c9d4SSatish Balay     z[1] = sum2;
2413*9371c9d4SSatish Balay     z[2] = sum3;
2414*9371c9d4SSatish Balay     z[3] = sum4;
2415*9371c9d4SSatish Balay     z[4] = sum5;
2416*9371c9d4SSatish Balay     z[5] = sum6;
241726e093fcSHong Zhang     if (!usecprow) {
2418*9371c9d4SSatish Balay       z += 6;
2419*9371c9d4SSatish Balay       y += 6;
242015091d37SBarry Smith     }
242126e093fcSHong Zhang   }
24229566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
24239566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
24249566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0 * a->nz));
242515091d37SBarry Smith   PetscFunctionReturn(0);
242615091d37SBarry Smith }
24272d61bbb3SSatish Balay 
2428*9371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) {
24292d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2430f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2431d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
243226e093fcSHong Zhang   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2433d9ca1df4SBarry Smith   const MatScalar   *v;
2434d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2435d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2436ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
24372d61bbb3SSatish Balay 
24382d61bbb3SSatish Balay   PetscFunctionBegin;
24399566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
24409566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
24412d61bbb3SSatish Balay 
24422d61bbb3SSatish Balay   idx = a->j;
24432d61bbb3SSatish Balay   v   = a->a;
244426e093fcSHong Zhang   if (usecprow) {
2445*9371c9d4SSatish Balay     if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); }
244626e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
244726e093fcSHong Zhang     ii   = a->compressedrow.i;
24487b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
244926e093fcSHong Zhang   } else {
24502d61bbb3SSatish Balay     ii = a->i;
245126e093fcSHong Zhang     y  = yarray;
245226e093fcSHong Zhang     z  = zarray;
245326e093fcSHong Zhang   }
24542d61bbb3SSatish Balay 
24552d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
2456*9371c9d4SSatish Balay     n = ii[1] - ii[0];
2457*9371c9d4SSatish Balay     ii++;
245826e093fcSHong Zhang     if (usecprow) {
24597b2bb3b9SHong Zhang       z = zarray + 7 * ridx[i];
24607b2bb3b9SHong Zhang       y = yarray + 7 * ridx[i];
246126e093fcSHong Zhang     }
2462*9371c9d4SSatish Balay     sum1 = y[0];
2463*9371c9d4SSatish Balay     sum2 = y[1];
2464*9371c9d4SSatish Balay     sum3 = y[2];
2465*9371c9d4SSatish Balay     sum4 = y[3];
2466*9371c9d4SSatish Balay     sum5 = y[4];
2467*9371c9d4SSatish Balay     sum6 = y[5];
2468*9371c9d4SSatish Balay     sum7 = y[6];
2469444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2470444d8c10SJed Brown     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
24712d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
24722d61bbb3SSatish Balay       xb = x + 7 * (*idx++);
2473*9371c9d4SSatish Balay       x1 = xb[0];
2474*9371c9d4SSatish Balay       x2 = xb[1];
2475*9371c9d4SSatish Balay       x3 = xb[2];
2476*9371c9d4SSatish Balay       x4 = xb[3];
2477*9371c9d4SSatish Balay       x5 = xb[4];
2478*9371c9d4SSatish Balay       x6 = xb[5];
2479*9371c9d4SSatish Balay       x7 = xb[6];
24802d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
24812d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
24822d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
24832d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
24842d61bbb3SSatish Balay       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
24852d61bbb3SSatish Balay       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
24862d61bbb3SSatish Balay       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
24872d61bbb3SSatish Balay       v += 49;
24882d61bbb3SSatish Balay     }
2489*9371c9d4SSatish Balay     z[0] = sum1;
2490*9371c9d4SSatish Balay     z[1] = sum2;
2491*9371c9d4SSatish Balay     z[2] = sum3;
2492*9371c9d4SSatish Balay     z[3] = sum4;
2493*9371c9d4SSatish Balay     z[4] = sum5;
2494*9371c9d4SSatish Balay     z[5] = sum6;
2495*9371c9d4SSatish Balay     z[6] = sum7;
249626e093fcSHong Zhang     if (!usecprow) {
2497*9371c9d4SSatish Balay       z += 7;
2498*9371c9d4SSatish Balay       y += 7;
24992d61bbb3SSatish Balay     }
250026e093fcSHong Zhang   }
25019566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
25029566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
25039566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0 * a->nz));
25042d61bbb3SSatish Balay   PetscFunctionReturn(0);
25052d61bbb3SSatish Balay }
2506218c64b6SSatish Balay 
25075f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2508*9371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz) {
250996e086a2SDaniel Kokron   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2510f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, *work, *workt, *zarray;
251196e086a2SDaniel Kokron   const PetscScalar *x, *xb;
251296e086a2SDaniel Kokron   const MatScalar   *v;
25136679dcc1SBarry Smith   PetscInt           mbs, i, j, n;
2514ce68d72fSJed Brown   PetscInt           k;
251596e086a2SDaniel Kokron   PetscBool          usecprow = a->compressedrow.use;
25166679dcc1SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;
251796e086a2SDaniel Kokron 
251896e086a2SDaniel Kokron   __m256d a0, a1, a2, a3, a4, a5;
2519ce68d72fSJed Brown   __m256d w0, w1, w2, w3;
252096e086a2SDaniel Kokron   __m256d z0, z1, z2;
252196e086a2SDaniel Kokron   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
252296e086a2SDaniel Kokron 
252396e086a2SDaniel Kokron   PetscFunctionBegin;
25249566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
25259566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
25269566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &zarray));
252796e086a2SDaniel Kokron 
252896e086a2SDaniel Kokron   idx = a->j;
252996e086a2SDaniel Kokron   v   = a->a;
253096e086a2SDaniel Kokron   if (usecprow) {
253196e086a2SDaniel Kokron     mbs  = a->compressedrow.nrows;
253296e086a2SDaniel Kokron     ii   = a->compressedrow.i;
253396e086a2SDaniel Kokron     ridx = a->compressedrow.rindex;
253496e086a2SDaniel Kokron   } else {
253596e086a2SDaniel Kokron     mbs = a->mbs;
253696e086a2SDaniel Kokron     ii  = a->i;
253796e086a2SDaniel Kokron     z   = zarray;
253896e086a2SDaniel Kokron   }
253996e086a2SDaniel Kokron 
254096e086a2SDaniel Kokron   if (!a->mult_work) {
254196e086a2SDaniel Kokron     k = PetscMax(A->rmap->n, A->cmap->n);
25429566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
254396e086a2SDaniel Kokron   }
254496e086a2SDaniel Kokron 
254596e086a2SDaniel Kokron   work = a->mult_work;
254696e086a2SDaniel Kokron   for (i = 0; i < mbs; i++) {
2547*9371c9d4SSatish Balay     n = ii[1] - ii[0];
2548*9371c9d4SSatish Balay     ii++;
254996e086a2SDaniel Kokron     workt = work;
255096e086a2SDaniel Kokron     for (j = 0; j < n; j++) {
255196e086a2SDaniel Kokron       xb = x + bs * (*idx++);
255296e086a2SDaniel Kokron       for (k = 0; k < bs; k++) workt[k] = xb[k];
255396e086a2SDaniel Kokron       workt += bs;
255496e086a2SDaniel Kokron     }
255596e086a2SDaniel Kokron     if (usecprow) z = zarray + bs * ridx[i];
255696e086a2SDaniel Kokron 
2557*9371c9d4SSatish Balay     z0 = _mm256_loadu_pd(&z[0]);
2558*9371c9d4SSatish Balay     z1 = _mm256_loadu_pd(&z[4]);
2559*9371c9d4SSatish Balay     z2 = _mm256_set1_pd(z[8]);
256096e086a2SDaniel Kokron 
256196e086a2SDaniel Kokron     for (j = 0; j < n; j++) {
2562c05b70c4SSatish Balay       /* first column of a */
256396e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9]);
2564*9371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81]);
2565*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
2566*9371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
2567*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
2568*9371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
2569*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
257096e086a2SDaniel Kokron 
2571c05b70c4SSatish Balay       /* second column of a */
257296e086a2SDaniel Kokron       w1 = _mm256_set1_pd(work[j * 9 + 1]);
2573*9371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
2574*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w1, z0);
2575*9371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
2576*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w1, z1);
2577*9371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
2578*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w1, z2);
257996e086a2SDaniel Kokron 
2580c05b70c4SSatish Balay       /* third column of a */
258196e086a2SDaniel Kokron       w2 = _mm256_set1_pd(work[j * 9 + 2]);
2582*9371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
2583*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w2, z0);
2584*9371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
2585*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w2, z1);
2586*9371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
2587*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w2, z2);
258896e086a2SDaniel Kokron 
2589c05b70c4SSatish Balay       /* fourth column of a */
259096e086a2SDaniel Kokron       w3 = _mm256_set1_pd(work[j * 9 + 3]);
2591*9371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
2592*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w3, z0);
2593*9371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
2594*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w3, z1);
2595*9371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
2596*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w3, z2);
259796e086a2SDaniel Kokron 
2598c05b70c4SSatish Balay       /* fifth column of a */
259996e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9 + 4]);
2600*9371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
2601*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w0, z0);
2602*9371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
2603*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w0, z1);
2604*9371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
2605*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w0, z2);
260696e086a2SDaniel Kokron 
2607c05b70c4SSatish Balay       /* sixth column of a */
260896e086a2SDaniel Kokron       w1 = _mm256_set1_pd(work[j * 9 + 5]);
2609*9371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
2610*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w1, z0);
2611*9371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
2612*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w1, z1);
2613*9371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
2614*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w1, z2);
261596e086a2SDaniel Kokron 
2616c05b70c4SSatish Balay       /* seventh column of a */
261796e086a2SDaniel Kokron       w2 = _mm256_set1_pd(work[j * 9 + 6]);
2618*9371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
2619*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
2620*9371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
2621*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
2622*9371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
2623*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
262496e086a2SDaniel Kokron 
26256aad120cSJose E. Roman       /* eighth column of a */
262696e086a2SDaniel Kokron       w3 = _mm256_set1_pd(work[j * 9 + 7]);
2627*9371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
2628*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
2629*9371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
2630*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
2631*9371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
2632*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
263396e086a2SDaniel Kokron 
2634c05b70c4SSatish Balay       /* ninth column of a */
263596e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9 + 8]);
2636*9371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
2637*9371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
2638*9371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
2639*9371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
2640*9371c9d4SSatish Balay       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
2641*9371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
264296e086a2SDaniel Kokron     }
264396e086a2SDaniel Kokron 
2644*9371c9d4SSatish Balay     _mm256_storeu_pd(&z[0], z0);
2645*9371c9d4SSatish Balay     _mm256_storeu_pd(&z[4], z1);
2646*9371c9d4SSatish Balay     _mm256_maskstore_pd(&z[8], mask1, z2);
264796e086a2SDaniel Kokron 
264896e086a2SDaniel Kokron     v += n * bs2;
264996e086a2SDaniel Kokron     if (!usecprow) z += bs;
265096e086a2SDaniel Kokron   }
26519566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
26529566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &zarray));
26539566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(162.0 * a->nz));
265496e086a2SDaniel Kokron   PetscFunctionReturn(0);
265596e086a2SDaniel Kokron }
265696e086a2SDaniel Kokron #endif
265796e086a2SDaniel Kokron 
2658*9371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz) {
2659ebada01fSBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2660f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2661ebada01fSBarry Smith   const PetscScalar *x, *xb;
2662ebada01fSBarry Smith   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2663ebada01fSBarry Smith   const MatScalar   *v;
2664ebada01fSBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2665ebada01fSBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2666ebada01fSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
2667ebada01fSBarry Smith 
2668ebada01fSBarry Smith   PetscFunctionBegin;
26699566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
26709566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2671ebada01fSBarry Smith 
2672ebada01fSBarry Smith   idx = a->j;
2673ebada01fSBarry Smith   v   = a->a;
2674ebada01fSBarry Smith   if (usecprow) {
2675*9371c9d4SSatish Balay     if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); }
2676ebada01fSBarry Smith     mbs  = a->compressedrow.nrows;
2677ebada01fSBarry Smith     ii   = a->compressedrow.i;
2678ebada01fSBarry Smith     ridx = a->compressedrow.rindex;
2679ebada01fSBarry Smith   } else {
2680ebada01fSBarry Smith     ii = a->i;
2681ebada01fSBarry Smith     y  = yarray;
2682ebada01fSBarry Smith     z  = zarray;
2683ebada01fSBarry Smith   }
2684ebada01fSBarry Smith 
2685ebada01fSBarry Smith   for (i = 0; i < mbs; i++) {
2686*9371c9d4SSatish Balay     n = ii[1] - ii[0];
2687*9371c9d4SSatish Balay     ii++;
2688ebada01fSBarry Smith     if (usecprow) {
2689ebada01fSBarry Smith       z = zarray + 11 * ridx[i];
2690ebada01fSBarry Smith       y = yarray + 11 * ridx[i];
2691ebada01fSBarry Smith     }
2692*9371c9d4SSatish Balay     sum1  = y[0];
2693*9371c9d4SSatish Balay     sum2  = y[1];
2694*9371c9d4SSatish Balay     sum3  = y[2];
2695*9371c9d4SSatish Balay     sum4  = y[3];
2696*9371c9d4SSatish Balay     sum5  = y[4];
2697*9371c9d4SSatish Balay     sum6  = y[5];
2698*9371c9d4SSatish Balay     sum7  = y[6];
2699*9371c9d4SSatish Balay     sum8  = y[7];
2700*9371c9d4SSatish Balay     sum9  = y[8];
2701*9371c9d4SSatish Balay     sum10 = y[9];
2702*9371c9d4SSatish Balay     sum11 = y[10];
2703ebada01fSBarry Smith     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);           /* Indices for the next row (assumes same size as this one) */
2704ebada01fSBarry Smith     PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2705ebada01fSBarry Smith     for (j = 0; j < n; j++) {
2706ebada01fSBarry Smith       xb  = x + 11 * (*idx++);
2707*9371c9d4SSatish Balay       x1  = xb[0];
2708*9371c9d4SSatish Balay       x2  = xb[1];
2709*9371c9d4SSatish Balay       x3  = xb[2];
2710*9371c9d4SSatish Balay       x4  = xb[3];
2711*9371c9d4SSatish Balay       x5  = xb[4];
2712*9371c9d4SSatish Balay       x6  = xb[5];
2713*9371c9d4SSatish Balay       x7  = xb[6];
2714*9371c9d4SSatish Balay       x8  = xb[7];
2715*9371c9d4SSatish Balay       x9  = xb[8];
2716*9371c9d4SSatish Balay       x10 = xb[9];
2717*9371c9d4SSatish Balay       x11 = xb[10];
2718ebada01fSBarry Smith       sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2719ebada01fSBarry Smith       sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2720ebada01fSBarry Smith       sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2721ebada01fSBarry Smith       sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2722ebada01fSBarry Smith       sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2723ebada01fSBarry Smith       sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2724ebada01fSBarry Smith       sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2725ebada01fSBarry Smith       sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2726ebada01fSBarry Smith       sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2727ebada01fSBarry Smith       sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2728ebada01fSBarry Smith       sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2729ebada01fSBarry Smith       v += 121;
2730ebada01fSBarry Smith     }
2731*9371c9d4SSatish Balay     z[0]  = sum1;
2732*9371c9d4SSatish Balay     z[1]  = sum2;
2733*9371c9d4SSatish Balay     z[2]  = sum3;
2734*9371c9d4SSatish Balay     z[3]  = sum4;
2735*9371c9d4SSatish Balay     z[4]  = sum5;
2736*9371c9d4SSatish Balay     z[5]  = sum6;
2737*9371c9d4SSatish Balay     z[6]  = sum7;
2738*9371c9d4SSatish Balay     z[7]  = sum8;
2739*9371c9d4SSatish Balay     z[8]  = sum9;
2740*9371c9d4SSatish Balay     z[9]  = sum10;
2741*9371c9d4SSatish Balay     z[10] = sum11;
2742ebada01fSBarry Smith     if (!usecprow) {
2743*9371c9d4SSatish Balay       z += 11;
2744*9371c9d4SSatish Balay       y += 11;
2745ebada01fSBarry Smith     }
2746ebada01fSBarry Smith   }
27479566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
27489566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
27499566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(242.0 * a->nz));
2750ebada01fSBarry Smith   PetscFunctionReturn(0);
2751ebada01fSBarry Smith }
2752ebada01fSBarry Smith 
2753*9371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) {
27542d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2755f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, *work, *workt, *zarray;
2756d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
2757d9ca1df4SBarry Smith   const MatScalar   *v;
2758d9ca1df4SBarry Smith   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2759d9ca1df4SBarry Smith   PetscInt           ncols, k;
2760d9ca1df4SBarry Smith   const PetscInt    *ridx     = NULL, *idx, *ii;
2761ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
2762218c64b6SSatish Balay 
27632d61bbb3SSatish Balay   PetscFunctionBegin;
27649566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
27659566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
27669566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &zarray));
27672d61bbb3SSatish Balay 
27682d61bbb3SSatish Balay   idx = a->j;
27692d61bbb3SSatish Balay   v   = a->a;
277026e093fcSHong Zhang   if (usecprow) {
277126e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
277226e093fcSHong Zhang     ii   = a->compressedrow.i;
27737b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
277426e093fcSHong Zhang   } else {
277526e093fcSHong Zhang     mbs = a->mbs;
27762d61bbb3SSatish Balay     ii  = a->i;
277726e093fcSHong Zhang     z   = zarray;
277826e093fcSHong Zhang   }
27792d61bbb3SSatish Balay 
27802d61bbb3SSatish Balay   if (!a->mult_work) {
2781d0f46423SBarry Smith     k = PetscMax(A->rmap->n, A->cmap->n);
27829566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
27832d61bbb3SSatish Balay   }
27842d61bbb3SSatish Balay   work = a->mult_work;
27852d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
2786*9371c9d4SSatish Balay     n = ii[1] - ii[0];
2787*9371c9d4SSatish Balay     ii++;
27882d61bbb3SSatish Balay     ncols = n * bs;
27892d61bbb3SSatish Balay     workt = work;
27902d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
27912d61bbb3SSatish Balay       xb = x + bs * (*idx++);
27922d61bbb3SSatish Balay       for (k = 0; k < bs; k++) workt[k] = xb[k];
27932d61bbb3SSatish Balay       workt += bs;
27942d61bbb3SSatish Balay     }
27957b2bb3b9SHong Zhang     if (usecprow) z = zarray + bs * ridx[i];
279696b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
27972d61bbb3SSatish Balay     v += n * bs2;
279826fbe8dcSKarl Rupp     if (!usecprow) z += bs;
279926e093fcSHong Zhang   }
28009566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
28019566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &zarray));
28029566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * bs2));
28032d61bbb3SSatish Balay   PetscFunctionReturn(0);
28042d61bbb3SSatish Balay }
28052d61bbb3SSatish Balay 
2806*9371c9d4SSatish Balay PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) {
2807547795f9SHong Zhang   PetscScalar zero = 0.0;
2808547795f9SHong Zhang 
2809547795f9SHong Zhang   PetscFunctionBegin;
28109566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
28119566063dSJacob Faibussowitsch   PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2812547795f9SHong Zhang   PetscFunctionReturn(0);
2813547795f9SHong Zhang }
2814547795f9SHong Zhang 
2815*9371c9d4SSatish Balay PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) {
28163447b6efSHong Zhang   PetscScalar zero = 0.0;
28172d61bbb3SSatish Balay 
28182d61bbb3SSatish Balay   PetscFunctionBegin;
28199566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
28209566063dSJacob Faibussowitsch   PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz));
28212d61bbb3SSatish Balay   PetscFunctionReturn(0);
28222d61bbb3SSatish Balay }
28232d61bbb3SSatish Balay 
2824*9371c9d4SSatish Balay PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) {
2825547795f9SHong Zhang   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2826b8c08b77SHong Zhang   PetscScalar       *z, x1, x2, x3, x4, x5;
2827d9ca1df4SBarry Smith   const PetscScalar *x, *xb = NULL;
2828d9ca1df4SBarry Smith   const MatScalar   *v;
2829b8c08b77SHong Zhang   PetscInt           mbs, i, rval, bs     = A->rmap->bs, j, n;
2830d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2831547795f9SHong Zhang   Mat_CompressedRow  cprow    = a->compressedrow;
2832ace3abfcSBarry Smith   PetscBool          usecprow = cprow.use;
2833547795f9SHong Zhang 
2834547795f9SHong Zhang   PetscFunctionBegin;
28359566063dSJacob Faibussowitsch   if (yy != zz) PetscCall(VecCopy(yy, zz));
28369566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
28379566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
2838547795f9SHong Zhang 
2839547795f9SHong Zhang   idx = a->j;
2840547795f9SHong Zhang   v   = a->a;
2841547795f9SHong Zhang   if (usecprow) {
2842547795f9SHong Zhang     mbs  = cprow.nrows;
2843547795f9SHong Zhang     ii   = cprow.i;
2844547795f9SHong Zhang     ridx = cprow.rindex;
2845547795f9SHong Zhang   } else {
2846547795f9SHong Zhang     mbs = a->mbs;
2847547795f9SHong Zhang     ii  = a->i;
2848547795f9SHong Zhang     xb  = x;
2849547795f9SHong Zhang   }
2850547795f9SHong Zhang 
2851547795f9SHong Zhang   switch (bs) {
2852547795f9SHong Zhang   case 1:
2853547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2854547795f9SHong Zhang       if (usecprow) xb = x + ridx[i];
2855547795f9SHong Zhang       x1 = xb[0];
2856547795f9SHong Zhang       ib = idx + ii[0];
2857*9371c9d4SSatish Balay       n  = ii[1] - ii[0];
2858*9371c9d4SSatish Balay       ii++;
2859547795f9SHong Zhang       for (j = 0; j < n; j++) {
2860547795f9SHong Zhang         rval = ib[j];
2861547795f9SHong Zhang         z[rval] += PetscConj(*v) * x1;
2862547795f9SHong Zhang         v++;
2863547795f9SHong Zhang       }
2864547795f9SHong Zhang       if (!usecprow) xb++;
2865547795f9SHong Zhang     }
2866547795f9SHong Zhang     break;
2867547795f9SHong Zhang   case 2:
2868547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2869547795f9SHong Zhang       if (usecprow) xb = x + 2 * ridx[i];
2870*9371c9d4SSatish Balay       x1 = xb[0];
2871*9371c9d4SSatish Balay       x2 = xb[1];
2872547795f9SHong Zhang       ib = idx + ii[0];
2873*9371c9d4SSatish Balay       n  = ii[1] - ii[0];
2874*9371c9d4SSatish Balay       ii++;
2875547795f9SHong Zhang       for (j = 0; j < n; j++) {
2876547795f9SHong Zhang         rval = ib[j] * 2;
2877547795f9SHong Zhang         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2878547795f9SHong Zhang         z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2879547795f9SHong Zhang         v += 4;
2880547795f9SHong Zhang       }
2881547795f9SHong Zhang       if (!usecprow) xb += 2;
2882547795f9SHong Zhang     }
2883547795f9SHong Zhang     break;
2884547795f9SHong Zhang   case 3:
2885547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2886547795f9SHong Zhang       if (usecprow) xb = x + 3 * ridx[i];
2887*9371c9d4SSatish Balay       x1 = xb[0];
2888*9371c9d4SSatish Balay       x2 = xb[1];
2889*9371c9d4SSatish Balay       x3 = xb[2];
2890547795f9SHong Zhang       ib = idx + ii[0];
2891*9371c9d4SSatish Balay       n  = ii[1] - ii[0];
2892*9371c9d4SSatish Balay       ii++;
2893547795f9SHong Zhang       for (j = 0; j < n; j++) {
2894547795f9SHong Zhang         rval = ib[j] * 3;
2895547795f9SHong Zhang         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2896547795f9SHong Zhang         z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2897547795f9SHong Zhang         z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2898547795f9SHong Zhang         v += 9;
2899547795f9SHong Zhang       }
2900547795f9SHong Zhang       if (!usecprow) xb += 3;
2901547795f9SHong Zhang     }
2902547795f9SHong Zhang     break;
2903547795f9SHong Zhang   case 4:
2904547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2905547795f9SHong Zhang       if (usecprow) xb = x + 4 * ridx[i];
2906*9371c9d4SSatish Balay       x1 = xb[0];
2907*9371c9d4SSatish Balay       x2 = xb[1];
2908*9371c9d4SSatish Balay       x3 = xb[2];
2909*9371c9d4SSatish Balay       x4 = xb[3];
2910547795f9SHong Zhang       ib = idx + ii[0];
2911*9371c9d4SSatish Balay       n  = ii[1] - ii[0];
2912*9371c9d4SSatish Balay       ii++;
2913547795f9SHong Zhang       for (j = 0; j < n; j++) {
2914547795f9SHong Zhang         rval = ib[j] * 4;
2915547795f9SHong Zhang         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2916547795f9SHong Zhang         z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2917547795f9SHong Zhang         z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2918547795f9SHong Zhang         z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2919547795f9SHong Zhang         v += 16;
2920547795f9SHong Zhang       }
2921547795f9SHong Zhang       if (!usecprow) xb += 4;
2922547795f9SHong Zhang     }
2923547795f9SHong Zhang     break;
2924547795f9SHong Zhang   case 5:
2925547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2926547795f9SHong Zhang       if (usecprow) xb = x + 5 * ridx[i];
2927*9371c9d4SSatish Balay       x1 = xb[0];
2928*9371c9d4SSatish Balay       x2 = xb[1];
2929*9371c9d4SSatish Balay       x3 = xb[2];
2930*9371c9d4SSatish Balay       x4 = xb[3];
2931*9371c9d4SSatish Balay       x5 = xb[4];
2932547795f9SHong Zhang       ib = idx + ii[0];
2933*9371c9d4SSatish Balay       n  = ii[1] - ii[0];
2934*9371c9d4SSatish Balay       ii++;
2935547795f9SHong Zhang       for (j = 0; j < n; j++) {
2936547795f9SHong Zhang         rval = ib[j] * 5;
2937547795f9SHong Zhang         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2938547795f9SHong Zhang         z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2939547795f9SHong Zhang         z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2940547795f9SHong Zhang         z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2941547795f9SHong Zhang         z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2942547795f9SHong Zhang         v += 25;
2943547795f9SHong Zhang       }
2944547795f9SHong Zhang       if (!usecprow) xb += 5;
2945547795f9SHong Zhang     }
2946547795f9SHong Zhang     break;
2947*9371c9d4SSatish Balay   default: /* block sizes larger than 5 by 5 are handled by BLAS */ SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2948968ae2c8SSatish Balay #if 0
2949968ae2c8SSatish Balay     {
2950b8c08b77SHong Zhang       PetscInt          ncols,k,bs2=a->bs2;
2951b8c08b77SHong Zhang       PetscScalar       *work,*workt,zb;
2952d9ca1df4SBarry Smith       const PetscScalar *xtmp;
2953547795f9SHong Zhang       if (!a->mult_work) {
2954547795f9SHong Zhang         k    = PetscMax(A->rmap->n,A->cmap->n);
29559566063dSJacob Faibussowitsch         PetscCall(PetscMalloc1(k+1,&a->mult_work));
2956547795f9SHong Zhang       }
2957547795f9SHong Zhang       work = a->mult_work;
2958547795f9SHong Zhang       xtmp = x;
2959547795f9SHong Zhang       for (i=0; i<mbs; i++) {
2960547795f9SHong Zhang         n     = ii[1] - ii[0]; ii++;
2961547795f9SHong Zhang         ncols = n*bs;
29629566063dSJacob Faibussowitsch         PetscCall(PetscArrayzero(work,ncols));
296326fbe8dcSKarl Rupp         if (usecprow) xtmp = x + bs*ridx[i];
296496b95a6bSBarry Smith         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2965547795f9SHong Zhang         v += n*bs2;
2966547795f9SHong Zhang         if (!usecprow) xtmp += bs;
2967547795f9SHong Zhang         workt = work;
2968547795f9SHong Zhang         for (j=0; j<n; j++) {
2969547795f9SHong Zhang           zb = z + bs*(*idx++);
2970547795f9SHong Zhang           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2971547795f9SHong Zhang           workt += bs;
2972547795f9SHong Zhang         }
2973547795f9SHong Zhang       }
2974547795f9SHong Zhang     }
2975968ae2c8SSatish Balay #endif
2976547795f9SHong Zhang   }
29779566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
29789566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
29799566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
2980547795f9SHong Zhang   PetscFunctionReturn(0);
2981547795f9SHong Zhang }
2982547795f9SHong Zhang 
2983*9371c9d4SSatish Balay PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) {
29842d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2985d9ca1df4SBarry Smith   PetscScalar       *zb, *z, x1, x2, x3, x4, x5;
2986f4259b30SLisandro Dalcin   const PetscScalar *x, *xb = NULL;
2987d9ca1df4SBarry Smith   const MatScalar   *v;
2988d9ca1df4SBarry Smith   PetscInt           mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2989d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
29903447b6efSHong Zhang   Mat_CompressedRow  cprow    = a->compressedrow;
2991ace3abfcSBarry Smith   PetscBool          usecprow = cprow.use;
29922d61bbb3SSatish Balay 
29932d61bbb3SSatish Balay   PetscFunctionBegin;
29949566063dSJacob Faibussowitsch   if (yy != zz) PetscCall(VecCopy(yy, zz));
29959566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
29969566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
29972d61bbb3SSatish Balay 
29982d61bbb3SSatish Balay   idx = a->j;
29992d61bbb3SSatish Balay   v   = a->a;
30003447b6efSHong Zhang   if (usecprow) {
30013447b6efSHong Zhang     mbs  = cprow.nrows;
30023447b6efSHong Zhang     ii   = cprow.i;
30037b2bb3b9SHong Zhang     ridx = cprow.rindex;
30043447b6efSHong Zhang   } else {
30053447b6efSHong Zhang     mbs = a->mbs;
30062d61bbb3SSatish Balay     ii  = a->i;
3007f1af5d2fSBarry Smith     xb  = x;
30083447b6efSHong Zhang   }
30092d61bbb3SSatish Balay 
30102d61bbb3SSatish Balay   switch (bs) {
30112d61bbb3SSatish Balay   case 1:
30122d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30137b2bb3b9SHong Zhang       if (usecprow) xb = x + ridx[i];
3014f1af5d2fSBarry Smith       x1 = xb[0];
30153447b6efSHong Zhang       ib = idx + ii[0];
3016*9371c9d4SSatish Balay       n  = ii[1] - ii[0];
3017*9371c9d4SSatish Balay       ii++;
30182d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
30192d61bbb3SSatish Balay         rval = ib[j];
3020f1af5d2fSBarry Smith         z[rval] += *v * x1;
3021f1af5d2fSBarry Smith         v++;
30222d61bbb3SSatish Balay       }
30233447b6efSHong Zhang       if (!usecprow) xb++;
30242d61bbb3SSatish Balay     }
30252d61bbb3SSatish Balay     break;
30262d61bbb3SSatish Balay   case 2:
30272d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30287b2bb3b9SHong Zhang       if (usecprow) xb = x + 2 * ridx[i];
3029*9371c9d4SSatish Balay       x1 = xb[0];
3030*9371c9d4SSatish Balay       x2 = xb[1];
30313447b6efSHong Zhang       ib = idx + ii[0];
3032*9371c9d4SSatish Balay       n  = ii[1] - ii[0];
3033*9371c9d4SSatish Balay       ii++;
30342d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
30352d61bbb3SSatish Balay         rval = ib[j] * 2;
30362d61bbb3SSatish Balay         z[rval++] += v[0] * x1 + v[1] * x2;
30372d61bbb3SSatish Balay         z[rval++] += v[2] * x1 + v[3] * x2;
30382d61bbb3SSatish Balay         v += 4;
30392d61bbb3SSatish Balay       }
30403447b6efSHong Zhang       if (!usecprow) xb += 2;
30412d61bbb3SSatish Balay     }
30422d61bbb3SSatish Balay     break;
30432d61bbb3SSatish Balay   case 3:
30442d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30457b2bb3b9SHong Zhang       if (usecprow) xb = x + 3 * ridx[i];
3046*9371c9d4SSatish Balay       x1 = xb[0];
3047*9371c9d4SSatish Balay       x2 = xb[1];
3048*9371c9d4SSatish Balay       x3 = xb[2];
30493447b6efSHong Zhang       ib = idx + ii[0];
3050*9371c9d4SSatish Balay       n  = ii[1] - ii[0];
3051*9371c9d4SSatish Balay       ii++;
30522d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
30532d61bbb3SSatish Balay         rval = ib[j] * 3;
30542d61bbb3SSatish Balay         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
30552d61bbb3SSatish Balay         z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
30562d61bbb3SSatish Balay         z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
30572d61bbb3SSatish Balay         v += 9;
30582d61bbb3SSatish Balay       }
30593447b6efSHong Zhang       if (!usecprow) xb += 3;
30602d61bbb3SSatish Balay     }
30612d61bbb3SSatish Balay     break;
30622d61bbb3SSatish Balay   case 4:
30632d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30647b2bb3b9SHong Zhang       if (usecprow) xb = x + 4 * ridx[i];
3065*9371c9d4SSatish Balay       x1 = xb[0];
3066*9371c9d4SSatish Balay       x2 = xb[1];
3067*9371c9d4SSatish Balay       x3 = xb[2];
3068*9371c9d4SSatish Balay       x4 = xb[3];
30693447b6efSHong Zhang       ib = idx + ii[0];
3070*9371c9d4SSatish Balay       n  = ii[1] - ii[0];
3071*9371c9d4SSatish Balay       ii++;
30722d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
30732d61bbb3SSatish Balay         rval = ib[j] * 4;
30742d61bbb3SSatish Balay         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
30752d61bbb3SSatish Balay         z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
30762d61bbb3SSatish Balay         z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
30772d61bbb3SSatish Balay         z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
30782d61bbb3SSatish Balay         v += 16;
30792d61bbb3SSatish Balay       }
30803447b6efSHong Zhang       if (!usecprow) xb += 4;
30812d61bbb3SSatish Balay     }
30822d61bbb3SSatish Balay     break;
30832d61bbb3SSatish Balay   case 5:
30842d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30857b2bb3b9SHong Zhang       if (usecprow) xb = x + 5 * ridx[i];
3086*9371c9d4SSatish Balay       x1 = xb[0];
3087*9371c9d4SSatish Balay       x2 = xb[1];
3088*9371c9d4SSatish Balay       x3 = xb[2];
3089*9371c9d4SSatish Balay       x4 = xb[3];
3090*9371c9d4SSatish Balay       x5 = xb[4];
30913447b6efSHong Zhang       ib = idx + ii[0];
3092*9371c9d4SSatish Balay       n  = ii[1] - ii[0];
3093*9371c9d4SSatish Balay       ii++;
30942d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
30952d61bbb3SSatish Balay         rval = ib[j] * 5;
30962d61bbb3SSatish Balay         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
30972d61bbb3SSatish Balay         z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
30982d61bbb3SSatish Balay         z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
30992d61bbb3SSatish Balay         z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
31002d61bbb3SSatish Balay         z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
31012d61bbb3SSatish Balay         v += 25;
31022d61bbb3SSatish Balay       }
31033447b6efSHong Zhang       if (!usecprow) xb += 5;
31042d61bbb3SSatish Balay     }
31052d61bbb3SSatish Balay     break;
3106f1af5d2fSBarry Smith   default: { /* block sizes larger then 5 by 5 are handled by BLAS */
3107690b6cddSBarry Smith     PetscInt           ncols, k;
3108d9ca1df4SBarry Smith     PetscScalar       *work, *workt;
3109d9ca1df4SBarry Smith     const PetscScalar *xtmp;
31102d61bbb3SSatish Balay     if (!a->mult_work) {
3111d0f46423SBarry Smith       k = PetscMax(A->rmap->n, A->cmap->n);
31129566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(k + 1, &a->mult_work));
31132d61bbb3SSatish Balay     }
31142d61bbb3SSatish Balay     work = a->mult_work;
31153447b6efSHong Zhang     xtmp = x;
31162d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
3117*9371c9d4SSatish Balay       n = ii[1] - ii[0];
3118*9371c9d4SSatish Balay       ii++;
31192d61bbb3SSatish Balay       ncols = n * bs;
31209566063dSJacob Faibussowitsch       PetscCall(PetscArrayzero(work, ncols));
312126fbe8dcSKarl Rupp       if (usecprow) xtmp = x + bs * ridx[i];
312296b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
31232d61bbb3SSatish Balay       v += n * bs2;
31243447b6efSHong Zhang       if (!usecprow) xtmp += bs;
31252d61bbb3SSatish Balay       workt = work;
31262d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
31272d61bbb3SSatish Balay         zb = z + bs * (*idx++);
31282d61bbb3SSatish Balay         for (k = 0; k < bs; k++) zb[k] += workt[k];
31292d61bbb3SSatish Balay         workt += bs;
31302d61bbb3SSatish Balay       }
31312d61bbb3SSatish Balay     }
31322d61bbb3SSatish Balay   }
31332d61bbb3SSatish Balay   }
31349566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
31359566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
31369566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
31372d61bbb3SSatish Balay   PetscFunctionReturn(0);
31382d61bbb3SSatish Balay }
31392d61bbb3SSatish Balay 
3140*9371c9d4SSatish Balay PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha) {
31412d61bbb3SSatish Balay   Mat_SeqBAIJ *a       = (Mat_SeqBAIJ *)inA->data;
3142690b6cddSBarry Smith   PetscInt     totalnz = a->bs2 * a->nz;
3143f4df32b1SMatthew Knepley   PetscScalar  oalpha  = alpha;
3144c5df96a5SBarry Smith   PetscBLASInt one     = 1, tnz;
31452d61bbb3SSatish Balay 
31462d61bbb3SSatish Balay   PetscFunctionBegin;
31479566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(totalnz, &tnz));
3148792fecdfSBarry Smith   PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
31499566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(totalnz));
31502d61bbb3SSatish Balay   PetscFunctionReturn(0);
31512d61bbb3SSatish Balay }
31522d61bbb3SSatish Balay 
3153*9371c9d4SSatish Balay PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm) {
31542d61bbb3SSatish Balay   Mat_SeqBAIJ *a   = (Mat_SeqBAIJ *)A->data;
31553f1db9ecSBarry Smith   MatScalar   *v   = a->a;
3156329f5518SBarry Smith   PetscReal    sum = 0.0;
3157d0f46423SBarry Smith   PetscInt     i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;
31582d61bbb3SSatish Balay 
31592d61bbb3SSatish Balay   PetscFunctionBegin;
31602d61bbb3SSatish Balay   if (type == NORM_FROBENIUS) {
3161570b7f6dSBarry Smith #if defined(PETSC_USE_REAL___FP16)
3162570b7f6dSBarry Smith     PetscBLASInt one = 1, cnt = bs2 * nz;
3163792fecdfSBarry Smith     PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3164570b7f6dSBarry Smith #else
31652d61bbb3SSatish Balay     for (i = 0; i < bs2 * nz; i++) {
3166*9371c9d4SSatish Balay       sum += PetscRealPart(PetscConj(*v) * (*v));
3167*9371c9d4SSatish Balay       v++;
31682d61bbb3SSatish Balay     }
3169570b7f6dSBarry Smith #endif
31708f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum);
31719566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(2.0 * bs2 * nz));
31728a62d963SHong Zhang   } else if (type == NORM_1) { /* maximum column sum */
31738a62d963SHong Zhang     PetscReal *tmp;
31748a62d963SHong Zhang     PetscInt  *bcol = a->j;
31759566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp));
31768a62d963SHong Zhang     for (i = 0; i < nz; i++) {
31778a62d963SHong Zhang       for (j = 0; j < bs; j++) {
31788a62d963SHong Zhang         k1 = bs * (*bcol) + j; /* column index */
31798a62d963SHong Zhang         for (k = 0; k < bs; k++) {
3180*9371c9d4SSatish Balay           tmp[k1] += PetscAbsScalar(*v);
3181*9371c9d4SSatish Balay           v++;
31828a62d963SHong Zhang         }
31838a62d963SHong Zhang       }
31848a62d963SHong Zhang       bcol++;
31858a62d963SHong Zhang     }
31868a62d963SHong Zhang     *norm = 0.0;
3187d0f46423SBarry Smith     for (j = 0; j < A->cmap->n; j++) {
31888a62d963SHong Zhang       if (tmp[j] > *norm) *norm = tmp[j];
31898a62d963SHong Zhang     }
31909566063dSJacob Faibussowitsch     PetscCall(PetscFree(tmp));
31919566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3192596552b5SBarry Smith   } else if (type == NORM_INFINITY) { /* maximum row sum */
3193596552b5SBarry Smith     *norm = 0.0;
3194596552b5SBarry Smith     for (k = 0; k < bs; k++) {
319574f84c7bSSatish Balay       for (j = 0; j < a->mbs; j++) {
3196596552b5SBarry Smith         v   = a->a + bs2 * a->i[j] + k;
3197596552b5SBarry Smith         sum = 0.0;
3198596552b5SBarry Smith         for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
31990e90e235SBarry Smith           for (k1 = 0; k1 < bs; k1++) {
3200596552b5SBarry Smith             sum += PetscAbsScalar(*v);
3201596552b5SBarry Smith             v += bs;
32022d61bbb3SSatish Balay           }
32030e90e235SBarry Smith         }
3204596552b5SBarry Smith         if (sum > *norm) *norm = sum;
3205596552b5SBarry Smith       }
3206596552b5SBarry Smith     }
32079566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3208e7e72b3dSBarry Smith   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
32092d61bbb3SSatish Balay   PetscFunctionReturn(0);
32102d61bbb3SSatish Balay }
32112d61bbb3SSatish Balay 
3212*9371c9d4SSatish Balay PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg) {
32132d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
32142d61bbb3SSatish Balay 
32152d61bbb3SSatish Balay   PetscFunctionBegin;
32162d61bbb3SSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
3217d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
3218273d9f13SBarry Smith     *flg = PETSC_FALSE;
3219273d9f13SBarry Smith     PetscFunctionReturn(0);
32202d61bbb3SSatish Balay   }
32212d61bbb3SSatish Balay 
32222d61bbb3SSatish Balay   /* if the a->i are the same */
32239566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
322426fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
32252d61bbb3SSatish Balay 
32262d61bbb3SSatish Balay   /* if a->j are the same */
32279566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
322826fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
322926fbe8dcSKarl Rupp 
32302d61bbb3SSatish Balay   /* if a->a are the same */
32319566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg));
32322d61bbb3SSatish Balay   PetscFunctionReturn(0);
32332d61bbb3SSatish Balay }
32342d61bbb3SSatish Balay 
3235*9371c9d4SSatish Balay PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v) {
32362d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3237690b6cddSBarry Smith   PetscInt     i, j, k, n, row, bs, *ai, *aj, ambs, bs2;
323887828ca2SBarry Smith   PetscScalar *x, zero = 0.0;
32393f1db9ecSBarry Smith   MatScalar   *aa, *aa_j;
32402d61bbb3SSatish Balay 
32412d61bbb3SSatish Balay   PetscFunctionBegin;
324228b400f6SJacob Faibussowitsch   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
3243d0f46423SBarry Smith   bs   = A->rmap->bs;
32442d61bbb3SSatish Balay   aa   = a->a;
32452d61bbb3SSatish Balay   ai   = a->i;
32462d61bbb3SSatish Balay   aj   = a->j;
32472d61bbb3SSatish Balay   ambs = a->mbs;
32482d61bbb3SSatish Balay   bs2  = a->bs2;
32492d61bbb3SSatish Balay 
32509566063dSJacob Faibussowitsch   PetscCall(VecSet(v, zero));
32519566063dSJacob Faibussowitsch   PetscCall(VecGetArray(v, &x));
32529566063dSJacob Faibussowitsch   PetscCall(VecGetLocalSize(v, &n));
325308401ef6SPierre Jolivet   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
32542d61bbb3SSatish Balay   for (i = 0; i < ambs; i++) {
32552d61bbb3SSatish Balay     for (j = ai[i]; j < ai[i + 1]; j++) {
32562d61bbb3SSatish Balay       if (aj[j] == i) {
32572d61bbb3SSatish Balay         row  = i * bs;
32582d61bbb3SSatish Balay         aa_j = aa + j * bs2;
32592d61bbb3SSatish Balay         for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
32602d61bbb3SSatish Balay         break;
32612d61bbb3SSatish Balay       }
32622d61bbb3SSatish Balay     }
32632d61bbb3SSatish Balay   }
32649566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(v, &x));
32652d61bbb3SSatish Balay   PetscFunctionReturn(0);
32662d61bbb3SSatish Balay }
32672d61bbb3SSatish Balay 
3268*9371c9d4SSatish Balay PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr) {
32692d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
327053ef36baSBarry Smith   const PetscScalar *l, *r, *li, *ri;
327153ef36baSBarry Smith   PetscScalar        x;
32723f1db9ecSBarry Smith   MatScalar         *aa, *v;
327353ef36baSBarry Smith   PetscInt           i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
327453ef36baSBarry Smith   const PetscInt    *ai, *aj;
32752d61bbb3SSatish Balay 
32762d61bbb3SSatish Balay   PetscFunctionBegin;
32772d61bbb3SSatish Balay   ai  = a->i;
32782d61bbb3SSatish Balay   aj  = a->j;
32792d61bbb3SSatish Balay   aa  = a->a;
3280d0f46423SBarry Smith   m   = A->rmap->n;
3281d0f46423SBarry Smith   n   = A->cmap->n;
3282d0f46423SBarry Smith   bs  = A->rmap->bs;
32832d61bbb3SSatish Balay   mbs = a->mbs;
32842d61bbb3SSatish Balay   bs2 = a->bs2;
32852d61bbb3SSatish Balay   if (ll) {
32869566063dSJacob Faibussowitsch     PetscCall(VecGetArrayRead(ll, &l));
32879566063dSJacob Faibussowitsch     PetscCall(VecGetLocalSize(ll, &lm));
328808401ef6SPierre Jolivet     PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
32892d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) { /* for each block row */
32902d61bbb3SSatish Balay       M  = ai[i + 1] - ai[i];
32912d61bbb3SSatish Balay       li = l + i * bs;
32922d61bbb3SSatish Balay       v  = aa + bs2 * ai[i];
32932d61bbb3SSatish Balay       for (j = 0; j < M; j++) { /* for each block */
3294*9371c9d4SSatish Balay         for (k = 0; k < bs2; k++) { (*v++) *= li[k % bs]; }
32952d61bbb3SSatish Balay       }
32962d61bbb3SSatish Balay     }
32979566063dSJacob Faibussowitsch     PetscCall(VecRestoreArrayRead(ll, &l));
32989566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(a->nz));
32992d61bbb3SSatish Balay   }
33002d61bbb3SSatish Balay 
33012d61bbb3SSatish Balay   if (rr) {
33029566063dSJacob Faibussowitsch     PetscCall(VecGetArrayRead(rr, &r));
33039566063dSJacob Faibussowitsch     PetscCall(VecGetLocalSize(rr, &rn));
330408401ef6SPierre Jolivet     PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
33052d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) { /* for each block row */
330653ef36baSBarry Smith       iai = ai[i];
330753ef36baSBarry Smith       M   = ai[i + 1] - iai;
330853ef36baSBarry Smith       v   = aa + bs2 * iai;
33092d61bbb3SSatish Balay       for (j = 0; j < M; j++) { /* for each block */
331053ef36baSBarry Smith         ri = r + bs * aj[iai + j];
33112d61bbb3SSatish Balay         for (k = 0; k < bs; k++) {
33122d61bbb3SSatish Balay           x = ri[k];
331353ef36baSBarry Smith           for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
331453ef36baSBarry Smith           v += bs;
33152d61bbb3SSatish Balay         }
33162d61bbb3SSatish Balay       }
33172d61bbb3SSatish Balay     }
33189566063dSJacob Faibussowitsch     PetscCall(VecRestoreArrayRead(rr, &r));
33199566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(a->nz));
33202d61bbb3SSatish Balay   }
33212d61bbb3SSatish Balay   PetscFunctionReturn(0);
33222d61bbb3SSatish Balay }
33232d61bbb3SSatish Balay 
3324*9371c9d4SSatish Balay PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info) {
33252d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
33262d61bbb3SSatish Balay 
33272d61bbb3SSatish Balay   PetscFunctionBegin;
33282d61bbb3SSatish Balay   info->block_size   = a->bs2;
3329ceed8ce5SJed Brown   info->nz_allocated = a->bs2 * a->maxnz;
33302d61bbb3SSatish Balay   info->nz_used      = a->bs2 * a->nz;
33313966268fSBarry Smith   info->nz_unneeded  = info->nz_allocated - info->nz_used;
33322d61bbb3SSatish Balay   info->assemblies   = A->num_ass;
33338e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
33347adad957SLisandro Dalcin   info->memory       = ((PetscObject)A)->mem;
3335d5f3da31SBarry Smith   if (A->factortype) {
33362d61bbb3SSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
33372d61bbb3SSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
33382d61bbb3SSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
33392d61bbb3SSatish Balay   } else {
33402d61bbb3SSatish Balay     info->fill_ratio_given  = 0;
33412d61bbb3SSatish Balay     info->fill_ratio_needed = 0;
33422d61bbb3SSatish Balay     info->factor_mallocs    = 0;
33432d61bbb3SSatish Balay   }
33442d61bbb3SSatish Balay   PetscFunctionReturn(0);
33452d61bbb3SSatish Balay }
33462d61bbb3SSatish Balay 
3347*9371c9d4SSatish Balay PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) {
33482d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
33492d61bbb3SSatish Balay 
33502d61bbb3SSatish Balay   PetscFunctionBegin;
33519566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
33522d61bbb3SSatish Balay   PetscFunctionReturn(0);
33532d61bbb3SSatish Balay }
3354a001520aSPierre Jolivet 
3355*9371c9d4SSatish Balay PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) {
3356a001520aSPierre Jolivet   PetscFunctionBegin;
33579566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
33584222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3359a001520aSPierre Jolivet   PetscFunctionReturn(0);
3360a001520aSPierre Jolivet }
3361a001520aSPierre Jolivet 
3362*9371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
336374eeabc5SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3364f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1;
3365bcf10a7aSPierre Jolivet   const PetscScalar *xb;
336674eeabc5SPierre Jolivet   PetscScalar        x1;
336774eeabc5SPierre Jolivet   const MatScalar   *v, *vv;
336874eeabc5SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
336974eeabc5SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
337074eeabc5SPierre Jolivet 
337174eeabc5SPierre Jolivet   PetscFunctionBegin;
337274eeabc5SPierre Jolivet   idx = a->j;
337374eeabc5SPierre Jolivet   v   = a->a;
337474eeabc5SPierre Jolivet   if (usecprow) {
337574eeabc5SPierre Jolivet     mbs  = a->compressedrow.nrows;
337674eeabc5SPierre Jolivet     ii   = a->compressedrow.i;
337774eeabc5SPierre Jolivet     ridx = a->compressedrow.rindex;
337874eeabc5SPierre Jolivet   } else {
337974eeabc5SPierre Jolivet     mbs = a->mbs;
338074eeabc5SPierre Jolivet     ii  = a->i;
338174eeabc5SPierre Jolivet     z   = c;
338274eeabc5SPierre Jolivet   }
338374eeabc5SPierre Jolivet 
338474eeabc5SPierre Jolivet   for (i = 0; i < mbs; i++) {
3385*9371c9d4SSatish Balay     n = ii[1] - ii[0];
3386*9371c9d4SSatish Balay     ii++;
338774eeabc5SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
338874eeabc5SPierre Jolivet     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
338974eeabc5SPierre Jolivet     if (usecprow) z = c + ridx[i];
339074eeabc5SPierre Jolivet     jj = idx;
339174eeabc5SPierre Jolivet     vv = v;
339274eeabc5SPierre Jolivet     for (k = 0; k < cn; k++) {
339374eeabc5SPierre Jolivet       idx  = jj;
339474eeabc5SPierre Jolivet       v    = vv;
339574eeabc5SPierre Jolivet       sum1 = 0.0;
339674eeabc5SPierre Jolivet       for (j = 0; j < n; j++) {
3397*9371c9d4SSatish Balay         xb = b + (*idx++);
3398*9371c9d4SSatish Balay         x1 = xb[0 + k * bm];
339974eeabc5SPierre Jolivet         sum1 += v[0] * x1;
340074eeabc5SPierre Jolivet         v += 1;
340174eeabc5SPierre Jolivet       }
3402feb237baSPierre Jolivet       z[0 + k * cm] = sum1;
340374eeabc5SPierre Jolivet     }
340474eeabc5SPierre Jolivet     if (!usecprow) z += 1;
340574eeabc5SPierre Jolivet   }
340674eeabc5SPierre Jolivet   PetscFunctionReturn(0);
340774eeabc5SPierre Jolivet }
340874eeabc5SPierre Jolivet 
3409*9371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
34104b7054f4SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3411f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2;
3412bcf10a7aSPierre Jolivet   const PetscScalar *xb;
34134b7054f4SPierre Jolivet   PetscScalar        x1, x2;
34144b7054f4SPierre Jolivet   const MatScalar   *v, *vv;
34154b7054f4SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
34164b7054f4SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
34174b7054f4SPierre Jolivet 
34184b7054f4SPierre Jolivet   PetscFunctionBegin;
34194b7054f4SPierre Jolivet   idx = a->j;
34204b7054f4SPierre Jolivet   v   = a->a;
34214b7054f4SPierre Jolivet   if (usecprow) {
34224b7054f4SPierre Jolivet     mbs  = a->compressedrow.nrows;
34234b7054f4SPierre Jolivet     ii   = a->compressedrow.i;
34244b7054f4SPierre Jolivet     ridx = a->compressedrow.rindex;
34254b7054f4SPierre Jolivet   } else {
34264b7054f4SPierre Jolivet     mbs = a->mbs;
34274b7054f4SPierre Jolivet     ii  = a->i;
34284b7054f4SPierre Jolivet     z   = c;
34294b7054f4SPierre Jolivet   }
34304b7054f4SPierre Jolivet 
34314b7054f4SPierre Jolivet   for (i = 0; i < mbs; i++) {
3432*9371c9d4SSatish Balay     n = ii[1] - ii[0];
3433*9371c9d4SSatish Balay     ii++;
34344b7054f4SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
34354b7054f4SPierre Jolivet     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
34364b7054f4SPierre Jolivet     if (usecprow) z = c + 2 * ridx[i];
34374b7054f4SPierre Jolivet     jj = idx;
34384b7054f4SPierre Jolivet     vv = v;
34394b7054f4SPierre Jolivet     for (k = 0; k < cn; k++) {
34404b7054f4SPierre Jolivet       idx  = jj;
34414b7054f4SPierre Jolivet       v    = vv;
3442*9371c9d4SSatish Balay       sum1 = 0.0;
3443*9371c9d4SSatish Balay       sum2 = 0.0;
34444b7054f4SPierre Jolivet       for (j = 0; j < n; j++) {
3445*9371c9d4SSatish Balay         xb = b + 2 * (*idx++);
3446*9371c9d4SSatish Balay         x1 = xb[0 + k * bm];
3447*9371c9d4SSatish Balay         x2 = xb[1 + k * bm];
34484b7054f4SPierre Jolivet         sum1 += v[0] * x1 + v[2] * x2;
34494b7054f4SPierre Jolivet         sum2 += v[1] * x1 + v[3] * x2;
34504b7054f4SPierre Jolivet         v += 4;
34514b7054f4SPierre Jolivet       }
3452*9371c9d4SSatish Balay       z[0 + k * cm] = sum1;
3453*9371c9d4SSatish Balay       z[1 + k * cm] = sum2;
34544b7054f4SPierre Jolivet     }
34554b7054f4SPierre Jolivet     if (!usecprow) z += 2;
34564b7054f4SPierre Jolivet   }
34574b7054f4SPierre Jolivet   PetscFunctionReturn(0);
34584b7054f4SPierre Jolivet }
34594b7054f4SPierre Jolivet 
3460*9371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
346174eeabc5SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3462f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3;
3463bcf10a7aSPierre Jolivet   const PetscScalar *xb;
346474eeabc5SPierre Jolivet   PetscScalar        x1, x2, x3;
346574eeabc5SPierre Jolivet   const MatScalar   *v, *vv;
346674eeabc5SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
346774eeabc5SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
346874eeabc5SPierre Jolivet 
346974eeabc5SPierre Jolivet   PetscFunctionBegin;
347074eeabc5SPierre Jolivet   idx = a->j;
347174eeabc5SPierre Jolivet   v   = a->a;
347274eeabc5SPierre Jolivet   if (usecprow) {
347374eeabc5SPierre Jolivet     mbs  = a->compressedrow.nrows;
347474eeabc5SPierre Jolivet     ii   = a->compressedrow.i;
347574eeabc5SPierre Jolivet     ridx = a->compressedrow.rindex;
347674eeabc5SPierre Jolivet   } else {
347774eeabc5SPierre Jolivet     mbs = a->mbs;
347874eeabc5SPierre Jolivet     ii  = a->i;
347974eeabc5SPierre Jolivet     z   = c;
348074eeabc5SPierre Jolivet   }
348174eeabc5SPierre Jolivet 
348274eeabc5SPierre Jolivet   for (i = 0; i < mbs; i++) {
3483*9371c9d4SSatish Balay     n = ii[1] - ii[0];
3484*9371c9d4SSatish Balay     ii++;
348574eeabc5SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
348674eeabc5SPierre Jolivet     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
348774eeabc5SPierre Jolivet     if (usecprow) z = c + 3 * ridx[i];
348874eeabc5SPierre Jolivet     jj = idx;
348974eeabc5SPierre Jolivet     vv = v;
349074eeabc5SPierre Jolivet     for (k = 0; k < cn; k++) {
349174eeabc5SPierre Jolivet       idx  = jj;
349274eeabc5SPierre Jolivet       v    = vv;
3493*9371c9d4SSatish Balay       sum1 = 0.0;
3494*9371c9d4SSatish Balay       sum2 = 0.0;
3495*9371c9d4SSatish Balay       sum3 = 0.0;
349674eeabc5SPierre Jolivet       for (j = 0; j < n; j++) {
3497*9371c9d4SSatish Balay         xb = b + 3 * (*idx++);
3498*9371c9d4SSatish Balay         x1 = xb[0 + k * bm];
3499*9371c9d4SSatish Balay         x2 = xb[1 + k * bm];
3500*9371c9d4SSatish Balay         x3 = xb[2 + k * bm];
350174eeabc5SPierre Jolivet         sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
350274eeabc5SPierre Jolivet         sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
350374eeabc5SPierre Jolivet         sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
350474eeabc5SPierre Jolivet         v += 9;
350574eeabc5SPierre Jolivet       }
3506*9371c9d4SSatish Balay       z[0 + k * cm] = sum1;
3507*9371c9d4SSatish Balay       z[1 + k * cm] = sum2;
3508*9371c9d4SSatish Balay       z[2 + k * cm] = sum3;
350974eeabc5SPierre Jolivet     }
351074eeabc5SPierre Jolivet     if (!usecprow) z += 3;
351174eeabc5SPierre Jolivet   }
351274eeabc5SPierre Jolivet   PetscFunctionReturn(0);
351374eeabc5SPierre Jolivet }
351474eeabc5SPierre Jolivet 
3515*9371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
351674eeabc5SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3517f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4;
3518bcf10a7aSPierre Jolivet   const PetscScalar *xb;
351974eeabc5SPierre Jolivet   PetscScalar        x1, x2, x3, x4;
352074eeabc5SPierre Jolivet   const MatScalar   *v, *vv;
352174eeabc5SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
352274eeabc5SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
352374eeabc5SPierre Jolivet 
352474eeabc5SPierre Jolivet   PetscFunctionBegin;
352574eeabc5SPierre Jolivet   idx = a->j;
352674eeabc5SPierre Jolivet   v   = a->a;
352774eeabc5SPierre Jolivet   if (usecprow) {
352874eeabc5SPierre Jolivet     mbs  = a->compressedrow.nrows;
352974eeabc5SPierre Jolivet     ii   = a->compressedrow.i;
353074eeabc5SPierre Jolivet     ridx = a->compressedrow.rindex;
353174eeabc5SPierre Jolivet   } else {
353274eeabc5SPierre Jolivet     mbs = a->mbs;
353374eeabc5SPierre Jolivet     ii  = a->i;
353474eeabc5SPierre Jolivet     z   = c;
353574eeabc5SPierre Jolivet   }
353674eeabc5SPierre Jolivet 
353774eeabc5SPierre Jolivet   for (i = 0; i < mbs; i++) {
3538*9371c9d4SSatish Balay     n = ii[1] - ii[0];
3539*9371c9d4SSatish Balay     ii++;
354074eeabc5SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
354174eeabc5SPierre Jolivet     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
354274eeabc5SPierre Jolivet     if (usecprow) z = c + 4 * ridx[i];
354374eeabc5SPierre Jolivet     jj = idx;
354474eeabc5SPierre Jolivet     vv = v;
354574eeabc5SPierre Jolivet     for (k = 0; k < cn; k++) {
354674eeabc5SPierre Jolivet       idx  = jj;
354774eeabc5SPierre Jolivet       v    = vv;
3548*9371c9d4SSatish Balay       sum1 = 0.0;
3549*9371c9d4SSatish Balay       sum2 = 0.0;
3550*9371c9d4SSatish Balay       sum3 = 0.0;
3551*9371c9d4SSatish Balay       sum4 = 0.0;
355274eeabc5SPierre Jolivet       for (j = 0; j < n; j++) {
3553*9371c9d4SSatish Balay         xb = b + 4 * (*idx++);
3554*9371c9d4SSatish Balay         x1 = xb[0 + k * bm];
3555*9371c9d4SSatish Balay         x2 = xb[1 + k * bm];
3556*9371c9d4SSatish Balay         x3 = xb[2 + k * bm];
3557*9371c9d4SSatish Balay         x4 = xb[3 + k * bm];
355874eeabc5SPierre Jolivet         sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
355974eeabc5SPierre Jolivet         sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
356074eeabc5SPierre Jolivet         sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
356174eeabc5SPierre Jolivet         sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
356274eeabc5SPierre Jolivet         v += 16;
356374eeabc5SPierre Jolivet       }
3564*9371c9d4SSatish Balay       z[0 + k * cm] = sum1;
3565*9371c9d4SSatish Balay       z[1 + k * cm] = sum2;
3566*9371c9d4SSatish Balay       z[2 + k * cm] = sum3;
3567*9371c9d4SSatish Balay       z[3 + k * cm] = sum4;
356874eeabc5SPierre Jolivet     }
356974eeabc5SPierre Jolivet     if (!usecprow) z += 4;
357074eeabc5SPierre Jolivet   }
357174eeabc5SPierre Jolivet   PetscFunctionReturn(0);
357274eeabc5SPierre Jolivet }
357374eeabc5SPierre Jolivet 
3574*9371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
357574eeabc5SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3576f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5;
3577bcf10a7aSPierre Jolivet   const PetscScalar *xb;
357874eeabc5SPierre Jolivet   PetscScalar        x1, x2, x3, x4, x5;
357974eeabc5SPierre Jolivet   const MatScalar   *v, *vv;
358074eeabc5SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
358174eeabc5SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
358274eeabc5SPierre Jolivet 
358374eeabc5SPierre Jolivet   PetscFunctionBegin;
358474eeabc5SPierre Jolivet   idx = a->j;
358574eeabc5SPierre Jolivet   v   = a->a;
358674eeabc5SPierre Jolivet   if (usecprow) {
358774eeabc5SPierre Jolivet     mbs  = a->compressedrow.nrows;
358874eeabc5SPierre Jolivet     ii   = a->compressedrow.i;
358974eeabc5SPierre Jolivet     ridx = a->compressedrow.rindex;
359074eeabc5SPierre Jolivet   } else {
359174eeabc5SPierre Jolivet     mbs = a->mbs;
359274eeabc5SPierre Jolivet     ii  = a->i;
359374eeabc5SPierre Jolivet     z   = c;
359474eeabc5SPierre Jolivet   }
359574eeabc5SPierre Jolivet 
359674eeabc5SPierre Jolivet   for (i = 0; i < mbs; i++) {
3597*9371c9d4SSatish Balay     n = ii[1] - ii[0];
3598*9371c9d4SSatish Balay     ii++;
359974eeabc5SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
360074eeabc5SPierre Jolivet     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
360174eeabc5SPierre Jolivet     if (usecprow) z = c + 5 * ridx[i];
360274eeabc5SPierre Jolivet     jj = idx;
360374eeabc5SPierre Jolivet     vv = v;
360474eeabc5SPierre Jolivet     for (k = 0; k < cn; k++) {
360574eeabc5SPierre Jolivet       idx  = jj;
360674eeabc5SPierre Jolivet       v    = vv;
3607*9371c9d4SSatish Balay       sum1 = 0.0;
3608*9371c9d4SSatish Balay       sum2 = 0.0;
3609*9371c9d4SSatish Balay       sum3 = 0.0;
3610*9371c9d4SSatish Balay       sum4 = 0.0;
3611*9371c9d4SSatish Balay       sum5 = 0.0;
361274eeabc5SPierre Jolivet       for (j = 0; j < n; j++) {
3613*9371c9d4SSatish Balay         xb = b + 5 * (*idx++);
3614*9371c9d4SSatish Balay         x1 = xb[0 + k * bm];
3615*9371c9d4SSatish Balay         x2 = xb[1 + k * bm];
3616*9371c9d4SSatish Balay         x3 = xb[2 + k * bm];
3617*9371c9d4SSatish Balay         x4 = xb[3 + k * bm];
3618*9371c9d4SSatish Balay         x5 = xb[4 + k * bm];
361974eeabc5SPierre Jolivet         sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
362074eeabc5SPierre Jolivet         sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
362174eeabc5SPierre Jolivet         sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
362274eeabc5SPierre Jolivet         sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
362374eeabc5SPierre Jolivet         sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
362474eeabc5SPierre Jolivet         v += 25;
362574eeabc5SPierre Jolivet       }
3626*9371c9d4SSatish Balay       z[0 + k * cm] = sum1;
3627*9371c9d4SSatish Balay       z[1 + k * cm] = sum2;
3628*9371c9d4SSatish Balay       z[2 + k * cm] = sum3;
3629*9371c9d4SSatish Balay       z[3 + k * cm] = sum4;
3630*9371c9d4SSatish Balay       z[4 + k * cm] = sum5;
363174eeabc5SPierre Jolivet     }
363274eeabc5SPierre Jolivet     if (!usecprow) z += 5;
363374eeabc5SPierre Jolivet   }
363474eeabc5SPierre Jolivet   PetscFunctionReturn(0);
363574eeabc5SPierre Jolivet }
363674eeabc5SPierre Jolivet 
3637*9371c9d4SSatish Balay PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C) {
3638a001520aSPierre Jolivet   Mat_SeqBAIJ     *a  = (Mat_SeqBAIJ *)A->data;
3639a001520aSPierre Jolivet   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
3640910cf402Sprj-   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
3641bcf10a7aSPierre Jolivet   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3642a001520aSPierre Jolivet   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3643a001520aSPierre Jolivet   PetscBLASInt     bbs, bcn, bbm, bcm;
3644f4259b30SLisandro Dalcin   PetscScalar     *z = NULL;
3645a001520aSPierre Jolivet   PetscScalar     *c, *b;
3646a001520aSPierre Jolivet   const MatScalar *v;
3647a001520aSPierre Jolivet   const PetscInt  *idx, *ii, *ridx = NULL;
36484b7054f4SPierre Jolivet   PetscScalar      _DZero = 0.0, _DOne = 1.0;
3649a001520aSPierre Jolivet   PetscBool        usecprow = a->compressedrow.use;
3650a001520aSPierre Jolivet 
3651a001520aSPierre Jolivet   PetscFunctionBegin;
3652a001520aSPierre Jolivet   if (!cm || !cn) PetscFunctionReturn(0);
365308401ef6SPierre Jolivet   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
365408401ef6SPierre Jolivet   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
365508401ef6SPierre Jolivet   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
3656a001520aSPierre Jolivet   b = bd->v;
3657*9371c9d4SSatish Balay   if (a->nonzerorowcnt != A->rmap->n) { PetscCall(MatZeroEntries(C)); }
36589566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArray(C, &c));
365974eeabc5SPierre Jolivet   switch (bs) {
3660*9371c9d4SSatish Balay   case 1: PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); break;
3661*9371c9d4SSatish Balay   case 2: PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); break;
3662*9371c9d4SSatish Balay   case 3: PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); break;
3663*9371c9d4SSatish Balay   case 4: PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); break;
3664*9371c9d4SSatish Balay   case 5: PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); break;
366574eeabc5SPierre Jolivet   default: /* block sizes larger than 5 by 5 are handled by BLAS */
36669566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bs, &bbs));
36679566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cn, &bcn));
36689566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bm, &bbm));
36699566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cm, &bcm));
3670a001520aSPierre Jolivet     idx = a->j;
3671a001520aSPierre Jolivet     v   = a->a;
3672a001520aSPierre Jolivet     if (usecprow) {
3673a001520aSPierre Jolivet       mbs  = a->compressedrow.nrows;
3674a001520aSPierre Jolivet       ii   = a->compressedrow.i;
3675a001520aSPierre Jolivet       ridx = a->compressedrow.rindex;
3676a001520aSPierre Jolivet     } else {
3677a001520aSPierre Jolivet       mbs = a->mbs;
3678a001520aSPierre Jolivet       ii  = a->i;
3679a001520aSPierre Jolivet       z   = c;
3680a001520aSPierre Jolivet     }
3681a001520aSPierre Jolivet     for (i = 0; i < mbs; i++) {
3682*9371c9d4SSatish Balay       n = ii[1] - ii[0];
3683*9371c9d4SSatish Balay       ii++;
3684a001520aSPierre Jolivet       if (usecprow) z = c + bs * ridx[i];
36854b7054f4SPierre Jolivet       if (n) {
3686792fecdfSBarry Smith         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
36874b7054f4SPierre Jolivet         v += bs2;
36884b7054f4SPierre Jolivet       }
36894b7054f4SPierre Jolivet       for (j = 1; j < n; j++) {
3690792fecdfSBarry Smith         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3691a001520aSPierre Jolivet         v += bs2;
3692a001520aSPierre Jolivet       }
3693a001520aSPierre Jolivet       if (!usecprow) z += bs;
3694a001520aSPierre Jolivet     }
36954b7054f4SPierre Jolivet   }
36969566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArray(C, &c));
36979566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn));
3698a001520aSPierre Jolivet   PetscFunctionReturn(0);
3699a001520aSPierre Jolivet }
3700