xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 48a46eb9bd028bec07ec0f396b1a3abb43f14558)
1c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
2a001520aSPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h>
3af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
4c6db04a5SJed Brown #include <petscbt.h>
5c6db04a5SJed Brown #include <petscblaslapack.h>
6cac129eeSSatish Balay 
75f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
896e086a2SDaniel Kokron #include <immintrin.h>
996e086a2SDaniel Kokron #endif
1096e086a2SDaniel Kokron 
119371c9d4SSatish Balay PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) {
12a3192f15SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
135d0c19d7SBarry Smith   PetscInt        row, i, j, k, l, m, n, *nidx, isz, val, ival;
145d0c19d7SBarry Smith   const PetscInt *idx;
15690b6cddSBarry Smith   PetscInt        start, end, *ai, *aj, bs, *nidx2;
16f1af5d2fSBarry Smith   PetscBT         table;
17a3192f15SSatish Balay 
183a40ed3dSBarry Smith   PetscFunctionBegin;
19a3192f15SSatish Balay   m  = a->mbs;
20a3192f15SSatish Balay   ai = a->i;
21a3192f15SSatish Balay   aj = a->j;
22d0f46423SBarry Smith   bs = A->rmap->bs;
23a3192f15SSatish Balay 
2408401ef6SPierre Jolivet   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
25a3192f15SSatish Balay 
269566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(m, &table));
279566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m + 1, &nidx));
289566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(A->rmap->N + 1, &nidx2));
29a3192f15SSatish Balay 
30a3192f15SSatish Balay   for (i = 0; i < is_max; i++) {
31a3192f15SSatish Balay     /* Initialise the two local arrays */
32a3192f15SSatish Balay     isz = 0;
339566063dSJacob Faibussowitsch     PetscCall(PetscBTMemzero(m, table));
34a3192f15SSatish Balay 
35a3192f15SSatish Balay     /* Extract the indices, assume there can be duplicate entries */
369566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(is[i], &idx));
379566063dSJacob Faibussowitsch     PetscCall(ISGetLocalSize(is[i], &n));
38a3192f15SSatish Balay 
39a3192f15SSatish Balay     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40a3192f15SSatish Balay     for (j = 0; j < n; ++j) {
41218c64b6SSatish Balay       ival = idx[j] / bs; /* convert the indices into block indices */
4208401ef6SPierre Jolivet       PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
4326fbe8dcSKarl Rupp       if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
44a3192f15SSatish Balay     }
459566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(is[i], &idx));
469566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&is[i]));
47a3192f15SSatish Balay 
48a3192f15SSatish Balay     k = 0;
49a3192f15SSatish Balay     for (j = 0; j < ov; j++) { /* for each overlap*/
50a3192f15SSatish Balay       n = isz;
51a3192f15SSatish Balay       for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
52a3192f15SSatish Balay         row   = nidx[k];
53a3192f15SSatish Balay         start = ai[row];
54a3192f15SSatish Balay         end   = ai[row + 1];
55a3192f15SSatish Balay         for (l = start; l < end; l++) {
56a3192f15SSatish Balay           val = aj[l];
5726fbe8dcSKarl Rupp           if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
58a3192f15SSatish Balay         }
59a3192f15SSatish Balay       }
60a3192f15SSatish Balay     }
61218c64b6SSatish Balay     /* expand the Index Set */
62218c64b6SSatish Balay     for (j = 0; j < isz; j++) {
6326fbe8dcSKarl Rupp       for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k;
64218c64b6SSatish Balay     }
659566063dSJacob Faibussowitsch     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i));
66a3192f15SSatish Balay   }
679566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&table));
689566063dSJacob Faibussowitsch   PetscCall(PetscFree(nidx));
699566063dSJacob Faibussowitsch   PetscCall(PetscFree(nidx2));
703a40ed3dSBarry Smith   PetscFunctionReturn(0);
71a3192f15SSatish Balay }
721c351548SSatish Balay 
739371c9d4SSatish Balay PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) {
74736121d4SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *c;
75690b6cddSBarry Smith   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
76690b6cddSBarry Smith   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
775d0c19d7SBarry Smith   const PetscInt *irow, *icol;
785d0c19d7SBarry Smith   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
79690b6cddSBarry Smith   PetscInt       *aj = a->j, *ai = a->i;
803f1db9ecSBarry Smith   MatScalar      *mat_a;
81736121d4SSatish Balay   Mat             C;
826041f1b1SToby Isaac   PetscBool       flag;
83736121d4SSatish Balay 
843a40ed3dSBarry Smith   PetscFunctionBegin;
859566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &irow));
869566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &icol));
879566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(isrow, &nrows));
889566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(iscol, &ncols));
89736121d4SSatish Balay 
909566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(1 + oldcols, &smap));
91736121d4SSatish Balay   ssmap = smap;
929566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(1 + nrows, &lens));
93736121d4SSatish Balay   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
94736121d4SSatish Balay   /* determine lens of each row */
95736121d4SSatish Balay   for (i = 0; i < nrows; i++) {
96736121d4SSatish Balay     kstart  = ai[irow[i]];
97736121d4SSatish Balay     kend    = kstart + a->ilen[irow[i]];
98736121d4SSatish Balay     lens[i] = 0;
99736121d4SSatish Balay     for (k = kstart; k < kend; k++) {
10026fbe8dcSKarl Rupp       if (ssmap[aj[k]]) lens[i]++;
101736121d4SSatish Balay     }
102736121d4SSatish Balay   }
103736121d4SSatish Balay   /* Create and fill new matrix */
104736121d4SSatish Balay   if (scall == MAT_REUSE_MATRIX) {
105736121d4SSatish Balay     c = (Mat_SeqBAIJ *)((*B)->data);
106736121d4SSatish Balay 
107aed4548fSBarry Smith     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
1089566063dSJacob Faibussowitsch     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
10928b400f6SJacob Faibussowitsch     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
1109566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(c->ilen, c->mbs));
111736121d4SSatish Balay     C = *B;
1123a40ed3dSBarry Smith   } else {
1139566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
1149566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
1159566063dSJacob Faibussowitsch     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
1169566063dSJacob Faibussowitsch     PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
117736121d4SSatish Balay   }
118736121d4SSatish Balay   c = (Mat_SeqBAIJ *)(C->data);
119736121d4SSatish Balay   for (i = 0; i < nrows; i++) {
120736121d4SSatish Balay     row      = irow[i];
121736121d4SSatish Balay     kstart   = ai[row];
122736121d4SSatish Balay     kend     = kstart + a->ilen[row];
123736121d4SSatish Balay     mat_i    = c->i[i];
124d29f2997SMatthew Woehlke     mat_j    = c->j ? c->j + mat_i : NULL;       /* mustn't add to NULL, that is UB */
125d29f2997SMatthew Woehlke     mat_a    = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */
126736121d4SSatish Balay     mat_ilen = c->ilen + i;
127736121d4SSatish Balay     for (k = kstart; k < kend; k++) {
128736121d4SSatish Balay       if ((tcol = ssmap[a->j[k]])) {
129736121d4SSatish Balay         *mat_j++ = tcol - 1;
1309566063dSJacob Faibussowitsch         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
131549d3d68SSatish Balay         mat_a += bs2;
132736121d4SSatish Balay         (*mat_ilen)++;
133736121d4SSatish Balay       }
134736121d4SSatish Balay     }
135736121d4SSatish Balay   }
136cdc6f3adSToby Isaac   /* sort */
137d29f2997SMatthew Woehlke   if (c->j && c->a) {
138cdc6f3adSToby Isaac     MatScalar *work;
1399566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(bs2, &work));
140cdc6f3adSToby Isaac     for (i = 0; i < nrows; i++) {
141cdc6f3adSToby Isaac       PetscInt ilen;
142cdc6f3adSToby Isaac       mat_i = c->i[i];
143cdc6f3adSToby Isaac       mat_j = c->j + mat_i;
144cdc6f3adSToby Isaac       mat_a = c->a + mat_i * bs2;
145cdc6f3adSToby Isaac       ilen  = c->ilen[i];
1469566063dSJacob Faibussowitsch       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
147cdc6f3adSToby Isaac     }
1489566063dSJacob Faibussowitsch     PetscCall(PetscFree(work));
149cdc6f3adSToby Isaac   }
150218c64b6SSatish Balay 
151736121d4SSatish Balay   /* Free work space */
1529566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &icol));
1539566063dSJacob Faibussowitsch   PetscCall(PetscFree(smap));
1549566063dSJacob Faibussowitsch   PetscCall(PetscFree(lens));
1559566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1569566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
157736121d4SSatish Balay 
1589566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &irow));
159736121d4SSatish Balay   *B = C;
1603a40ed3dSBarry Smith   PetscFunctionReturn(0);
161736121d4SSatish Balay }
162736121d4SSatish Balay 
1639371c9d4SSatish Balay PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) {
164218c64b6SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
165218c64b6SSatish Balay   IS              is1, is2;
166afebec48SHong Zhang   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j;
1675d0c19d7SBarry Smith   const PetscInt *irow, *icol;
168218c64b6SSatish Balay 
1693a40ed3dSBarry Smith   PetscFunctionBegin;
1709566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &irow));
1719566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &icol));
1729566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(isrow, &nrows));
1739566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(iscol, &ncols));
174218c64b6SSatish Balay 
175218c64b6SSatish Balay   /* Verify if the indices corespond to each element in a block
176218c64b6SSatish Balay    and form the IS with compressed IS */
177f8ecb639SStefano Zampini   maxmnbs = PetscMax(a->mbs, a->nbs);
1789566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary));
1799566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(vary, a->mbs));
180218c64b6SSatish Balay   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
1819371c9d4SSatish Balay   for (i = 0; i < a->mbs; i++) { PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks"); }
1826041f1b1SToby Isaac   count = 0;
1836041f1b1SToby Isaac   for (i = 0; i < nrows; i++) {
184afebec48SHong Zhang     j = irow[i] / bs;
1856041f1b1SToby Isaac     if ((vary[j]--) == bs) iary[count++] = j;
186218c64b6SSatish Balay   }
1879566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1));
188218c64b6SSatish Balay 
1899566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(vary, a->nbs));
190218c64b6SSatish Balay   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
1919371c9d4SSatish Balay   for (i = 0; i < a->nbs; i++) { PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc"); }
1926041f1b1SToby Isaac   count = 0;
1936041f1b1SToby Isaac   for (i = 0; i < ncols; i++) {
194afebec48SHong Zhang     j = icol[i] / bs;
1956041f1b1SToby Isaac     if ((vary[j]--) == bs) iary[count++] = j;
1966041f1b1SToby Isaac   }
1979566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2));
1989566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &irow));
1999566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &icol));
2009566063dSJacob Faibussowitsch   PetscCall(PetscFree2(vary, iary));
201218c64b6SSatish Balay 
2029566063dSJacob Faibussowitsch   PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B));
2039566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is1));
2049566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is2));
2053a40ed3dSBarry Smith   PetscFunctionReturn(0);
206218c64b6SSatish Balay }
207218c64b6SSatish Balay 
2089371c9d4SSatish Balay PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) {
20916b64355SHong Zhang   Mat_SeqBAIJ *c       = (Mat_SeqBAIJ *)C->data;
2105c39f6d9SHong Zhang   Mat_SubSppt *submatj = c->submatis1;
21116b64355SHong Zhang 
21216b64355SHong Zhang   PetscFunctionBegin;
2139566063dSJacob Faibussowitsch   PetscCall((*submatj->destroy)(C));
2149566063dSJacob Faibussowitsch   PetscCall(MatDestroySubMatrix_Private(submatj));
21516b64355SHong Zhang   PetscFunctionReturn(0);
21616b64355SHong Zhang }
21716b64355SHong Zhang 
21889a1a59bSHong Zhang /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
2199371c9d4SSatish Balay PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[]) {
22086e85357SHong Zhang   PetscInt     i;
22186e85357SHong Zhang   Mat          C;
22286e85357SHong Zhang   Mat_SeqBAIJ *c;
22386e85357SHong Zhang   Mat_SubSppt *submatj;
22486e85357SHong Zhang 
22586e85357SHong Zhang   PetscFunctionBegin;
22686e85357SHong Zhang   for (i = 0; i < n; i++) {
22786e85357SHong Zhang     C       = (*mat)[i];
22886e85357SHong Zhang     c       = (Mat_SeqBAIJ *)C->data;
22986e85357SHong Zhang     submatj = c->submatis1;
23086e85357SHong Zhang     if (submatj) {
2317daefbafSJunchao Zhang       if (--((PetscObject)C)->refct <= 0) {
23226cc229bSBarry Smith         PetscCall(PetscFree(C->factorprefix));
2339566063dSJacob Faibussowitsch         PetscCall((*submatj->destroy)(C));
2349566063dSJacob Faibussowitsch         PetscCall(MatDestroySubMatrix_Private(submatj));
2359566063dSJacob Faibussowitsch         PetscCall(PetscFree(C->defaultvectype));
2369566063dSJacob Faibussowitsch         PetscCall(PetscLayoutDestroy(&C->rmap));
2379566063dSJacob Faibussowitsch         PetscCall(PetscLayoutDestroy(&C->cmap));
2389566063dSJacob Faibussowitsch         PetscCall(PetscHeaderDestroy(&C));
2397daefbafSJunchao Zhang       }
24086e85357SHong Zhang     } else {
2419566063dSJacob Faibussowitsch       PetscCall(MatDestroy(&C));
24286e85357SHong Zhang     }
24386e85357SHong Zhang   }
2447daefbafSJunchao Zhang 
2457daefbafSJunchao Zhang   /* Destroy Dummy submatrices created for reuse */
2469566063dSJacob Faibussowitsch   PetscCall(MatDestroySubMatrices_Dummy(n, mat));
2477daefbafSJunchao Zhang 
2489566063dSJacob Faibussowitsch   PetscCall(PetscFree(*mat));
24986e85357SHong Zhang   PetscFunctionReturn(0);
25086e85357SHong Zhang }
25186e85357SHong Zhang 
2529371c9d4SSatish Balay PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) {
253690b6cddSBarry Smith   PetscInt i;
254736121d4SSatish Balay 
2553a40ed3dSBarry Smith   PetscFunctionBegin;
256*48a46eb9SPierre Jolivet   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));
257736121d4SSatish Balay 
258*48a46eb9SPierre Jolivet   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
2593a40ed3dSBarry Smith   PetscFunctionReturn(0);
260736121d4SSatish Balay }
261218c64b6SSatish Balay 
2622d61bbb3SSatish Balay /* -------------------------------------------------------*/
2632d61bbb3SSatish Balay /* Should check that shapes of vectors and matrices match */
2642d61bbb3SSatish Balay /* -------------------------------------------------------*/
2652d61bbb3SSatish Balay 
2669371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz) {
2672d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
268d9fead3dSBarry Smith   PetscScalar       *z, sum;
269d9fead3dSBarry Smith   const PetscScalar *x;
270d9fead3dSBarry Smith   const MatScalar   *v;
2717c565772SBarry Smith   PetscInt           mbs, i, n;
2720298fd71SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
273ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
2742d61bbb3SSatish Balay 
2752d61bbb3SSatish Balay   PetscFunctionBegin;
2769566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
2779566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &z));
2782d61bbb3SSatish Balay 
27926e093fcSHong Zhang   if (usecprow) {
28026e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
28126e093fcSHong Zhang     ii   = a->compressedrow.i;
2827b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
2839566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(z, a->mbs));
28426e093fcSHong Zhang   } else {
28526e093fcSHong Zhang     mbs = a->mbs;
2862d61bbb3SSatish Balay     ii  = a->i;
28726e093fcSHong Zhang   }
2882d61bbb3SSatish Balay 
2892d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
290ee54c7eeSHong Zhang     n   = ii[1] - ii[0];
291ee54c7eeSHong Zhang     v   = a->a + ii[0];
292ee54c7eeSHong Zhang     idx = a->j + ii[0];
293ee54c7eeSHong Zhang     ii++;
294444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
295444d8c10SJed Brown     PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2962d61bbb3SSatish Balay     sum = 0.0;
2972162cab8SBarry Smith     PetscSparseDensePlusDot(sum, x, v, idx, n);
29826e093fcSHong Zhang     if (usecprow) {
2997b2bb3b9SHong Zhang       z[ridx[i]] = sum;
30026e093fcSHong Zhang     } else {
3012d61bbb3SSatish Balay       z[i] = sum;
3022d61bbb3SSatish Balay     }
30326e093fcSHong Zhang   }
3049566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
3059566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &z));
3069566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt));
3072d61bbb3SSatish Balay   PetscFunctionReturn(0);
3082d61bbb3SSatish Balay }
3092d61bbb3SSatish Balay 
3109371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz) {
3112d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
312f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, *zarray;
313d9fead3dSBarry Smith   const PetscScalar *x, *xb;
31487828ca2SBarry Smith   PetscScalar        x1, x2;
315d9fead3dSBarry Smith   const MatScalar   *v;
3167c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
317ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
3182d61bbb3SSatish Balay 
3192d61bbb3SSatish Balay   PetscFunctionBegin;
3209566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3219566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
3222d61bbb3SSatish Balay 
3232d61bbb3SSatish Balay   idx = a->j;
3242d61bbb3SSatish Balay   v   = a->a;
32526e093fcSHong Zhang   if (usecprow) {
32626e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
32726e093fcSHong Zhang     ii   = a->compressedrow.i;
3287b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
3299566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 2 * a->mbs));
33026e093fcSHong Zhang   } else {
33126e093fcSHong Zhang     mbs = a->mbs;
3322d61bbb3SSatish Balay     ii  = a->i;
33326e093fcSHong Zhang     z   = zarray;
33426e093fcSHong Zhang   }
3352d61bbb3SSatish Balay 
3362d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
3379371c9d4SSatish Balay     n = ii[1] - ii[0];
3389371c9d4SSatish Balay     ii++;
3399371c9d4SSatish Balay     sum1 = 0.0;
3409371c9d4SSatish Balay     sum2 = 0.0;
341444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
342444d8c10SJed Brown     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3432d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
3449371c9d4SSatish Balay       xb = x + 2 * (*idx++);
3459371c9d4SSatish Balay       x1 = xb[0];
3469371c9d4SSatish Balay       x2 = xb[1];
3472d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[2] * x2;
3482d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[3] * x2;
3492d61bbb3SSatish Balay       v += 4;
3502d61bbb3SSatish Balay     }
3517b2bb3b9SHong Zhang     if (usecprow) z = zarray + 2 * ridx[i];
3529371c9d4SSatish Balay     z[0] = sum1;
3539371c9d4SSatish Balay     z[1] = sum2;
35426e093fcSHong Zhang     if (!usecprow) z += 2;
3552d61bbb3SSatish Balay   }
3569566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
3579566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
3589566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt));
3592d61bbb3SSatish Balay   PetscFunctionReturn(0);
3602d61bbb3SSatish Balay }
3612d61bbb3SSatish Balay 
3629371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz) {
3632d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
364f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
365d9fead3dSBarry Smith   const PetscScalar *x, *xb;
366d9fead3dSBarry Smith   const MatScalar   *v;
3677c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
368ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
36926e093fcSHong Zhang 
370b6410449SSatish Balay #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
371fee21e36SBarry Smith #pragma disjoint(*v, *z, *xb)
372fee21e36SBarry Smith #endif
373fee21e36SBarry Smith 
3742d61bbb3SSatish Balay   PetscFunctionBegin;
3759566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3769566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
3772d61bbb3SSatish Balay 
3782d61bbb3SSatish Balay   idx = a->j;
3792d61bbb3SSatish Balay   v   = a->a;
38026e093fcSHong Zhang   if (usecprow) {
38126e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
38226e093fcSHong Zhang     ii   = a->compressedrow.i;
3837b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
3849566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 3 * a->mbs));
38526e093fcSHong Zhang   } else {
38626e093fcSHong Zhang     mbs = a->mbs;
3872d61bbb3SSatish Balay     ii  = a->i;
38826e093fcSHong Zhang     z   = zarray;
38926e093fcSHong Zhang   }
3902d61bbb3SSatish Balay 
3912d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
3929371c9d4SSatish Balay     n = ii[1] - ii[0];
3939371c9d4SSatish Balay     ii++;
3949371c9d4SSatish Balay     sum1 = 0.0;
3959371c9d4SSatish Balay     sum2 = 0.0;
3969371c9d4SSatish Balay     sum3 = 0.0;
397444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
398444d8c10SJed Brown     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3992d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
40026fbe8dcSKarl Rupp       xb = x + 3 * (*idx++);
40126fbe8dcSKarl Rupp       x1 = xb[0];
40226fbe8dcSKarl Rupp       x2 = xb[1];
40326fbe8dcSKarl Rupp       x3 = xb[2];
40426fbe8dcSKarl Rupp 
4052d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
4062d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
4072d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
4082d61bbb3SSatish Balay       v += 9;
4092d61bbb3SSatish Balay     }
4107b2bb3b9SHong Zhang     if (usecprow) z = zarray + 3 * ridx[i];
4119371c9d4SSatish Balay     z[0] = sum1;
4129371c9d4SSatish Balay     z[1] = sum2;
4139371c9d4SSatish Balay     z[2] = sum3;
41426e093fcSHong Zhang     if (!usecprow) z += 3;
4152d61bbb3SSatish Balay   }
4169566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
4179566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
4189566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt));
4192d61bbb3SSatish Balay   PetscFunctionReturn(0);
4202d61bbb3SSatish Balay }
4212d61bbb3SSatish Balay 
4229371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz) {
4232d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
424f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
425d9fead3dSBarry Smith   const PetscScalar *x, *xb;
426d9fead3dSBarry Smith   const MatScalar   *v;
4277c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
428ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
4292d61bbb3SSatish Balay 
4302d61bbb3SSatish Balay   PetscFunctionBegin;
4319566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
4329566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
4332d61bbb3SSatish Balay 
4342d61bbb3SSatish Balay   idx = a->j;
4352d61bbb3SSatish Balay   v   = a->a;
43626e093fcSHong Zhang   if (usecprow) {
43726e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
43826e093fcSHong Zhang     ii   = a->compressedrow.i;
4397b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
4409566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 4 * a->mbs));
44126e093fcSHong Zhang   } else {
44226e093fcSHong Zhang     mbs = a->mbs;
4432d61bbb3SSatish Balay     ii  = a->i;
44426e093fcSHong Zhang     z   = zarray;
44526e093fcSHong Zhang   }
4462d61bbb3SSatish Balay 
4472d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
44826fbe8dcSKarl Rupp     n = ii[1] - ii[0];
44926fbe8dcSKarl Rupp     ii++;
45026fbe8dcSKarl Rupp     sum1 = 0.0;
45126fbe8dcSKarl Rupp     sum2 = 0.0;
45226fbe8dcSKarl Rupp     sum3 = 0.0;
45326fbe8dcSKarl Rupp     sum4 = 0.0;
45426fbe8dcSKarl Rupp 
455444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
456444d8c10SJed Brown     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
4572d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
4582d61bbb3SSatish Balay       xb = x + 4 * (*idx++);
4599371c9d4SSatish Balay       x1 = xb[0];
4609371c9d4SSatish Balay       x2 = xb[1];
4619371c9d4SSatish Balay       x3 = xb[2];
4629371c9d4SSatish Balay       x4 = xb[3];
4632d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
4642d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
4652d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
4662d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
4672d61bbb3SSatish Balay       v += 16;
4682d61bbb3SSatish Balay     }
4697b2bb3b9SHong Zhang     if (usecprow) z = zarray + 4 * ridx[i];
4709371c9d4SSatish Balay     z[0] = sum1;
4719371c9d4SSatish Balay     z[1] = sum2;
4729371c9d4SSatish Balay     z[2] = sum3;
4739371c9d4SSatish Balay     z[3] = sum4;
47426e093fcSHong Zhang     if (!usecprow) z += 4;
4752d61bbb3SSatish Balay   }
4769566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
4779566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
4789566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt));
4792d61bbb3SSatish Balay   PetscFunctionReturn(0);
4802d61bbb3SSatish Balay }
4812d61bbb3SSatish Balay 
4829371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz) {
4832d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
484f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
485d9fead3dSBarry Smith   const PetscScalar *xb, *x;
486d9fead3dSBarry Smith   const MatScalar   *v;
4870298fd71SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
4887c565772SBarry Smith   PetscInt           mbs, i, j, n;
489ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
4902d61bbb3SSatish Balay 
491433994e6SBarry Smith   PetscFunctionBegin;
4929566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
4939566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
4942d61bbb3SSatish Balay 
4952d61bbb3SSatish Balay   idx = a->j;
4962d61bbb3SSatish Balay   v   = a->a;
49726e093fcSHong Zhang   if (usecprow) {
49826e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
49926e093fcSHong Zhang     ii   = a->compressedrow.i;
5007b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
5019566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 5 * a->mbs));
50226e093fcSHong Zhang   } else {
50326e093fcSHong Zhang     mbs = a->mbs;
5042d61bbb3SSatish Balay     ii  = a->i;
50526e093fcSHong Zhang     z   = zarray;
50626e093fcSHong Zhang   }
5072d61bbb3SSatish Balay 
5082d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
5099371c9d4SSatish Balay     n = ii[1] - ii[0];
5109371c9d4SSatish Balay     ii++;
5119371c9d4SSatish Balay     sum1 = 0.0;
5129371c9d4SSatish Balay     sum2 = 0.0;
5139371c9d4SSatish Balay     sum3 = 0.0;
5149371c9d4SSatish Balay     sum4 = 0.0;
5159371c9d4SSatish Balay     sum5 = 0.0;
516444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
517444d8c10SJed Brown     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
5182d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
5192d61bbb3SSatish Balay       xb = x + 5 * (*idx++);
5209371c9d4SSatish Balay       x1 = xb[0];
5219371c9d4SSatish Balay       x2 = xb[1];
5229371c9d4SSatish Balay       x3 = xb[2];
5239371c9d4SSatish Balay       x4 = xb[3];
5249371c9d4SSatish Balay       x5 = xb[4];
5252d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
5262d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
5272d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
5282d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
5292d61bbb3SSatish Balay       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
5302d61bbb3SSatish Balay       v += 25;
5312d61bbb3SSatish Balay     }
5327b2bb3b9SHong Zhang     if (usecprow) z = zarray + 5 * ridx[i];
5339371c9d4SSatish Balay     z[0] = sum1;
5349371c9d4SSatish Balay     z[1] = sum2;
5359371c9d4SSatish Balay     z[2] = sum3;
5369371c9d4SSatish Balay     z[3] = sum4;
5379371c9d4SSatish Balay     z[4] = sum5;
53826e093fcSHong Zhang     if (!usecprow) z += 5;
5392d61bbb3SSatish Balay   }
5409566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
5419566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
5429566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt));
5432d61bbb3SSatish Balay   PetscFunctionReturn(0);
5442d61bbb3SSatish Balay }
5452d61bbb3SSatish Balay 
5469371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz) {
54715091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
548f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
549d9fead3dSBarry Smith   const PetscScalar *x, *xb;
55026e093fcSHong Zhang   PetscScalar        x1, x2, x3, x4, x5, x6, *zarray;
551d9fead3dSBarry Smith   const MatScalar   *v;
5527c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
553ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
55415091d37SBarry Smith 
555433994e6SBarry Smith   PetscFunctionBegin;
5569566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
5579566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
55815091d37SBarry Smith 
55915091d37SBarry Smith   idx = a->j;
56015091d37SBarry Smith   v   = a->a;
56126e093fcSHong Zhang   if (usecprow) {
56226e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
56326e093fcSHong Zhang     ii   = a->compressedrow.i;
5647b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
5659566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 6 * a->mbs));
56626e093fcSHong Zhang   } else {
56726e093fcSHong Zhang     mbs = a->mbs;
56815091d37SBarry Smith     ii  = a->i;
56926e093fcSHong Zhang     z   = zarray;
57026e093fcSHong Zhang   }
57115091d37SBarry Smith 
57215091d37SBarry Smith   for (i = 0; i < mbs; i++) {
57326fbe8dcSKarl Rupp     n = ii[1] - ii[0];
57426fbe8dcSKarl Rupp     ii++;
57526fbe8dcSKarl Rupp     sum1 = 0.0;
57626fbe8dcSKarl Rupp     sum2 = 0.0;
57726fbe8dcSKarl Rupp     sum3 = 0.0;
57826fbe8dcSKarl Rupp     sum4 = 0.0;
57926fbe8dcSKarl Rupp     sum5 = 0.0;
58026fbe8dcSKarl Rupp     sum6 = 0.0;
58126fbe8dcSKarl Rupp 
582444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
583444d8c10SJed Brown     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
58415091d37SBarry Smith     for (j = 0; j < n; j++) {
58515091d37SBarry Smith       xb = x + 6 * (*idx++);
5869371c9d4SSatish Balay       x1 = xb[0];
5879371c9d4SSatish Balay       x2 = xb[1];
5889371c9d4SSatish Balay       x3 = xb[2];
5899371c9d4SSatish Balay       x4 = xb[3];
5909371c9d4SSatish Balay       x5 = xb[4];
5919371c9d4SSatish Balay       x6 = xb[5];
59215091d37SBarry Smith       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
59315091d37SBarry Smith       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
59415091d37SBarry Smith       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
59515091d37SBarry Smith       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
59615091d37SBarry Smith       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
59715091d37SBarry Smith       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
59815091d37SBarry Smith       v += 36;
59915091d37SBarry Smith     }
6007b2bb3b9SHong Zhang     if (usecprow) z = zarray + 6 * ridx[i];
6019371c9d4SSatish Balay     z[0] = sum1;
6029371c9d4SSatish Balay     z[1] = sum2;
6039371c9d4SSatish Balay     z[2] = sum3;
6049371c9d4SSatish Balay     z[3] = sum4;
6059371c9d4SSatish Balay     z[4] = sum5;
6069371c9d4SSatish Balay     z[5] = sum6;
60726e093fcSHong Zhang     if (!usecprow) z += 6;
60815091d37SBarry Smith   }
60915091d37SBarry Smith 
6109566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
6119566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
6129566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt));
61315091d37SBarry Smith   PetscFunctionReturn(0);
61415091d37SBarry Smith }
6158ab949d8SShri Abhyankar 
6169371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz) {
6172d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
618f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
619d9fead3dSBarry Smith   const PetscScalar *x, *xb;
62026e093fcSHong Zhang   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *zarray;
621d9fead3dSBarry Smith   const MatScalar   *v;
6227c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
623ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
6242d61bbb3SSatish Balay 
625433994e6SBarry Smith   PetscFunctionBegin;
6269566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
6279566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
6282d61bbb3SSatish Balay 
6292d61bbb3SSatish Balay   idx = a->j;
6302d61bbb3SSatish Balay   v   = a->a;
63126e093fcSHong Zhang   if (usecprow) {
63226e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
63326e093fcSHong Zhang     ii   = a->compressedrow.i;
6347b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
6359566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 7 * a->mbs));
63626e093fcSHong Zhang   } else {
63726e093fcSHong Zhang     mbs = a->mbs;
6382d61bbb3SSatish Balay     ii  = a->i;
63926e093fcSHong Zhang     z   = zarray;
64026e093fcSHong Zhang   }
6412d61bbb3SSatish Balay 
6422d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
64326fbe8dcSKarl Rupp     n = ii[1] - ii[0];
64426fbe8dcSKarl Rupp     ii++;
64526fbe8dcSKarl Rupp     sum1 = 0.0;
64626fbe8dcSKarl Rupp     sum2 = 0.0;
64726fbe8dcSKarl Rupp     sum3 = 0.0;
64826fbe8dcSKarl Rupp     sum4 = 0.0;
64926fbe8dcSKarl Rupp     sum5 = 0.0;
65026fbe8dcSKarl Rupp     sum6 = 0.0;
65126fbe8dcSKarl Rupp     sum7 = 0.0;
65226fbe8dcSKarl Rupp 
653444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
654444d8c10SJed Brown     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
6552d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
6562d61bbb3SSatish Balay       xb = x + 7 * (*idx++);
6579371c9d4SSatish Balay       x1 = xb[0];
6589371c9d4SSatish Balay       x2 = xb[1];
6599371c9d4SSatish Balay       x3 = xb[2];
6609371c9d4SSatish Balay       x4 = xb[3];
6619371c9d4SSatish Balay       x5 = xb[4];
6629371c9d4SSatish Balay       x6 = xb[5];
6639371c9d4SSatish Balay       x7 = xb[6];
6642d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
6652d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
6662d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
6672d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
6682d61bbb3SSatish Balay       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
6692d61bbb3SSatish Balay       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
6702d61bbb3SSatish Balay       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
6712d61bbb3SSatish Balay       v += 49;
6722d61bbb3SSatish Balay     }
6737b2bb3b9SHong Zhang     if (usecprow) z = zarray + 7 * ridx[i];
6749371c9d4SSatish Balay     z[0] = sum1;
6759371c9d4SSatish Balay     z[1] = sum2;
6769371c9d4SSatish Balay     z[2] = sum3;
6779371c9d4SSatish Balay     z[3] = sum4;
6789371c9d4SSatish Balay     z[4] = sum5;
6799371c9d4SSatish Balay     z[5] = sum6;
6809371c9d4SSatish Balay     z[6] = sum7;
68126e093fcSHong Zhang     if (!usecprow) z += 7;
6822d61bbb3SSatish Balay   }
6832d61bbb3SSatish Balay 
6849566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
6859566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
6869566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt));
6872d61bbb3SSatish Balay   PetscFunctionReturn(0);
6882d61bbb3SSatish Balay }
6892d61bbb3SSatish Balay 
6905f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
6919371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz) {
69296e086a2SDaniel Kokron   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
693f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, *work, *workt, *zarray;
69496e086a2SDaniel Kokron   const PetscScalar *x, *xb;
69596e086a2SDaniel Kokron   const MatScalar   *v;
69696e086a2SDaniel Kokron   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
69796e086a2SDaniel Kokron   const PetscInt    *idx, *ii, *ridx = NULL;
698ce68d72fSJed Brown   PetscInt           k;
69996e086a2SDaniel Kokron   PetscBool          usecprow = a->compressedrow.use;
70096e086a2SDaniel Kokron 
70196e086a2SDaniel Kokron   __m256d a0, a1, a2, a3, a4, a5;
702ce68d72fSJed Brown   __m256d w0, w1, w2, w3;
70396e086a2SDaniel Kokron   __m256d z0, z1, z2;
70496e086a2SDaniel Kokron   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
70596e086a2SDaniel Kokron 
70696e086a2SDaniel Kokron   PetscFunctionBegin;
7079566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
7089566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
70996e086a2SDaniel Kokron 
71096e086a2SDaniel Kokron   idx = a->j;
71196e086a2SDaniel Kokron   v   = a->a;
71296e086a2SDaniel Kokron   if (usecprow) {
71396e086a2SDaniel Kokron     mbs  = a->compressedrow.nrows;
71496e086a2SDaniel Kokron     ii   = a->compressedrow.i;
71596e086a2SDaniel Kokron     ridx = a->compressedrow.rindex;
7169566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
71796e086a2SDaniel Kokron   } else {
71896e086a2SDaniel Kokron     mbs = a->mbs;
71996e086a2SDaniel Kokron     ii  = a->i;
72096e086a2SDaniel Kokron     z   = zarray;
72196e086a2SDaniel Kokron   }
72296e086a2SDaniel Kokron 
72396e086a2SDaniel Kokron   if (!a->mult_work) {
72496e086a2SDaniel Kokron     k = PetscMax(A->rmap->n, A->cmap->n);
7259566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
72696e086a2SDaniel Kokron   }
72796e086a2SDaniel Kokron 
72896e086a2SDaniel Kokron   work = a->mult_work;
72996e086a2SDaniel Kokron   for (i = 0; i < mbs; i++) {
7309371c9d4SSatish Balay     n = ii[1] - ii[0];
7319371c9d4SSatish Balay     ii++;
73296e086a2SDaniel Kokron     workt = work;
73396e086a2SDaniel Kokron     for (j = 0; j < n; j++) {
73496e086a2SDaniel Kokron       xb = x + bs * (*idx++);
73596e086a2SDaniel Kokron       for (k = 0; k < bs; k++) workt[k] = xb[k];
73696e086a2SDaniel Kokron       workt += bs;
73796e086a2SDaniel Kokron     }
73896e086a2SDaniel Kokron     if (usecprow) z = zarray + bs * ridx[i];
73996e086a2SDaniel Kokron 
7409371c9d4SSatish Balay     z0 = _mm256_setzero_pd();
7419371c9d4SSatish Balay     z1 = _mm256_setzero_pd();
7429371c9d4SSatish Balay     z2 = _mm256_setzero_pd();
74396e086a2SDaniel Kokron 
74496e086a2SDaniel Kokron     for (j = 0; j < n; j++) {
745c05b70c4SSatish Balay       /* first column of a */
74696e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9]);
7479371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81]);
7489371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
7499371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
7509371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
7519371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
7529371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
75396e086a2SDaniel Kokron 
754c05b70c4SSatish Balay       /* second column of a */
75596e086a2SDaniel Kokron       w1 = _mm256_set1_pd(work[j * 9 + 1]);
7569371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
7579371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w1, z0);
7589371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
7599371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w1, z1);
7609371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
7619371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w1, z2);
76296e086a2SDaniel Kokron 
763c05b70c4SSatish Balay       /* third column of a */
76496e086a2SDaniel Kokron       w2 = _mm256_set1_pd(work[j * 9 + 2]);
7659371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
7669371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w2, z0);
7679371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
7689371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w2, z1);
7699371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
7709371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w2, z2);
77196e086a2SDaniel Kokron 
772c05b70c4SSatish Balay       /* fourth column of a */
77396e086a2SDaniel Kokron       w3 = _mm256_set1_pd(work[j * 9 + 3]);
7749371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
7759371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w3, z0);
7769371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
7779371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w3, z1);
7789371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
7799371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w3, z2);
78096e086a2SDaniel Kokron 
781c05b70c4SSatish Balay       /* fifth column of a */
78296e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9 + 4]);
7839371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
7849371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w0, z0);
7859371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
7869371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w0, z1);
7879371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
7889371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w0, z2);
78996e086a2SDaniel Kokron 
790c05b70c4SSatish Balay       /* sixth column of a */
79196e086a2SDaniel Kokron       w1 = _mm256_set1_pd(work[j * 9 + 5]);
7929371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
7939371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w1, z0);
7949371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
7959371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w1, z1);
7969371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
7979371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w1, z2);
79896e086a2SDaniel Kokron 
799c05b70c4SSatish Balay       /* seventh column of a */
80096e086a2SDaniel Kokron       w2 = _mm256_set1_pd(work[j * 9 + 6]);
8019371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
8029371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
8039371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
8049371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
8059371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
8069371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
80796e086a2SDaniel Kokron 
8086aad120cSJose E. Roman       /* eighth column of a */
80996e086a2SDaniel Kokron       w3 = _mm256_set1_pd(work[j * 9 + 7]);
8109371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
8119371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
8129371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
8139371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
8149371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
8159371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
81696e086a2SDaniel Kokron 
817c05b70c4SSatish Balay       /* ninth column of a */
81896e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9 + 8]);
8199371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
8209371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
8219371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
8229371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
8239371c9d4SSatish Balay       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
8249371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
82596e086a2SDaniel Kokron     }
82696e086a2SDaniel Kokron 
8279371c9d4SSatish Balay     _mm256_storeu_pd(&z[0], z0);
8289371c9d4SSatish Balay     _mm256_storeu_pd(&z[4], z1);
8299371c9d4SSatish Balay     _mm256_maskstore_pd(&z[8], mask1, z2);
83096e086a2SDaniel Kokron 
83196e086a2SDaniel Kokron     v += n * bs2;
83296e086a2SDaniel Kokron     if (!usecprow) z += bs;
83396e086a2SDaniel Kokron   }
8349566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
8359566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
8369566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
83796e086a2SDaniel Kokron   PetscFunctionReturn(0);
83896e086a2SDaniel Kokron }
83996e086a2SDaniel Kokron #endif
84096e086a2SDaniel Kokron 
8419371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz) {
842ebada01fSBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
843f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
844ebada01fSBarry Smith   const PetscScalar *x, *xb;
845ebada01fSBarry Smith   PetscScalar       *zarray, xv;
846ebada01fSBarry Smith   const MatScalar   *v;
847ebada01fSBarry Smith   const PetscInt    *ii, *ij = a->j, *idx;
848ebada01fSBarry Smith   PetscInt           mbs, i, j, k, n, *ridx = NULL;
849ebada01fSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
850ebada01fSBarry Smith 
851ebada01fSBarry Smith   PetscFunctionBegin;
8529566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
8539566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
854ebada01fSBarry Smith 
855ebada01fSBarry Smith   v = a->a;
856ebada01fSBarry Smith   if (usecprow) {
857ebada01fSBarry Smith     mbs  = a->compressedrow.nrows;
858ebada01fSBarry Smith     ii   = a->compressedrow.i;
859ebada01fSBarry Smith     ridx = a->compressedrow.rindex;
8609566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 11 * a->mbs));
861ebada01fSBarry Smith   } else {
862ebada01fSBarry Smith     mbs = a->mbs;
863ebada01fSBarry Smith     ii  = a->i;
864ebada01fSBarry Smith     z   = zarray;
865ebada01fSBarry Smith   }
866ebada01fSBarry Smith 
867ebada01fSBarry Smith   for (i = 0; i < mbs; i++) {
868ebada01fSBarry Smith     n     = ii[i + 1] - ii[i];
869ebada01fSBarry Smith     idx   = ij + ii[i];
8709371c9d4SSatish Balay     sum1  = 0.0;
8719371c9d4SSatish Balay     sum2  = 0.0;
8729371c9d4SSatish Balay     sum3  = 0.0;
8739371c9d4SSatish Balay     sum4  = 0.0;
8749371c9d4SSatish Balay     sum5  = 0.0;
8759371c9d4SSatish Balay     sum6  = 0.0;
8769371c9d4SSatish Balay     sum7  = 0.0;
8779371c9d4SSatish Balay     sum8  = 0.0;
8789371c9d4SSatish Balay     sum9  = 0.0;
8799371c9d4SSatish Balay     sum10 = 0.0;
8809371c9d4SSatish Balay     sum11 = 0.0;
881ebada01fSBarry Smith 
882ebada01fSBarry Smith     for (j = 0; j < n; j++) {
883ebada01fSBarry Smith       xb = x + 11 * (idx[j]);
884ebada01fSBarry Smith 
885ebada01fSBarry Smith       for (k = 0; k < 11; k++) {
886ebada01fSBarry Smith         xv = xb[k];
887ebada01fSBarry Smith         sum1 += v[0] * xv;
888ebada01fSBarry Smith         sum2 += v[1] * xv;
889ebada01fSBarry Smith         sum3 += v[2] * xv;
890ebada01fSBarry Smith         sum4 += v[3] * xv;
891ebada01fSBarry Smith         sum5 += v[4] * xv;
892ebada01fSBarry Smith         sum6 += v[5] * xv;
893ebada01fSBarry Smith         sum7 += v[6] * xv;
894ebada01fSBarry Smith         sum8 += v[7] * xv;
895ebada01fSBarry Smith         sum9 += v[8] * xv;
896ebada01fSBarry Smith         sum10 += v[9] * xv;
897ebada01fSBarry Smith         sum11 += v[10] * xv;
898ebada01fSBarry Smith         v += 11;
899ebada01fSBarry Smith       }
900ebada01fSBarry Smith     }
901ebada01fSBarry Smith     if (usecprow) z = zarray + 11 * ridx[i];
9029371c9d4SSatish Balay     z[0]  = sum1;
9039371c9d4SSatish Balay     z[1]  = sum2;
9049371c9d4SSatish Balay     z[2]  = sum3;
9059371c9d4SSatish Balay     z[3]  = sum4;
9069371c9d4SSatish Balay     z[4]  = sum5;
9079371c9d4SSatish Balay     z[5]  = sum6;
9089371c9d4SSatish Balay     z[6]  = sum7;
9099371c9d4SSatish Balay     z[7]  = sum8;
9109371c9d4SSatish Balay     z[8]  = sum9;
9119371c9d4SSatish Balay     z[9]  = sum10;
9129371c9d4SSatish Balay     z[10] = sum11;
913ebada01fSBarry Smith 
914ebada01fSBarry Smith     if (!usecprow) z += 11;
915ebada01fSBarry Smith   }
916ebada01fSBarry Smith 
9179566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
9189566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
9199566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt));
920ebada01fSBarry Smith   PetscFunctionReturn(0);
921ebada01fSBarry Smith }
922ebada01fSBarry Smith 
9236679dcc1SBarry Smith /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
9249371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz) {
9256679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
9266679dcc1SBarry Smith   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
9276679dcc1SBarry Smith   const PetscScalar *x, *xb;
9286679dcc1SBarry Smith   PetscScalar       *zarray, xv;
9296679dcc1SBarry Smith   const MatScalar   *v;
9306679dcc1SBarry Smith   const PetscInt    *ii, *ij = a->j, *idx;
9316679dcc1SBarry Smith   PetscInt           mbs, i, j, k, n, *ridx = NULL;
9326679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
9336679dcc1SBarry Smith 
9346679dcc1SBarry Smith   PetscFunctionBegin;
9359566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
9369566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
9376679dcc1SBarry Smith 
9386679dcc1SBarry Smith   v = a->a;
9396679dcc1SBarry Smith   if (usecprow) {
9406679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
9416679dcc1SBarry Smith     ii   = a->compressedrow.i;
9426679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
9439566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
9446679dcc1SBarry Smith   } else {
9456679dcc1SBarry Smith     mbs = a->mbs;
9466679dcc1SBarry Smith     ii  = a->i;
9476679dcc1SBarry Smith     z   = zarray;
9486679dcc1SBarry Smith   }
9496679dcc1SBarry Smith 
9506679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
9516679dcc1SBarry Smith     n     = ii[i + 1] - ii[i];
9526679dcc1SBarry Smith     idx   = ij + ii[i];
9539371c9d4SSatish Balay     sum1  = 0.0;
9549371c9d4SSatish Balay     sum2  = 0.0;
9559371c9d4SSatish Balay     sum3  = 0.0;
9569371c9d4SSatish Balay     sum4  = 0.0;
9579371c9d4SSatish Balay     sum5  = 0.0;
9589371c9d4SSatish Balay     sum6  = 0.0;
9599371c9d4SSatish Balay     sum7  = 0.0;
9609371c9d4SSatish Balay     sum8  = 0.0;
9619371c9d4SSatish Balay     sum9  = 0.0;
9629371c9d4SSatish Balay     sum10 = 0.0;
9639371c9d4SSatish Balay     sum11 = 0.0;
9649371c9d4SSatish Balay     sum12 = 0.0;
9656679dcc1SBarry Smith 
9666679dcc1SBarry Smith     for (j = 0; j < n; j++) {
9676679dcc1SBarry Smith       xb = x + 12 * (idx[j]);
9686679dcc1SBarry Smith 
9696679dcc1SBarry Smith       for (k = 0; k < 12; k++) {
9706679dcc1SBarry Smith         xv = xb[k];
9716679dcc1SBarry Smith         sum1 += v[0] * xv;
9726679dcc1SBarry Smith         sum2 += v[1] * xv;
9736679dcc1SBarry Smith         sum3 += v[2] * xv;
9746679dcc1SBarry Smith         sum4 += v[3] * xv;
9756679dcc1SBarry Smith         sum5 += v[4] * xv;
9766679dcc1SBarry Smith         sum6 += v[5] * xv;
9776679dcc1SBarry Smith         sum7 += v[6] * xv;
9786679dcc1SBarry Smith         sum8 += v[7] * xv;
9796679dcc1SBarry Smith         sum9 += v[8] * xv;
9806679dcc1SBarry Smith         sum10 += v[9] * xv;
9816679dcc1SBarry Smith         sum11 += v[10] * xv;
9826679dcc1SBarry Smith         sum12 += v[11] * xv;
9836679dcc1SBarry Smith         v += 12;
9846679dcc1SBarry Smith       }
9856679dcc1SBarry Smith     }
9866679dcc1SBarry Smith     if (usecprow) z = zarray + 12 * ridx[i];
9879371c9d4SSatish Balay     z[0]  = sum1;
9889371c9d4SSatish Balay     z[1]  = sum2;
9899371c9d4SSatish Balay     z[2]  = sum3;
9909371c9d4SSatish Balay     z[3]  = sum4;
9919371c9d4SSatish Balay     z[4]  = sum5;
9929371c9d4SSatish Balay     z[5]  = sum6;
9939371c9d4SSatish Balay     z[6]  = sum7;
9949371c9d4SSatish Balay     z[7]  = sum8;
9959371c9d4SSatish Balay     z[8]  = sum9;
9969371c9d4SSatish Balay     z[9]  = sum10;
9979371c9d4SSatish Balay     z[10] = sum11;
9989371c9d4SSatish Balay     z[11] = sum12;
9996679dcc1SBarry Smith     if (!usecprow) z += 12;
10006679dcc1SBarry Smith   }
10019566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
10029566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
10039566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
10046679dcc1SBarry Smith   PetscFunctionReturn(0);
10056679dcc1SBarry Smith }
10066679dcc1SBarry Smith 
10079371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz) {
10086679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
10096679dcc1SBarry Smith   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
10106679dcc1SBarry Smith   const PetscScalar *x, *xb;
10116679dcc1SBarry Smith   PetscScalar       *zarray, *yarray, xv;
10126679dcc1SBarry Smith   const MatScalar   *v;
10136679dcc1SBarry Smith   const PetscInt    *ii, *ij = a->j, *idx;
10146679dcc1SBarry Smith   PetscInt           mbs = a->mbs, i, j, k, n, *ridx = NULL;
10156679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
10166679dcc1SBarry Smith 
10176679dcc1SBarry Smith   PetscFunctionBegin;
10189566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
10199566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
10206679dcc1SBarry Smith 
10216679dcc1SBarry Smith   v = a->a;
10226679dcc1SBarry Smith   if (usecprow) {
1023*48a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
10246679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
10256679dcc1SBarry Smith     ii   = a->compressedrow.i;
10266679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
10276679dcc1SBarry Smith   } else {
10286679dcc1SBarry Smith     ii = a->i;
10296679dcc1SBarry Smith     y  = yarray;
10306679dcc1SBarry Smith     z  = zarray;
10316679dcc1SBarry Smith   }
10326679dcc1SBarry Smith 
10336679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
10346679dcc1SBarry Smith     n   = ii[i + 1] - ii[i];
10356679dcc1SBarry Smith     idx = ij + ii[i];
10366679dcc1SBarry Smith 
10376679dcc1SBarry Smith     if (usecprow) {
10386679dcc1SBarry Smith       y = yarray + 12 * ridx[i];
10396679dcc1SBarry Smith       z = zarray + 12 * ridx[i];
10406679dcc1SBarry Smith     }
10419371c9d4SSatish Balay     sum1  = y[0];
10429371c9d4SSatish Balay     sum2  = y[1];
10439371c9d4SSatish Balay     sum3  = y[2];
10449371c9d4SSatish Balay     sum4  = y[3];
10459371c9d4SSatish Balay     sum5  = y[4];
10469371c9d4SSatish Balay     sum6  = y[5];
10479371c9d4SSatish Balay     sum7  = y[6];
10489371c9d4SSatish Balay     sum8  = y[7];
10499371c9d4SSatish Balay     sum9  = y[8];
10509371c9d4SSatish Balay     sum10 = y[9];
10519371c9d4SSatish Balay     sum11 = y[10];
10529371c9d4SSatish Balay     sum12 = y[11];
10536679dcc1SBarry Smith 
10546679dcc1SBarry Smith     for (j = 0; j < n; j++) {
10556679dcc1SBarry Smith       xb = x + 12 * (idx[j]);
10566679dcc1SBarry Smith 
10576679dcc1SBarry Smith       for (k = 0; k < 12; k++) {
10586679dcc1SBarry Smith         xv = xb[k];
10596679dcc1SBarry Smith         sum1 += v[0] * xv;
10606679dcc1SBarry Smith         sum2 += v[1] * xv;
10616679dcc1SBarry Smith         sum3 += v[2] * xv;
10626679dcc1SBarry Smith         sum4 += v[3] * xv;
10636679dcc1SBarry Smith         sum5 += v[4] * xv;
10646679dcc1SBarry Smith         sum6 += v[5] * xv;
10656679dcc1SBarry Smith         sum7 += v[6] * xv;
10666679dcc1SBarry Smith         sum8 += v[7] * xv;
10676679dcc1SBarry Smith         sum9 += v[8] * xv;
10686679dcc1SBarry Smith         sum10 += v[9] * xv;
10696679dcc1SBarry Smith         sum11 += v[10] * xv;
10706679dcc1SBarry Smith         sum12 += v[11] * xv;
10716679dcc1SBarry Smith         v += 12;
10726679dcc1SBarry Smith       }
10736679dcc1SBarry Smith     }
10746679dcc1SBarry Smith 
10759371c9d4SSatish Balay     z[0]  = sum1;
10769371c9d4SSatish Balay     z[1]  = sum2;
10779371c9d4SSatish Balay     z[2]  = sum3;
10789371c9d4SSatish Balay     z[3]  = sum4;
10799371c9d4SSatish Balay     z[4]  = sum5;
10809371c9d4SSatish Balay     z[5]  = sum6;
10819371c9d4SSatish Balay     z[6]  = sum7;
10829371c9d4SSatish Balay     z[7]  = sum8;
10839371c9d4SSatish Balay     z[8]  = sum9;
10849371c9d4SSatish Balay     z[9]  = sum10;
10859371c9d4SSatish Balay     z[10] = sum11;
10869371c9d4SSatish Balay     z[11] = sum12;
10876679dcc1SBarry Smith     if (!usecprow) {
10886679dcc1SBarry Smith       y += 12;
10896679dcc1SBarry Smith       z += 12;
10906679dcc1SBarry Smith     }
10916679dcc1SBarry Smith   }
10929566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
10939566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
10949566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
10956679dcc1SBarry Smith   PetscFunctionReturn(0);
10966679dcc1SBarry Smith }
10976679dcc1SBarry Smith 
10986679dcc1SBarry Smith /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
10999371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz) {
11006679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
11016679dcc1SBarry Smith   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
11026679dcc1SBarry Smith   const PetscScalar *x, *xb;
11036679dcc1SBarry Smith   PetscScalar        x1, x2, x3, x4, *zarray;
11046679dcc1SBarry Smith   const MatScalar   *v;
11056679dcc1SBarry Smith   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
11066679dcc1SBarry Smith   PetscInt           mbs, i, j, n;
11076679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
11086679dcc1SBarry Smith 
11096679dcc1SBarry Smith   PetscFunctionBegin;
11109566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
11119566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
11126679dcc1SBarry Smith 
11136679dcc1SBarry Smith   v = a->a;
11146679dcc1SBarry Smith   if (usecprow) {
11156679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
11166679dcc1SBarry Smith     ii   = a->compressedrow.i;
11176679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
11189566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
11196679dcc1SBarry Smith   } else {
11206679dcc1SBarry Smith     mbs = a->mbs;
11216679dcc1SBarry Smith     ii  = a->i;
11226679dcc1SBarry Smith     z   = zarray;
11236679dcc1SBarry Smith   }
11246679dcc1SBarry Smith 
11256679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
11266679dcc1SBarry Smith     n   = ii[i + 1] - ii[i];
11276679dcc1SBarry Smith     idx = ij + ii[i];
11286679dcc1SBarry Smith 
11296679dcc1SBarry Smith     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
11306679dcc1SBarry Smith     for (j = 0; j < n; j++) {
11316679dcc1SBarry Smith       xb = x + 12 * (idx[j]);
11329371c9d4SSatish Balay       x1 = xb[0];
11339371c9d4SSatish Balay       x2 = xb[1];
11349371c9d4SSatish Balay       x3 = xb[2];
11359371c9d4SSatish Balay       x4 = xb[3];
11366679dcc1SBarry Smith 
11376679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
11386679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
11396679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
11406679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
11416679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
11426679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
11436679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
11446679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
11456679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
11466679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
11476679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
11486679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
11496679dcc1SBarry Smith       v += 48;
11506679dcc1SBarry Smith 
11519371c9d4SSatish Balay       x1 = xb[4];
11529371c9d4SSatish Balay       x2 = xb[5];
11539371c9d4SSatish Balay       x3 = xb[6];
11549371c9d4SSatish Balay       x4 = xb[7];
11556679dcc1SBarry Smith 
11566679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
11576679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
11586679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
11596679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
11606679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
11616679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
11626679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
11636679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
11646679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
11656679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
11666679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
11676679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
11686679dcc1SBarry Smith       v += 48;
11696679dcc1SBarry Smith 
11709371c9d4SSatish Balay       x1 = xb[8];
11719371c9d4SSatish Balay       x2 = xb[9];
11729371c9d4SSatish Balay       x3 = xb[10];
11739371c9d4SSatish Balay       x4 = xb[11];
11746679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
11756679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
11766679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
11776679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
11786679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
11796679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
11806679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
11816679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
11826679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
11836679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
11846679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
11856679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
11866679dcc1SBarry Smith       v += 48;
11876679dcc1SBarry Smith     }
11886679dcc1SBarry Smith     if (usecprow) z = zarray + 12 * ridx[i];
11899371c9d4SSatish Balay     z[0]  = sum1;
11909371c9d4SSatish Balay     z[1]  = sum2;
11919371c9d4SSatish Balay     z[2]  = sum3;
11929371c9d4SSatish Balay     z[3]  = sum4;
11939371c9d4SSatish Balay     z[4]  = sum5;
11949371c9d4SSatish Balay     z[5]  = sum6;
11959371c9d4SSatish Balay     z[6]  = sum7;
11969371c9d4SSatish Balay     z[7]  = sum8;
11979371c9d4SSatish Balay     z[8]  = sum9;
11989371c9d4SSatish Balay     z[9]  = sum10;
11999371c9d4SSatish Balay     z[10] = sum11;
12009371c9d4SSatish Balay     z[11] = sum12;
12016679dcc1SBarry Smith     if (!usecprow) z += 12;
12026679dcc1SBarry Smith   }
12039566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
12049566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
12059566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
12066679dcc1SBarry Smith   PetscFunctionReturn(0);
12076679dcc1SBarry Smith }
12086679dcc1SBarry Smith 
12096679dcc1SBarry Smith /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
12109371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz) {
12116679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
12126679dcc1SBarry Smith   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
12136679dcc1SBarry Smith   const PetscScalar *x, *xb;
12146679dcc1SBarry Smith   PetscScalar        x1, x2, x3, x4, *zarray, *yarray;
12156679dcc1SBarry Smith   const MatScalar   *v;
12166679dcc1SBarry Smith   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
12176679dcc1SBarry Smith   PetscInt           mbs      = a->mbs, i, j, n;
12186679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
12196679dcc1SBarry Smith 
12206679dcc1SBarry Smith   PetscFunctionBegin;
12219566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
12229566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
12236679dcc1SBarry Smith 
12246679dcc1SBarry Smith   v = a->a;
12256679dcc1SBarry Smith   if (usecprow) {
1226*48a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
12276679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
12286679dcc1SBarry Smith     ii   = a->compressedrow.i;
12296679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
12306679dcc1SBarry Smith   } else {
12316679dcc1SBarry Smith     ii = a->i;
12326679dcc1SBarry Smith     y  = yarray;
12336679dcc1SBarry Smith     z  = zarray;
12346679dcc1SBarry Smith   }
12356679dcc1SBarry Smith 
12366679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
12376679dcc1SBarry Smith     n   = ii[i + 1] - ii[i];
12386679dcc1SBarry Smith     idx = ij + ii[i];
12396679dcc1SBarry Smith 
12406679dcc1SBarry Smith     if (usecprow) {
12416679dcc1SBarry Smith       y = yarray + 12 * ridx[i];
12426679dcc1SBarry Smith       z = zarray + 12 * ridx[i];
12436679dcc1SBarry Smith     }
12449371c9d4SSatish Balay     sum1  = y[0];
12459371c9d4SSatish Balay     sum2  = y[1];
12469371c9d4SSatish Balay     sum3  = y[2];
12479371c9d4SSatish Balay     sum4  = y[3];
12489371c9d4SSatish Balay     sum5  = y[4];
12499371c9d4SSatish Balay     sum6  = y[5];
12509371c9d4SSatish Balay     sum7  = y[6];
12519371c9d4SSatish Balay     sum8  = y[7];
12529371c9d4SSatish Balay     sum9  = y[8];
12539371c9d4SSatish Balay     sum10 = y[9];
12549371c9d4SSatish Balay     sum11 = y[10];
12559371c9d4SSatish Balay     sum12 = y[11];
12566679dcc1SBarry Smith 
12576679dcc1SBarry Smith     for (j = 0; j < n; j++) {
12586679dcc1SBarry Smith       xb = x + 12 * (idx[j]);
12599371c9d4SSatish Balay       x1 = xb[0];
12609371c9d4SSatish Balay       x2 = xb[1];
12619371c9d4SSatish Balay       x3 = xb[2];
12629371c9d4SSatish Balay       x4 = xb[3];
12636679dcc1SBarry Smith 
12646679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
12656679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
12666679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
12676679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
12686679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
12696679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
12706679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
12716679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
12726679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
12736679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
12746679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
12756679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
12766679dcc1SBarry Smith       v += 48;
12776679dcc1SBarry Smith 
12789371c9d4SSatish Balay       x1 = xb[4];
12799371c9d4SSatish Balay       x2 = xb[5];
12809371c9d4SSatish Balay       x3 = xb[6];
12819371c9d4SSatish Balay       x4 = xb[7];
12826679dcc1SBarry Smith 
12836679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
12846679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
12856679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
12866679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
12876679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
12886679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
12896679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
12906679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
12916679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
12926679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
12936679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
12946679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
12956679dcc1SBarry Smith       v += 48;
12966679dcc1SBarry Smith 
12979371c9d4SSatish Balay       x1 = xb[8];
12989371c9d4SSatish Balay       x2 = xb[9];
12999371c9d4SSatish Balay       x3 = xb[10];
13009371c9d4SSatish Balay       x4 = xb[11];
13016679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
13026679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
13036679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
13046679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
13056679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
13066679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
13076679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
13086679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
13096679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
13106679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
13116679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
13126679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
13136679dcc1SBarry Smith       v += 48;
13146679dcc1SBarry Smith     }
13159371c9d4SSatish Balay     z[0]  = sum1;
13169371c9d4SSatish Balay     z[1]  = sum2;
13179371c9d4SSatish Balay     z[2]  = sum3;
13189371c9d4SSatish Balay     z[3]  = sum4;
13199371c9d4SSatish Balay     z[4]  = sum5;
13209371c9d4SSatish Balay     z[5]  = sum6;
13219371c9d4SSatish Balay     z[6]  = sum7;
13229371c9d4SSatish Balay     z[7]  = sum8;
13239371c9d4SSatish Balay     z[8]  = sum9;
13249371c9d4SSatish Balay     z[9]  = sum10;
13259371c9d4SSatish Balay     z[10] = sum11;
13269371c9d4SSatish Balay     z[11] = sum12;
13276679dcc1SBarry Smith     if (!usecprow) {
13286679dcc1SBarry Smith       y += 12;
13296679dcc1SBarry Smith       z += 12;
13306679dcc1SBarry Smith     }
13316679dcc1SBarry Smith   }
13329566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
13339566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
13349566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
13356679dcc1SBarry Smith   PetscFunctionReturn(0);
13366679dcc1SBarry Smith }
13376679dcc1SBarry Smith 
13386679dcc1SBarry Smith #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
13399371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz) {
13406679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
13416679dcc1SBarry Smith   PetscScalar       *z = NULL, *zarray;
13426679dcc1SBarry Smith   const PetscScalar *x, *work;
13436679dcc1SBarry Smith   const MatScalar   *v = a->a;
13446679dcc1SBarry Smith   PetscInt           mbs, i, j, n;
13456679dcc1SBarry Smith   const PetscInt    *idx = a->j, *ii, *ridx = NULL;
13466679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
13476679dcc1SBarry Smith   const PetscInt     bs = 12, bs2 = 144;
13486679dcc1SBarry Smith 
13496679dcc1SBarry Smith   __m256d a0, a1, a2, a3, a4, a5;
13506679dcc1SBarry Smith   __m256d w0, w1, w2, w3;
13516679dcc1SBarry Smith   __m256d z0, z1, z2;
13526679dcc1SBarry Smith 
13536679dcc1SBarry Smith   PetscFunctionBegin;
13549566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
13559566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
13566679dcc1SBarry Smith 
13576679dcc1SBarry Smith   if (usecprow) {
13586679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
13596679dcc1SBarry Smith     ii   = a->compressedrow.i;
13606679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
13619566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
13626679dcc1SBarry Smith   } else {
13636679dcc1SBarry Smith     mbs = a->mbs;
13646679dcc1SBarry Smith     ii  = a->i;
13656679dcc1SBarry Smith     z   = zarray;
13666679dcc1SBarry Smith   }
13676679dcc1SBarry Smith 
13686679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
13699371c9d4SSatish Balay     z0 = _mm256_setzero_pd();
13709371c9d4SSatish Balay     z1 = _mm256_setzero_pd();
13719371c9d4SSatish Balay     z2 = _mm256_setzero_pd();
13726679dcc1SBarry Smith 
13739371c9d4SSatish Balay     n = ii[1] - ii[0];
13749371c9d4SSatish Balay     ii++;
13756679dcc1SBarry Smith     for (j = 0; j < n; j++) {
13766679dcc1SBarry Smith       work = x + bs * (*idx++);
13776679dcc1SBarry Smith 
13786679dcc1SBarry Smith       /* first column of a */
13796679dcc1SBarry Smith       w0 = _mm256_set1_pd(work[0]);
13809371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 0);
13819371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
13829371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 4);
13839371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
13849371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 8);
13859371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
13866679dcc1SBarry Smith 
13876679dcc1SBarry Smith       /* second column of a */
13886679dcc1SBarry Smith       w1 = _mm256_set1_pd(work[1]);
13899371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 12);
13909371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w1, z0);
13919371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 16);
13929371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w1, z1);
13939371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 20);
13949371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w1, z2);
13956679dcc1SBarry Smith 
13966679dcc1SBarry Smith       /* third column of a */
13976679dcc1SBarry Smith       w2 = _mm256_set1_pd(work[2]);
13989371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 24);
13999371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
14009371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 28);
14019371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
14029371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 32);
14039371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
14046679dcc1SBarry Smith 
14056679dcc1SBarry Smith       /* fourth column of a */
14066679dcc1SBarry Smith       w3 = _mm256_set1_pd(work[3]);
14079371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 36);
14089371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
14099371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 40);
14109371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
14119371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 44);
14129371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
14136679dcc1SBarry Smith 
14146679dcc1SBarry Smith       /* fifth column of a */
14156679dcc1SBarry Smith       w0 = _mm256_set1_pd(work[4]);
14169371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 48);
14179371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
14189371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 52);
14199371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
14209371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 56);
14219371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
14226679dcc1SBarry Smith 
14236679dcc1SBarry Smith       /* sixth column of a */
14246679dcc1SBarry Smith       w1 = _mm256_set1_pd(work[5]);
14259371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 60);
14269371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w1, z0);
14279371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 64);
14289371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w1, z1);
14299371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 68);
14309371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w1, z2);
14316679dcc1SBarry Smith 
14326679dcc1SBarry Smith       /* seventh column of a */
14336679dcc1SBarry Smith       w2 = _mm256_set1_pd(work[6]);
14349371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 72);
14359371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
14369371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 76);
14379371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
14389371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 80);
14399371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
14406679dcc1SBarry Smith 
14416aad120cSJose E. Roman       /* eighth column of a */
14426679dcc1SBarry Smith       w3 = _mm256_set1_pd(work[7]);
14439371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 84);
14449371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
14459371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 88);
14469371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
14479371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 92);
14489371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
14496679dcc1SBarry Smith 
14506679dcc1SBarry Smith       /* ninth column of a */
14516679dcc1SBarry Smith       w0 = _mm256_set1_pd(work[8]);
14529371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 96);
14539371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
14549371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 100);
14559371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
14569371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 104);
14579371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
14586679dcc1SBarry Smith 
14596679dcc1SBarry Smith       /* tenth column of a */
14606679dcc1SBarry Smith       w1 = _mm256_set1_pd(work[9]);
14619371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 108);
14629371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w1, z0);
14639371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 112);
14649371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w1, z1);
14659371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 116);
14669371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w1, z2);
14676679dcc1SBarry Smith 
14686679dcc1SBarry Smith       /* eleventh column of a */
14696679dcc1SBarry Smith       w2 = _mm256_set1_pd(work[10]);
14709371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 120);
14719371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
14729371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 124);
14739371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
14749371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 128);
14759371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
14766679dcc1SBarry Smith 
14776679dcc1SBarry Smith       /* twelveth column of a */
14786679dcc1SBarry Smith       w3 = _mm256_set1_pd(work[11]);
14799371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 132);
14809371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
14819371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 136);
14829371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
14839371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 140);
14849371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
14856679dcc1SBarry Smith 
14866679dcc1SBarry Smith       v += bs2;
14876679dcc1SBarry Smith     }
14886679dcc1SBarry Smith     if (usecprow) z = zarray + bs * ridx[i];
14899371c9d4SSatish Balay     _mm256_storeu_pd(&z[0], z0);
14909371c9d4SSatish Balay     _mm256_storeu_pd(&z[4], z1);
14919371c9d4SSatish Balay     _mm256_storeu_pd(&z[8], z2);
14926679dcc1SBarry Smith     if (!usecprow) z += bs;
14936679dcc1SBarry Smith   }
14949566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
14959566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
14969566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
14976679dcc1SBarry Smith   PetscFunctionReturn(0);
14986679dcc1SBarry Smith }
14996679dcc1SBarry Smith #endif
15006679dcc1SBarry Smith 
15018ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1502832cc040SShri Abhyankar /* Default MatMult for block size 15 */
15039371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz) {
15048ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1505f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
15068ab949d8SShri Abhyankar   const PetscScalar *x, *xb;
150753ef36baSBarry Smith   PetscScalar       *zarray, xv;
15088ab949d8SShri Abhyankar   const MatScalar   *v;
15098ab949d8SShri Abhyankar   const PetscInt    *ii, *ij = a->j, *idx;
15107c565772SBarry Smith   PetscInt           mbs, i, j, k, n, *ridx = NULL;
1511ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
15128ab949d8SShri Abhyankar 
15138ab949d8SShri Abhyankar   PetscFunctionBegin;
15149566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
15159566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
15168ab949d8SShri Abhyankar 
15178ab949d8SShri Abhyankar   v = a->a;
15188ab949d8SShri Abhyankar   if (usecprow) {
15198ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
15208ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
15218ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
15229566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
15238ab949d8SShri Abhyankar   } else {
15248ab949d8SShri Abhyankar     mbs = a->mbs;
15258ab949d8SShri Abhyankar     ii  = a->i;
15268ab949d8SShri Abhyankar     z   = zarray;
15278ab949d8SShri Abhyankar   }
15288ab949d8SShri Abhyankar 
15298ab949d8SShri Abhyankar   for (i = 0; i < mbs; i++) {
15308ab949d8SShri Abhyankar     n     = ii[i + 1] - ii[i];
15318ab949d8SShri Abhyankar     idx   = ij + ii[i];
15329371c9d4SSatish Balay     sum1  = 0.0;
15339371c9d4SSatish Balay     sum2  = 0.0;
15349371c9d4SSatish Balay     sum3  = 0.0;
15359371c9d4SSatish Balay     sum4  = 0.0;
15369371c9d4SSatish Balay     sum5  = 0.0;
15379371c9d4SSatish Balay     sum6  = 0.0;
15389371c9d4SSatish Balay     sum7  = 0.0;
15399371c9d4SSatish Balay     sum8  = 0.0;
15409371c9d4SSatish Balay     sum9  = 0.0;
15419371c9d4SSatish Balay     sum10 = 0.0;
15429371c9d4SSatish Balay     sum11 = 0.0;
15439371c9d4SSatish Balay     sum12 = 0.0;
15449371c9d4SSatish Balay     sum13 = 0.0;
15459371c9d4SSatish Balay     sum14 = 0.0;
15469371c9d4SSatish Balay     sum15 = 0.0;
15478ab949d8SShri Abhyankar 
15488ab949d8SShri Abhyankar     for (j = 0; j < n; j++) {
15498ab949d8SShri Abhyankar       xb = x + 15 * (idx[j]);
15508ab949d8SShri Abhyankar 
15518ab949d8SShri Abhyankar       for (k = 0; k < 15; k++) {
155253ef36baSBarry Smith         xv = xb[k];
155353ef36baSBarry Smith         sum1 += v[0] * xv;
155453ef36baSBarry Smith         sum2 += v[1] * xv;
155553ef36baSBarry Smith         sum3 += v[2] * xv;
155653ef36baSBarry Smith         sum4 += v[3] * xv;
155753ef36baSBarry Smith         sum5 += v[4] * xv;
155853ef36baSBarry Smith         sum6 += v[5] * xv;
155953ef36baSBarry Smith         sum7 += v[6] * xv;
156053ef36baSBarry Smith         sum8 += v[7] * xv;
156153ef36baSBarry Smith         sum9 += v[8] * xv;
156253ef36baSBarry Smith         sum10 += v[9] * xv;
156353ef36baSBarry Smith         sum11 += v[10] * xv;
156453ef36baSBarry Smith         sum12 += v[11] * xv;
156553ef36baSBarry Smith         sum13 += v[12] * xv;
156653ef36baSBarry Smith         sum14 += v[13] * xv;
156753ef36baSBarry Smith         sum15 += v[14] * xv;
15688ab949d8SShri Abhyankar         v += 15;
15698ab949d8SShri Abhyankar       }
15708ab949d8SShri Abhyankar     }
15718ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15 * ridx[i];
15729371c9d4SSatish Balay     z[0]  = sum1;
15739371c9d4SSatish Balay     z[1]  = sum2;
15749371c9d4SSatish Balay     z[2]  = sum3;
15759371c9d4SSatish Balay     z[3]  = sum4;
15769371c9d4SSatish Balay     z[4]  = sum5;
15779371c9d4SSatish Balay     z[5]  = sum6;
15789371c9d4SSatish Balay     z[6]  = sum7;
15799371c9d4SSatish Balay     z[7]  = sum8;
15809371c9d4SSatish Balay     z[8]  = sum9;
15819371c9d4SSatish Balay     z[9]  = sum10;
15829371c9d4SSatish Balay     z[10] = sum11;
15839371c9d4SSatish Balay     z[11] = sum12;
15849371c9d4SSatish Balay     z[12] = sum13;
15859371c9d4SSatish Balay     z[13] = sum14;
15869371c9d4SSatish Balay     z[14] = sum15;
15878ab949d8SShri Abhyankar 
15888ab949d8SShri Abhyankar     if (!usecprow) z += 15;
15898ab949d8SShri Abhyankar   }
15908ab949d8SShri Abhyankar 
15919566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
15929566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
15939566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
15948ab949d8SShri Abhyankar   PetscFunctionReturn(0);
15958ab949d8SShri Abhyankar }
15968ab949d8SShri Abhyankar 
15978ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
15989371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz) {
15998ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1600f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
16018ab949d8SShri Abhyankar   const PetscScalar *x, *xb;
16020b8f6341SShri Abhyankar   PetscScalar        x1, x2, x3, x4, *zarray;
16038ab949d8SShri Abhyankar   const MatScalar   *v;
16048ab949d8SShri Abhyankar   const PetscInt    *ii, *ij = a->j, *idx;
16057c565772SBarry Smith   PetscInt           mbs, i, j, n, *ridx = NULL;
1606ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
16078ab949d8SShri Abhyankar 
16088ab949d8SShri Abhyankar   PetscFunctionBegin;
16099566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
16109566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
16118ab949d8SShri Abhyankar 
16128ab949d8SShri Abhyankar   v = a->a;
16138ab949d8SShri Abhyankar   if (usecprow) {
16148ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
16158ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
16168ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
16179566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
16188ab949d8SShri Abhyankar   } else {
16198ab949d8SShri Abhyankar     mbs = a->mbs;
16208ab949d8SShri Abhyankar     ii  = a->i;
16218ab949d8SShri Abhyankar     z   = zarray;
16228ab949d8SShri Abhyankar   }
16238ab949d8SShri Abhyankar 
16248ab949d8SShri Abhyankar   for (i = 0; i < mbs; i++) {
16258ab949d8SShri Abhyankar     n     = ii[i + 1] - ii[i];
16268ab949d8SShri Abhyankar     idx   = ij + ii[i];
16279371c9d4SSatish Balay     sum1  = 0.0;
16289371c9d4SSatish Balay     sum2  = 0.0;
16299371c9d4SSatish Balay     sum3  = 0.0;
16309371c9d4SSatish Balay     sum4  = 0.0;
16319371c9d4SSatish Balay     sum5  = 0.0;
16329371c9d4SSatish Balay     sum6  = 0.0;
16339371c9d4SSatish Balay     sum7  = 0.0;
16349371c9d4SSatish Balay     sum8  = 0.0;
16359371c9d4SSatish Balay     sum9  = 0.0;
16369371c9d4SSatish Balay     sum10 = 0.0;
16379371c9d4SSatish Balay     sum11 = 0.0;
16389371c9d4SSatish Balay     sum12 = 0.0;
16399371c9d4SSatish Balay     sum13 = 0.0;
16409371c9d4SSatish Balay     sum14 = 0.0;
16419371c9d4SSatish Balay     sum15 = 0.0;
16428ab949d8SShri Abhyankar 
16438ab949d8SShri Abhyankar     for (j = 0; j < n; j++) {
16448ab949d8SShri Abhyankar       xb = x + 15 * (idx[j]);
16459371c9d4SSatish Balay       x1 = xb[0];
16469371c9d4SSatish Balay       x2 = xb[1];
16479371c9d4SSatish Balay       x3 = xb[2];
16489371c9d4SSatish Balay       x4 = xb[3];
16498ab949d8SShri Abhyankar 
16508ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
16518ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
16528ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
16538ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
16548ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
16558ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
16568ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
16578ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
16588ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
16598ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
16608ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
16618ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
16628ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
16638ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
16648ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
16658ab949d8SShri Abhyankar 
16668ab949d8SShri Abhyankar       v += 60;
16678ab949d8SShri Abhyankar 
16689371c9d4SSatish Balay       x1 = xb[4];
16699371c9d4SSatish Balay       x2 = xb[5];
16709371c9d4SSatish Balay       x3 = xb[6];
16719371c9d4SSatish Balay       x4 = xb[7];
16728ab949d8SShri Abhyankar 
16738ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
16748ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
16758ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
16768ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
16778ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
16788ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
16798ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
16808ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
16818ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
16828ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
16838ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
16848ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
16858ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
16868ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
16878ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
16888ab949d8SShri Abhyankar       v += 60;
16898ab949d8SShri Abhyankar 
16909371c9d4SSatish Balay       x1 = xb[8];
16919371c9d4SSatish Balay       x2 = xb[9];
16929371c9d4SSatish Balay       x3 = xb[10];
16939371c9d4SSatish Balay       x4 = xb[11];
16940b8f6341SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
16950b8f6341SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
16960b8f6341SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
16970b8f6341SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
16980b8f6341SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
16990b8f6341SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
17000b8f6341SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
17010b8f6341SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
17020b8f6341SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
17030b8f6341SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
17040b8f6341SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
17050b8f6341SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
17060b8f6341SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
17070b8f6341SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
17080b8f6341SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
17090b8f6341SShri Abhyankar       v += 60;
17100b8f6341SShri Abhyankar 
17119371c9d4SSatish Balay       x1 = xb[12];
17129371c9d4SSatish Balay       x2 = xb[13];
17139371c9d4SSatish Balay       x3 = xb[14];
17148ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
17158ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
17168ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
17178ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
17188ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
17198ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
17208ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
17218ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
17228ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
17238ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
17248ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
17258ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
17268ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
17278ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
17288ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
17298ab949d8SShri Abhyankar       v += 45;
17308ab949d8SShri Abhyankar     }
17318ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15 * ridx[i];
17329371c9d4SSatish Balay     z[0]  = sum1;
17339371c9d4SSatish Balay     z[1]  = sum2;
17349371c9d4SSatish Balay     z[2]  = sum3;
17359371c9d4SSatish Balay     z[3]  = sum4;
17369371c9d4SSatish Balay     z[4]  = sum5;
17379371c9d4SSatish Balay     z[5]  = sum6;
17389371c9d4SSatish Balay     z[6]  = sum7;
17399371c9d4SSatish Balay     z[7]  = sum8;
17409371c9d4SSatish Balay     z[8]  = sum9;
17419371c9d4SSatish Balay     z[9]  = sum10;
17429371c9d4SSatish Balay     z[10] = sum11;
17439371c9d4SSatish Balay     z[11] = sum12;
17449371c9d4SSatish Balay     z[12] = sum13;
17459371c9d4SSatish Balay     z[13] = sum14;
17469371c9d4SSatish Balay     z[14] = sum15;
17478ab949d8SShri Abhyankar 
17488ab949d8SShri Abhyankar     if (!usecprow) z += 15;
17498ab949d8SShri Abhyankar   }
17508ab949d8SShri Abhyankar 
17519566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
17529566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
17539566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
17548ab949d8SShri Abhyankar   PetscFunctionReturn(0);
17558ab949d8SShri Abhyankar }
17568ab949d8SShri Abhyankar 
17578ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
17589371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz) {
17598ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1760f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
17618ab949d8SShri Abhyankar   const PetscScalar *x, *xb;
17620b8f6341SShri Abhyankar   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
17638ab949d8SShri Abhyankar   const MatScalar   *v;
17648ab949d8SShri Abhyankar   const PetscInt    *ii, *ij = a->j, *idx;
17657c565772SBarry Smith   PetscInt           mbs, i, j, n, *ridx = NULL;
1766ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
17678ab949d8SShri Abhyankar 
17688ab949d8SShri Abhyankar   PetscFunctionBegin;
17699566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
17709566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
17718ab949d8SShri Abhyankar 
17728ab949d8SShri Abhyankar   v = a->a;
17738ab949d8SShri Abhyankar   if (usecprow) {
17748ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
17758ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
17768ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
17779566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
17788ab949d8SShri Abhyankar   } else {
17798ab949d8SShri Abhyankar     mbs = a->mbs;
17808ab949d8SShri Abhyankar     ii  = a->i;
17818ab949d8SShri Abhyankar     z   = zarray;
17828ab949d8SShri Abhyankar   }
17838ab949d8SShri Abhyankar 
17848ab949d8SShri Abhyankar   for (i = 0; i < mbs; i++) {
17858ab949d8SShri Abhyankar     n     = ii[i + 1] - ii[i];
17868ab949d8SShri Abhyankar     idx   = ij + ii[i];
17879371c9d4SSatish Balay     sum1  = 0.0;
17889371c9d4SSatish Balay     sum2  = 0.0;
17899371c9d4SSatish Balay     sum3  = 0.0;
17909371c9d4SSatish Balay     sum4  = 0.0;
17919371c9d4SSatish Balay     sum5  = 0.0;
17929371c9d4SSatish Balay     sum6  = 0.0;
17939371c9d4SSatish Balay     sum7  = 0.0;
17949371c9d4SSatish Balay     sum8  = 0.0;
17959371c9d4SSatish Balay     sum9  = 0.0;
17969371c9d4SSatish Balay     sum10 = 0.0;
17979371c9d4SSatish Balay     sum11 = 0.0;
17989371c9d4SSatish Balay     sum12 = 0.0;
17999371c9d4SSatish Balay     sum13 = 0.0;
18009371c9d4SSatish Balay     sum14 = 0.0;
18019371c9d4SSatish Balay     sum15 = 0.0;
18028ab949d8SShri Abhyankar 
18038ab949d8SShri Abhyankar     for (j = 0; j < n; j++) {
18048ab949d8SShri Abhyankar       xb = x + 15 * (idx[j]);
18059371c9d4SSatish Balay       x1 = xb[0];
18069371c9d4SSatish Balay       x2 = xb[1];
18079371c9d4SSatish Balay       x3 = xb[2];
18089371c9d4SSatish Balay       x4 = xb[3];
18099371c9d4SSatish Balay       x5 = xb[4];
18109371c9d4SSatish Balay       x6 = xb[5];
18119371c9d4SSatish Balay       x7 = xb[6];
18120b8f6341SShri Abhyankar       x8 = xb[7];
18138ab949d8SShri Abhyankar 
18148ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
18158ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
18168ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
18178ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
18188ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
18198ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
18208ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
18218ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
18228ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
18238ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
18248ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
18258ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
18268ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
18278ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
18288ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
18298ab949d8SShri Abhyankar       v += 120;
18308ab949d8SShri Abhyankar 
18319371c9d4SSatish Balay       x1 = xb[8];
18329371c9d4SSatish Balay       x2 = xb[9];
18339371c9d4SSatish Balay       x3 = xb[10];
18349371c9d4SSatish Balay       x4 = xb[11];
18359371c9d4SSatish Balay       x5 = xb[12];
18369371c9d4SSatish Balay       x6 = xb[13];
18379371c9d4SSatish Balay       x7 = xb[14];
18380b8f6341SShri Abhyankar 
18398ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
18408ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
18418ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
18428ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
18438ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
18448ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
18458ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
18468ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
18478ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
18488ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
18498ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
18508ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
18518ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
18528ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
18538ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
18548ab949d8SShri Abhyankar       v += 105;
18558ab949d8SShri Abhyankar     }
18568ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15 * ridx[i];
18579371c9d4SSatish Balay     z[0]  = sum1;
18589371c9d4SSatish Balay     z[1]  = sum2;
18599371c9d4SSatish Balay     z[2]  = sum3;
18609371c9d4SSatish Balay     z[3]  = sum4;
18619371c9d4SSatish Balay     z[4]  = sum5;
18629371c9d4SSatish Balay     z[5]  = sum6;
18639371c9d4SSatish Balay     z[6]  = sum7;
18649371c9d4SSatish Balay     z[7]  = sum8;
18659371c9d4SSatish Balay     z[8]  = sum9;
18669371c9d4SSatish Balay     z[9]  = sum10;
18679371c9d4SSatish Balay     z[10] = sum11;
18689371c9d4SSatish Balay     z[11] = sum12;
18699371c9d4SSatish Balay     z[12] = sum13;
18709371c9d4SSatish Balay     z[13] = sum14;
18719371c9d4SSatish Balay     z[14] = sum15;
18728ab949d8SShri Abhyankar 
18738ab949d8SShri Abhyankar     if (!usecprow) z += 15;
18748ab949d8SShri Abhyankar   }
18758ab949d8SShri Abhyankar 
18769566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
18779566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
18789566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
18798ab949d8SShri Abhyankar   PetscFunctionReturn(0);
18808ab949d8SShri Abhyankar }
18818ab949d8SShri Abhyankar 
18828ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
18839371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz) {
18848ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1885f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
18868ab949d8SShri Abhyankar   const PetscScalar *x, *xb;
18878ab949d8SShri Abhyankar   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
18888ab949d8SShri Abhyankar   const MatScalar   *v;
18898ab949d8SShri Abhyankar   const PetscInt    *ii, *ij = a->j, *idx;
18907c565772SBarry Smith   PetscInt           mbs, i, j, n, *ridx = NULL;
1891ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
18928ab949d8SShri Abhyankar 
18938ab949d8SShri Abhyankar   PetscFunctionBegin;
18949566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
18959566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
18968ab949d8SShri Abhyankar 
18978ab949d8SShri Abhyankar   v = a->a;
18988ab949d8SShri Abhyankar   if (usecprow) {
18998ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
19008ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
19018ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
19029566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
19038ab949d8SShri Abhyankar   } else {
19048ab949d8SShri Abhyankar     mbs = a->mbs;
19058ab949d8SShri Abhyankar     ii  = a->i;
19068ab949d8SShri Abhyankar     z   = zarray;
19078ab949d8SShri Abhyankar   }
19088ab949d8SShri Abhyankar 
19098ab949d8SShri Abhyankar   for (i = 0; i < mbs; i++) {
19108ab949d8SShri Abhyankar     n     = ii[i + 1] - ii[i];
19118ab949d8SShri Abhyankar     idx   = ij + ii[i];
19129371c9d4SSatish Balay     sum1  = 0.0;
19139371c9d4SSatish Balay     sum2  = 0.0;
19149371c9d4SSatish Balay     sum3  = 0.0;
19159371c9d4SSatish Balay     sum4  = 0.0;
19169371c9d4SSatish Balay     sum5  = 0.0;
19179371c9d4SSatish Balay     sum6  = 0.0;
19189371c9d4SSatish Balay     sum7  = 0.0;
19199371c9d4SSatish Balay     sum8  = 0.0;
19209371c9d4SSatish Balay     sum9  = 0.0;
19219371c9d4SSatish Balay     sum10 = 0.0;
19229371c9d4SSatish Balay     sum11 = 0.0;
19239371c9d4SSatish Balay     sum12 = 0.0;
19249371c9d4SSatish Balay     sum13 = 0.0;
19259371c9d4SSatish Balay     sum14 = 0.0;
19269371c9d4SSatish Balay     sum15 = 0.0;
19278ab949d8SShri Abhyankar 
19288ab949d8SShri Abhyankar     for (j = 0; j < n; j++) {
19298ab949d8SShri Abhyankar       xb  = x + 15 * (idx[j]);
19309371c9d4SSatish Balay       x1  = xb[0];
19319371c9d4SSatish Balay       x2  = xb[1];
19329371c9d4SSatish Balay       x3  = xb[2];
19339371c9d4SSatish Balay       x4  = xb[3];
19349371c9d4SSatish Balay       x5  = xb[4];
19359371c9d4SSatish Balay       x6  = xb[5];
19369371c9d4SSatish Balay       x7  = xb[6];
19379371c9d4SSatish Balay       x8  = xb[7];
19389371c9d4SSatish Balay       x9  = xb[8];
19399371c9d4SSatish Balay       x10 = xb[9];
19409371c9d4SSatish Balay       x11 = xb[10];
19419371c9d4SSatish Balay       x12 = xb[11];
19429371c9d4SSatish Balay       x13 = xb[12];
19439371c9d4SSatish Balay       x14 = xb[13];
19449371c9d4SSatish Balay       x15 = xb[14];
19458ab949d8SShri Abhyankar 
19468ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
19478ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
19488ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
19498ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
19508ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
19518ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
19528ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
19538ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
19548ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
19558ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
19568ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
19578ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
19588ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
19598ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
19608ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
19618ab949d8SShri Abhyankar       v += 225;
19628ab949d8SShri Abhyankar     }
19638ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15 * ridx[i];
19649371c9d4SSatish Balay     z[0]  = sum1;
19659371c9d4SSatish Balay     z[1]  = sum2;
19669371c9d4SSatish Balay     z[2]  = sum3;
19679371c9d4SSatish Balay     z[3]  = sum4;
19689371c9d4SSatish Balay     z[4]  = sum5;
19699371c9d4SSatish Balay     z[5]  = sum6;
19709371c9d4SSatish Balay     z[6]  = sum7;
19719371c9d4SSatish Balay     z[7]  = sum8;
19729371c9d4SSatish Balay     z[8]  = sum9;
19739371c9d4SSatish Balay     z[9]  = sum10;
19749371c9d4SSatish Balay     z[10] = sum11;
19759371c9d4SSatish Balay     z[11] = sum12;
19769371c9d4SSatish Balay     z[12] = sum13;
19779371c9d4SSatish Balay     z[13] = sum14;
19789371c9d4SSatish Balay     z[14] = sum15;
19798ab949d8SShri Abhyankar 
19808ab949d8SShri Abhyankar     if (!usecprow) z += 15;
19818ab949d8SShri Abhyankar   }
19828ab949d8SShri Abhyankar 
19839566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
19849566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
19859566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
19868ab949d8SShri Abhyankar   PetscFunctionReturn(0);
19878ab949d8SShri Abhyankar }
19888ab949d8SShri Abhyankar 
19893f1db9ecSBarry Smith /*
19903f1db9ecSBarry Smith     This will not work with MatScalar == float because it calls the BLAS
19913f1db9ecSBarry Smith */
19929371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz) {
19932d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1994f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, *work, *workt, *zarray;
1995d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
1996d9ca1df4SBarry Smith   const MatScalar   *v;
1997d9ca1df4SBarry Smith   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1998d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
1999d9ca1df4SBarry Smith   PetscInt           ncols, k;
2000ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
20012d61bbb3SSatish Balay 
20022d61bbb3SSatish Balay   PetscFunctionBegin;
20039566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
20049566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
20052d61bbb3SSatish Balay 
20062d61bbb3SSatish Balay   idx = a->j;
20072d61bbb3SSatish Balay   v   = a->a;
200826e093fcSHong Zhang   if (usecprow) {
200926e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
201026e093fcSHong Zhang     ii   = a->compressedrow.i;
20117b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
20129566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
201326e093fcSHong Zhang   } else {
201426e093fcSHong Zhang     mbs = a->mbs;
20152d61bbb3SSatish Balay     ii  = a->i;
201626e093fcSHong Zhang     z   = zarray;
201726e093fcSHong Zhang   }
2018218c64b6SSatish Balay 
20192d61bbb3SSatish Balay   if (!a->mult_work) {
2020d0f46423SBarry Smith     k = PetscMax(A->rmap->n, A->cmap->n);
20219566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
20222d61bbb3SSatish Balay   }
20232d61bbb3SSatish Balay   work = a->mult_work;
20242d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
20259371c9d4SSatish Balay     n = ii[1] - ii[0];
20269371c9d4SSatish Balay     ii++;
20272d61bbb3SSatish Balay     ncols = n * bs;
20282d61bbb3SSatish Balay     workt = work;
20292d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
20302d61bbb3SSatish Balay       xb = x + bs * (*idx++);
20312d61bbb3SSatish Balay       for (k = 0; k < bs; k++) workt[k] = xb[k];
20322d61bbb3SSatish Balay       workt += bs;
20332d61bbb3SSatish Balay     }
20347b2bb3b9SHong Zhang     if (usecprow) z = zarray + bs * ridx[i];
203596b95a6bSBarry Smith     PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
20362d61bbb3SSatish Balay     v += n * bs2;
203726e093fcSHong Zhang     if (!usecprow) z += bs;
20382d61bbb3SSatish Balay   }
20399566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
20409566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
20419566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
20422d61bbb3SSatish Balay   PetscFunctionReturn(0);
20432d61bbb3SSatish Balay }
20442d61bbb3SSatish Balay 
20459371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) {
20462d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2047122f12eaSBarry Smith   const PetscScalar *x;
2048122f12eaSBarry Smith   PetscScalar       *y, *z, sum;
2049122f12eaSBarry Smith   const MatScalar   *v;
20507c565772SBarry Smith   PetscInt           mbs = a->mbs, i, n, *ridx = NULL;
2051122f12eaSBarry Smith   const PetscInt    *idx, *ii;
2052ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
20532d61bbb3SSatish Balay 
20542d61bbb3SSatish Balay   PetscFunctionBegin;
20559566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
20569566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
20572d61bbb3SSatish Balay 
20582d61bbb3SSatish Balay   idx = a->j;
20592d61bbb3SSatish Balay   v   = a->a;
206026e093fcSHong Zhang   if (usecprow) {
2061*48a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs));
206226e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
206326e093fcSHong Zhang     ii   = a->compressedrow.i;
20647b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
206526e093fcSHong Zhang   } else {
20662d61bbb3SSatish Balay     ii = a->i;
206726e093fcSHong Zhang   }
20682d61bbb3SSatish Balay 
20692d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
2070122f12eaSBarry Smith     n = ii[1] - ii[0];
2071122f12eaSBarry Smith     ii++;
207226e093fcSHong Zhang     if (!usecprow) {
2073122f12eaSBarry Smith       sum = y[i];
2074122f12eaSBarry Smith     } else {
2075122f12eaSBarry Smith       sum = y[ridx[i]];
2076122f12eaSBarry Smith     }
2077444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2078444d8c10SJed Brown     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
2079122f12eaSBarry Smith     PetscSparseDensePlusDot(sum, x, v, idx, n);
2080122f12eaSBarry Smith     v += n;
2081122f12eaSBarry Smith     idx += n;
2082122f12eaSBarry Smith     if (usecprow) {
2083122f12eaSBarry Smith       z[ridx[i]] = sum;
2084122f12eaSBarry Smith     } else {
2085122f12eaSBarry Smith       z[i] = sum;
208626e093fcSHong Zhang     }
20872d61bbb3SSatish Balay   }
20889566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
20899566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
20909566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz));
20912d61bbb3SSatish Balay   PetscFunctionReturn(0);
20922d61bbb3SSatish Balay }
20932d61bbb3SSatish Balay 
20949371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) {
20952d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2096f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2;
2097d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
209826e093fcSHong Zhang   PetscScalar        x1, x2, *yarray, *zarray;
2099d9ca1df4SBarry Smith   const MatScalar   *v;
2100d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, j;
2101d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2102ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
21032d61bbb3SSatish Balay 
21042d61bbb3SSatish Balay   PetscFunctionBegin;
21059566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
21069566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
21072d61bbb3SSatish Balay 
21082d61bbb3SSatish Balay   idx = a->j;
21092d61bbb3SSatish Balay   v   = a->a;
211026e093fcSHong Zhang   if (usecprow) {
2111*48a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs));
211226e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
211326e093fcSHong Zhang     ii   = a->compressedrow.i;
21147b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
211526e093fcSHong Zhang   } else {
21162d61bbb3SSatish Balay     ii = a->i;
211726e093fcSHong Zhang     y  = yarray;
211826e093fcSHong Zhang     z  = zarray;
211926e093fcSHong Zhang   }
21202d61bbb3SSatish Balay 
21212d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
21229371c9d4SSatish Balay     n = ii[1] - ii[0];
21239371c9d4SSatish Balay     ii++;
212426e093fcSHong Zhang     if (usecprow) {
21257b2bb3b9SHong Zhang       z = zarray + 2 * ridx[i];
21267b2bb3b9SHong Zhang       y = yarray + 2 * ridx[i];
212726e093fcSHong Zhang     }
21289371c9d4SSatish Balay     sum1 = y[0];
21299371c9d4SSatish Balay     sum2 = y[1];
2130444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2131444d8c10SJed Brown     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
21322d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
213326fbe8dcSKarl Rupp       xb = x + 2 * (*idx++);
213426fbe8dcSKarl Rupp       x1 = xb[0];
213526fbe8dcSKarl Rupp       x2 = xb[1];
213626fbe8dcSKarl Rupp 
21372d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[2] * x2;
21382d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[3] * x2;
21392d61bbb3SSatish Balay       v += 4;
21402d61bbb3SSatish Balay     }
21419371c9d4SSatish Balay     z[0] = sum1;
21429371c9d4SSatish Balay     z[1] = sum2;
214326e093fcSHong Zhang     if (!usecprow) {
21449371c9d4SSatish Balay       z += 2;
21459371c9d4SSatish Balay       y += 2;
21462d61bbb3SSatish Balay     }
214726e093fcSHong Zhang   }
21489566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
21499566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
21509566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(4.0 * a->nz));
21512d61bbb3SSatish Balay   PetscFunctionReturn(0);
21522d61bbb3SSatish Balay }
21532d61bbb3SSatish Balay 
21549371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) {
21552d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2156f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2157d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
2158d9ca1df4SBarry Smith   const MatScalar   *v;
2159d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2160d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2161ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
21622d61bbb3SSatish Balay 
21632d61bbb3SSatish Balay   PetscFunctionBegin;
21649566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
21659566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
21662d61bbb3SSatish Balay 
21672d61bbb3SSatish Balay   idx = a->j;
21682d61bbb3SSatish Balay   v   = a->a;
216926e093fcSHong Zhang   if (usecprow) {
2170*48a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs));
217126e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
217226e093fcSHong Zhang     ii   = a->compressedrow.i;
21737b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
217426e093fcSHong Zhang   } else {
21752d61bbb3SSatish Balay     ii = a->i;
217626e093fcSHong Zhang     y  = yarray;
217726e093fcSHong Zhang     z  = zarray;
217826e093fcSHong Zhang   }
21792d61bbb3SSatish Balay 
21802d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
21819371c9d4SSatish Balay     n = ii[1] - ii[0];
21829371c9d4SSatish Balay     ii++;
218326e093fcSHong Zhang     if (usecprow) {
21847b2bb3b9SHong Zhang       z = zarray + 3 * ridx[i];
21857b2bb3b9SHong Zhang       y = yarray + 3 * ridx[i];
218626e093fcSHong Zhang     }
21879371c9d4SSatish Balay     sum1 = y[0];
21889371c9d4SSatish Balay     sum2 = y[1];
21899371c9d4SSatish Balay     sum3 = y[2];
2190444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2191444d8c10SJed Brown     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
21922d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
21939371c9d4SSatish Balay       xb = x + 3 * (*idx++);
21949371c9d4SSatish Balay       x1 = xb[0];
21959371c9d4SSatish Balay       x2 = xb[1];
21969371c9d4SSatish Balay       x3 = xb[2];
21972d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
21982d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
21992d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
22002d61bbb3SSatish Balay       v += 9;
22012d61bbb3SSatish Balay     }
22029371c9d4SSatish Balay     z[0] = sum1;
22039371c9d4SSatish Balay     z[1] = sum2;
22049371c9d4SSatish Balay     z[2] = sum3;
220526e093fcSHong Zhang     if (!usecprow) {
22069371c9d4SSatish Balay       z += 3;
22079371c9d4SSatish Balay       y += 3;
22082d61bbb3SSatish Balay     }
220926e093fcSHong Zhang   }
22109566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
22119566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
22129566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0 * a->nz));
22132d61bbb3SSatish Balay   PetscFunctionReturn(0);
22142d61bbb3SSatish Balay }
22152d61bbb3SSatish Balay 
22169371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) {
22172d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2218f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2219d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
2220d9ca1df4SBarry Smith   const MatScalar   *v;
2221d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2222d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2223ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
22242d61bbb3SSatish Balay 
22252d61bbb3SSatish Balay   PetscFunctionBegin;
22269566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
22279566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
22282d61bbb3SSatish Balay 
22292d61bbb3SSatish Balay   idx = a->j;
22302d61bbb3SSatish Balay   v   = a->a;
223126e093fcSHong Zhang   if (usecprow) {
2232*48a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs));
223326e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
223426e093fcSHong Zhang     ii   = a->compressedrow.i;
22357b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
223626e093fcSHong Zhang   } else {
22372d61bbb3SSatish Balay     ii = a->i;
223826e093fcSHong Zhang     y  = yarray;
223926e093fcSHong Zhang     z  = zarray;
224026e093fcSHong Zhang   }
22412d61bbb3SSatish Balay 
22422d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
22439371c9d4SSatish Balay     n = ii[1] - ii[0];
22449371c9d4SSatish Balay     ii++;
224526e093fcSHong Zhang     if (usecprow) {
22467b2bb3b9SHong Zhang       z = zarray + 4 * ridx[i];
22477b2bb3b9SHong Zhang       y = yarray + 4 * ridx[i];
224826e093fcSHong Zhang     }
22499371c9d4SSatish Balay     sum1 = y[0];
22509371c9d4SSatish Balay     sum2 = y[1];
22519371c9d4SSatish Balay     sum3 = y[2];
22529371c9d4SSatish Balay     sum4 = y[3];
2253444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2254444d8c10SJed Brown     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
22552d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
22562d61bbb3SSatish Balay       xb = x + 4 * (*idx++);
22579371c9d4SSatish Balay       x1 = xb[0];
22589371c9d4SSatish Balay       x2 = xb[1];
22599371c9d4SSatish Balay       x3 = xb[2];
22609371c9d4SSatish Balay       x4 = xb[3];
22612d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
22622d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
22632d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
22642d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
22652d61bbb3SSatish Balay       v += 16;
22662d61bbb3SSatish Balay     }
22679371c9d4SSatish Balay     z[0] = sum1;
22689371c9d4SSatish Balay     z[1] = sum2;
22699371c9d4SSatish Balay     z[2] = sum3;
22709371c9d4SSatish Balay     z[3] = sum4;
227126e093fcSHong Zhang     if (!usecprow) {
22729371c9d4SSatish Balay       z += 4;
22739371c9d4SSatish Balay       y += 4;
22742d61bbb3SSatish Balay     }
227526e093fcSHong Zhang   }
22769566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
22779566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
22789566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0 * a->nz));
22792d61bbb3SSatish Balay   PetscFunctionReturn(0);
22802d61bbb3SSatish Balay }
22812d61bbb3SSatish Balay 
22829371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) {
22832d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2284f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2285d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
228626e093fcSHong Zhang   PetscScalar       *yarray, *zarray;
2287d9ca1df4SBarry Smith   const MatScalar   *v;
2288d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2289d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2290ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
22912d61bbb3SSatish Balay 
22922d61bbb3SSatish Balay   PetscFunctionBegin;
22939566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
22949566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
22952d61bbb3SSatish Balay 
22962d61bbb3SSatish Balay   idx = a->j;
22972d61bbb3SSatish Balay   v   = a->a;
229826e093fcSHong Zhang   if (usecprow) {
2299*48a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs));
230026e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
230126e093fcSHong Zhang     ii   = a->compressedrow.i;
23027b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
230326e093fcSHong Zhang   } else {
23042d61bbb3SSatish Balay     ii = a->i;
230526e093fcSHong Zhang     y  = yarray;
230626e093fcSHong Zhang     z  = zarray;
230726e093fcSHong Zhang   }
23082d61bbb3SSatish Balay 
23092d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
23109371c9d4SSatish Balay     n = ii[1] - ii[0];
23119371c9d4SSatish Balay     ii++;
231226e093fcSHong Zhang     if (usecprow) {
23137b2bb3b9SHong Zhang       z = zarray + 5 * ridx[i];
23147b2bb3b9SHong Zhang       y = yarray + 5 * ridx[i];
231526e093fcSHong Zhang     }
23169371c9d4SSatish Balay     sum1 = y[0];
23179371c9d4SSatish Balay     sum2 = y[1];
23189371c9d4SSatish Balay     sum3 = y[2];
23199371c9d4SSatish Balay     sum4 = y[3];
23209371c9d4SSatish Balay     sum5 = y[4];
2321444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2322444d8c10SJed Brown     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
23232d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
23242d61bbb3SSatish Balay       xb = x + 5 * (*idx++);
23259371c9d4SSatish Balay       x1 = xb[0];
23269371c9d4SSatish Balay       x2 = xb[1];
23279371c9d4SSatish Balay       x3 = xb[2];
23289371c9d4SSatish Balay       x4 = xb[3];
23299371c9d4SSatish Balay       x5 = xb[4];
23302d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
23312d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
23322d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
23332d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
23342d61bbb3SSatish Balay       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
23352d61bbb3SSatish Balay       v += 25;
23362d61bbb3SSatish Balay     }
23379371c9d4SSatish Balay     z[0] = sum1;
23389371c9d4SSatish Balay     z[1] = sum2;
23399371c9d4SSatish Balay     z[2] = sum3;
23409371c9d4SSatish Balay     z[3] = sum4;
23419371c9d4SSatish Balay     z[4] = sum5;
234226e093fcSHong Zhang     if (!usecprow) {
23439371c9d4SSatish Balay       z += 5;
23449371c9d4SSatish Balay       y += 5;
23452d61bbb3SSatish Balay     }
234626e093fcSHong Zhang   }
23479566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
23489566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
23499566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0 * a->nz));
23502d61bbb3SSatish Balay   PetscFunctionReturn(0);
23512d61bbb3SSatish Balay }
2352c2916339SPierre Jolivet 
23539371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) {
235415091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2355f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2356d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
235726e093fcSHong Zhang   PetscScalar        x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2358d9ca1df4SBarry Smith   const MatScalar   *v;
2359d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2360d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2361ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
236215091d37SBarry Smith 
236315091d37SBarry Smith   PetscFunctionBegin;
23649566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
23659566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
236615091d37SBarry Smith 
236715091d37SBarry Smith   idx = a->j;
236815091d37SBarry Smith   v   = a->a;
236926e093fcSHong Zhang   if (usecprow) {
2370*48a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs));
237126e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
237226e093fcSHong Zhang     ii   = a->compressedrow.i;
23737b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
237426e093fcSHong Zhang   } else {
237515091d37SBarry Smith     ii = a->i;
237626e093fcSHong Zhang     y  = yarray;
237726e093fcSHong Zhang     z  = zarray;
237826e093fcSHong Zhang   }
237915091d37SBarry Smith 
238015091d37SBarry Smith   for (i = 0; i < mbs; i++) {
23819371c9d4SSatish Balay     n = ii[1] - ii[0];
23829371c9d4SSatish Balay     ii++;
238326e093fcSHong Zhang     if (usecprow) {
23847b2bb3b9SHong Zhang       z = zarray + 6 * ridx[i];
23857b2bb3b9SHong Zhang       y = yarray + 6 * ridx[i];
238626e093fcSHong Zhang     }
23879371c9d4SSatish Balay     sum1 = y[0];
23889371c9d4SSatish Balay     sum2 = y[1];
23899371c9d4SSatish Balay     sum3 = y[2];
23909371c9d4SSatish Balay     sum4 = y[3];
23919371c9d4SSatish Balay     sum5 = y[4];
23929371c9d4SSatish Balay     sum6 = y[5];
2393444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2394444d8c10SJed Brown     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
239515091d37SBarry Smith     for (j = 0; j < n; j++) {
23963b95cb0eSSatish Balay       xb = x + 6 * (*idx++);
23979371c9d4SSatish Balay       x1 = xb[0];
23989371c9d4SSatish Balay       x2 = xb[1];
23999371c9d4SSatish Balay       x3 = xb[2];
24009371c9d4SSatish Balay       x4 = xb[3];
24019371c9d4SSatish Balay       x5 = xb[4];
24029371c9d4SSatish Balay       x6 = xb[5];
240315091d37SBarry Smith       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
240415091d37SBarry Smith       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
240515091d37SBarry Smith       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
240615091d37SBarry Smith       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
240715091d37SBarry Smith       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
240815091d37SBarry Smith       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
240915091d37SBarry Smith       v += 36;
241015091d37SBarry Smith     }
24119371c9d4SSatish Balay     z[0] = sum1;
24129371c9d4SSatish Balay     z[1] = sum2;
24139371c9d4SSatish Balay     z[2] = sum3;
24149371c9d4SSatish Balay     z[3] = sum4;
24159371c9d4SSatish Balay     z[4] = sum5;
24169371c9d4SSatish Balay     z[5] = sum6;
241726e093fcSHong Zhang     if (!usecprow) {
24189371c9d4SSatish Balay       z += 6;
24199371c9d4SSatish Balay       y += 6;
242015091d37SBarry Smith     }
242126e093fcSHong Zhang   }
24229566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
24239566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
24249566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0 * a->nz));
242515091d37SBarry Smith   PetscFunctionReturn(0);
242615091d37SBarry Smith }
24272d61bbb3SSatish Balay 
24289371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) {
24292d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2430f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2431d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
243226e093fcSHong Zhang   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2433d9ca1df4SBarry Smith   const MatScalar   *v;
2434d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2435d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2436ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
24372d61bbb3SSatish Balay 
24382d61bbb3SSatish Balay   PetscFunctionBegin;
24399566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
24409566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
24412d61bbb3SSatish Balay 
24422d61bbb3SSatish Balay   idx = a->j;
24432d61bbb3SSatish Balay   v   = a->a;
244426e093fcSHong Zhang   if (usecprow) {
2445*48a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
244626e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
244726e093fcSHong Zhang     ii   = a->compressedrow.i;
24487b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
244926e093fcSHong Zhang   } else {
24502d61bbb3SSatish Balay     ii = a->i;
245126e093fcSHong Zhang     y  = yarray;
245226e093fcSHong Zhang     z  = zarray;
245326e093fcSHong Zhang   }
24542d61bbb3SSatish Balay 
24552d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
24569371c9d4SSatish Balay     n = ii[1] - ii[0];
24579371c9d4SSatish Balay     ii++;
245826e093fcSHong Zhang     if (usecprow) {
24597b2bb3b9SHong Zhang       z = zarray + 7 * ridx[i];
24607b2bb3b9SHong Zhang       y = yarray + 7 * ridx[i];
246126e093fcSHong Zhang     }
24629371c9d4SSatish Balay     sum1 = y[0];
24639371c9d4SSatish Balay     sum2 = y[1];
24649371c9d4SSatish Balay     sum3 = y[2];
24659371c9d4SSatish Balay     sum4 = y[3];
24669371c9d4SSatish Balay     sum5 = y[4];
24679371c9d4SSatish Balay     sum6 = y[5];
24689371c9d4SSatish Balay     sum7 = y[6];
2469444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2470444d8c10SJed Brown     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
24712d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
24722d61bbb3SSatish Balay       xb = x + 7 * (*idx++);
24739371c9d4SSatish Balay       x1 = xb[0];
24749371c9d4SSatish Balay       x2 = xb[1];
24759371c9d4SSatish Balay       x3 = xb[2];
24769371c9d4SSatish Balay       x4 = xb[3];
24779371c9d4SSatish Balay       x5 = xb[4];
24789371c9d4SSatish Balay       x6 = xb[5];
24799371c9d4SSatish Balay       x7 = xb[6];
24802d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
24812d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
24822d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
24832d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
24842d61bbb3SSatish Balay       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
24852d61bbb3SSatish Balay       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
24862d61bbb3SSatish Balay       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
24872d61bbb3SSatish Balay       v += 49;
24882d61bbb3SSatish Balay     }
24899371c9d4SSatish Balay     z[0] = sum1;
24909371c9d4SSatish Balay     z[1] = sum2;
24919371c9d4SSatish Balay     z[2] = sum3;
24929371c9d4SSatish Balay     z[3] = sum4;
24939371c9d4SSatish Balay     z[4] = sum5;
24949371c9d4SSatish Balay     z[5] = sum6;
24959371c9d4SSatish Balay     z[6] = sum7;
249626e093fcSHong Zhang     if (!usecprow) {
24979371c9d4SSatish Balay       z += 7;
24989371c9d4SSatish Balay       y += 7;
24992d61bbb3SSatish Balay     }
250026e093fcSHong Zhang   }
25019566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
25029566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
25039566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0 * a->nz));
25042d61bbb3SSatish Balay   PetscFunctionReturn(0);
25052d61bbb3SSatish Balay }
2506218c64b6SSatish Balay 
25075f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
25089371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz) {
250996e086a2SDaniel Kokron   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2510f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, *work, *workt, *zarray;
251196e086a2SDaniel Kokron   const PetscScalar *x, *xb;
251296e086a2SDaniel Kokron   const MatScalar   *v;
25136679dcc1SBarry Smith   PetscInt           mbs, i, j, n;
2514ce68d72fSJed Brown   PetscInt           k;
251596e086a2SDaniel Kokron   PetscBool          usecprow = a->compressedrow.use;
25166679dcc1SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;
251796e086a2SDaniel Kokron 
251896e086a2SDaniel Kokron   __m256d a0, a1, a2, a3, a4, a5;
2519ce68d72fSJed Brown   __m256d w0, w1, w2, w3;
252096e086a2SDaniel Kokron   __m256d z0, z1, z2;
252196e086a2SDaniel Kokron   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
252296e086a2SDaniel Kokron 
252396e086a2SDaniel Kokron   PetscFunctionBegin;
25249566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
25259566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
25269566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &zarray));
252796e086a2SDaniel Kokron 
252896e086a2SDaniel Kokron   idx = a->j;
252996e086a2SDaniel Kokron   v   = a->a;
253096e086a2SDaniel Kokron   if (usecprow) {
253196e086a2SDaniel Kokron     mbs  = a->compressedrow.nrows;
253296e086a2SDaniel Kokron     ii   = a->compressedrow.i;
253396e086a2SDaniel Kokron     ridx = a->compressedrow.rindex;
253496e086a2SDaniel Kokron   } else {
253596e086a2SDaniel Kokron     mbs = a->mbs;
253696e086a2SDaniel Kokron     ii  = a->i;
253796e086a2SDaniel Kokron     z   = zarray;
253896e086a2SDaniel Kokron   }
253996e086a2SDaniel Kokron 
254096e086a2SDaniel Kokron   if (!a->mult_work) {
254196e086a2SDaniel Kokron     k = PetscMax(A->rmap->n, A->cmap->n);
25429566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
254396e086a2SDaniel Kokron   }
254496e086a2SDaniel Kokron 
254596e086a2SDaniel Kokron   work = a->mult_work;
254696e086a2SDaniel Kokron   for (i = 0; i < mbs; i++) {
25479371c9d4SSatish Balay     n = ii[1] - ii[0];
25489371c9d4SSatish Balay     ii++;
254996e086a2SDaniel Kokron     workt = work;
255096e086a2SDaniel Kokron     for (j = 0; j < n; j++) {
255196e086a2SDaniel Kokron       xb = x + bs * (*idx++);
255296e086a2SDaniel Kokron       for (k = 0; k < bs; k++) workt[k] = xb[k];
255396e086a2SDaniel Kokron       workt += bs;
255496e086a2SDaniel Kokron     }
255596e086a2SDaniel Kokron     if (usecprow) z = zarray + bs * ridx[i];
255696e086a2SDaniel Kokron 
25579371c9d4SSatish Balay     z0 = _mm256_loadu_pd(&z[0]);
25589371c9d4SSatish Balay     z1 = _mm256_loadu_pd(&z[4]);
25599371c9d4SSatish Balay     z2 = _mm256_set1_pd(z[8]);
256096e086a2SDaniel Kokron 
256196e086a2SDaniel Kokron     for (j = 0; j < n; j++) {
2562c05b70c4SSatish Balay       /* first column of a */
256396e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9]);
25649371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81]);
25659371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
25669371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
25679371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
25689371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
25699371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
257096e086a2SDaniel Kokron 
2571c05b70c4SSatish Balay       /* second column of a */
257296e086a2SDaniel Kokron       w1 = _mm256_set1_pd(work[j * 9 + 1]);
25739371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
25749371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w1, z0);
25759371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
25769371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w1, z1);
25779371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
25789371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w1, z2);
257996e086a2SDaniel Kokron 
2580c05b70c4SSatish Balay       /* third column of a */
258196e086a2SDaniel Kokron       w2 = _mm256_set1_pd(work[j * 9 + 2]);
25829371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
25839371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w2, z0);
25849371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
25859371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w2, z1);
25869371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
25879371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w2, z2);
258896e086a2SDaniel Kokron 
2589c05b70c4SSatish Balay       /* fourth column of a */
259096e086a2SDaniel Kokron       w3 = _mm256_set1_pd(work[j * 9 + 3]);
25919371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
25929371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w3, z0);
25939371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
25949371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w3, z1);
25959371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
25969371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w3, z2);
259796e086a2SDaniel Kokron 
2598c05b70c4SSatish Balay       /* fifth column of a */
259996e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9 + 4]);
26009371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
26019371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w0, z0);
26029371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
26039371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w0, z1);
26049371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
26059371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w0, z2);
260696e086a2SDaniel Kokron 
2607c05b70c4SSatish Balay       /* sixth column of a */
260896e086a2SDaniel Kokron       w1 = _mm256_set1_pd(work[j * 9 + 5]);
26099371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
26109371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w1, z0);
26119371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
26129371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w1, z1);
26139371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
26149371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w1, z2);
261596e086a2SDaniel Kokron 
2616c05b70c4SSatish Balay       /* seventh column of a */
261796e086a2SDaniel Kokron       w2 = _mm256_set1_pd(work[j * 9 + 6]);
26189371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
26199371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
26209371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
26219371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
26229371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
26239371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
262496e086a2SDaniel Kokron 
26256aad120cSJose E. Roman       /* eighth column of a */
262696e086a2SDaniel Kokron       w3 = _mm256_set1_pd(work[j * 9 + 7]);
26279371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
26289371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
26299371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
26309371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
26319371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
26329371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
263396e086a2SDaniel Kokron 
2634c05b70c4SSatish Balay       /* ninth column of a */
263596e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9 + 8]);
26369371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
26379371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
26389371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
26399371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
26409371c9d4SSatish Balay       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
26419371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
264296e086a2SDaniel Kokron     }
264396e086a2SDaniel Kokron 
26449371c9d4SSatish Balay     _mm256_storeu_pd(&z[0], z0);
26459371c9d4SSatish Balay     _mm256_storeu_pd(&z[4], z1);
26469371c9d4SSatish Balay     _mm256_maskstore_pd(&z[8], mask1, z2);
264796e086a2SDaniel Kokron 
264896e086a2SDaniel Kokron     v += n * bs2;
264996e086a2SDaniel Kokron     if (!usecprow) z += bs;
265096e086a2SDaniel Kokron   }
26519566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
26529566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &zarray));
26539566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(162.0 * a->nz));
265496e086a2SDaniel Kokron   PetscFunctionReturn(0);
265596e086a2SDaniel Kokron }
265696e086a2SDaniel Kokron #endif
265796e086a2SDaniel Kokron 
26589371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz) {
2659ebada01fSBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2660f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2661ebada01fSBarry Smith   const PetscScalar *x, *xb;
2662ebada01fSBarry Smith   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2663ebada01fSBarry Smith   const MatScalar   *v;
2664ebada01fSBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2665ebada01fSBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2666ebada01fSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
2667ebada01fSBarry Smith 
2668ebada01fSBarry Smith   PetscFunctionBegin;
26699566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
26709566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2671ebada01fSBarry Smith 
2672ebada01fSBarry Smith   idx = a->j;
2673ebada01fSBarry Smith   v   = a->a;
2674ebada01fSBarry Smith   if (usecprow) {
2675*48a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2676ebada01fSBarry Smith     mbs  = a->compressedrow.nrows;
2677ebada01fSBarry Smith     ii   = a->compressedrow.i;
2678ebada01fSBarry Smith     ridx = a->compressedrow.rindex;
2679ebada01fSBarry Smith   } else {
2680ebada01fSBarry Smith     ii = a->i;
2681ebada01fSBarry Smith     y  = yarray;
2682ebada01fSBarry Smith     z  = zarray;
2683ebada01fSBarry Smith   }
2684ebada01fSBarry Smith 
2685ebada01fSBarry Smith   for (i = 0; i < mbs; i++) {
26869371c9d4SSatish Balay     n = ii[1] - ii[0];
26879371c9d4SSatish Balay     ii++;
2688ebada01fSBarry Smith     if (usecprow) {
2689ebada01fSBarry Smith       z = zarray + 11 * ridx[i];
2690ebada01fSBarry Smith       y = yarray + 11 * ridx[i];
2691ebada01fSBarry Smith     }
26929371c9d4SSatish Balay     sum1  = y[0];
26939371c9d4SSatish Balay     sum2  = y[1];
26949371c9d4SSatish Balay     sum3  = y[2];
26959371c9d4SSatish Balay     sum4  = y[3];
26969371c9d4SSatish Balay     sum5  = y[4];
26979371c9d4SSatish Balay     sum6  = y[5];
26989371c9d4SSatish Balay     sum7  = y[6];
26999371c9d4SSatish Balay     sum8  = y[7];
27009371c9d4SSatish Balay     sum9  = y[8];
27019371c9d4SSatish Balay     sum10 = y[9];
27029371c9d4SSatish Balay     sum11 = y[10];
2703ebada01fSBarry Smith     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);           /* Indices for the next row (assumes same size as this one) */
2704ebada01fSBarry Smith     PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2705ebada01fSBarry Smith     for (j = 0; j < n; j++) {
2706ebada01fSBarry Smith       xb  = x + 11 * (*idx++);
27079371c9d4SSatish Balay       x1  = xb[0];
27089371c9d4SSatish Balay       x2  = xb[1];
27099371c9d4SSatish Balay       x3  = xb[2];
27109371c9d4SSatish Balay       x4  = xb[3];
27119371c9d4SSatish Balay       x5  = xb[4];
27129371c9d4SSatish Balay       x6  = xb[5];
27139371c9d4SSatish Balay       x7  = xb[6];
27149371c9d4SSatish Balay       x8  = xb[7];
27159371c9d4SSatish Balay       x9  = xb[8];
27169371c9d4SSatish Balay       x10 = xb[9];
27179371c9d4SSatish Balay       x11 = xb[10];
2718ebada01fSBarry Smith       sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2719ebada01fSBarry Smith       sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2720ebada01fSBarry Smith       sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2721ebada01fSBarry Smith       sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2722ebada01fSBarry Smith       sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2723ebada01fSBarry Smith       sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2724ebada01fSBarry Smith       sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2725ebada01fSBarry Smith       sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2726ebada01fSBarry Smith       sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2727ebada01fSBarry Smith       sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2728ebada01fSBarry Smith       sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2729ebada01fSBarry Smith       v += 121;
2730ebada01fSBarry Smith     }
27319371c9d4SSatish Balay     z[0]  = sum1;
27329371c9d4SSatish Balay     z[1]  = sum2;
27339371c9d4SSatish Balay     z[2]  = sum3;
27349371c9d4SSatish Balay     z[3]  = sum4;
27359371c9d4SSatish Balay     z[4]  = sum5;
27369371c9d4SSatish Balay     z[5]  = sum6;
27379371c9d4SSatish Balay     z[6]  = sum7;
27389371c9d4SSatish Balay     z[7]  = sum8;
27399371c9d4SSatish Balay     z[8]  = sum9;
27409371c9d4SSatish Balay     z[9]  = sum10;
27419371c9d4SSatish Balay     z[10] = sum11;
2742ebada01fSBarry Smith     if (!usecprow) {
27439371c9d4SSatish Balay       z += 11;
27449371c9d4SSatish Balay       y += 11;
2745ebada01fSBarry Smith     }
2746ebada01fSBarry Smith   }
27479566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
27489566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
27499566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(242.0 * a->nz));
2750ebada01fSBarry Smith   PetscFunctionReturn(0);
2751ebada01fSBarry Smith }
2752ebada01fSBarry Smith 
27539371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) {
27542d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2755f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, *work, *workt, *zarray;
2756d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
2757d9ca1df4SBarry Smith   const MatScalar   *v;
2758d9ca1df4SBarry Smith   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2759d9ca1df4SBarry Smith   PetscInt           ncols, k;
2760d9ca1df4SBarry Smith   const PetscInt    *ridx     = NULL, *idx, *ii;
2761ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
2762218c64b6SSatish Balay 
27632d61bbb3SSatish Balay   PetscFunctionBegin;
27649566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
27659566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
27669566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &zarray));
27672d61bbb3SSatish Balay 
27682d61bbb3SSatish Balay   idx = a->j;
27692d61bbb3SSatish Balay   v   = a->a;
277026e093fcSHong Zhang   if (usecprow) {
277126e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
277226e093fcSHong Zhang     ii   = a->compressedrow.i;
27737b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
277426e093fcSHong Zhang   } else {
277526e093fcSHong Zhang     mbs = a->mbs;
27762d61bbb3SSatish Balay     ii  = a->i;
277726e093fcSHong Zhang     z   = zarray;
277826e093fcSHong Zhang   }
27792d61bbb3SSatish Balay 
27802d61bbb3SSatish Balay   if (!a->mult_work) {
2781d0f46423SBarry Smith     k = PetscMax(A->rmap->n, A->cmap->n);
27829566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
27832d61bbb3SSatish Balay   }
27842d61bbb3SSatish Balay   work = a->mult_work;
27852d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
27869371c9d4SSatish Balay     n = ii[1] - ii[0];
27879371c9d4SSatish Balay     ii++;
27882d61bbb3SSatish Balay     ncols = n * bs;
27892d61bbb3SSatish Balay     workt = work;
27902d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
27912d61bbb3SSatish Balay       xb = x + bs * (*idx++);
27922d61bbb3SSatish Balay       for (k = 0; k < bs; k++) workt[k] = xb[k];
27932d61bbb3SSatish Balay       workt += bs;
27942d61bbb3SSatish Balay     }
27957b2bb3b9SHong Zhang     if (usecprow) z = zarray + bs * ridx[i];
279696b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
27972d61bbb3SSatish Balay     v += n * bs2;
279826fbe8dcSKarl Rupp     if (!usecprow) z += bs;
279926e093fcSHong Zhang   }
28009566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
28019566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &zarray));
28029566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * bs2));
28032d61bbb3SSatish Balay   PetscFunctionReturn(0);
28042d61bbb3SSatish Balay }
28052d61bbb3SSatish Balay 
28069371c9d4SSatish Balay PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) {
2807547795f9SHong Zhang   PetscScalar zero = 0.0;
2808547795f9SHong Zhang 
2809547795f9SHong Zhang   PetscFunctionBegin;
28109566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
28119566063dSJacob Faibussowitsch   PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2812547795f9SHong Zhang   PetscFunctionReturn(0);
2813547795f9SHong Zhang }
2814547795f9SHong Zhang 
28159371c9d4SSatish Balay PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) {
28163447b6efSHong Zhang   PetscScalar zero = 0.0;
28172d61bbb3SSatish Balay 
28182d61bbb3SSatish Balay   PetscFunctionBegin;
28199566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
28209566063dSJacob Faibussowitsch   PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz));
28212d61bbb3SSatish Balay   PetscFunctionReturn(0);
28222d61bbb3SSatish Balay }
28232d61bbb3SSatish Balay 
28249371c9d4SSatish Balay PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) {
2825547795f9SHong Zhang   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2826b8c08b77SHong Zhang   PetscScalar       *z, x1, x2, x3, x4, x5;
2827d9ca1df4SBarry Smith   const PetscScalar *x, *xb = NULL;
2828d9ca1df4SBarry Smith   const MatScalar   *v;
2829b8c08b77SHong Zhang   PetscInt           mbs, i, rval, bs     = A->rmap->bs, j, n;
2830d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2831547795f9SHong Zhang   Mat_CompressedRow  cprow    = a->compressedrow;
2832ace3abfcSBarry Smith   PetscBool          usecprow = cprow.use;
2833547795f9SHong Zhang 
2834547795f9SHong Zhang   PetscFunctionBegin;
28359566063dSJacob Faibussowitsch   if (yy != zz) PetscCall(VecCopy(yy, zz));
28369566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
28379566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
2838547795f9SHong Zhang 
2839547795f9SHong Zhang   idx = a->j;
2840547795f9SHong Zhang   v   = a->a;
2841547795f9SHong Zhang   if (usecprow) {
2842547795f9SHong Zhang     mbs  = cprow.nrows;
2843547795f9SHong Zhang     ii   = cprow.i;
2844547795f9SHong Zhang     ridx = cprow.rindex;
2845547795f9SHong Zhang   } else {
2846547795f9SHong Zhang     mbs = a->mbs;
2847547795f9SHong Zhang     ii  = a->i;
2848547795f9SHong Zhang     xb  = x;
2849547795f9SHong Zhang   }
2850547795f9SHong Zhang 
2851547795f9SHong Zhang   switch (bs) {
2852547795f9SHong Zhang   case 1:
2853547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2854547795f9SHong Zhang       if (usecprow) xb = x + ridx[i];
2855547795f9SHong Zhang       x1 = xb[0];
2856547795f9SHong Zhang       ib = idx + ii[0];
28579371c9d4SSatish Balay       n  = ii[1] - ii[0];
28589371c9d4SSatish Balay       ii++;
2859547795f9SHong Zhang       for (j = 0; j < n; j++) {
2860547795f9SHong Zhang         rval = ib[j];
2861547795f9SHong Zhang         z[rval] += PetscConj(*v) * x1;
2862547795f9SHong Zhang         v++;
2863547795f9SHong Zhang       }
2864547795f9SHong Zhang       if (!usecprow) xb++;
2865547795f9SHong Zhang     }
2866547795f9SHong Zhang     break;
2867547795f9SHong Zhang   case 2:
2868547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2869547795f9SHong Zhang       if (usecprow) xb = x + 2 * ridx[i];
28709371c9d4SSatish Balay       x1 = xb[0];
28719371c9d4SSatish Balay       x2 = xb[1];
2872547795f9SHong Zhang       ib = idx + ii[0];
28739371c9d4SSatish Balay       n  = ii[1] - ii[0];
28749371c9d4SSatish Balay       ii++;
2875547795f9SHong Zhang       for (j = 0; j < n; j++) {
2876547795f9SHong Zhang         rval = ib[j] * 2;
2877547795f9SHong Zhang         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2878547795f9SHong Zhang         z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2879547795f9SHong Zhang         v += 4;
2880547795f9SHong Zhang       }
2881547795f9SHong Zhang       if (!usecprow) xb += 2;
2882547795f9SHong Zhang     }
2883547795f9SHong Zhang     break;
2884547795f9SHong Zhang   case 3:
2885547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2886547795f9SHong Zhang       if (usecprow) xb = x + 3 * ridx[i];
28879371c9d4SSatish Balay       x1 = xb[0];
28889371c9d4SSatish Balay       x2 = xb[1];
28899371c9d4SSatish Balay       x3 = xb[2];
2890547795f9SHong Zhang       ib = idx + ii[0];
28919371c9d4SSatish Balay       n  = ii[1] - ii[0];
28929371c9d4SSatish Balay       ii++;
2893547795f9SHong Zhang       for (j = 0; j < n; j++) {
2894547795f9SHong Zhang         rval = ib[j] * 3;
2895547795f9SHong Zhang         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2896547795f9SHong Zhang         z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2897547795f9SHong Zhang         z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2898547795f9SHong Zhang         v += 9;
2899547795f9SHong Zhang       }
2900547795f9SHong Zhang       if (!usecprow) xb += 3;
2901547795f9SHong Zhang     }
2902547795f9SHong Zhang     break;
2903547795f9SHong Zhang   case 4:
2904547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2905547795f9SHong Zhang       if (usecprow) xb = x + 4 * ridx[i];
29069371c9d4SSatish Balay       x1 = xb[0];
29079371c9d4SSatish Balay       x2 = xb[1];
29089371c9d4SSatish Balay       x3 = xb[2];
29099371c9d4SSatish Balay       x4 = xb[3];
2910547795f9SHong Zhang       ib = idx + ii[0];
29119371c9d4SSatish Balay       n  = ii[1] - ii[0];
29129371c9d4SSatish Balay       ii++;
2913547795f9SHong Zhang       for (j = 0; j < n; j++) {
2914547795f9SHong Zhang         rval = ib[j] * 4;
2915547795f9SHong Zhang         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2916547795f9SHong Zhang         z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2917547795f9SHong Zhang         z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2918547795f9SHong Zhang         z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2919547795f9SHong Zhang         v += 16;
2920547795f9SHong Zhang       }
2921547795f9SHong Zhang       if (!usecprow) xb += 4;
2922547795f9SHong Zhang     }
2923547795f9SHong Zhang     break;
2924547795f9SHong Zhang   case 5:
2925547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2926547795f9SHong Zhang       if (usecprow) xb = x + 5 * ridx[i];
29279371c9d4SSatish Balay       x1 = xb[0];
29289371c9d4SSatish Balay       x2 = xb[1];
29299371c9d4SSatish Balay       x3 = xb[2];
29309371c9d4SSatish Balay       x4 = xb[3];
29319371c9d4SSatish Balay       x5 = xb[4];
2932547795f9SHong Zhang       ib = idx + ii[0];
29339371c9d4SSatish Balay       n  = ii[1] - ii[0];
29349371c9d4SSatish Balay       ii++;
2935547795f9SHong Zhang       for (j = 0; j < n; j++) {
2936547795f9SHong Zhang         rval = ib[j] * 5;
2937547795f9SHong Zhang         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2938547795f9SHong Zhang         z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2939547795f9SHong Zhang         z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2940547795f9SHong Zhang         z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2941547795f9SHong Zhang         z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2942547795f9SHong Zhang         v += 25;
2943547795f9SHong Zhang       }
2944547795f9SHong Zhang       if (!usecprow) xb += 5;
2945547795f9SHong Zhang     }
2946547795f9SHong Zhang     break;
29479371c9d4SSatish Balay   default: /* block sizes larger than 5 by 5 are handled by BLAS */ SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2948968ae2c8SSatish Balay #if 0
2949968ae2c8SSatish Balay     {
2950b8c08b77SHong Zhang       PetscInt          ncols,k,bs2=a->bs2;
2951b8c08b77SHong Zhang       PetscScalar       *work,*workt,zb;
2952d9ca1df4SBarry Smith       const PetscScalar *xtmp;
2953547795f9SHong Zhang       if (!a->mult_work) {
2954547795f9SHong Zhang         k    = PetscMax(A->rmap->n,A->cmap->n);
29559566063dSJacob Faibussowitsch         PetscCall(PetscMalloc1(k+1,&a->mult_work));
2956547795f9SHong Zhang       }
2957547795f9SHong Zhang       work = a->mult_work;
2958547795f9SHong Zhang       xtmp = x;
2959547795f9SHong Zhang       for (i=0; i<mbs; i++) {
2960547795f9SHong Zhang         n     = ii[1] - ii[0]; ii++;
2961547795f9SHong Zhang         ncols = n*bs;
29629566063dSJacob Faibussowitsch         PetscCall(PetscArrayzero(work,ncols));
296326fbe8dcSKarl Rupp         if (usecprow) xtmp = x + bs*ridx[i];
296496b95a6bSBarry Smith         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2965547795f9SHong Zhang         v += n*bs2;
2966547795f9SHong Zhang         if (!usecprow) xtmp += bs;
2967547795f9SHong Zhang         workt = work;
2968547795f9SHong Zhang         for (j=0; j<n; j++) {
2969547795f9SHong Zhang           zb = z + bs*(*idx++);
2970547795f9SHong Zhang           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2971547795f9SHong Zhang           workt += bs;
2972547795f9SHong Zhang         }
2973547795f9SHong Zhang       }
2974547795f9SHong Zhang     }
2975968ae2c8SSatish Balay #endif
2976547795f9SHong Zhang   }
29779566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
29789566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
29799566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
2980547795f9SHong Zhang   PetscFunctionReturn(0);
2981547795f9SHong Zhang }
2982547795f9SHong Zhang 
29839371c9d4SSatish Balay PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) {
29842d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2985d9ca1df4SBarry Smith   PetscScalar       *zb, *z, x1, x2, x3, x4, x5;
2986f4259b30SLisandro Dalcin   const PetscScalar *x, *xb = NULL;
2987d9ca1df4SBarry Smith   const MatScalar   *v;
2988d9ca1df4SBarry Smith   PetscInt           mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2989d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
29903447b6efSHong Zhang   Mat_CompressedRow  cprow    = a->compressedrow;
2991ace3abfcSBarry Smith   PetscBool          usecprow = cprow.use;
29922d61bbb3SSatish Balay 
29932d61bbb3SSatish Balay   PetscFunctionBegin;
29949566063dSJacob Faibussowitsch   if (yy != zz) PetscCall(VecCopy(yy, zz));
29959566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
29969566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
29972d61bbb3SSatish Balay 
29982d61bbb3SSatish Balay   idx = a->j;
29992d61bbb3SSatish Balay   v   = a->a;
30003447b6efSHong Zhang   if (usecprow) {
30013447b6efSHong Zhang     mbs  = cprow.nrows;
30023447b6efSHong Zhang     ii   = cprow.i;
30037b2bb3b9SHong Zhang     ridx = cprow.rindex;
30043447b6efSHong Zhang   } else {
30053447b6efSHong Zhang     mbs = a->mbs;
30062d61bbb3SSatish Balay     ii  = a->i;
3007f1af5d2fSBarry Smith     xb  = x;
30083447b6efSHong Zhang   }
30092d61bbb3SSatish Balay 
30102d61bbb3SSatish Balay   switch (bs) {
30112d61bbb3SSatish Balay   case 1:
30122d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30137b2bb3b9SHong Zhang       if (usecprow) xb = x + ridx[i];
3014f1af5d2fSBarry Smith       x1 = xb[0];
30153447b6efSHong Zhang       ib = idx + ii[0];
30169371c9d4SSatish Balay       n  = ii[1] - ii[0];
30179371c9d4SSatish Balay       ii++;
30182d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
30192d61bbb3SSatish Balay         rval = ib[j];
3020f1af5d2fSBarry Smith         z[rval] += *v * x1;
3021f1af5d2fSBarry Smith         v++;
30222d61bbb3SSatish Balay       }
30233447b6efSHong Zhang       if (!usecprow) xb++;
30242d61bbb3SSatish Balay     }
30252d61bbb3SSatish Balay     break;
30262d61bbb3SSatish Balay   case 2:
30272d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30287b2bb3b9SHong Zhang       if (usecprow) xb = x + 2 * ridx[i];
30299371c9d4SSatish Balay       x1 = xb[0];
30309371c9d4SSatish Balay       x2 = xb[1];
30313447b6efSHong Zhang       ib = idx + ii[0];
30329371c9d4SSatish Balay       n  = ii[1] - ii[0];
30339371c9d4SSatish Balay       ii++;
30342d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
30352d61bbb3SSatish Balay         rval = ib[j] * 2;
30362d61bbb3SSatish Balay         z[rval++] += v[0] * x1 + v[1] * x2;
30372d61bbb3SSatish Balay         z[rval++] += v[2] * x1 + v[3] * x2;
30382d61bbb3SSatish Balay         v += 4;
30392d61bbb3SSatish Balay       }
30403447b6efSHong Zhang       if (!usecprow) xb += 2;
30412d61bbb3SSatish Balay     }
30422d61bbb3SSatish Balay     break;
30432d61bbb3SSatish Balay   case 3:
30442d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30457b2bb3b9SHong Zhang       if (usecprow) xb = x + 3 * ridx[i];
30469371c9d4SSatish Balay       x1 = xb[0];
30479371c9d4SSatish Balay       x2 = xb[1];
30489371c9d4SSatish Balay       x3 = xb[2];
30493447b6efSHong Zhang       ib = idx + ii[0];
30509371c9d4SSatish Balay       n  = ii[1] - ii[0];
30519371c9d4SSatish Balay       ii++;
30522d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
30532d61bbb3SSatish Balay         rval = ib[j] * 3;
30542d61bbb3SSatish Balay         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
30552d61bbb3SSatish Balay         z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
30562d61bbb3SSatish Balay         z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
30572d61bbb3SSatish Balay         v += 9;
30582d61bbb3SSatish Balay       }
30593447b6efSHong Zhang       if (!usecprow) xb += 3;
30602d61bbb3SSatish Balay     }
30612d61bbb3SSatish Balay     break;
30622d61bbb3SSatish Balay   case 4:
30632d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30647b2bb3b9SHong Zhang       if (usecprow) xb = x + 4 * ridx[i];
30659371c9d4SSatish Balay       x1 = xb[0];
30669371c9d4SSatish Balay       x2 = xb[1];
30679371c9d4SSatish Balay       x3 = xb[2];
30689371c9d4SSatish Balay       x4 = xb[3];
30693447b6efSHong Zhang       ib = idx + ii[0];
30709371c9d4SSatish Balay       n  = ii[1] - ii[0];
30719371c9d4SSatish Balay       ii++;
30722d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
30732d61bbb3SSatish Balay         rval = ib[j] * 4;
30742d61bbb3SSatish Balay         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
30752d61bbb3SSatish Balay         z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
30762d61bbb3SSatish Balay         z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
30772d61bbb3SSatish Balay         z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
30782d61bbb3SSatish Balay         v += 16;
30792d61bbb3SSatish Balay       }
30803447b6efSHong Zhang       if (!usecprow) xb += 4;
30812d61bbb3SSatish Balay     }
30822d61bbb3SSatish Balay     break;
30832d61bbb3SSatish Balay   case 5:
30842d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30857b2bb3b9SHong Zhang       if (usecprow) xb = x + 5 * ridx[i];
30869371c9d4SSatish Balay       x1 = xb[0];
30879371c9d4SSatish Balay       x2 = xb[1];
30889371c9d4SSatish Balay       x3 = xb[2];
30899371c9d4SSatish Balay       x4 = xb[3];
30909371c9d4SSatish Balay       x5 = xb[4];
30913447b6efSHong Zhang       ib = idx + ii[0];
30929371c9d4SSatish Balay       n  = ii[1] - ii[0];
30939371c9d4SSatish Balay       ii++;
30942d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
30952d61bbb3SSatish Balay         rval = ib[j] * 5;
30962d61bbb3SSatish Balay         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
30972d61bbb3SSatish Balay         z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
30982d61bbb3SSatish Balay         z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
30992d61bbb3SSatish Balay         z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
31002d61bbb3SSatish Balay         z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
31012d61bbb3SSatish Balay         v += 25;
31022d61bbb3SSatish Balay       }
31033447b6efSHong Zhang       if (!usecprow) xb += 5;
31042d61bbb3SSatish Balay     }
31052d61bbb3SSatish Balay     break;
3106f1af5d2fSBarry Smith   default: { /* block sizes larger then 5 by 5 are handled by BLAS */
3107690b6cddSBarry Smith     PetscInt           ncols, k;
3108d9ca1df4SBarry Smith     PetscScalar       *work, *workt;
3109d9ca1df4SBarry Smith     const PetscScalar *xtmp;
31102d61bbb3SSatish Balay     if (!a->mult_work) {
3111d0f46423SBarry Smith       k = PetscMax(A->rmap->n, A->cmap->n);
31129566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(k + 1, &a->mult_work));
31132d61bbb3SSatish Balay     }
31142d61bbb3SSatish Balay     work = a->mult_work;
31153447b6efSHong Zhang     xtmp = x;
31162d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
31179371c9d4SSatish Balay       n = ii[1] - ii[0];
31189371c9d4SSatish Balay       ii++;
31192d61bbb3SSatish Balay       ncols = n * bs;
31209566063dSJacob Faibussowitsch       PetscCall(PetscArrayzero(work, ncols));
312126fbe8dcSKarl Rupp       if (usecprow) xtmp = x + bs * ridx[i];
312296b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
31232d61bbb3SSatish Balay       v += n * bs2;
31243447b6efSHong Zhang       if (!usecprow) xtmp += bs;
31252d61bbb3SSatish Balay       workt = work;
31262d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
31272d61bbb3SSatish Balay         zb = z + bs * (*idx++);
31282d61bbb3SSatish Balay         for (k = 0; k < bs; k++) zb[k] += workt[k];
31292d61bbb3SSatish Balay         workt += bs;
31302d61bbb3SSatish Balay       }
31312d61bbb3SSatish Balay     }
31322d61bbb3SSatish Balay   }
31332d61bbb3SSatish Balay   }
31349566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
31359566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
31369566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
31372d61bbb3SSatish Balay   PetscFunctionReturn(0);
31382d61bbb3SSatish Balay }
31392d61bbb3SSatish Balay 
31409371c9d4SSatish Balay PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha) {
31412d61bbb3SSatish Balay   Mat_SeqBAIJ *a       = (Mat_SeqBAIJ *)inA->data;
3142690b6cddSBarry Smith   PetscInt     totalnz = a->bs2 * a->nz;
3143f4df32b1SMatthew Knepley   PetscScalar  oalpha  = alpha;
3144c5df96a5SBarry Smith   PetscBLASInt one     = 1, tnz;
31452d61bbb3SSatish Balay 
31462d61bbb3SSatish Balay   PetscFunctionBegin;
31479566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(totalnz, &tnz));
3148792fecdfSBarry Smith   PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
31499566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(totalnz));
31502d61bbb3SSatish Balay   PetscFunctionReturn(0);
31512d61bbb3SSatish Balay }
31522d61bbb3SSatish Balay 
31539371c9d4SSatish Balay PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm) {
31542d61bbb3SSatish Balay   Mat_SeqBAIJ *a   = (Mat_SeqBAIJ *)A->data;
31553f1db9ecSBarry Smith   MatScalar   *v   = a->a;
3156329f5518SBarry Smith   PetscReal    sum = 0.0;
3157d0f46423SBarry Smith   PetscInt     i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;
31582d61bbb3SSatish Balay 
31592d61bbb3SSatish Balay   PetscFunctionBegin;
31602d61bbb3SSatish Balay   if (type == NORM_FROBENIUS) {
3161570b7f6dSBarry Smith #if defined(PETSC_USE_REAL___FP16)
3162570b7f6dSBarry Smith     PetscBLASInt one = 1, cnt = bs2 * nz;
3163792fecdfSBarry Smith     PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3164570b7f6dSBarry Smith #else
31652d61bbb3SSatish Balay     for (i = 0; i < bs2 * nz; i++) {
31669371c9d4SSatish Balay       sum += PetscRealPart(PetscConj(*v) * (*v));
31679371c9d4SSatish Balay       v++;
31682d61bbb3SSatish Balay     }
3169570b7f6dSBarry Smith #endif
31708f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum);
31719566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(2.0 * bs2 * nz));
31728a62d963SHong Zhang   } else if (type == NORM_1) { /* maximum column sum */
31738a62d963SHong Zhang     PetscReal *tmp;
31748a62d963SHong Zhang     PetscInt  *bcol = a->j;
31759566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp));
31768a62d963SHong Zhang     for (i = 0; i < nz; i++) {
31778a62d963SHong Zhang       for (j = 0; j < bs; j++) {
31788a62d963SHong Zhang         k1 = bs * (*bcol) + j; /* column index */
31798a62d963SHong Zhang         for (k = 0; k < bs; k++) {
31809371c9d4SSatish Balay           tmp[k1] += PetscAbsScalar(*v);
31819371c9d4SSatish Balay           v++;
31828a62d963SHong Zhang         }
31838a62d963SHong Zhang       }
31848a62d963SHong Zhang       bcol++;
31858a62d963SHong Zhang     }
31868a62d963SHong Zhang     *norm = 0.0;
3187d0f46423SBarry Smith     for (j = 0; j < A->cmap->n; j++) {
31888a62d963SHong Zhang       if (tmp[j] > *norm) *norm = tmp[j];
31898a62d963SHong Zhang     }
31909566063dSJacob Faibussowitsch     PetscCall(PetscFree(tmp));
31919566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3192596552b5SBarry Smith   } else if (type == NORM_INFINITY) { /* maximum row sum */
3193596552b5SBarry Smith     *norm = 0.0;
3194596552b5SBarry Smith     for (k = 0; k < bs; k++) {
319574f84c7bSSatish Balay       for (j = 0; j < a->mbs; j++) {
3196596552b5SBarry Smith         v   = a->a + bs2 * a->i[j] + k;
3197596552b5SBarry Smith         sum = 0.0;
3198596552b5SBarry Smith         for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
31990e90e235SBarry Smith           for (k1 = 0; k1 < bs; k1++) {
3200596552b5SBarry Smith             sum += PetscAbsScalar(*v);
3201596552b5SBarry Smith             v += bs;
32022d61bbb3SSatish Balay           }
32030e90e235SBarry Smith         }
3204596552b5SBarry Smith         if (sum > *norm) *norm = sum;
3205596552b5SBarry Smith       }
3206596552b5SBarry Smith     }
32079566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3208e7e72b3dSBarry Smith   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
32092d61bbb3SSatish Balay   PetscFunctionReturn(0);
32102d61bbb3SSatish Balay }
32112d61bbb3SSatish Balay 
32129371c9d4SSatish Balay PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg) {
32132d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
32142d61bbb3SSatish Balay 
32152d61bbb3SSatish Balay   PetscFunctionBegin;
32162d61bbb3SSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
3217d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
3218273d9f13SBarry Smith     *flg = PETSC_FALSE;
3219273d9f13SBarry Smith     PetscFunctionReturn(0);
32202d61bbb3SSatish Balay   }
32212d61bbb3SSatish Balay 
32222d61bbb3SSatish Balay   /* if the a->i are the same */
32239566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
322426fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
32252d61bbb3SSatish Balay 
32262d61bbb3SSatish Balay   /* if a->j are the same */
32279566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
322826fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
322926fbe8dcSKarl Rupp 
32302d61bbb3SSatish Balay   /* if a->a are the same */
32319566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg));
32322d61bbb3SSatish Balay   PetscFunctionReturn(0);
32332d61bbb3SSatish Balay }
32342d61bbb3SSatish Balay 
32359371c9d4SSatish Balay PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v) {
32362d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3237690b6cddSBarry Smith   PetscInt     i, j, k, n, row, bs, *ai, *aj, ambs, bs2;
323887828ca2SBarry Smith   PetscScalar *x, zero = 0.0;
32393f1db9ecSBarry Smith   MatScalar   *aa, *aa_j;
32402d61bbb3SSatish Balay 
32412d61bbb3SSatish Balay   PetscFunctionBegin;
324228b400f6SJacob Faibussowitsch   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
3243d0f46423SBarry Smith   bs   = A->rmap->bs;
32442d61bbb3SSatish Balay   aa   = a->a;
32452d61bbb3SSatish Balay   ai   = a->i;
32462d61bbb3SSatish Balay   aj   = a->j;
32472d61bbb3SSatish Balay   ambs = a->mbs;
32482d61bbb3SSatish Balay   bs2  = a->bs2;
32492d61bbb3SSatish Balay 
32509566063dSJacob Faibussowitsch   PetscCall(VecSet(v, zero));
32519566063dSJacob Faibussowitsch   PetscCall(VecGetArray(v, &x));
32529566063dSJacob Faibussowitsch   PetscCall(VecGetLocalSize(v, &n));
325308401ef6SPierre Jolivet   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
32542d61bbb3SSatish Balay   for (i = 0; i < ambs; i++) {
32552d61bbb3SSatish Balay     for (j = ai[i]; j < ai[i + 1]; j++) {
32562d61bbb3SSatish Balay       if (aj[j] == i) {
32572d61bbb3SSatish Balay         row  = i * bs;
32582d61bbb3SSatish Balay         aa_j = aa + j * bs2;
32592d61bbb3SSatish Balay         for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
32602d61bbb3SSatish Balay         break;
32612d61bbb3SSatish Balay       }
32622d61bbb3SSatish Balay     }
32632d61bbb3SSatish Balay   }
32649566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(v, &x));
32652d61bbb3SSatish Balay   PetscFunctionReturn(0);
32662d61bbb3SSatish Balay }
32672d61bbb3SSatish Balay 
32689371c9d4SSatish Balay PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr) {
32692d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
327053ef36baSBarry Smith   const PetscScalar *l, *r, *li, *ri;
327153ef36baSBarry Smith   PetscScalar        x;
32723f1db9ecSBarry Smith   MatScalar         *aa, *v;
327353ef36baSBarry Smith   PetscInt           i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
327453ef36baSBarry Smith   const PetscInt    *ai, *aj;
32752d61bbb3SSatish Balay 
32762d61bbb3SSatish Balay   PetscFunctionBegin;
32772d61bbb3SSatish Balay   ai  = a->i;
32782d61bbb3SSatish Balay   aj  = a->j;
32792d61bbb3SSatish Balay   aa  = a->a;
3280d0f46423SBarry Smith   m   = A->rmap->n;
3281d0f46423SBarry Smith   n   = A->cmap->n;
3282d0f46423SBarry Smith   bs  = A->rmap->bs;
32832d61bbb3SSatish Balay   mbs = a->mbs;
32842d61bbb3SSatish Balay   bs2 = a->bs2;
32852d61bbb3SSatish Balay   if (ll) {
32869566063dSJacob Faibussowitsch     PetscCall(VecGetArrayRead(ll, &l));
32879566063dSJacob Faibussowitsch     PetscCall(VecGetLocalSize(ll, &lm));
328808401ef6SPierre Jolivet     PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
32892d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) { /* for each block row */
32902d61bbb3SSatish Balay       M  = ai[i + 1] - ai[i];
32912d61bbb3SSatish Balay       li = l + i * bs;
32922d61bbb3SSatish Balay       v  = aa + bs2 * ai[i];
32932d61bbb3SSatish Balay       for (j = 0; j < M; j++) { /* for each block */
32949371c9d4SSatish Balay         for (k = 0; k < bs2; k++) { (*v++) *= li[k % bs]; }
32952d61bbb3SSatish Balay       }
32962d61bbb3SSatish Balay     }
32979566063dSJacob Faibussowitsch     PetscCall(VecRestoreArrayRead(ll, &l));
32989566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(a->nz));
32992d61bbb3SSatish Balay   }
33002d61bbb3SSatish Balay 
33012d61bbb3SSatish Balay   if (rr) {
33029566063dSJacob Faibussowitsch     PetscCall(VecGetArrayRead(rr, &r));
33039566063dSJacob Faibussowitsch     PetscCall(VecGetLocalSize(rr, &rn));
330408401ef6SPierre Jolivet     PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
33052d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) { /* for each block row */
330653ef36baSBarry Smith       iai = ai[i];
330753ef36baSBarry Smith       M   = ai[i + 1] - iai;
330853ef36baSBarry Smith       v   = aa + bs2 * iai;
33092d61bbb3SSatish Balay       for (j = 0; j < M; j++) { /* for each block */
331053ef36baSBarry Smith         ri = r + bs * aj[iai + j];
33112d61bbb3SSatish Balay         for (k = 0; k < bs; k++) {
33122d61bbb3SSatish Balay           x = ri[k];
331353ef36baSBarry Smith           for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
331453ef36baSBarry Smith           v += bs;
33152d61bbb3SSatish Balay         }
33162d61bbb3SSatish Balay       }
33172d61bbb3SSatish Balay     }
33189566063dSJacob Faibussowitsch     PetscCall(VecRestoreArrayRead(rr, &r));
33199566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(a->nz));
33202d61bbb3SSatish Balay   }
33212d61bbb3SSatish Balay   PetscFunctionReturn(0);
33222d61bbb3SSatish Balay }
33232d61bbb3SSatish Balay 
33249371c9d4SSatish Balay PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info) {
33252d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
33262d61bbb3SSatish Balay 
33272d61bbb3SSatish Balay   PetscFunctionBegin;
33282d61bbb3SSatish Balay   info->block_size   = a->bs2;
3329ceed8ce5SJed Brown   info->nz_allocated = a->bs2 * a->maxnz;
33302d61bbb3SSatish Balay   info->nz_used      = a->bs2 * a->nz;
33313966268fSBarry Smith   info->nz_unneeded  = info->nz_allocated - info->nz_used;
33322d61bbb3SSatish Balay   info->assemblies   = A->num_ass;
33338e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
33347adad957SLisandro Dalcin   info->memory       = ((PetscObject)A)->mem;
3335d5f3da31SBarry Smith   if (A->factortype) {
33362d61bbb3SSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
33372d61bbb3SSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
33382d61bbb3SSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
33392d61bbb3SSatish Balay   } else {
33402d61bbb3SSatish Balay     info->fill_ratio_given  = 0;
33412d61bbb3SSatish Balay     info->fill_ratio_needed = 0;
33422d61bbb3SSatish Balay     info->factor_mallocs    = 0;
33432d61bbb3SSatish Balay   }
33442d61bbb3SSatish Balay   PetscFunctionReturn(0);
33452d61bbb3SSatish Balay }
33462d61bbb3SSatish Balay 
33479371c9d4SSatish Balay PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) {
33482d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
33492d61bbb3SSatish Balay 
33502d61bbb3SSatish Balay   PetscFunctionBegin;
33519566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
33522d61bbb3SSatish Balay   PetscFunctionReturn(0);
33532d61bbb3SSatish Balay }
3354a001520aSPierre Jolivet 
33559371c9d4SSatish Balay PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) {
3356a001520aSPierre Jolivet   PetscFunctionBegin;
33579566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
33584222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3359a001520aSPierre Jolivet   PetscFunctionReturn(0);
3360a001520aSPierre Jolivet }
3361a001520aSPierre Jolivet 
33629371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
336374eeabc5SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3364f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1;
3365bcf10a7aSPierre Jolivet   const PetscScalar *xb;
336674eeabc5SPierre Jolivet   PetscScalar        x1;
336774eeabc5SPierre Jolivet   const MatScalar   *v, *vv;
336874eeabc5SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
336974eeabc5SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
337074eeabc5SPierre Jolivet 
337174eeabc5SPierre Jolivet   PetscFunctionBegin;
337274eeabc5SPierre Jolivet   idx = a->j;
337374eeabc5SPierre Jolivet   v   = a->a;
337474eeabc5SPierre Jolivet   if (usecprow) {
337574eeabc5SPierre Jolivet     mbs  = a->compressedrow.nrows;
337674eeabc5SPierre Jolivet     ii   = a->compressedrow.i;
337774eeabc5SPierre Jolivet     ridx = a->compressedrow.rindex;
337874eeabc5SPierre Jolivet   } else {
337974eeabc5SPierre Jolivet     mbs = a->mbs;
338074eeabc5SPierre Jolivet     ii  = a->i;
338174eeabc5SPierre Jolivet     z   = c;
338274eeabc5SPierre Jolivet   }
338374eeabc5SPierre Jolivet 
338474eeabc5SPierre Jolivet   for (i = 0; i < mbs; i++) {
33859371c9d4SSatish Balay     n = ii[1] - ii[0];
33869371c9d4SSatish Balay     ii++;
338774eeabc5SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
338874eeabc5SPierre Jolivet     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
338974eeabc5SPierre Jolivet     if (usecprow) z = c + ridx[i];
339074eeabc5SPierre Jolivet     jj = idx;
339174eeabc5SPierre Jolivet     vv = v;
339274eeabc5SPierre Jolivet     for (k = 0; k < cn; k++) {
339374eeabc5SPierre Jolivet       idx  = jj;
339474eeabc5SPierre Jolivet       v    = vv;
339574eeabc5SPierre Jolivet       sum1 = 0.0;
339674eeabc5SPierre Jolivet       for (j = 0; j < n; j++) {
33979371c9d4SSatish Balay         xb = b + (*idx++);
33989371c9d4SSatish Balay         x1 = xb[0 + k * bm];
339974eeabc5SPierre Jolivet         sum1 += v[0] * x1;
340074eeabc5SPierre Jolivet         v += 1;
340174eeabc5SPierre Jolivet       }
3402feb237baSPierre Jolivet       z[0 + k * cm] = sum1;
340374eeabc5SPierre Jolivet     }
340474eeabc5SPierre Jolivet     if (!usecprow) z += 1;
340574eeabc5SPierre Jolivet   }
340674eeabc5SPierre Jolivet   PetscFunctionReturn(0);
340774eeabc5SPierre Jolivet }
340874eeabc5SPierre Jolivet 
34099371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
34104b7054f4SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3411f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2;
3412bcf10a7aSPierre Jolivet   const PetscScalar *xb;
34134b7054f4SPierre Jolivet   PetscScalar        x1, x2;
34144b7054f4SPierre Jolivet   const MatScalar   *v, *vv;
34154b7054f4SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
34164b7054f4SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
34174b7054f4SPierre Jolivet 
34184b7054f4SPierre Jolivet   PetscFunctionBegin;
34194b7054f4SPierre Jolivet   idx = a->j;
34204b7054f4SPierre Jolivet   v   = a->a;
34214b7054f4SPierre Jolivet   if (usecprow) {
34224b7054f4SPierre Jolivet     mbs  = a->compressedrow.nrows;
34234b7054f4SPierre Jolivet     ii   = a->compressedrow.i;
34244b7054f4SPierre Jolivet     ridx = a->compressedrow.rindex;
34254b7054f4SPierre Jolivet   } else {
34264b7054f4SPierre Jolivet     mbs = a->mbs;
34274b7054f4SPierre Jolivet     ii  = a->i;
34284b7054f4SPierre Jolivet     z   = c;
34294b7054f4SPierre Jolivet   }
34304b7054f4SPierre Jolivet 
34314b7054f4SPierre Jolivet   for (i = 0; i < mbs; i++) {
34329371c9d4SSatish Balay     n = ii[1] - ii[0];
34339371c9d4SSatish Balay     ii++;
34344b7054f4SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
34354b7054f4SPierre Jolivet     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
34364b7054f4SPierre Jolivet     if (usecprow) z = c + 2 * ridx[i];
34374b7054f4SPierre Jolivet     jj = idx;
34384b7054f4SPierre Jolivet     vv = v;
34394b7054f4SPierre Jolivet     for (k = 0; k < cn; k++) {
34404b7054f4SPierre Jolivet       idx  = jj;
34414b7054f4SPierre Jolivet       v    = vv;
34429371c9d4SSatish Balay       sum1 = 0.0;
34439371c9d4SSatish Balay       sum2 = 0.0;
34444b7054f4SPierre Jolivet       for (j = 0; j < n; j++) {
34459371c9d4SSatish Balay         xb = b + 2 * (*idx++);
34469371c9d4SSatish Balay         x1 = xb[0 + k * bm];
34479371c9d4SSatish Balay         x2 = xb[1 + k * bm];
34484b7054f4SPierre Jolivet         sum1 += v[0] * x1 + v[2] * x2;
34494b7054f4SPierre Jolivet         sum2 += v[1] * x1 + v[3] * x2;
34504b7054f4SPierre Jolivet         v += 4;
34514b7054f4SPierre Jolivet       }
34529371c9d4SSatish Balay       z[0 + k * cm] = sum1;
34539371c9d4SSatish Balay       z[1 + k * cm] = sum2;
34544b7054f4SPierre Jolivet     }
34554b7054f4SPierre Jolivet     if (!usecprow) z += 2;
34564b7054f4SPierre Jolivet   }
34574b7054f4SPierre Jolivet   PetscFunctionReturn(0);
34584b7054f4SPierre Jolivet }
34594b7054f4SPierre Jolivet 
34609371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
346174eeabc5SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3462f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3;
3463bcf10a7aSPierre Jolivet   const PetscScalar *xb;
346474eeabc5SPierre Jolivet   PetscScalar        x1, x2, x3;
346574eeabc5SPierre Jolivet   const MatScalar   *v, *vv;
346674eeabc5SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
346774eeabc5SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
346874eeabc5SPierre Jolivet 
346974eeabc5SPierre Jolivet   PetscFunctionBegin;
347074eeabc5SPierre Jolivet   idx = a->j;
347174eeabc5SPierre Jolivet   v   = a->a;
347274eeabc5SPierre Jolivet   if (usecprow) {
347374eeabc5SPierre Jolivet     mbs  = a->compressedrow.nrows;
347474eeabc5SPierre Jolivet     ii   = a->compressedrow.i;
347574eeabc5SPierre Jolivet     ridx = a->compressedrow.rindex;
347674eeabc5SPierre Jolivet   } else {
347774eeabc5SPierre Jolivet     mbs = a->mbs;
347874eeabc5SPierre Jolivet     ii  = a->i;
347974eeabc5SPierre Jolivet     z   = c;
348074eeabc5SPierre Jolivet   }
348174eeabc5SPierre Jolivet 
348274eeabc5SPierre Jolivet   for (i = 0; i < mbs; i++) {
34839371c9d4SSatish Balay     n = ii[1] - ii[0];
34849371c9d4SSatish Balay     ii++;
348574eeabc5SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
348674eeabc5SPierre Jolivet     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
348774eeabc5SPierre Jolivet     if (usecprow) z = c + 3 * ridx[i];
348874eeabc5SPierre Jolivet     jj = idx;
348974eeabc5SPierre Jolivet     vv = v;
349074eeabc5SPierre Jolivet     for (k = 0; k < cn; k++) {
349174eeabc5SPierre Jolivet       idx  = jj;
349274eeabc5SPierre Jolivet       v    = vv;
34939371c9d4SSatish Balay       sum1 = 0.0;
34949371c9d4SSatish Balay       sum2 = 0.0;
34959371c9d4SSatish Balay       sum3 = 0.0;
349674eeabc5SPierre Jolivet       for (j = 0; j < n; j++) {
34979371c9d4SSatish Balay         xb = b + 3 * (*idx++);
34989371c9d4SSatish Balay         x1 = xb[0 + k * bm];
34999371c9d4SSatish Balay         x2 = xb[1 + k * bm];
35009371c9d4SSatish Balay         x3 = xb[2 + k * bm];
350174eeabc5SPierre Jolivet         sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
350274eeabc5SPierre Jolivet         sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
350374eeabc5SPierre Jolivet         sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
350474eeabc5SPierre Jolivet         v += 9;
350574eeabc5SPierre Jolivet       }
35069371c9d4SSatish Balay       z[0 + k * cm] = sum1;
35079371c9d4SSatish Balay       z[1 + k * cm] = sum2;
35089371c9d4SSatish Balay       z[2 + k * cm] = sum3;
350974eeabc5SPierre Jolivet     }
351074eeabc5SPierre Jolivet     if (!usecprow) z += 3;
351174eeabc5SPierre Jolivet   }
351274eeabc5SPierre Jolivet   PetscFunctionReturn(0);
351374eeabc5SPierre Jolivet }
351474eeabc5SPierre Jolivet 
35159371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
351674eeabc5SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3517f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4;
3518bcf10a7aSPierre Jolivet   const PetscScalar *xb;
351974eeabc5SPierre Jolivet   PetscScalar        x1, x2, x3, x4;
352074eeabc5SPierre Jolivet   const MatScalar   *v, *vv;
352174eeabc5SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
352274eeabc5SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
352374eeabc5SPierre Jolivet 
352474eeabc5SPierre Jolivet   PetscFunctionBegin;
352574eeabc5SPierre Jolivet   idx = a->j;
352674eeabc5SPierre Jolivet   v   = a->a;
352774eeabc5SPierre Jolivet   if (usecprow) {
352874eeabc5SPierre Jolivet     mbs  = a->compressedrow.nrows;
352974eeabc5SPierre Jolivet     ii   = a->compressedrow.i;
353074eeabc5SPierre Jolivet     ridx = a->compressedrow.rindex;
353174eeabc5SPierre Jolivet   } else {
353274eeabc5SPierre Jolivet     mbs = a->mbs;
353374eeabc5SPierre Jolivet     ii  = a->i;
353474eeabc5SPierre Jolivet     z   = c;
353574eeabc5SPierre Jolivet   }
353674eeabc5SPierre Jolivet 
353774eeabc5SPierre Jolivet   for (i = 0; i < mbs; i++) {
35389371c9d4SSatish Balay     n = ii[1] - ii[0];
35399371c9d4SSatish Balay     ii++;
354074eeabc5SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
354174eeabc5SPierre Jolivet     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
354274eeabc5SPierre Jolivet     if (usecprow) z = c + 4 * ridx[i];
354374eeabc5SPierre Jolivet     jj = idx;
354474eeabc5SPierre Jolivet     vv = v;
354574eeabc5SPierre Jolivet     for (k = 0; k < cn; k++) {
354674eeabc5SPierre Jolivet       idx  = jj;
354774eeabc5SPierre Jolivet       v    = vv;
35489371c9d4SSatish Balay       sum1 = 0.0;
35499371c9d4SSatish Balay       sum2 = 0.0;
35509371c9d4SSatish Balay       sum3 = 0.0;
35519371c9d4SSatish Balay       sum4 = 0.0;
355274eeabc5SPierre Jolivet       for (j = 0; j < n; j++) {
35539371c9d4SSatish Balay         xb = b + 4 * (*idx++);
35549371c9d4SSatish Balay         x1 = xb[0 + k * bm];
35559371c9d4SSatish Balay         x2 = xb[1 + k * bm];
35569371c9d4SSatish Balay         x3 = xb[2 + k * bm];
35579371c9d4SSatish Balay         x4 = xb[3 + k * bm];
355874eeabc5SPierre Jolivet         sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
355974eeabc5SPierre Jolivet         sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
356074eeabc5SPierre Jolivet         sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
356174eeabc5SPierre Jolivet         sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
356274eeabc5SPierre Jolivet         v += 16;
356374eeabc5SPierre Jolivet       }
35649371c9d4SSatish Balay       z[0 + k * cm] = sum1;
35659371c9d4SSatish Balay       z[1 + k * cm] = sum2;
35669371c9d4SSatish Balay       z[2 + k * cm] = sum3;
35679371c9d4SSatish Balay       z[3 + k * cm] = sum4;
356874eeabc5SPierre Jolivet     }
356974eeabc5SPierre Jolivet     if (!usecprow) z += 4;
357074eeabc5SPierre Jolivet   }
357174eeabc5SPierre Jolivet   PetscFunctionReturn(0);
357274eeabc5SPierre Jolivet }
357374eeabc5SPierre Jolivet 
35749371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
357574eeabc5SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3576f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5;
3577bcf10a7aSPierre Jolivet   const PetscScalar *xb;
357874eeabc5SPierre Jolivet   PetscScalar        x1, x2, x3, x4, x5;
357974eeabc5SPierre Jolivet   const MatScalar   *v, *vv;
358074eeabc5SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
358174eeabc5SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
358274eeabc5SPierre Jolivet 
358374eeabc5SPierre Jolivet   PetscFunctionBegin;
358474eeabc5SPierre Jolivet   idx = a->j;
358574eeabc5SPierre Jolivet   v   = a->a;
358674eeabc5SPierre Jolivet   if (usecprow) {
358774eeabc5SPierre Jolivet     mbs  = a->compressedrow.nrows;
358874eeabc5SPierre Jolivet     ii   = a->compressedrow.i;
358974eeabc5SPierre Jolivet     ridx = a->compressedrow.rindex;
359074eeabc5SPierre Jolivet   } else {
359174eeabc5SPierre Jolivet     mbs = a->mbs;
359274eeabc5SPierre Jolivet     ii  = a->i;
359374eeabc5SPierre Jolivet     z   = c;
359474eeabc5SPierre Jolivet   }
359574eeabc5SPierre Jolivet 
359674eeabc5SPierre Jolivet   for (i = 0; i < mbs; i++) {
35979371c9d4SSatish Balay     n = ii[1] - ii[0];
35989371c9d4SSatish Balay     ii++;
359974eeabc5SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
360074eeabc5SPierre Jolivet     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
360174eeabc5SPierre Jolivet     if (usecprow) z = c + 5 * ridx[i];
360274eeabc5SPierre Jolivet     jj = idx;
360374eeabc5SPierre Jolivet     vv = v;
360474eeabc5SPierre Jolivet     for (k = 0; k < cn; k++) {
360574eeabc5SPierre Jolivet       idx  = jj;
360674eeabc5SPierre Jolivet       v    = vv;
36079371c9d4SSatish Balay       sum1 = 0.0;
36089371c9d4SSatish Balay       sum2 = 0.0;
36099371c9d4SSatish Balay       sum3 = 0.0;
36109371c9d4SSatish Balay       sum4 = 0.0;
36119371c9d4SSatish Balay       sum5 = 0.0;
361274eeabc5SPierre Jolivet       for (j = 0; j < n; j++) {
36139371c9d4SSatish Balay         xb = b + 5 * (*idx++);
36149371c9d4SSatish Balay         x1 = xb[0 + k * bm];
36159371c9d4SSatish Balay         x2 = xb[1 + k * bm];
36169371c9d4SSatish Balay         x3 = xb[2 + k * bm];
36179371c9d4SSatish Balay         x4 = xb[3 + k * bm];
36189371c9d4SSatish Balay         x5 = xb[4 + k * bm];
361974eeabc5SPierre Jolivet         sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
362074eeabc5SPierre Jolivet         sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
362174eeabc5SPierre Jolivet         sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
362274eeabc5SPierre Jolivet         sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
362374eeabc5SPierre Jolivet         sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
362474eeabc5SPierre Jolivet         v += 25;
362574eeabc5SPierre Jolivet       }
36269371c9d4SSatish Balay       z[0 + k * cm] = sum1;
36279371c9d4SSatish Balay       z[1 + k * cm] = sum2;
36289371c9d4SSatish Balay       z[2 + k * cm] = sum3;
36299371c9d4SSatish Balay       z[3 + k * cm] = sum4;
36309371c9d4SSatish Balay       z[4 + k * cm] = sum5;
363174eeabc5SPierre Jolivet     }
363274eeabc5SPierre Jolivet     if (!usecprow) z += 5;
363374eeabc5SPierre Jolivet   }
363474eeabc5SPierre Jolivet   PetscFunctionReturn(0);
363574eeabc5SPierre Jolivet }
363674eeabc5SPierre Jolivet 
36379371c9d4SSatish Balay PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C) {
3638a001520aSPierre Jolivet   Mat_SeqBAIJ     *a  = (Mat_SeqBAIJ *)A->data;
3639a001520aSPierre Jolivet   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
3640910cf402Sprj-   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
3641bcf10a7aSPierre Jolivet   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3642a001520aSPierre Jolivet   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3643a001520aSPierre Jolivet   PetscBLASInt     bbs, bcn, bbm, bcm;
3644f4259b30SLisandro Dalcin   PetscScalar     *z = NULL;
3645a001520aSPierre Jolivet   PetscScalar     *c, *b;
3646a001520aSPierre Jolivet   const MatScalar *v;
3647a001520aSPierre Jolivet   const PetscInt  *idx, *ii, *ridx = NULL;
36484b7054f4SPierre Jolivet   PetscScalar      _DZero = 0.0, _DOne = 1.0;
3649a001520aSPierre Jolivet   PetscBool        usecprow = a->compressedrow.use;
3650a001520aSPierre Jolivet 
3651a001520aSPierre Jolivet   PetscFunctionBegin;
3652a001520aSPierre Jolivet   if (!cm || !cn) PetscFunctionReturn(0);
365308401ef6SPierre Jolivet   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
365408401ef6SPierre Jolivet   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
365508401ef6SPierre Jolivet   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
3656a001520aSPierre Jolivet   b = bd->v;
3657*48a46eb9SPierre Jolivet   if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C));
36589566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArray(C, &c));
365974eeabc5SPierre Jolivet   switch (bs) {
36609371c9d4SSatish Balay   case 1: PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); break;
36619371c9d4SSatish Balay   case 2: PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); break;
36629371c9d4SSatish Balay   case 3: PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); break;
36639371c9d4SSatish Balay   case 4: PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); break;
36649371c9d4SSatish Balay   case 5: PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); break;
366574eeabc5SPierre Jolivet   default: /* block sizes larger than 5 by 5 are handled by BLAS */
36669566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bs, &bbs));
36679566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cn, &bcn));
36689566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bm, &bbm));
36699566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cm, &bcm));
3670a001520aSPierre Jolivet     idx = a->j;
3671a001520aSPierre Jolivet     v   = a->a;
3672a001520aSPierre Jolivet     if (usecprow) {
3673a001520aSPierre Jolivet       mbs  = a->compressedrow.nrows;
3674a001520aSPierre Jolivet       ii   = a->compressedrow.i;
3675a001520aSPierre Jolivet       ridx = a->compressedrow.rindex;
3676a001520aSPierre Jolivet     } else {
3677a001520aSPierre Jolivet       mbs = a->mbs;
3678a001520aSPierre Jolivet       ii  = a->i;
3679a001520aSPierre Jolivet       z   = c;
3680a001520aSPierre Jolivet     }
3681a001520aSPierre Jolivet     for (i = 0; i < mbs; i++) {
36829371c9d4SSatish Balay       n = ii[1] - ii[0];
36839371c9d4SSatish Balay       ii++;
3684a001520aSPierre Jolivet       if (usecprow) z = c + bs * ridx[i];
36854b7054f4SPierre Jolivet       if (n) {
3686792fecdfSBarry Smith         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
36874b7054f4SPierre Jolivet         v += bs2;
36884b7054f4SPierre Jolivet       }
36894b7054f4SPierre Jolivet       for (j = 1; j < n; j++) {
3690792fecdfSBarry Smith         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3691a001520aSPierre Jolivet         v += bs2;
3692a001520aSPierre Jolivet       }
3693a001520aSPierre Jolivet       if (!usecprow) z += bs;
3694a001520aSPierre Jolivet     }
36954b7054f4SPierre Jolivet   }
36969566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArray(C, &c));
36979566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn));
3698a001520aSPierre Jolivet   PetscFunctionReturn(0);
3699a001520aSPierre Jolivet }
3700